1 /* $NetBSD: lfs_syscalls.c,v 1.179 2025/11/04 00:50:37 perseant Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Konrad E. Schroder <perseant (at) hhhh.org>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 /*- 33 * Copyright (c) 1991, 1993, 1994 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.179 2025/11/04 00:50:37 perseant Exp $"); 65 66 #ifndef LFS 67 # define LFS /* for prototypes in syscallargs.h */ 68 #endif 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/proc.h> 73 #include <sys/buf.h> 74 #include <sys/mount.h> 75 #include <sys/vnode.h> 76 #include <sys/kernel.h> 77 #include <sys/kauth.h> 78 #include <sys/syscallargs.h> 79 80 #include <ufs/lfs/ulfs_inode.h> 81 #include <ufs/lfs/ulfsmount.h> 82 #include <ufs/lfs/ulfs_extern.h> 83 84 #include <ufs/lfs/lfs.h> 85 #include <ufs/lfs/lfs_accessors.h> 86 #include <ufs/lfs/lfs_kernel.h> 87 #include <ufs/lfs/lfs_extern.h> 88 89 static int lfs_fastvget(struct mount *, ino_t, BLOCK_INFO *, int, 90 struct vnode **); 91 static struct buf *lfs_fakebuf(struct lfs *, struct vnode *, daddr_t, 92 size_t, void *); 93 94 /* 95 * sys_lfs_markv: 96 * 97 * This will mark inodes and blocks dirty, so they are written into the log. 98 * It will block until all the blocks have been written. The segment create 99 * time passed in the block_info and inode_info structures is used to decide 100 * if the data is valid for each block (in case some process dirtied a block 101 * or inode that is being cleaned between the determination that a block is 102 * live and the lfs_markv call). 103 * 104 * 0 on success 105 * -1/errno is return on error. 106 */ 107 #ifdef USE_64BIT_SYSCALLS 108 int 109 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 110 { 111 /* { 112 syscallarg(fsid_t *) fsidp; 113 syscallarg(struct block_info *) blkiov; 114 syscallarg(int) blkcnt; 115 } */ 116 BLOCK_INFO *blkiov; 117 int blkcnt, error; 118 fsid_t fsid; 119 struct lfs *fs; 120 struct mount *mntp; 121 122 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 123 return (error); 124 125 if ((mntp = vfs_getvfs(&fsid)) == NULL) 126 return (ENOENT); 127 fs = VFSTOULFS(mntp)->um_lfs; 128 129 blkcnt = SCARG(uap, blkcnt); 130 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 131 return (EINVAL); 132 133 KERNEL_LOCK(1, NULL); 134 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 135 if ((error = copyin(SCARG(uap, blkiov), blkiov, 136 blkcnt * sizeof(BLOCK_INFO))) != 0) 137 goto out; 138 139 if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) 140 copyout(blkiov, SCARG(uap, blkiov), 141 blkcnt * sizeof(BLOCK_INFO)); 142 out: 143 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 144 KERNEL_UNLOCK_ONE(NULL); 145 return error; 146 } 147 #else 148 int 149 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 150 { 151 /* { 152 syscallarg(fsid_t *) fsidp; 153 syscallarg(struct block_info *) blkiov; 154 syscallarg(int) blkcnt; 155 } */ 156 BLOCK_INFO *blkiov; 157 BLOCK_INFO_15 *blkiov15; 158 int i, blkcnt, error; 159 fsid_t fsid; 160 struct lfs *fs; 161 struct mount *mntp; 162 163 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 164 return (error); 165 166 if ((mntp = vfs_getvfs(&fsid)) == NULL) 167 return (ENOENT); 168 fs = VFSTOULFS(mntp)->um_lfs; 169 170 blkcnt = SCARG(uap, blkcnt); 171 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 172 return (EINVAL); 173 174 KERNEL_LOCK(1, NULL); 175 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 176 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 177 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 178 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 179 goto out; 180 181 for (i = 0; i < blkcnt; i++) { 182 blkiov[i].bi_inode = blkiov15[i].bi_inode; 183 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 184 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 185 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 186 blkiov[i].bi_version = blkiov15[i].bi_version; 187 blkiov[i].bi_bp = blkiov15[i].bi_bp; 188 blkiov[i].bi_size = blkiov15[i].bi_size; 189 } 190 191 if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) { 192 for (i = 0; i < blkcnt; i++) { 193 blkiov15[i].bi_inode = blkiov[i].bi_inode; 194 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 195 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 196 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 197 blkiov15[i].bi_version = blkiov[i].bi_version; 198 blkiov15[i].bi_bp = blkiov[i].bi_bp; 199 blkiov15[i].bi_size = blkiov[i].bi_size; 200 } 201 copyout(blkiov15, SCARG(uap, blkiov), 202 blkcnt * sizeof(BLOCK_INFO_15)); 203 } 204 out: 205 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 206 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 207 KERNEL_UNLOCK_ONE(NULL); 208 return error; 209 } 210 #endif 211 212 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) 213 214 int 215 lfs_markv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, 216 int blkcnt) 217 { 218 BLOCK_INFO *blkp; 219 IFILE *ifp; 220 struct buf *bp; 221 struct inode *ip = NULL; 222 struct lfs *fs; 223 struct mount *mntp; 224 struct ulfsmount *ump; 225 struct vnode *vp; 226 ino_t lastino; 227 daddr_t b_daddr; 228 int cnt, error; 229 int do_again = 0; 230 int numrefed = 0; 231 ino_t maxino; 232 size_t obsize; 233 234 /* number of blocks/inodes that we have already bwrite'ed */ 235 int nblkwritten, ninowritten; 236 237 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 238 KAUTH_REQ_SYSTEM_LFS_MARKV, NULL, NULL, NULL); 239 240 if (error) 241 return (error); 242 243 if ((mntp = vfs_getvfs(fsidp)) == NULL) 244 return (ENOENT); 245 246 ump = VFSTOULFS(mntp); 247 fs = ump->um_lfs; 248 249 if (fs->lfs_ronly) 250 return EROFS; 251 252 maxino = (lfs_fragstoblks(fs, lfs_dino_getblocks(fs, VTOI(fs->lfs_ivnode)->i_din)) - 253 lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); 254 255 if ((error = vfs_busy(mntp)) != 0) 256 return (error); 257 258 /* 259 * Reference all the vnodes we will need before we lock. 260 * This prevents a reclaimed vnode from being written 261 * in the same partial segment with cleaning blocks. 262 */ 263 lfs_cleanerlock(fs); 264 lastino = LFS_UNUSED_INUM; 265 for (cnt = blkcnt, blkp = blkiov; cnt--; ++blkp) { 266 /* Bounds-check incoming data, avoid panic for failed VGET */ 267 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { 268 error = EINVAL; 269 goto err4; 270 } 271 272 if (lastino != blkp->bi_inode) { 273 /* Load the vnode and add a cleaning reference */ 274 error = lfs_fastvget(mntp, blkp->bi_inode, blkp, 275 LK_EXCLUSIVE | LK_NOWAIT, &vp); 276 lfs_setclean(fs, vp); 277 vput(vp); 278 vp = NULL; 279 280 lastino = blkp->bi_inode; 281 } 282 } 283 284 /* 285 * This seglock is just to prevent the fact that we might have to sleep 286 * from allowing the possibility that our blocks might become 287 * invalid. 288 * 289 * It is also important to note here that unless we specify SEGM_CKP, 290 * any Ifile blocks that we might be asked to clean will never get 291 * to the disk. 292 */ 293 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 294 295 /* Mark blocks/inodes dirty. */ 296 error = 0; 297 298 /* these were inside the initialization for the for loop */ 299 vp = NULL; 300 lastino = LFS_UNUSED_INUM; 301 nblkwritten = ninowritten = 0; 302 cnt = blkcnt; 303 for (blkp = blkiov; cnt--; ++blkp) 304 { 305 /* Bounds-check incoming data, avoid panic for failed VGET */ 306 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { 307 error = EINVAL; 308 goto err3; 309 } 310 /* 311 * Get the IFILE entry (only once) and see if the file still 312 * exists. 313 */ 314 if (lastino != blkp->bi_inode) { 315 /* 316 * Finish the old file, if there was one. 317 */ 318 if (vp != NULL) { 319 vput(vp); 320 vp = NULL; 321 numrefed--; 322 } 323 324 /* 325 * Start a new file 326 */ 327 lastino = blkp->bi_inode; 328 329 /* Get the vnode/inode. */ 330 error = lfs_fastvget(mntp, blkp->bi_inode, blkp, 331 LK_EXCLUSIVE | LK_NOWAIT, &vp); 332 if (error) { 333 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" 334 " failed with %d (ino %d, segment %d)\n", 335 error, blkp->bi_inode, 336 lfs_dtosn(fs, blkp->bi_daddr))); 337 /* 338 * If we got EAGAIN, that means that the 339 * Inode was locked. This is 340 * recoverable: just clean the rest of 341 * this segment, and let the cleaner try 342 * again with another. (When the 343 * cleaner runs again, this segment will 344 * sort high on the list, since it is 345 * now almost entirely empty.) 346 */ 347 if (error == EAGAIN) { 348 error = 0; 349 do_again++; 350 } else 351 KASSERT(error == ENOENT); 352 KASSERT(vp == NULL); 353 ip = NULL; 354 continue; 355 } 356 357 ip = VTOI(vp); 358 numrefed++; 359 ninowritten++; 360 } else if (vp == NULL) { 361 /* 362 * This can only happen if the vnode is dead (or 363 * in any case we can't get it...e.g., it is 364 * inlocked). Keep going. 365 */ 366 continue; 367 } 368 369 /* Past this point we are guaranteed that vp, ip are valid. */ 370 371 /* Can't clean VU_DIROP directories in case of truncation */ 372 /* XXX - maybe we should mark removed dirs specially? */ 373 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { 374 do_again++; 375 continue; 376 } 377 378 /* If this BLOCK_INFO didn't contain a block, keep going. */ 379 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 380 /* XXX need to make sure that the inode gets written in this case */ 381 /* XXX but only write the inode if it's the right one */ 382 if (blkp->bi_inode != LFS_IFILE_INUM) { 383 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 384 if (lfs_if_getdaddr(fs, ifp) == blkp->bi_daddr) { 385 lfs_setclean(fs, vp); 386 } 387 brelse(bp, 0); 388 } 389 continue; 390 } 391 392 b_daddr = 0; 393 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || 394 LFS_DBTOFSB(fs, b_daddr) != blkp->bi_daddr) 395 { 396 if (lfs_dtosn(fs, LFS_DBTOFSB(fs, b_daddr)) == 397 lfs_dtosn(fs, blkp->bi_daddr)) 398 { 399 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %jx vs %jx\n", 400 (intmax_t)blkp->bi_daddr, (intmax_t)LFS_DBTOFSB(fs, b_daddr))); 401 } 402 do_again++; 403 continue; 404 } 405 406 /* 407 * Check block sizes. The blocks being cleaned come from 408 * disk, so they should have the same size as their on-disk 409 * counterparts. 410 */ 411 if (blkp->bi_lbn >= 0) 412 obsize = lfs_blksize(fs, ip, blkp->bi_lbn); 413 else 414 obsize = lfs_sb_getbsize(fs); 415 /* Check for fragment size change */ 416 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < ULFS_NDADDR) { 417 obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; 418 } 419 if (obsize != blkp->bi_size) { 420 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %jd wrong" 421 " size (%ld != %d), try again\n", 422 blkp->bi_inode, (intmax_t)blkp->bi_lbn, 423 (long) obsize, blkp->bi_size)); 424 do_again++; 425 continue; 426 } 427 428 /* 429 * If we get to here, then we are keeping the block. If 430 * it is an indirect block, we want to actually put it 431 * in the buffer cache so that it can be updated in the 432 * finish_meta section. If it's not, we need to 433 * allocate a fake buffer so that writeseg can perform 434 * the copyin and write the buffer. 435 */ 436 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { 437 /* Data Block */ 438 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, 439 blkp->bi_size, blkp->bi_bp); 440 /* Pretend we used bread() to get it */ 441 bp->b_blkno = LFS_FSBTODB(fs, blkp->bi_daddr); 442 } else { 443 /* Indirect block or ifile */ 444 if (blkp->bi_size != lfs_sb_getbsize(fs) && 445 ip->i_number != LFS_IFILE_INUM) 446 panic("lfs_markv: partial indirect block?" 447 " size=%d\n", blkp->bi_size); 448 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); 449 if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { 450 /* 451 * The block in question was not found 452 * in the cache; i.e., the block that 453 * getblk() returned is empty. So, we 454 * can (and should) copy in the 455 * contents, because we've already 456 * determined that this was the right 457 * version of this block on disk. 458 * 459 * And, it can't have changed underneath 460 * us, because we have the segment lock. 461 */ 462 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); 463 if (error) 464 goto err2; 465 } 466 } 467 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) 468 goto err2; 469 470 nblkwritten++; 471 /* 472 * XXX should account indirect blocks and ifile pages as well 473 */ 474 if (nblkwritten + lfs_lblkno(fs, ninowritten * DINOSIZE(fs)) 475 > LFS_MARKV_MAX_BLOCKS) { 476 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n", 477 nblkwritten, ninowritten)); 478 lfs_segwrite(mntp, SEGM_CLEAN); 479 nblkwritten = ninowritten = 0; 480 } 481 } 482 483 /* 484 * Finish the old file, if there was one 485 */ 486 if (vp != NULL) { 487 vput(vp); 488 vp = NULL; 489 numrefed--; 490 } 491 492 KASSERTMSG((numrefed == 0), "lfs_markv: numrefed=%d", numrefed); 493 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n", 494 nblkwritten, ninowritten)); 495 496 /* 497 * The last write has to be SEGM_SYNC, because of calling semantics. 498 * It also has to be SEGM_CKP, because otherwise we could write 499 * over the newly cleaned data contained in a checkpoint, and then 500 * we'd be unhappy at recovery time. 501 */ 502 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 503 504 lfs_segunlock(fs); 505 506 vfs_unbusy(mntp); 507 if (error) 508 return (error); 509 else if (do_again) 510 return EAGAIN; 511 512 return 0; 513 514 err2: 515 DLOG((DLOG_CLEAN, "lfs_markv err2\n")); 516 517 /* 518 * XXX we're here because copyin() failed. 519 * XXX it means that we can't trust the cleanerd. too bad. 520 * XXX how can we recover from this? 521 */ 522 523 err3: 524 /* 525 * XXX should do segwrite here anyway? 526 */ 527 528 if (vp != NULL) { 529 vput(vp); 530 vp = NULL; 531 --numrefed; 532 } 533 534 lfs_segunlock(fs); 535 536 err4: 537 vfs_unbusy(mntp); 538 KASSERTMSG((numrefed == 0), "lfs_markv: numrefed=%d", numrefed); 539 540 return (error); 541 } 542 543 /* 544 * sys_lfs_bmapv: 545 * 546 * This will fill in the current disk address for arrays of blocks. 547 * 548 * 0 on success 549 * -1/errno is return on error. 550 */ 551 #ifdef USE_64BIT_SYSCALLS 552 int 553 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 554 { 555 /* { 556 syscallarg(fsid_t *) fsidp; 557 syscallarg(struct block_info *) blkiov; 558 syscallarg(int) blkcnt; 559 } */ 560 BLOCK_INFO *blkiov; 561 int blkcnt, error; 562 fsid_t fsid; 563 struct lfs *fs; 564 struct mount *mntp; 565 566 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 567 return (error); 568 569 if ((mntp = vfs_getvfs(&fsid)) == NULL) 570 return (ENOENT); 571 fs = VFSTOULFS(mntp)->um_lfs; 572 573 blkcnt = SCARG(uap, blkcnt); 574 #if SIZE_T_MAX <= UINT_MAX 575 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 576 return (EINVAL); 577 #endif 578 KERNEL_LOCK(1, NULL); 579 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 580 if ((error = copyin(SCARG(uap, blkiov), blkiov, 581 blkcnt * sizeof(BLOCK_INFO))) != 0) 582 goto out; 583 584 if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) 585 copyout(blkiov, SCARG(uap, blkiov), 586 blkcnt * sizeof(BLOCK_INFO)); 587 out: 588 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 589 KERNEL_UNLOCK_ONE(NULL); 590 return error; 591 } 592 #else 593 int 594 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 595 { 596 /* { 597 syscallarg(fsid_t *) fsidp; 598 syscallarg(struct block_info *) blkiov; 599 syscallarg(int) blkcnt; 600 } */ 601 BLOCK_INFO *blkiov; 602 BLOCK_INFO_15 *blkiov15; 603 int i, blkcnt, error; 604 fsid_t fsid; 605 struct lfs *fs; 606 struct mount *mntp; 607 608 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 609 return (error); 610 611 if ((mntp = vfs_getvfs(&fsid)) == NULL) 612 return (ENOENT); 613 fs = VFSTOULFS(mntp)->um_lfs; 614 615 blkcnt = SCARG(uap, blkcnt); 616 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 617 return (EINVAL); 618 KERNEL_LOCK(1, NULL); 619 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 620 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 621 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 622 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 623 goto out; 624 625 for (i = 0; i < blkcnt; i++) { 626 blkiov[i].bi_inode = blkiov15[i].bi_inode; 627 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 628 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 629 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 630 blkiov[i].bi_version = blkiov15[i].bi_version; 631 blkiov[i].bi_bp = blkiov15[i].bi_bp; 632 blkiov[i].bi_size = blkiov15[i].bi_size; 633 } 634 635 if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) { 636 for (i = 0; i < blkcnt; i++) { 637 blkiov15[i].bi_inode = blkiov[i].bi_inode; 638 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 639 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 640 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 641 blkiov15[i].bi_version = blkiov[i].bi_version; 642 blkiov15[i].bi_bp = blkiov[i].bi_bp; 643 blkiov15[i].bi_size = blkiov[i].bi_size; 644 } 645 copyout(blkiov15, SCARG(uap, blkiov), 646 blkcnt * sizeof(BLOCK_INFO_15)); 647 } 648 out: 649 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 650 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 651 KERNEL_UNLOCK_ONE(NULL); 652 return error; 653 } 654 #endif 655 656 int 657 lfs_bmapv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) 658 { 659 BLOCK_INFO *blkp; 660 IFILE *ifp; 661 struct buf *bp; 662 struct inode *ip = NULL; 663 struct lfs *fs; 664 struct mount *mntp; 665 struct ulfsmount *ump; 666 struct vnode *vp; 667 ino_t lastino; 668 daddr_t v_daddr; 669 int cnt, error; 670 int numrefed = 0; 671 672 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 673 KAUTH_REQ_SYSTEM_LFS_BMAPV, NULL, NULL, NULL); 674 if (error) 675 return (error); 676 677 if ((mntp = vfs_getvfs(fsidp)) == NULL) 678 return (ENOENT); 679 680 if ((error = vfs_busy(mntp)) != 0) 681 return (error); 682 683 ump = VFSTOULFS(mntp); 684 fs = ump->um_lfs; 685 686 if (fs->lfs_cleaner_thread == NULL) 687 fs->lfs_cleaner_thread = curlwp; 688 KASSERT(fs->lfs_cleaner_thread == curlwp); 689 690 cnt = blkcnt; 691 692 error = 0; 693 694 /* these were inside the initialization for the for loop */ 695 vp = NULL; 696 v_daddr = LFS_UNUSED_DADDR; 697 lastino = LFS_UNUSED_INUM; 698 for (blkp = blkiov; cnt--; ++blkp) 699 { 700 /* 701 * Get the IFILE entry (only once) and see if the file still 702 * exists. 703 */ 704 if (lastino != blkp->bi_inode) { 705 /* 706 * Finish the old file, if there was one. 707 */ 708 if (vp != NULL) { 709 vput(vp); 710 vp = NULL; 711 numrefed--; 712 } 713 714 /* 715 * Start a new file 716 */ 717 lastino = blkp->bi_inode; 718 if (blkp->bi_inode == LFS_IFILE_INUM) 719 v_daddr = lfs_sb_getidaddr(fs); 720 else { 721 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 722 v_daddr = lfs_if_getdaddr(fs, ifp); 723 brelse(bp, 0); 724 } 725 if (v_daddr == LFS_UNUSED_DADDR) { 726 blkp->bi_daddr = LFS_UNUSED_DADDR; 727 continue; 728 } 729 error = lfs_fastvget(mntp, blkp->bi_inode, NULL, 730 LK_SHARED, &vp); 731 if (error) { 732 DLOG((DLOG_CLEAN, "lfs_bmapv: lfs_fastvget ino" 733 "%d failed with %d", 734 blkp->bi_inode,error)); 735 KASSERT(vp == NULL); 736 continue; 737 } else { 738 KASSERT(VOP_ISLOCKED(vp)); 739 numrefed++; 740 } 741 ip = VTOI(vp); 742 } else if (vp == NULL) { 743 /* 744 * This can only happen if the vnode is dead. 745 * Keep going. Note that we DO NOT set the 746 * bi_addr to anything -- if we failed to get 747 * the vnode, for example, we want to assume 748 * conservatively that all of its blocks *are* 749 * located in the segment in question. 750 * lfs_markv will throw them out if we are 751 * wrong. 752 */ 753 continue; 754 } 755 756 /* Past this point we are guaranteed that vp, ip are valid. */ 757 758 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 759 /* 760 * We just want the inode address, which is 761 * conveniently in v_daddr. 762 */ 763 blkp->bi_daddr = v_daddr; 764 } else { 765 daddr_t bi_daddr; 766 767 error = VOP_BMAP(vp, blkp->bi_lbn, NULL, 768 &bi_daddr, NULL); 769 if (error) 770 { 771 blkp->bi_daddr = LFS_UNUSED_DADDR; 772 continue; 773 } 774 blkp->bi_daddr = LFS_DBTOFSB(fs, bi_daddr); 775 /* Fill in the block size, too */ 776 if (blkp->bi_lbn >= 0) 777 blkp->bi_size = lfs_blksize(fs, ip, blkp->bi_lbn); 778 else 779 blkp->bi_size = lfs_sb_getbsize(fs); 780 } 781 } 782 783 /* 784 * Finish the old file, if there was one. 785 */ 786 if (vp != NULL) { 787 vput(vp); 788 vp = NULL; 789 numrefed--; 790 } 791 792 KASSERTMSG((numrefed == 0), "lfs_bmapv: numrefed=%d", numrefed); 793 794 vfs_unbusy(mntp); 795 796 return 0; 797 } 798 799 /* 800 * sys_lfs_segclean: 801 * 802 * Mark the segment clean. 803 * 804 * 0 on success 805 * -1/errno is return on error. 806 */ 807 int 808 sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval) 809 { 810 /* { 811 syscallarg(fsid_t *) fsidp; 812 syscallarg(u_long) segment; 813 } */ 814 struct lfs *fs; 815 struct mount *mntp; 816 fsid_t fsid; 817 int error; 818 unsigned long segnum; 819 820 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 821 KAUTH_REQ_SYSTEM_LFS_SEGCLEAN, NULL, NULL, NULL); 822 if (error) 823 return (error); 824 825 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 826 return (error); 827 if ((mntp = vfs_getvfs(&fsid)) == NULL) 828 return (ENOENT); 829 830 fs = VFSTOULFS(mntp)->um_lfs; 831 segnum = SCARG(uap, segment); 832 833 if ((error = vfs_busy(mntp)) != 0) 834 return (error); 835 836 KERNEL_LOCK(1, NULL); 837 lfs_seglock(fs, SEGM_PROT); 838 error = lfs_do_segclean(fs, segnum, l->l_cred, l); 839 lfs_segunlock(fs); 840 KERNEL_UNLOCK_ONE(NULL); 841 vfs_unbusy(mntp); 842 return error; 843 } 844 845 /* 846 * Actually mark the segment clean. 847 * Must be called with the segment lock held. 848 */ 849 int 850 lfs_do_segclean(struct lfs *fs, unsigned long segnum, kauth_cred_t cred, struct lwp *l) 851 { 852 struct buf *bp; 853 SEGUSE *sup; 854 855 ASSERT_SEGLOCK(fs); 856 857 if (lfs_dtosn(fs, lfs_sb_getcurseg(fs)) == segnum) { 858 return (EBUSY); 859 } 860 861 LFS_SEGENTRY(sup, fs, segnum, bp); 862 if (sup->su_nbytes) { 863 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 864 " %d live bytes\n", segnum, sup->su_nbytes)); 865 brelse(bp, 0); 866 return (EBUSY); 867 } 868 if (sup->su_flags & SEGUSE_ACTIVE) { 869 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 870 " segment is active\n", segnum)); 871 brelse(bp, 0); 872 return (EBUSY); 873 } 874 if (!(sup->su_flags & SEGUSE_DIRTY)) { 875 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 876 " segment is already clean\n", segnum)); 877 brelse(bp, 0); 878 return (EALREADY); 879 } 880 881 lfs_markclean(fs, segnum, sup, cred, l); 882 LFS_WRITESEGENTRY(sup, fs, segnum, bp); 883 884 return 0; 885 } 886 887 int 888 lfs_markclean(struct lfs *fs, unsigned long segnum, SEGUSE *sup, 889 kauth_cred_t cred, struct lwp *l) 890 { 891 extern int lfs_dostats; 892 struct buf *bp; 893 CLEANERINFO *cip; 894 895 ASSERT_SEGLOCK(fs); 896 897 #ifdef DEBUG 898 if (lfs_checkempty(fs, segnum, cred, l) == EEXIST) 899 panic("Live data in cleaned segment %jd\n", (intmax_t)segnum); 900 #endif /* DEBUG */ 901 902 lfs_sb_addavail(fs, lfs_segtod(fs, 1)); 903 if (sup->su_flags & SEGUSE_SUPERBLOCK) 904 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_SBPAD)); 905 if (lfs_sb_getversion(fs) > 1 && segnum == 0 && 906 lfs_sb_gets0addr(fs) < lfs_btofsb(fs, LFS_LABELPAD)) 907 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_LABELPAD) - lfs_sb_gets0addr(fs)); 908 mutex_enter(&lfs_lock); 909 lfs_sb_addbfree(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + 910 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); 911 lfs_sb_subdmeta(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + 912 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); 913 if (lfs_sb_getdmeta(fs) < 0) 914 lfs_sb_setdmeta(fs, 0); 915 mutex_exit(&lfs_lock); 916 sup->su_flags &= ~(SEGUSE_ACTIVE | SEGUSE_DIRTY 917 | SEGUSE_EMPTY | SEGUSE_READY 918 | SEGUSE_ERROR); 919 920 LFS_CLEANERINFO(cip, fs, bp); 921 lfs_ci_shiftdirtytoclean(fs, cip, 1); 922 lfs_sb_setnclean(fs, lfs_ci_getclean(fs, cip)); 923 mutex_enter(&lfs_lock); 924 lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); 925 lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) 926 - fs->lfs_ravail - fs->lfs_favail); 927 wakeup(&fs->lfs_availsleep); 928 mutex_exit(&lfs_lock); 929 (void) LFS_BWRITE_LOG(bp); 930 931 if (lfs_dostats) 932 ++lfs_stats.segs_reclaimed; 933 934 return (0); 935 } 936 937 /* 938 * This will block until a segment in file system fsid is written. A timeout 939 * in milliseconds may be specified which will awake the cleaner automatically. 940 * An fsid of -1 means any file system, and a timeout of 0 means forever. 941 */ 942 int 943 lfs_segwait(fsid_t *fsidp, struct timeval *tv) 944 { 945 struct mount *mntp; 946 void *addr; 947 u_long timeout; 948 int error; 949 950 mutex_enter(&lfs_lock); 951 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL) 952 addr = &lfs_allclean_wakeup; 953 else 954 addr = &VFSTOULFS(mntp)->um_lfs->lfs_nextsegsleep; 955 /* 956 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! 957 * XXX IS THAT WHAT IS INTENDED? 958 */ 959 timeout = tvtohz(tv); 960 error = cv_timedwait_sig(addr, &lfs_lock, timeout); 961 mutex_exit(&lfs_lock); 962 return (error == ERESTART ? EINTR : 0); 963 } 964 965 /* 966 * sys_lfs_segwait: 967 * 968 * System call wrapper around lfs_segwait(). 969 * 970 * 0 on success 971 * 1 on timeout 972 * -1/errno is return on error. 973 */ 974 int 975 sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap, 976 register_t *retval) 977 { 978 /* { 979 syscallarg(fsid_t *) fsidp; 980 syscallarg(struct timeval *) tv; 981 } */ 982 struct timeval atv; 983 fsid_t fsid; 984 int error; 985 986 /* XXX need we be su to segwait? */ 987 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 988 KAUTH_REQ_SYSTEM_LFS_SEGWAIT, NULL, NULL, NULL); 989 if (error) 990 return (error); 991 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 992 return (error); 993 994 if (SCARG(uap, tv)) { 995 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); 996 if (error) 997 return (error); 998 if (itimerfix(&atv)) 999 return (EINVAL); 1000 } else /* NULL or invalid */ 1001 atv.tv_sec = atv.tv_usec = 0; 1002 return lfs_segwait(&fsid, &atv); 1003 } 1004 1005 /* 1006 * VFS_VGET call specialized for the cleaner. If the cleaner is 1007 * processing IINFO structures, it may have the ondisk inode already, so 1008 * don't go retrieving it again. 1009 * 1010 * Return the vnode referenced and locked. 1011 */ 1012 1013 static int 1014 lfs_fastvget(struct mount *mp, ino_t ino, BLOCK_INFO *blkp, int lk_flags, 1015 struct vnode **vpp) 1016 { 1017 struct ulfsmount *ump; 1018 struct lfs *fs; 1019 int error; 1020 1021 ump = VFSTOULFS(mp); 1022 fs = ump->um_lfs; 1023 fs->lfs_cleaner_hint = blkp; 1024 error = vcache_get(mp, &ino, sizeof(ino), vpp); 1025 fs->lfs_cleaner_hint = NULL; 1026 if (error) 1027 return error; 1028 error = vn_lock(*vpp, lk_flags); 1029 if (error) { 1030 if (error == EBUSY) 1031 error = EAGAIN; 1032 vrele(*vpp); 1033 *vpp = NULL; 1034 return error; 1035 } 1036 1037 return 0; 1038 } 1039 1040 /* 1041 * Make up a "fake" cleaner buffer, copy the data from userland into it. 1042 */ 1043 static struct buf * 1044 lfs_fakebuf(struct lfs *fs, struct vnode *vp, daddr_t lbn, size_t size, void *uaddr) 1045 { 1046 struct buf *bp; 1047 int error; 1048 1049 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); 1050 1051 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); 1052 error = copyin(uaddr, bp->b_data, size); 1053 if (error) { 1054 lfs_freebuf(fs, bp); 1055 return NULL; 1056 } 1057 KDASSERT(bp->b_iodone == lfs_free_aiodone); 1058 1059 #if 0 1060 mutex_enter(&lfs_lock); 1061 ++fs->lfs_iocount; 1062 mutex_exit(&lfs_lock); 1063 #endif 1064 bp->b_bufsize = size; 1065 bp->b_bcount = size; 1066 return (bp); 1067 } 1068