1 /* $NetBSD: lfs_syscalls.c,v 1.176 2020/02/18 20:23:17 chs Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Konrad E. Schroder <perseant (at) hhhh.org>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 /*- 33 * Copyright (c) 1991, 1993, 1994 34 * The Regents of the University of California. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95 61 */ 62 63 #include <sys/cdefs.h> 64 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.176 2020/02/18 20:23:17 chs Exp $"); 65 66 #ifndef LFS 67 # define LFS /* for prototypes in syscallargs.h */ 68 #endif 69 70 #include <sys/param.h> 71 #include <sys/systm.h> 72 #include <sys/proc.h> 73 #include <sys/buf.h> 74 #include <sys/mount.h> 75 #include <sys/vnode.h> 76 #include <sys/kernel.h> 77 #include <sys/kauth.h> 78 #include <sys/syscallargs.h> 79 80 #include <ufs/lfs/ulfs_inode.h> 81 #include <ufs/lfs/ulfsmount.h> 82 #include <ufs/lfs/ulfs_extern.h> 83 84 #include <ufs/lfs/lfs.h> 85 #include <ufs/lfs/lfs_accessors.h> 86 #include <ufs/lfs/lfs_kernel.h> 87 #include <ufs/lfs/lfs_extern.h> 88 89 static int lfs_fastvget(struct mount *, ino_t, BLOCK_INFO *, int, 90 struct vnode **); 91 static struct buf *lfs_fakebuf(struct lfs *, struct vnode *, daddr_t, 92 size_t, void *); 93 94 /* 95 * sys_lfs_markv: 96 * 97 * This will mark inodes and blocks dirty, so they are written into the log. 98 * It will block until all the blocks have been written. The segment create 99 * time passed in the block_info and inode_info structures is used to decide 100 * if the data is valid for each block (in case some process dirtied a block 101 * or inode that is being cleaned between the determination that a block is 102 * live and the lfs_markv call). 103 * 104 * 0 on success 105 * -1/errno is return on error. 106 */ 107 #ifdef USE_64BIT_SYSCALLS 108 int 109 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 110 { 111 /* { 112 syscallarg(fsid_t *) fsidp; 113 syscallarg(struct block_info *) blkiov; 114 syscallarg(int) blkcnt; 115 } */ 116 BLOCK_INFO *blkiov; 117 int blkcnt, error; 118 fsid_t fsid; 119 struct lfs *fs; 120 struct mount *mntp; 121 122 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 123 return (error); 124 125 if ((mntp = vfs_getvfs(&fsid)) == NULL) 126 return (ENOENT); 127 fs = VFSTOULFS(mntp)->um_lfs; 128 129 blkcnt = SCARG(uap, blkcnt); 130 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 131 return (EINVAL); 132 133 KERNEL_LOCK(1, NULL); 134 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 135 if ((error = copyin(SCARG(uap, blkiov), blkiov, 136 blkcnt * sizeof(BLOCK_INFO))) != 0) 137 goto out; 138 139 if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) 140 copyout(blkiov, SCARG(uap, blkiov), 141 blkcnt * sizeof(BLOCK_INFO)); 142 out: 143 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 144 KERNEL_UNLOCK_ONE(NULL); 145 return error; 146 } 147 #else 148 int 149 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval) 150 { 151 /* { 152 syscallarg(fsid_t *) fsidp; 153 syscallarg(struct block_info *) blkiov; 154 syscallarg(int) blkcnt; 155 } */ 156 BLOCK_INFO *blkiov; 157 BLOCK_INFO_15 *blkiov15; 158 int i, blkcnt, error; 159 fsid_t fsid; 160 struct lfs *fs; 161 struct mount *mntp; 162 163 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 164 return (error); 165 166 if ((mntp = vfs_getvfs(&fsid)) == NULL) 167 return (ENOENT); 168 fs = VFSTOULFS(mntp)->um_lfs; 169 170 blkcnt = SCARG(uap, blkcnt); 171 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT) 172 return (EINVAL); 173 174 KERNEL_LOCK(1, NULL); 175 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 176 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 177 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 178 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 179 goto out; 180 181 for (i = 0; i < blkcnt; i++) { 182 blkiov[i].bi_inode = blkiov15[i].bi_inode; 183 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 184 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 185 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 186 blkiov[i].bi_version = blkiov15[i].bi_version; 187 blkiov[i].bi_bp = blkiov15[i].bi_bp; 188 blkiov[i].bi_size = blkiov15[i].bi_size; 189 } 190 191 if ((error = lfs_markv(l, &fsid, blkiov, blkcnt)) == 0) { 192 for (i = 0; i < blkcnt; i++) { 193 blkiov15[i].bi_inode = blkiov[i].bi_inode; 194 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 195 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 196 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 197 blkiov15[i].bi_version = blkiov[i].bi_version; 198 blkiov15[i].bi_bp = blkiov[i].bi_bp; 199 blkiov15[i].bi_size = blkiov[i].bi_size; 200 } 201 copyout(blkiov15, SCARG(uap, blkiov), 202 blkcnt * sizeof(BLOCK_INFO_15)); 203 } 204 out: 205 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 206 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 207 KERNEL_UNLOCK_ONE(NULL); 208 return error; 209 } 210 #endif 211 212 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS) 213 214 int 215 lfs_markv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, 216 int blkcnt) 217 { 218 BLOCK_INFO *blkp; 219 IFILE *ifp; 220 struct buf *bp; 221 struct inode *ip = NULL; 222 struct lfs *fs; 223 struct mount *mntp; 224 struct ulfsmount *ump; 225 struct vnode *vp; 226 ino_t lastino; 227 daddr_t b_daddr; 228 int cnt, error; 229 int do_again = 0; 230 int numrefed = 0; 231 ino_t maxino; 232 size_t obsize; 233 234 /* number of blocks/inodes that we have already bwrite'ed */ 235 int nblkwritten, ninowritten; 236 237 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 238 KAUTH_REQ_SYSTEM_LFS_MARKV, NULL, NULL, NULL); 239 if (error) 240 return (error); 241 242 if ((mntp = vfs_getvfs(fsidp)) == NULL) 243 return (ENOENT); 244 245 ump = VFSTOULFS(mntp); 246 fs = ump->um_lfs; 247 248 if (fs->lfs_ronly) 249 return EROFS; 250 251 maxino = (lfs_fragstoblks(fs, lfs_dino_getblocks(fs, VTOI(fs->lfs_ivnode)->i_din)) - 252 lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs); 253 254 cnt = blkcnt; 255 256 if ((error = vfs_busy(mntp)) != 0) 257 return (error); 258 259 /* 260 * This seglock is just to prevent the fact that we might have to sleep 261 * from allowing the possibility that our blocks might become 262 * invalid. 263 * 264 * It is also important to note here that unless we specify SEGM_CKP, 265 * any Ifile blocks that we might be asked to clean will never get 266 * to the disk. 267 */ 268 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 269 270 /* Mark blocks/inodes dirty. */ 271 error = 0; 272 273 /* these were inside the initialization for the for loop */ 274 vp = NULL; 275 lastino = LFS_UNUSED_INUM; 276 nblkwritten = ninowritten = 0; 277 for (blkp = blkiov; cnt--; ++blkp) 278 { 279 /* Bounds-check incoming data, avoid panic for failed VGET */ 280 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) { 281 error = EINVAL; 282 goto err3; 283 } 284 /* 285 * Get the IFILE entry (only once) and see if the file still 286 * exists. 287 */ 288 if (lastino != blkp->bi_inode) { 289 /* 290 * Finish the old file, if there was one. 291 */ 292 if (vp != NULL) { 293 vput(vp); 294 vp = NULL; 295 numrefed--; 296 } 297 298 /* 299 * Start a new file 300 */ 301 lastino = blkp->bi_inode; 302 303 /* Get the vnode/inode. */ 304 error = lfs_fastvget(mntp, blkp->bi_inode, blkp, 305 LK_EXCLUSIVE | LK_NOWAIT, &vp); 306 if (error) { 307 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget" 308 " failed with %d (ino %d, segment %d)\n", 309 error, blkp->bi_inode, 310 lfs_dtosn(fs, blkp->bi_daddr))); 311 /* 312 * If we got EAGAIN, that means that the 313 * Inode was locked. This is 314 * recoverable: just clean the rest of 315 * this segment, and let the cleaner try 316 * again with another. (When the 317 * cleaner runs again, this segment will 318 * sort high on the list, since it is 319 * now almost entirely empty.) 320 */ 321 if (error == EAGAIN) { 322 error = 0; 323 do_again++; 324 } else 325 KASSERT(error == ENOENT); 326 KASSERT(vp == NULL); 327 ip = NULL; 328 continue; 329 } 330 331 ip = VTOI(vp); 332 numrefed++; 333 ninowritten++; 334 } else if (vp == NULL) { 335 /* 336 * This can only happen if the vnode is dead (or 337 * in any case we can't get it...e.g., it is 338 * inlocked). Keep going. 339 */ 340 continue; 341 } 342 343 /* Past this point we are guaranteed that vp, ip are valid. */ 344 345 /* Can't clean VU_DIROP directories in case of truncation */ 346 /* XXX - maybe we should mark removed dirs specially? */ 347 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) { 348 do_again++; 349 continue; 350 } 351 352 /* If this BLOCK_INFO didn't contain a block, keep going. */ 353 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 354 /* XXX need to make sure that the inode gets written in this case */ 355 /* XXX but only write the inode if it's the right one */ 356 if (blkp->bi_inode != LFS_IFILE_INUM) { 357 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 358 if (lfs_if_getdaddr(fs, ifp) == blkp->bi_daddr) { 359 mutex_enter(&lfs_lock); 360 LFS_SET_UINO(ip, IN_CLEANING); 361 mutex_exit(&lfs_lock); 362 } 363 brelse(bp, 0); 364 } 365 continue; 366 } 367 368 b_daddr = 0; 369 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) || 370 LFS_DBTOFSB(fs, b_daddr) != blkp->bi_daddr) 371 { 372 if (lfs_dtosn(fs, LFS_DBTOFSB(fs, b_daddr)) == 373 lfs_dtosn(fs, blkp->bi_daddr)) 374 { 375 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %jx vs %jx\n", 376 (intmax_t)blkp->bi_daddr, (intmax_t)LFS_DBTOFSB(fs, b_daddr))); 377 } 378 do_again++; 379 continue; 380 } 381 382 /* 383 * Check block sizes. The blocks being cleaned come from 384 * disk, so they should have the same size as their on-disk 385 * counterparts. 386 */ 387 if (blkp->bi_lbn >= 0) 388 obsize = lfs_blksize(fs, ip, blkp->bi_lbn); 389 else 390 obsize = lfs_sb_getbsize(fs); 391 /* Check for fragment size change */ 392 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < ULFS_NDADDR) { 393 obsize = ip->i_lfs_fragsize[blkp->bi_lbn]; 394 } 395 if (obsize != blkp->bi_size) { 396 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %jd wrong" 397 " size (%ld != %d), try again\n", 398 blkp->bi_inode, (intmax_t)blkp->bi_lbn, 399 (long) obsize, blkp->bi_size)); 400 do_again++; 401 continue; 402 } 403 404 /* 405 * If we get to here, then we are keeping the block. If 406 * it is an indirect block, we want to actually put it 407 * in the buffer cache so that it can be updated in the 408 * finish_meta section. If it's not, we need to 409 * allocate a fake buffer so that writeseg can perform 410 * the copyin and write the buffer. 411 */ 412 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) { 413 /* Data Block */ 414 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn, 415 blkp->bi_size, blkp->bi_bp); 416 /* Pretend we used bread() to get it */ 417 bp->b_blkno = LFS_FSBTODB(fs, blkp->bi_daddr); 418 } else { 419 /* Indirect block or ifile */ 420 if (blkp->bi_size != lfs_sb_getbsize(fs) && 421 ip->i_number != LFS_IFILE_INUM) 422 panic("lfs_markv: partial indirect block?" 423 " size=%d\n", blkp->bi_size); 424 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0); 425 if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) { 426 /* 427 * The block in question was not found 428 * in the cache; i.e., the block that 429 * getblk() returned is empty. So, we 430 * can (and should) copy in the 431 * contents, because we've already 432 * determined that this was the right 433 * version of this block on disk. 434 * 435 * And, it can't have changed underneath 436 * us, because we have the segment lock. 437 */ 438 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size); 439 if (error) 440 goto err2; 441 } 442 } 443 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0) 444 goto err2; 445 446 nblkwritten++; 447 /* 448 * XXX should account indirect blocks and ifile pages as well 449 */ 450 if (nblkwritten + lfs_lblkno(fs, ninowritten * DINOSIZE(fs)) 451 > LFS_MARKV_MAX_BLOCKS) { 452 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n", 453 nblkwritten, ninowritten)); 454 lfs_segwrite(mntp, SEGM_CLEAN); 455 nblkwritten = ninowritten = 0; 456 } 457 } 458 459 /* 460 * Finish the old file, if there was one 461 */ 462 if (vp != NULL) { 463 vput(vp); 464 vp = NULL; 465 numrefed--; 466 } 467 468 KASSERTMSG((numrefed == 0), "lfs_markv: numrefed=%d", numrefed); 469 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n", 470 nblkwritten, ninowritten)); 471 472 /* 473 * The last write has to be SEGM_SYNC, because of calling semantics. 474 * It also has to be SEGM_CKP, because otherwise we could write 475 * over the newly cleaned data contained in a checkpoint, and then 476 * we'd be unhappy at recovery time. 477 */ 478 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC); 479 480 lfs_segunlock(fs); 481 482 vfs_unbusy(mntp); 483 if (error) 484 return (error); 485 else if (do_again) 486 return EAGAIN; 487 488 return 0; 489 490 err2: 491 DLOG((DLOG_CLEAN, "lfs_markv err2\n")); 492 493 /* 494 * XXX we're here because copyin() failed. 495 * XXX it means that we can't trust the cleanerd. too bad. 496 * XXX how can we recover from this? 497 */ 498 499 err3: 500 /* 501 * XXX should do segwrite here anyway? 502 */ 503 504 if (vp != NULL) { 505 vput(vp); 506 vp = NULL; 507 --numrefed; 508 } 509 510 lfs_segunlock(fs); 511 vfs_unbusy(mntp); 512 KASSERTMSG((numrefed == 0), "lfs_markv: numrefed=%d", numrefed); 513 514 return (error); 515 } 516 517 /* 518 * sys_lfs_bmapv: 519 * 520 * This will fill in the current disk address for arrays of blocks. 521 * 522 * 0 on success 523 * -1/errno is return on error. 524 */ 525 #ifdef USE_64BIT_SYSCALLS 526 int 527 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 528 { 529 /* { 530 syscallarg(fsid_t *) fsidp; 531 syscallarg(struct block_info *) blkiov; 532 syscallarg(int) blkcnt; 533 } */ 534 BLOCK_INFO *blkiov; 535 int blkcnt, error; 536 fsid_t fsid; 537 struct lfs *fs; 538 struct mount *mntp; 539 540 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 541 return (error); 542 543 if ((mntp = vfs_getvfs(&fsid)) == NULL) 544 return (ENOENT); 545 fs = VFSTOULFS(mntp)->um_lfs; 546 547 blkcnt = SCARG(uap, blkcnt); 548 #if SIZE_T_MAX <= UINT_MAX 549 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 550 return (EINVAL); 551 #endif 552 KERNEL_LOCK(1, NULL); 553 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 554 if ((error = copyin(SCARG(uap, blkiov), blkiov, 555 blkcnt * sizeof(BLOCK_INFO))) != 0) 556 goto out; 557 558 if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) 559 copyout(blkiov, SCARG(uap, blkiov), 560 blkcnt * sizeof(BLOCK_INFO)); 561 out: 562 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 563 KERNEL_UNLOCK_ONE(NULL); 564 return error; 565 } 566 #else 567 int 568 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval) 569 { 570 /* { 571 syscallarg(fsid_t *) fsidp; 572 syscallarg(struct block_info *) blkiov; 573 syscallarg(int) blkcnt; 574 } */ 575 BLOCK_INFO *blkiov; 576 BLOCK_INFO_15 *blkiov15; 577 int i, blkcnt, error; 578 fsid_t fsid; 579 struct lfs *fs; 580 struct mount *mntp; 581 582 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 583 return (error); 584 585 if ((mntp = vfs_getvfs(&fsid)) == NULL) 586 return (ENOENT); 587 fs = VFSTOULFS(mntp)->um_lfs; 588 589 blkcnt = SCARG(uap, blkcnt); 590 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO)) 591 return (EINVAL); 592 KERNEL_LOCK(1, NULL); 593 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV); 594 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV); 595 if ((error = copyin(SCARG(uap, blkiov), blkiov15, 596 blkcnt * sizeof(BLOCK_INFO_15))) != 0) 597 goto out; 598 599 for (i = 0; i < blkcnt; i++) { 600 blkiov[i].bi_inode = blkiov15[i].bi_inode; 601 blkiov[i].bi_lbn = blkiov15[i].bi_lbn; 602 blkiov[i].bi_daddr = blkiov15[i].bi_daddr; 603 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate; 604 blkiov[i].bi_version = blkiov15[i].bi_version; 605 blkiov[i].bi_bp = blkiov15[i].bi_bp; 606 blkiov[i].bi_size = blkiov15[i].bi_size; 607 } 608 609 if ((error = lfs_bmapv(l, &fsid, blkiov, blkcnt)) == 0) { 610 for (i = 0; i < blkcnt; i++) { 611 blkiov15[i].bi_inode = blkiov[i].bi_inode; 612 blkiov15[i].bi_lbn = blkiov[i].bi_lbn; 613 blkiov15[i].bi_daddr = blkiov[i].bi_daddr; 614 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate; 615 blkiov15[i].bi_version = blkiov[i].bi_version; 616 blkiov15[i].bi_bp = blkiov[i].bi_bp; 617 blkiov15[i].bi_size = blkiov[i].bi_size; 618 } 619 copyout(blkiov15, SCARG(uap, blkiov), 620 blkcnt * sizeof(BLOCK_INFO_15)); 621 } 622 out: 623 lfs_free(fs, blkiov, LFS_NB_BLKIOV); 624 lfs_free(fs, blkiov15, LFS_NB_BLKIOV); 625 KERNEL_UNLOCK_ONE(NULL); 626 return error; 627 } 628 #endif 629 630 int 631 lfs_bmapv(struct lwp *l, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt) 632 { 633 BLOCK_INFO *blkp; 634 IFILE *ifp; 635 struct buf *bp; 636 struct inode *ip = NULL; 637 struct lfs *fs; 638 struct mount *mntp; 639 struct ulfsmount *ump; 640 struct vnode *vp; 641 ino_t lastino; 642 daddr_t v_daddr; 643 int cnt, error; 644 int numrefed = 0; 645 646 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 647 KAUTH_REQ_SYSTEM_LFS_BMAPV, NULL, NULL, NULL); 648 if (error) 649 return (error); 650 651 if ((mntp = vfs_getvfs(fsidp)) == NULL) 652 return (ENOENT); 653 654 if ((error = vfs_busy(mntp)) != 0) 655 return (error); 656 657 ump = VFSTOULFS(mntp); 658 fs = ump->um_lfs; 659 660 if (fs->lfs_cleaner_thread == NULL) 661 fs->lfs_cleaner_thread = curlwp; 662 KASSERT(fs->lfs_cleaner_thread == curlwp); 663 664 cnt = blkcnt; 665 666 error = 0; 667 668 /* these were inside the initialization for the for loop */ 669 vp = NULL; 670 v_daddr = LFS_UNUSED_DADDR; 671 lastino = LFS_UNUSED_INUM; 672 for (blkp = blkiov; cnt--; ++blkp) 673 { 674 /* 675 * Get the IFILE entry (only once) and see if the file still 676 * exists. 677 */ 678 if (lastino != blkp->bi_inode) { 679 /* 680 * Finish the old file, if there was one. 681 */ 682 if (vp != NULL) { 683 vput(vp); 684 vp = NULL; 685 numrefed--; 686 } 687 688 /* 689 * Start a new file 690 */ 691 lastino = blkp->bi_inode; 692 if (blkp->bi_inode == LFS_IFILE_INUM) 693 v_daddr = lfs_sb_getidaddr(fs); 694 else { 695 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp); 696 v_daddr = lfs_if_getdaddr(fs, ifp); 697 brelse(bp, 0); 698 } 699 if (v_daddr == LFS_UNUSED_DADDR) { 700 blkp->bi_daddr = LFS_UNUSED_DADDR; 701 continue; 702 } 703 error = lfs_fastvget(mntp, blkp->bi_inode, NULL, 704 LK_SHARED, &vp); 705 if (error) { 706 DLOG((DLOG_CLEAN, "lfs_bmapv: lfs_fastvget ino" 707 "%d failed with %d", 708 blkp->bi_inode,error)); 709 KASSERT(vp == NULL); 710 continue; 711 } else { 712 KASSERT(VOP_ISLOCKED(vp)); 713 numrefed++; 714 } 715 ip = VTOI(vp); 716 } else if (vp == NULL) { 717 /* 718 * This can only happen if the vnode is dead. 719 * Keep going. Note that we DO NOT set the 720 * bi_addr to anything -- if we failed to get 721 * the vnode, for example, we want to assume 722 * conservatively that all of its blocks *are* 723 * located in the segment in question. 724 * lfs_markv will throw them out if we are 725 * wrong. 726 */ 727 continue; 728 } 729 730 /* Past this point we are guaranteed that vp, ip are valid. */ 731 732 if (blkp->bi_lbn == LFS_UNUSED_LBN) { 733 /* 734 * We just want the inode address, which is 735 * conveniently in v_daddr. 736 */ 737 blkp->bi_daddr = v_daddr; 738 } else { 739 daddr_t bi_daddr; 740 741 error = VOP_BMAP(vp, blkp->bi_lbn, NULL, 742 &bi_daddr, NULL); 743 if (error) 744 { 745 blkp->bi_daddr = LFS_UNUSED_DADDR; 746 continue; 747 } 748 blkp->bi_daddr = LFS_DBTOFSB(fs, bi_daddr); 749 /* Fill in the block size, too */ 750 if (blkp->bi_lbn >= 0) 751 blkp->bi_size = lfs_blksize(fs, ip, blkp->bi_lbn); 752 else 753 blkp->bi_size = lfs_sb_getbsize(fs); 754 } 755 } 756 757 /* 758 * Finish the old file, if there was one. 759 */ 760 if (vp != NULL) { 761 vput(vp); 762 vp = NULL; 763 numrefed--; 764 } 765 766 KASSERTMSG((numrefed == 0), "lfs_bmapv: numrefed=%d", numrefed); 767 768 vfs_unbusy(mntp); 769 770 return 0; 771 } 772 773 /* 774 * sys_lfs_segclean: 775 * 776 * Mark the segment clean. 777 * 778 * 0 on success 779 * -1/errno is return on error. 780 */ 781 int 782 sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval) 783 { 784 /* { 785 syscallarg(fsid_t *) fsidp; 786 syscallarg(u_long) segment; 787 } */ 788 struct lfs *fs; 789 struct mount *mntp; 790 fsid_t fsid; 791 int error; 792 unsigned long segnum; 793 794 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 795 KAUTH_REQ_SYSTEM_LFS_SEGCLEAN, NULL, NULL, NULL); 796 if (error) 797 return (error); 798 799 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 800 return (error); 801 if ((mntp = vfs_getvfs(&fsid)) == NULL) 802 return (ENOENT); 803 804 fs = VFSTOULFS(mntp)->um_lfs; 805 segnum = SCARG(uap, segment); 806 807 if ((error = vfs_busy(mntp)) != 0) 808 return (error); 809 810 KERNEL_LOCK(1, NULL); 811 lfs_seglock(fs, SEGM_PROT); 812 error = lfs_do_segclean(fs, segnum); 813 lfs_segunlock(fs); 814 KERNEL_UNLOCK_ONE(NULL); 815 vfs_unbusy(mntp); 816 return error; 817 } 818 819 /* 820 * Actually mark the segment clean. 821 * Must be called with the segment lock held. 822 */ 823 int 824 lfs_do_segclean(struct lfs *fs, unsigned long segnum) 825 { 826 extern int lfs_dostats; 827 struct buf *bp; 828 CLEANERINFO *cip; 829 SEGUSE *sup; 830 831 if (lfs_dtosn(fs, lfs_sb_getcurseg(fs)) == segnum) { 832 return (EBUSY); 833 } 834 835 LFS_SEGENTRY(sup, fs, segnum, bp); 836 if (sup->su_nbytes) { 837 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 838 " %d live bytes\n", segnum, sup->su_nbytes)); 839 brelse(bp, 0); 840 return (EBUSY); 841 } 842 if (sup->su_flags & SEGUSE_ACTIVE) { 843 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 844 " segment is active\n", segnum)); 845 brelse(bp, 0); 846 return (EBUSY); 847 } 848 if (!(sup->su_flags & SEGUSE_DIRTY)) { 849 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:" 850 " segment is already clean\n", segnum)); 851 brelse(bp, 0); 852 return (EALREADY); 853 } 854 855 lfs_sb_addavail(fs, lfs_segtod(fs, 1)); 856 if (sup->su_flags & SEGUSE_SUPERBLOCK) 857 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_SBPAD)); 858 if (lfs_sb_getversion(fs) > 1 && segnum == 0 && 859 lfs_sb_gets0addr(fs) < lfs_btofsb(fs, LFS_LABELPAD)) 860 lfs_sb_subavail(fs, lfs_btofsb(fs, LFS_LABELPAD) - lfs_sb_gets0addr(fs)); 861 mutex_enter(&lfs_lock); 862 lfs_sb_addbfree(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + 863 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); 864 lfs_sb_subdmeta(fs, sup->su_nsums * lfs_btofsb(fs, lfs_sb_getsumsize(fs)) + 865 lfs_btofsb(fs, sup->su_ninos * lfs_sb_getibsize(fs))); 866 if (lfs_sb_getdmeta(fs) < 0) 867 lfs_sb_setdmeta(fs, 0); 868 mutex_exit(&lfs_lock); 869 sup->su_flags &= ~SEGUSE_DIRTY; 870 LFS_WRITESEGENTRY(sup, fs, segnum, bp); 871 872 LFS_CLEANERINFO(cip, fs, bp); 873 lfs_ci_shiftdirtytoclean(fs, cip, 1); 874 lfs_sb_setnclean(fs, lfs_ci_getclean(fs, cip)); 875 mutex_enter(&lfs_lock); 876 lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); 877 lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) 878 - fs->lfs_ravail - fs->lfs_favail); 879 wakeup(&fs->lfs_availsleep); 880 mutex_exit(&lfs_lock); 881 (void) LFS_BWRITE_LOG(bp); 882 883 if (lfs_dostats) 884 ++lfs_stats.segs_reclaimed; 885 886 return (0); 887 } 888 889 /* 890 * This will block until a segment in file system fsid is written. A timeout 891 * in milliseconds may be specified which will awake the cleaner automatically. 892 * An fsid of -1 means any file system, and a timeout of 0 means forever. 893 */ 894 int 895 lfs_segwait(fsid_t *fsidp, struct timeval *tv) 896 { 897 struct mount *mntp; 898 void *addr; 899 u_long timeout; 900 int error; 901 902 mutex_enter(&lfs_lock); 903 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL) 904 addr = &lfs_allclean_wakeup; 905 else 906 addr = &VFSTOULFS(mntp)->um_lfs->lfs_nextsegsleep; 907 /* 908 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}! 909 * XXX IS THAT WHAT IS INTENDED? 910 */ 911 timeout = tvtohz(tv); 912 error = cv_timedwait_sig(addr, &lfs_lock, timeout); 913 mutex_exit(&lfs_lock); 914 return (error == ERESTART ? EINTR : 0); 915 } 916 917 /* 918 * sys_lfs_segwait: 919 * 920 * System call wrapper around lfs_segwait(). 921 * 922 * 0 on success 923 * 1 on timeout 924 * -1/errno is return on error. 925 */ 926 int 927 sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap, 928 register_t *retval) 929 { 930 /* { 931 syscallarg(fsid_t *) fsidp; 932 syscallarg(struct timeval *) tv; 933 } */ 934 struct timeval atv; 935 fsid_t fsid; 936 int error; 937 938 /* XXX need we be su to segwait? */ 939 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_LFS, 940 KAUTH_REQ_SYSTEM_LFS_SEGWAIT, NULL, NULL, NULL); 941 if (error) 942 return (error); 943 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0) 944 return (error); 945 946 if (SCARG(uap, tv)) { 947 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval)); 948 if (error) 949 return (error); 950 if (itimerfix(&atv)) 951 return (EINVAL); 952 } else /* NULL or invalid */ 953 atv.tv_sec = atv.tv_usec = 0; 954 return lfs_segwait(&fsid, &atv); 955 } 956 957 /* 958 * VFS_VGET call specialized for the cleaner. If the cleaner is 959 * processing IINFO structures, it may have the ondisk inode already, so 960 * don't go retrieving it again. 961 * 962 * Return the vnode referenced and locked. 963 */ 964 965 static int 966 lfs_fastvget(struct mount *mp, ino_t ino, BLOCK_INFO *blkp, int lk_flags, 967 struct vnode **vpp) 968 { 969 struct ulfsmount *ump; 970 struct lfs *fs; 971 int error; 972 973 ump = VFSTOULFS(mp); 974 fs = ump->um_lfs; 975 fs->lfs_cleaner_hint = blkp; 976 error = vcache_get(mp, &ino, sizeof(ino), vpp); 977 fs->lfs_cleaner_hint = NULL; 978 if (error) 979 return error; 980 error = vn_lock(*vpp, lk_flags); 981 if (error) { 982 if (error == EBUSY) 983 error = EAGAIN; 984 vrele(*vpp); 985 *vpp = NULL; 986 return error; 987 } 988 989 return 0; 990 } 991 992 /* 993 * Make up a "fake" cleaner buffer, copy the data from userland into it. 994 */ 995 static struct buf * 996 lfs_fakebuf(struct lfs *fs, struct vnode *vp, daddr_t lbn, size_t size, void *uaddr) 997 { 998 struct buf *bp; 999 int error; 1000 1001 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM); 1002 1003 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN); 1004 error = copyin(uaddr, bp->b_data, size); 1005 if (error) { 1006 lfs_freebuf(fs, bp); 1007 return NULL; 1008 } 1009 KDASSERT(bp->b_iodone == lfs_free_aiodone); 1010 1011 #if 0 1012 mutex_enter(&lfs_lock); 1013 ++fs->lfs_iocount; 1014 mutex_exit(&lfs_lock); 1015 #endif 1016 bp->b_bufsize = size; 1017 bp->b_bcount = size; 1018 return (bp); 1019 } 1020