1 /* $NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $ */ 2 3 /* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick (at) mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40 #include <sys/cdefs.h> 41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $"); 42 43 #if defined(_KERNEL_OPT) 44 #include "opt_ffs.h" 45 #include "opt_quota.h" 46 #endif 47 48 #include <sys/param.h> 49 #include <sys/kernel.h> 50 #include <sys/systm.h> 51 #include <sys/conf.h> 52 #include <sys/buf.h> 53 #include <sys/proc.h> 54 #include <sys/namei.h> 55 #include <sys/sched.h> 56 #include <sys/stat.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/resource.h> 60 #include <sys/resourcevar.h> 61 #include <sys/vnode.h> 62 #include <sys/kauth.h> 63 #include <sys/fstrans.h> 64 #include <sys/wapbl.h> 65 66 #include <miscfs/specfs/specdev.h> 67 68 #include <ufs/ufs/quota.h> 69 #include <ufs/ufs/ufsmount.h> 70 #include <ufs/ufs/inode.h> 71 #include <ufs/ufs/ufs_extern.h> 72 #include <ufs/ufs/ufs_bswap.h> 73 #include <ufs/ufs/ufs_wapbl.h> 74 75 #include <ufs/ffs/fs.h> 76 #include <ufs/ffs/ffs_extern.h> 77 78 #include <uvm/uvm.h> 79 80 TAILQ_HEAD(inodelst, inode); /* List of active snapshots */ 81 82 struct snap_info { 83 kmutex_t si_lock; /* Lock this snapinfo */ 84 kmutex_t si_snaplock; /* Snapshot vnode common lock */ 85 lwp_t *si_owner; /* Snaplock owner */ 86 struct inodelst si_snapshots; /* List of active snapshots */ 87 daddr_t *si_snapblklist; /* Snapshot block hints list */ 88 uint32_t si_gen; /* Incremented on change */ 89 }; 90 91 #if !defined(FFS_NO_SNAPSHOT) 92 typedef int (*acctfunc_t) 93 (struct vnode *, void *, int, int, struct fs *, daddr_t, int); 94 95 static int snapshot_setup(struct mount *, struct vnode *); 96 static int snapshot_copyfs(struct mount *, struct vnode *, void **); 97 static int snapshot_expunge(struct mount *, struct vnode *, 98 struct fs *, daddr_t *, daddr_t **); 99 static int snapshot_expunge_snap(struct mount *, struct vnode *, 100 struct fs *, daddr_t); 101 static int snapshot_writefs(struct mount *, struct vnode *, void *); 102 static int cgaccount(struct vnode *, int, int *); 103 static int cgaccount1(int, struct vnode *, void *, int); 104 static int expunge(struct vnode *, struct inode *, struct fs *, 105 acctfunc_t, int); 106 static int indiracct(struct vnode *, struct vnode *, int, daddr_t, 107 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int); 108 static int fullacct(struct vnode *, void *, int, int, struct fs *, 109 daddr_t, int); 110 static int snapacct(struct vnode *, void *, int, int, struct fs *, 111 daddr_t, int); 112 static int mapacct(struct vnode *, void *, int, int, struct fs *, 113 daddr_t, int); 114 #endif /* !defined(FFS_NO_SNAPSHOT) */ 115 116 static int ffs_copyonwrite(void *, struct buf *, bool); 117 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 118 static int rwfsblk(struct vnode *, int, void *, daddr_t); 119 static int syncsnap(struct vnode *); 120 static int wrsnapblk(struct vnode *, void *, daddr_t); 121 #if !defined(FFS_NO_SNAPSHOT) 122 static int blocks_in_journal(struct fs *); 123 #endif 124 125 static inline bool is_active_snapshot(struct snap_info *, struct inode *); 126 static inline daddr_t db_get(struct inode *, int); 127 static inline void db_assign(struct inode *, int, daddr_t); 128 static inline daddr_t ib_get(struct inode *, int); 129 static inline daddr_t idb_get(struct inode *, void *, int); 130 static inline void idb_assign(struct inode *, void *, int, daddr_t); 131 132 #ifdef DEBUG 133 static int snapdebug = 0; 134 #endif 135 136 int 137 ffs_snapshot_init(struct ufsmount *ump) 138 { 139 struct snap_info *si; 140 141 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 142 TAILQ_INIT(&si->si_snapshots); 143 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 144 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE); 145 si->si_owner = NULL; 146 si->si_gen = 0; 147 si->si_snapblklist = NULL; 148 149 return 0; 150 } 151 152 void 153 ffs_snapshot_fini(struct ufsmount *ump) 154 { 155 struct snap_info *si; 156 157 si = ump->um_snapinfo; 158 ump->um_snapinfo = NULL; 159 160 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 161 mutex_destroy(&si->si_lock); 162 mutex_destroy(&si->si_snaplock); 163 KASSERT(si->si_snapblklist == NULL); 164 kmem_free(si, sizeof(*si)); 165 } 166 167 /* 168 * Create a snapshot file and initialize it for the filesystem. 169 * Vnode is locked on entry and return. 170 */ 171 int 172 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 173 { 174 #if defined(FFS_NO_SNAPSHOT) 175 return EOPNOTSUPP; 176 } 177 #else /* defined(FFS_NO_SNAPSHOT) */ 178 bool suspended = false; 179 int error, redo = 0, snaploc; 180 void *sbbuf = NULL; 181 daddr_t *snaplist = NULL, snaplistsize = 0; 182 struct buf *bp, *nbp; 183 struct fs *copy_fs = NULL; 184 struct fs *fs = VFSTOUFS(mp)->um_fs; 185 struct inode *ip = VTOI(vp); 186 struct lwp *l = curlwp; 187 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 188 struct timespec ts; 189 struct timeval starttime; 190 #ifdef DEBUG 191 struct timeval endtime; 192 #endif 193 struct vnode *devvp = ip->i_devvp; 194 195 /* 196 * If the vnode already is a snapshot, return. 197 */ 198 if ((ip->i_flags & SF_SNAPSHOT)) { 199 if ((ip->i_flags & SF_SNAPINVAL)) 200 return EINVAL; 201 if (ctime) { 202 ctime->tv_sec = DIP(ip, mtime); 203 ctime->tv_nsec = DIP(ip, mtimensec); 204 } 205 return 0; 206 } 207 /* 208 * Check for free snapshot slot in the superblock. 209 */ 210 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 211 if (fs->fs_snapinum[snaploc] == 0) 212 break; 213 if (snaploc == FSMAXSNAP) 214 return (ENOSPC); 215 /* 216 * Prepare the vnode to become a snapshot. 217 */ 218 error = snapshot_setup(mp, vp); 219 if (error) 220 goto out; 221 222 /* 223 * Copy all the cylinder group maps. Although the 224 * filesystem is still active, we hope that only a few 225 * cylinder groups will change between now and when we 226 * suspend operations. Thus, we will be able to quickly 227 * touch up the few cylinder groups that changed during 228 * the suspension period. 229 */ 230 error = cgaccount(vp, 1, NULL); 231 if (error) 232 goto out; 233 234 /* 235 * snapshot is now valid 236 */ 237 ip->i_flags &= ~SF_SNAPINVAL; 238 DIP_ASSIGN(ip, flags, ip->i_flags); 239 ip->i_flag |= IN_CHANGE | IN_UPDATE; 240 241 /* 242 * Ensure that the snapshot is completely on disk. 243 * Since we have marked it as a snapshot it is safe to 244 * unlock it as no process will be allowed to write to it. 245 */ 246 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 247 if (error) 248 goto out; 249 VOP_UNLOCK(vp); 250 /* 251 * All allocations are done, so we can now suspend the filesystem. 252 */ 253 error = vfs_suspend(vp->v_mount, 0); 254 if (error == 0) { 255 suspended = true; 256 vrele_flush(vp->v_mount); 257 error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred); 258 } 259 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 260 if (error) 261 goto out; 262 getmicrotime(&starttime); 263 /* 264 * First, copy all the cylinder group maps that have changed. 265 */ 266 error = cgaccount(vp, 2, &redo); 267 if (error) 268 goto out; 269 /* 270 * Create a copy of the superblock and its summary information. 271 */ 272 error = snapshot_copyfs(mp, vp, &sbbuf); 273 if (error) 274 goto out; 275 copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc)); 276 /* 277 * Expunge unlinked files from our view. 278 */ 279 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist); 280 if (error) 281 goto out; 282 /* 283 * Record snapshot inode. Since this is the newest snapshot, 284 * it must be placed at the end of the list. 285 */ 286 if (ip->i_nlink > 0) 287 fs->fs_snapinum[snaploc] = ip->i_number; 288 289 mutex_enter(&si->si_lock); 290 if (is_active_snapshot(si, ip)) 291 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number); 292 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 293 if (TAILQ_FIRST(&si->si_snapshots) == ip) { 294 /* 295 * If this is the first snapshot on this filesystem, put the 296 * preliminary list in place and establish the cow handler. 297 */ 298 si->si_snapblklist = snaplist; 299 fscow_establish(mp, ffs_copyonwrite, devvp); 300 } 301 si->si_gen++; 302 mutex_exit(&si->si_lock); 303 304 vp->v_vflag |= VV_SYSTEM; 305 /* 306 * Set the mtime to the time the snapshot has been taken. 307 */ 308 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 309 if (ctime) 310 *ctime = ts; 311 DIP_ASSIGN(ip, mtime, ts.tv_sec); 312 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 313 ip->i_flag |= IN_CHANGE | IN_UPDATE; 314 /* 315 * Copy allocation information from all snapshots and then 316 * expunge them from our view. 317 */ 318 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize); 319 if (error) 320 goto out; 321 /* 322 * Write the superblock and its summary information to the snapshot. 323 */ 324 error = snapshot_writefs(mp, vp, sbbuf); 325 if (error) 326 goto out; 327 /* 328 * We're nearly done, ensure that the snapshot is completely on disk. 329 */ 330 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0); 331 if (error) 332 goto out; 333 /* 334 * Invalidate and free all pages on the snapshot vnode. 335 * We will read and write through the buffercache. 336 */ 337 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 338 error = VOP_PUTPAGES(vp, 0, 0, 339 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE); 340 if (error) 341 goto out; 342 /* 343 * Invalidate short ( < fs_bsize ) buffers. We will always read 344 * full size buffers later. 345 */ 346 mutex_enter(&bufcache_lock); 347 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL); 348 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 349 nbp = LIST_NEXT(bp, b_vnbufs); 350 if (bp->b_bcount == fs->fs_bsize) 351 continue; 352 error = bbusy(bp, false, 0, NULL); 353 if (error != 0) { 354 if (error == EPASSTHROUGH) { 355 nbp = LIST_FIRST(&vp->v_cleanblkhd); 356 continue; 357 } 358 break; 359 } 360 brelsel(bp, BC_INVAL | BC_VFLUSH); 361 } 362 mutex_exit(&bufcache_lock); 363 364 out: 365 if (sbbuf != NULL) { 366 free(copy_fs->fs_csp, M_UFSMNT); 367 free(sbbuf, M_UFSMNT); 368 } 369 if (fs->fs_active != NULL) { 370 free(fs->fs_active, M_DEVBUF); 371 fs->fs_active = NULL; 372 } 373 374 mutex_enter(&si->si_lock); 375 if (snaplist != NULL) { 376 if (si->si_snapblklist == snaplist) 377 si->si_snapblklist = NULL; 378 free(snaplist, M_UFSMNT); 379 } 380 if (error) { 381 fs->fs_snapinum[snaploc] = 0; 382 } else { 383 /* 384 * As this is the newest list, it is the most inclusive, so 385 * should replace the previous list. 386 */ 387 si->si_snapblklist = ip->i_snapblklist; 388 } 389 si->si_gen++; 390 mutex_exit(&si->si_lock); 391 392 if (suspended) { 393 VOP_UNLOCK(vp); 394 vfs_resume(vp->v_mount); 395 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 396 #ifdef DEBUG 397 getmicrotime(&endtime); 398 timersub(&endtime, &starttime, &endtime); 399 printf("%s: suspended %lld.%03d sec, redo %d of %d\n", 400 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec, 401 endtime.tv_usec / 1000, redo, fs->fs_ncg); 402 #endif 403 } 404 if (error) { 405 if (UFS_WAPBL_BEGIN(mp) == 0) { 406 /* 407 * We depend on ffs_truncate() to call ffs_snapremove() 408 * before it may return an error. On failed 409 * ffs_truncate() we have normal file with leaked 410 * (meta-) data, but no snapshot to use. 411 */ 412 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 413 UFS_WAPBL_END(mp); 414 } 415 } else if (ip->i_nlink > 0) 416 vref(vp); 417 return (error); 418 } 419 420 /* 421 * Prepare vnode to become a snapshot. 422 */ 423 static int 424 snapshot_setup(struct mount *mp, struct vnode *vp) 425 { 426 int error, n, len, loc, cg; 427 daddr_t blkno, numblks; 428 struct buf *ibp, *nbp; 429 struct fs *fs = VFSTOUFS(mp)->um_fs; 430 struct lwp *l = curlwp; 431 const int wbreak = blocks_in_journal(fs)/8; 432 struct inode *ip = VTOI(vp); 433 434 /* 435 * Check mount, readonly reference and owner. 436 */ 437 if (vp->v_mount != mp) 438 return EXDEV; 439 if (vp->v_writecount != 0) 440 return EBUSY; 441 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT, 442 0, mp, vp, NULL); 443 if (error) 444 return EACCES; 445 446 /* 447 * Must completely truncate the file here. Allocated 448 * blocks on a snapshot mean that block has been copied 449 * on write, see ffs_copyonwrite() testing "blkno != 0" 450 */ 451 error = ufs_truncate_all(vp); 452 if (error) 453 return error; 454 455 /* Change inode to snapshot type file. */ 456 error = UFS_WAPBL_BEGIN(mp); 457 if (error) 458 return error; 459 #if defined(QUOTA) || defined(QUOTA2) 460 /* snapshot inodes are not accounted in quotas */ 461 chkiq(ip, -1, l->l_cred, 0); 462 #endif 463 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL); 464 DIP_ASSIGN(ip, flags, ip->i_flags); 465 ip->i_flag |= IN_CHANGE | IN_UPDATE; 466 ffs_update(vp, NULL, NULL, UPDATE_WAIT); 467 UFS_WAPBL_END(mp); 468 469 KASSERT(ip->i_flags & SF_SNAPSHOT); 470 /* 471 * Write an empty list of preallocated blocks to the end of 472 * the snapshot to set size to at least that of the filesystem. 473 */ 474 numblks = howmany(fs->fs_size, fs->fs_frag); 475 blkno = 1; 476 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs)); 477 error = vn_rdwr(UIO_WRITE, vp, 478 (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks), 479 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 480 if (error) 481 return error; 482 /* 483 * Preallocate critical data structures so that we can copy 484 * them in without further allocation after we suspend all 485 * operations on the filesystem. We would like to just release 486 * the allocated buffers without writing them since they will 487 * be filled in below once we are ready to go, but this upsets 488 * the soft update code, so we go ahead and write the new buffers. 489 * 490 * Allocate all indirect blocks and mark all of them as not 491 * needing to be copied. 492 */ 493 error = UFS_WAPBL_BEGIN(mp); 494 if (error) 495 return error; 496 for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) { 497 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno), 498 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 499 if (error) 500 goto out; 501 brelse(ibp, 0); 502 if (wbreak > 0 && (++n % wbreak) == 0) { 503 UFS_WAPBL_END(mp); 504 error = UFS_WAPBL_BEGIN(mp); 505 if (error) 506 return error; 507 } 508 } 509 /* 510 * Allocate copies for the superblock and its summary information. 511 */ 512 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred, 513 0, &nbp); 514 if (error) 515 goto out; 516 bawrite(nbp); 517 blkno = ffs_fragstoblks(fs, fs->fs_csaddr); 518 len = howmany(fs->fs_cssize, fs->fs_bsize); 519 for (loc = 0; loc < len; loc++) { 520 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)), 521 fs->fs_bsize, l->l_cred, 0, &nbp); 522 if (error) 523 goto out; 524 bawrite(nbp); 525 if (wbreak > 0 && (++n % wbreak) == 0) { 526 UFS_WAPBL_END(mp); 527 error = UFS_WAPBL_BEGIN(mp); 528 if (error) 529 return error; 530 } 531 } 532 /* 533 * Allocate all cylinder group blocks. 534 */ 535 for (cg = 0; cg < fs->fs_ncg; cg++) { 536 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)), 537 fs->fs_bsize, l->l_cred, 0, &nbp); 538 if (error) 539 goto out; 540 bawrite(nbp); 541 if (wbreak > 0 && (++n % wbreak) == 0) { 542 UFS_WAPBL_END(mp); 543 error = UFS_WAPBL_BEGIN(mp); 544 if (error) 545 return error; 546 } 547 } 548 549 out: 550 UFS_WAPBL_END(mp); 551 return error; 552 } 553 554 /* 555 * Create a copy of the superblock and its summary information. 556 * It is up to the caller to free copyfs and copy_fs->fs_csp. 557 */ 558 static int 559 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf) 560 { 561 int error, i, len, loc, size; 562 void *space; 563 int32_t *lp; 564 struct buf *bp; 565 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 566 struct vnode *devvp = VTOI(vp)->i_devvp; 567 568 /* 569 * Grab a copy of the superblock and its summary information. 570 * We delay writing it until the suspension is released below. 571 */ 572 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 573 loc = ffs_blkoff(fs, fs->fs_sblockloc); 574 if (loc > 0) 575 memset(*sbbuf, 0, loc); 576 copyfs = (struct fs *)((char *)(*sbbuf) + loc); 577 memcpy(copyfs, fs, fs->fs_sbsize); 578 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 579 if (fs->fs_sbsize < size) 580 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 581 size - fs->fs_sbsize); 582 size = ffs_blkroundup(fs, fs->fs_cssize); 583 if (fs->fs_contigsumsize > 0) 584 size += fs->fs_ncg * sizeof(int32_t); 585 space = malloc(size, M_UFSMNT, M_WAITOK); 586 copyfs->fs_csp = space; 587 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize); 588 space = (char *)space + fs->fs_cssize; 589 loc = howmany(fs->fs_cssize, fs->fs_fsize); 590 i = fs->fs_frag - loc % fs->fs_frag; 591 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 592 if (len > 0) { 593 if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc), 594 len, 0, &bp)) != 0) { 595 free(copyfs->fs_csp, M_UFSMNT); 596 free(*sbbuf, M_UFSMNT); 597 *sbbuf = NULL; 598 return error; 599 } 600 memcpy(space, bp->b_data, (u_int)len); 601 space = (char *)space + len; 602 brelse(bp, BC_INVAL | BC_NOCACHE); 603 } 604 if (fs->fs_contigsumsize > 0) { 605 copyfs->fs_maxcluster = lp = space; 606 for (i = 0; i < fs->fs_ncg; i++) 607 *lp++ = fs->fs_contigsumsize; 608 } 609 if (mp->mnt_wapbl) 610 copyfs->fs_flags &= ~FS_DOWAPBL; 611 return 0; 612 } 613 614 struct snapshot_expunge_ctx { 615 struct vnode *logvp; 616 struct vnode *vp; 617 struct fs *copy_fs; 618 }; 619 620 static bool 621 snapshot_expunge_selector(void *cl, struct vnode *xvp) 622 { 623 struct snapshot_expunge_ctx *c = cl; 624 struct inode *xp; 625 626 KASSERT(mutex_owned(xvp->v_interlock)); 627 628 xp = VTOI(xvp); 629 if (xvp->v_type == VNON || VTOI(xvp) == NULL || 630 (xp->i_flags & SF_SNAPSHOT)) 631 return false; 632 #ifdef DEBUG 633 if (snapdebug) 634 vprint("ffs_snapshot: busy vnode", xvp); 635 #endif 636 637 if (xvp == c->logvp) 638 return true; 639 640 if (xp->i_nlink > 0) 641 return false; 642 643 if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number)) 644 return false; 645 646 return true; 647 } 648 649 /* 650 * We must check for active files that have been unlinked (e.g., with a zero 651 * link count). We have to expunge all trace of these files from the snapshot 652 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped. 653 * Note that we skip unlinked snapshot files as they will be handled separately. 654 * Calculate the snapshot list size and create a preliminary list. 655 */ 656 static int 657 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs, 658 daddr_t *snaplistsize, daddr_t **snaplist) 659 { 660 int cg, error = 0, len, loc; 661 daddr_t blkno, *blkp; 662 struct fs *fs = VFSTOUFS(mp)->um_fs; 663 struct inode *xp; 664 struct vnode *logvp = NULL, *xvp; 665 struct vnode_iterator *marker; 666 struct snapshot_expunge_ctx ctx; 667 668 *snaplist = NULL; 669 /* 670 * Get the log inode if any. 671 */ 672 if ((fs->fs_flags & FS_DOWAPBL) && 673 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 674 error = VFS_VGET(mp, fs->fs_journallocs[UFS_WAPBL_INFS_INO], 675 LK_EXCLUSIVE, &logvp); 676 if (error) 677 goto out; 678 } 679 /* 680 * We also calculate the needed size for the snapshot list. 681 */ 682 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 683 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 684 685 vfs_vnode_iterator_init(mp, &marker); 686 ctx.logvp = logvp; 687 ctx.vp = vp; 688 ctx.copy_fs = copy_fs; 689 while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector, 690 &ctx))) 691 { 692 /* 693 * If there is a fragment, clear it here. 694 */ 695 xp = VTOI(xvp); 696 blkno = 0; 697 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 698 if (loc < UFS_NDADDR) { 699 len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size)); 700 if (len > 0 && len < fs->fs_bsize) { 701 error = UFS_WAPBL_BEGIN(mp); 702 if (error) { 703 vrele(xvp); 704 vfs_vnode_iterator_destroy(marker); 705 goto out; 706 } 707 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc), 708 len, xp->i_number); 709 blkno = db_get(xp, loc); 710 db_assign(xp, loc, 0); 711 UFS_WAPBL_END(mp); 712 } 713 } 714 *snaplistsize += 1; 715 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 716 if (blkno) 717 db_assign(xp, loc, blkno); 718 if (!error) { 719 error = UFS_WAPBL_BEGIN(mp); 720 if (!error) { 721 error = ffs_freefile_snap(copy_fs, vp, 722 xp->i_number, xp->i_mode); 723 UFS_WAPBL_END(mp); 724 } 725 } 726 vrele(xvp); 727 if (error) { 728 vfs_vnode_iterator_destroy(marker); 729 goto out; 730 } 731 } 732 vfs_vnode_iterator_destroy(marker); 733 734 /* 735 * Create a preliminary list of preallocated snapshot blocks. 736 */ 737 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 738 blkp = &(*snaplist)[1]; 739 *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc); 740 blkno = ffs_fragstoblks(fs, fs->fs_csaddr); 741 for (cg = 0; cg < fs->fs_ncg; cg++) { 742 if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno) 743 break; 744 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg)); 745 } 746 len = howmany(fs->fs_cssize, fs->fs_bsize); 747 for (loc = 0; loc < len; loc++) 748 *blkp++ = blkno + loc; 749 for (; cg < fs->fs_ncg; cg++) 750 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg)); 751 (*snaplist)[0] = blkp - &(*snaplist)[0]; 752 753 out: 754 if (logvp != NULL) 755 vput(logvp); 756 if (error && *snaplist != NULL) { 757 free(*snaplist, M_UFSMNT); 758 *snaplist = NULL; 759 } 760 761 return error; 762 } 763 764 /* 765 * Copy allocation information from all the snapshots in this snapshot and 766 * then expunge them from its view. Also, collect the list of allocated 767 * blocks in i_snapblklist. 768 */ 769 static int 770 snapshot_expunge_snap(struct mount *mp, struct vnode *vp, 771 struct fs *copy_fs, daddr_t snaplistsize) 772 { 773 int error = 0, i; 774 daddr_t numblks, *snaplist = NULL; 775 struct fs *fs = VFSTOUFS(mp)->um_fs; 776 struct inode *ip = VTOI(vp), *xp; 777 struct lwp *l = curlwp; 778 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo; 779 780 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 781 if (xp != ip) { 782 error = expunge(vp, xp, fs, snapacct, BLK_SNAP); 783 if (error) 784 break; 785 } 786 if (xp->i_nlink != 0) 787 continue; 788 error = UFS_WAPBL_BEGIN(mp); 789 if (error) 790 break; 791 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode); 792 UFS_WAPBL_END(mp); 793 if (error) 794 break; 795 } 796 if (error) 797 goto out; 798 /* 799 * Allocate space for the full list of preallocated snapshot blocks. 800 */ 801 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 802 ip->i_snapblklist = &snaplist[1]; 803 /* 804 * Expunge the blocks used by the snapshots from the set of 805 * blocks marked as used in the snapshot bitmaps. Also, collect 806 * the list of allocated blocks in i_snapblklist. 807 */ 808 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP); 809 if (error) 810 goto out; 811 if (snaplistsize < ip->i_snapblklist - snaplist) 812 panic("ffs_snapshot: list too small"); 813 snaplistsize = ip->i_snapblklist - snaplist; 814 snaplist[0] = snaplistsize; 815 ip->i_snapblklist = &snaplist[0]; 816 /* 817 * Write out the list of allocated blocks to the end of the snapshot. 818 */ 819 numblks = howmany(fs->fs_size, fs->fs_frag); 820 for (i = 0; i < snaplistsize; i++) 821 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 822 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist, 823 snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks), 824 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL); 825 for (i = 0; i < snaplistsize; i++) 826 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs)); 827 out: 828 if (error && snaplist != NULL) { 829 free(snaplist, M_UFSMNT); 830 ip->i_snapblklist = NULL; 831 } 832 return error; 833 } 834 835 /* 836 * Write the superblock and its summary information to the snapshot. 837 * Make sure, the first UFS_NDADDR blocks get copied to the snapshot. 838 */ 839 static int 840 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf) 841 { 842 int error, len, loc; 843 void *space; 844 daddr_t blkno; 845 struct buf *bp; 846 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs; 847 struct inode *ip = VTOI(vp); 848 struct lwp *l = curlwp; 849 850 copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc)); 851 852 /* 853 * Write the superblock and its summary information 854 * to the snapshot. 855 */ 856 blkno = ffs_fragstoblks(fs, fs->fs_csaddr); 857 len = howmany(fs->fs_cssize, fs->fs_bsize); 858 space = copyfs->fs_csp; 859 #ifdef FFS_EI 860 if (UFS_FSNEEDSWAP(fs)) { 861 ffs_sb_swap(copyfs, copyfs); 862 ffs_csum_swap(space, space, fs->fs_cssize); 863 } 864 #endif 865 error = UFS_WAPBL_BEGIN(mp); 866 if (error) 867 return error; 868 for (loc = 0; loc < len; loc++) { 869 error = bread(vp, blkno + loc, fs->fs_bsize, 870 B_MODIFY, &bp); 871 if (error) { 872 break; 873 } 874 memcpy(bp->b_data, space, fs->fs_bsize); 875 space = (char *)space + fs->fs_bsize; 876 bawrite(bp); 877 } 878 if (error) 879 goto out; 880 error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc), 881 fs->fs_bsize, B_MODIFY, &bp); 882 if (error) { 883 goto out; 884 } else { 885 memcpy(bp->b_data, sbbuf, fs->fs_bsize); 886 bawrite(bp); 887 } 888 /* 889 * Copy the first UFS_NDADDR blocks to the snapshot so 890 * ffs_copyonwrite() and ffs_snapblkfree() will always work on 891 * indirect blocks. 892 */ 893 for (loc = 0; loc < UFS_NDADDR; loc++) { 894 if (db_get(ip, loc) != 0) 895 continue; 896 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc), 897 fs->fs_bsize, l->l_cred, 0, &bp); 898 if (error) 899 break; 900 error = rwfsblk(vp, B_READ, bp->b_data, loc); 901 if (error) { 902 brelse(bp, 0); 903 break; 904 } 905 bawrite(bp); 906 } 907 908 out: 909 UFS_WAPBL_END(mp); 910 return error; 911 } 912 913 /* 914 * Copy all cylinder group maps. 915 */ 916 static int 917 cgaccount(struct vnode *vp, int passno, int *redo) 918 { 919 int cg, error = 0; 920 struct buf *nbp; 921 struct fs *fs = VTOI(vp)->i_fs; 922 923 if (redo != NULL) 924 *redo = 0; 925 if (passno == 1) 926 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY), 927 M_DEVBUF, M_WAITOK | M_ZERO); 928 for (cg = 0; cg < fs->fs_ncg; cg++) { 929 if (passno == 2 && ACTIVECG_ISSET(fs, cg)) 930 continue; 931 932 if (redo != NULL) 933 *redo += 1; 934 error = UFS_WAPBL_BEGIN(vp->v_mount); 935 if (error) 936 return error; 937 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)), 938 fs->fs_bsize, curlwp->l_cred, 0, &nbp); 939 if (error) { 940 UFS_WAPBL_END(vp->v_mount); 941 break; 942 } 943 error = cgaccount1(cg, vp, nbp->b_data, passno); 944 bawrite(nbp); 945 UFS_WAPBL_END(vp->v_mount); 946 if (error) 947 break; 948 } 949 return error; 950 } 951 952 /* 953 * Copy a cylinder group map. All the unallocated blocks are marked 954 * BLK_NOCOPY so that the snapshot knows that it need not copy them 955 * if they are later written. If passno is one, then this is a first 956 * pass, so only setting needs to be done. If passno is 2, then this 957 * is a revision to a previous pass which must be undone as the 958 * replacement pass is done. 959 */ 960 static int 961 cgaccount1(int cg, struct vnode *vp, void *data, int passno) 962 { 963 struct buf *bp, *ibp; 964 struct inode *ip; 965 struct cg *cgp; 966 struct fs *fs; 967 struct lwp *l = curlwp; 968 daddr_t base, numblks; 969 int error, len, loc, ns __unused, indiroff; 970 971 ip = VTOI(vp); 972 fs = ip->i_fs; 973 ns = UFS_FSNEEDSWAP(fs); 974 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)), 975 (int)fs->fs_cgsize, 0, &bp); 976 if (error) { 977 return (error); 978 } 979 cgp = (struct cg *)bp->b_data; 980 if (!cg_chkmagic(cgp, ns)) { 981 brelse(bp, 0); 982 return (EIO); 983 } 984 ACTIVECG_SET(fs, cg); 985 986 memcpy(data, bp->b_data, fs->fs_cgsize); 987 brelse(bp, 0); 988 if (fs->fs_cgsize < fs->fs_bsize) 989 memset((char *)data + fs->fs_cgsize, 0, 990 fs->fs_bsize - fs->fs_cgsize); 991 numblks = howmany(fs->fs_size, fs->fs_frag); 992 len = howmany(fs->fs_fpg, fs->fs_frag); 993 base = cgbase(fs, cg) / fs->fs_frag; 994 if (base + len >= numblks) 995 len = numblks - base - 1; 996 loc = 0; 997 if (base < UFS_NDADDR) { 998 for ( ; loc < UFS_NDADDR; loc++) { 999 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 1000 db_assign(ip, loc, BLK_NOCOPY); 1001 else if (db_get(ip, loc) == BLK_NOCOPY) { 1002 if (passno == 2) 1003 db_assign(ip, loc, 0); 1004 else if (passno == 1) 1005 panic("ffs_snapshot: lost direct block"); 1006 } 1007 } 1008 } 1009 if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)), 1010 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 1011 return (error); 1012 indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs); 1013 for ( ; loc < len; loc++, indiroff++) { 1014 if (indiroff >= FFS_NINDIR(fs)) { 1015 bawrite(ibp); 1016 if ((error = ffs_balloc(vp, 1017 ffs_lblktosize(fs, (off_t)(base + loc)), 1018 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0) 1019 return (error); 1020 indiroff = 0; 1021 } 1022 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 1023 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 1024 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 1025 if (passno == 2) 1026 idb_assign(ip, ibp->b_data, indiroff, 0); 1027 else if (passno == 1) 1028 panic("ffs_snapshot: lost indirect block"); 1029 } 1030 } 1031 bdwrite(ibp); 1032 return (0); 1033 } 1034 1035 /* 1036 * Before expunging a snapshot inode, note all the 1037 * blocks that it claims with BLK_SNAP so that fsck will 1038 * be able to account for those blocks properly and so 1039 * that this snapshot knows that it need not copy them 1040 * if the other snapshot holding them is freed. 1041 */ 1042 static int 1043 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 1044 acctfunc_t acctfunc, int expungetype) 1045 { 1046 int i, error, ns __unused; 1047 daddr_t lbn, rlbn; 1048 daddr_t len, blkno, numblks, blksperindir; 1049 struct ufs1_dinode *dip1; 1050 struct ufs2_dinode *dip2; 1051 struct lwp *l = curlwp; 1052 void *bap; 1053 struct buf *bp; 1054 struct mount *mp; 1055 1056 ns = UFS_FSNEEDSWAP(fs); 1057 mp = snapvp->v_mount; 1058 1059 error = UFS_WAPBL_BEGIN(mp); 1060 if (error) 1061 return error; 1062 /* 1063 * Prepare to expunge the inode. If its inode block has not 1064 * yet been copied, then allocate and fill the copy. 1065 */ 1066 lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1067 error = snapblkaddr(snapvp, lbn, &blkno); 1068 if (error) 1069 return error; 1070 if (blkno != 0) { 1071 error = bread(snapvp, lbn, fs->fs_bsize, 1072 B_MODIFY, &bp); 1073 } else { 1074 error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn), 1075 fs->fs_bsize, l->l_cred, 0, &bp); 1076 if (! error) 1077 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1078 } 1079 if (error) { 1080 UFS_WAPBL_END(mp); 1081 return error; 1082 } 1083 /* 1084 * Set a snapshot inode to be a zero length file, regular files 1085 * or unlinked snapshots to be completely unallocated. 1086 */ 1087 if (fs->fs_magic == FS_UFS1_MAGIC) { 1088 dip1 = (struct ufs1_dinode *)bp->b_data + 1089 ino_to_fsbo(fs, cancelip->i_number); 1090 if (cancelip->i_flags & SF_SNAPSHOT) { 1091 dip1->di_flags = 1092 ufs_rw32(ufs_rw32(dip1->di_flags, ns) | 1093 SF_SNAPINVAL, ns); 1094 } 1095 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1096 dip1->di_mode = 0; 1097 dip1->di_size = 0; 1098 dip1->di_blocks = 0; 1099 memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t)); 1100 } else { 1101 dip2 = (struct ufs2_dinode *)bp->b_data + 1102 ino_to_fsbo(fs, cancelip->i_number); 1103 if (cancelip->i_flags & SF_SNAPSHOT) { 1104 dip2->di_flags = 1105 ufs_rw32(ufs_rw32(dip2->di_flags, ns) | 1106 SF_SNAPINVAL, ns); 1107 } 1108 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0) 1109 dip2->di_mode = 0; 1110 dip2->di_size = 0; 1111 dip2->di_blocks = 0; 1112 memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t)); 1113 } 1114 bdwrite(bp); 1115 UFS_WAPBL_END(mp); 1116 /* 1117 * Now go through and expunge all the blocks in the file 1118 * using the function requested. 1119 */ 1120 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1121 if (fs->fs_magic == FS_UFS1_MAGIC) 1122 bap = &cancelip->i_ffs1_db[0]; 1123 else 1124 bap = &cancelip->i_ffs2_db[0]; 1125 error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype); 1126 if (error) 1127 return (error); 1128 if (fs->fs_magic == FS_UFS1_MAGIC) 1129 bap = &cancelip->i_ffs1_ib[0]; 1130 else 1131 bap = &cancelip->i_ffs2_ib[0]; 1132 error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype); 1133 if (error) 1134 return (error); 1135 blksperindir = 1; 1136 lbn = -UFS_NDADDR; 1137 len = numblks - UFS_NDADDR; 1138 rlbn = UFS_NDADDR; 1139 for (i = 0; len > 0 && i < UFS_NIADDR; i++) { 1140 error = indiracct(snapvp, ITOV(cancelip), i, 1141 ib_get(cancelip, i), lbn, rlbn, len, 1142 blksperindir, fs, acctfunc, expungetype); 1143 if (error) 1144 return (error); 1145 blksperindir *= FFS_NINDIR(fs); 1146 lbn -= blksperindir + 1; 1147 len -= blksperindir; 1148 rlbn += blksperindir; 1149 } 1150 return (0); 1151 } 1152 1153 /* 1154 * Descend an indirect block chain for vnode cancelvp accounting for all 1155 * its indirect blocks in snapvp. 1156 */ 1157 static int 1158 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level, 1159 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks, 1160 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype) 1161 { 1162 int error, num, i; 1163 daddr_t subblksperindir; 1164 struct indir indirs[UFS_NIADDR + 2]; 1165 daddr_t last; 1166 void *bap; 1167 struct buf *bp; 1168 1169 if (blkno == 0) { 1170 if (expungetype == BLK_NOCOPY) 1171 return (0); 1172 panic("indiracct: missing indir"); 1173 } 1174 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1175 return (error); 1176 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1177 panic("indiracct: botched params"); 1178 /* 1179 * We have to expand bread here since it will deadlock looking 1180 * up the block number for any blocks that are not in the cache. 1181 */ 1182 error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize, 1183 false, &bp); 1184 if (error) 1185 return error; 1186 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1187 rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) { 1188 brelse(bp, 0); 1189 return (error); 1190 } 1191 /* 1192 * Account for the block pointers in this indirect block. 1193 */ 1194 last = howmany(remblks, blksperindir); 1195 if (last > FFS_NINDIR(fs)) 1196 last = FFS_NINDIR(fs); 1197 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO); 1198 memcpy((void *)bap, bp->b_data, fs->fs_bsize); 1199 brelse(bp, 0); 1200 error = (*acctfunc)(snapvp, bap, 0, last, 1201 fs, level == 0 ? rlbn : -1, expungetype); 1202 if (error || level == 0) 1203 goto out; 1204 /* 1205 * Account for the block pointers in each of the indirect blocks 1206 * in the levels below us. 1207 */ 1208 subblksperindir = blksperindir / FFS_NINDIR(fs); 1209 for (lbn++, level--, i = 0; i < last; i++) { 1210 error = indiracct(snapvp, cancelvp, level, 1211 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks, 1212 subblksperindir, fs, acctfunc, expungetype); 1213 if (error) 1214 goto out; 1215 rlbn += blksperindir; 1216 lbn -= blksperindir; 1217 remblks -= blksperindir; 1218 } 1219 out: 1220 free(bap, M_DEVBUF); 1221 return (error); 1222 } 1223 1224 /* 1225 * Do both snap accounting and map accounting. 1226 */ 1227 static int 1228 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1229 struct fs *fs, daddr_t lblkno, 1230 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1231 { 1232 int error; 1233 1234 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype))) 1235 return (error); 1236 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)); 1237 } 1238 1239 /* 1240 * Identify a set of blocks allocated in a snapshot inode. 1241 */ 1242 static int 1243 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1244 struct fs *fs, daddr_t lblkno, 1245 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1246 { 1247 struct inode *ip = VTOI(vp); 1248 struct lwp *l = curlwp; 1249 struct mount *mp = vp->v_mount; 1250 daddr_t blkno; 1251 daddr_t lbn; 1252 struct buf *ibp; 1253 int error, n; 1254 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1255 1256 error = UFS_WAPBL_BEGIN(mp); 1257 if (error) 1258 return error; 1259 for ( n = 0; oldblkp < lastblkp; oldblkp++) { 1260 blkno = idb_get(ip, bap, oldblkp); 1261 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1262 continue; 1263 lbn = ffs_fragstoblks(fs, blkno); 1264 if (lbn < UFS_NDADDR) { 1265 blkno = db_get(ip, lbn); 1266 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1267 } else { 1268 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), 1269 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1270 if (error) 1271 break; 1272 blkno = idb_get(ip, ibp->b_data, 1273 (lbn - UFS_NDADDR) % FFS_NINDIR(fs)); 1274 } 1275 /* 1276 * If we are expunging a snapshot vnode and we 1277 * find a block marked BLK_NOCOPY, then it is 1278 * one that has been allocated to this snapshot after 1279 * we took our current snapshot and can be ignored. 1280 */ 1281 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1282 if (lbn >= UFS_NDADDR) 1283 brelse(ibp, 0); 1284 } else { 1285 if (blkno != 0) 1286 panic("snapacct: bad block"); 1287 if (lbn < UFS_NDADDR) 1288 db_assign(ip, lbn, expungetype); 1289 else { 1290 idb_assign(ip, ibp->b_data, 1291 (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype); 1292 bdwrite(ibp); 1293 } 1294 } 1295 if (wbreak > 0 && (++n % wbreak) == 0) { 1296 UFS_WAPBL_END(mp); 1297 error = UFS_WAPBL_BEGIN(mp); 1298 if (error) 1299 return error; 1300 } 1301 } 1302 UFS_WAPBL_END(mp); 1303 return error; 1304 } 1305 1306 /* 1307 * Account for a set of blocks allocated in a snapshot inode. 1308 */ 1309 static int 1310 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp, 1311 struct fs *fs, daddr_t lblkno, int expungetype) 1312 { 1313 daddr_t blkno; 1314 struct inode *ip; 1315 struct mount *mp = vp->v_mount; 1316 ino_t inum; 1317 int acctit, error, n; 1318 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8; 1319 1320 error = UFS_WAPBL_BEGIN(mp); 1321 if (error) 1322 return error; 1323 ip = VTOI(vp); 1324 inum = ip->i_number; 1325 if (lblkno == -1) 1326 acctit = 0; 1327 else 1328 acctit = 1; 1329 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) { 1330 blkno = idb_get(ip, bap, oldblkp); 1331 if (blkno == 0 || blkno == BLK_NOCOPY) 1332 continue; 1333 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1334 *ip->i_snapblklist++ = lblkno; 1335 if (blkno == BLK_SNAP) 1336 blkno = ffs_blkstofrags(fs, lblkno); 1337 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum); 1338 if (wbreak > 0 && (++n % wbreak) == 0) { 1339 UFS_WAPBL_END(mp); 1340 error = UFS_WAPBL_BEGIN(mp); 1341 if (error) 1342 return error; 1343 } 1344 } 1345 UFS_WAPBL_END(mp); 1346 return (0); 1347 } 1348 1349 /* 1350 * Number of blocks that fit into the journal or zero if not logging. 1351 */ 1352 static int 1353 blocks_in_journal(struct fs *fs) 1354 { 1355 off_t bpj; 1356 1357 if ((fs->fs_flags & FS_DOWAPBL) == 0) 1358 return 0; 1359 bpj = 1; 1360 if (fs->fs_journal_version == UFS_WAPBL_VERSION) { 1361 switch (fs->fs_journal_location) { 1362 case UFS_WAPBL_JOURNALLOC_END_PARTITION: 1363 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]* 1364 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; 1365 break; 1366 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: 1367 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]* 1368 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; 1369 break; 1370 } 1371 } 1372 bpj /= fs->fs_bsize; 1373 return (bpj > 0 ? bpj : 1); 1374 } 1375 #endif /* defined(FFS_NO_SNAPSHOT) */ 1376 1377 /* 1378 * Decrement extra reference on snapshot when last name is removed. 1379 * It will not be freed until the last open reference goes away. 1380 */ 1381 void 1382 ffs_snapgone(struct vnode *vp) 1383 { 1384 struct inode *xp, *ip = VTOI(vp); 1385 struct mount *mp = spec_node_getmountedfs(ip->i_devvp); 1386 struct fs *fs; 1387 struct snap_info *si; 1388 int snaploc; 1389 1390 si = VFSTOUFS(mp)->um_snapinfo; 1391 1392 /* 1393 * Find snapshot in incore list. 1394 */ 1395 mutex_enter(&si->si_lock); 1396 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1397 if (xp == ip) 1398 break; 1399 mutex_exit(&si->si_lock); 1400 if (xp != NULL) 1401 vrele(ITOV(ip)); 1402 #ifdef DEBUG 1403 else if (snapdebug) 1404 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1405 (unsigned long long)ip->i_number); 1406 #endif 1407 /* 1408 * Delete snapshot inode from superblock. Keep list dense. 1409 */ 1410 mutex_enter(&si->si_lock); 1411 fs = ip->i_fs; 1412 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1413 if (fs->fs_snapinum[snaploc] == ip->i_number) 1414 break; 1415 if (snaploc < FSMAXSNAP) { 1416 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1417 if (fs->fs_snapinum[snaploc] == 0) 1418 break; 1419 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1420 } 1421 fs->fs_snapinum[snaploc - 1] = 0; 1422 } 1423 si->si_gen++; 1424 mutex_exit(&si->si_lock); 1425 } 1426 1427 /* 1428 * Prepare a snapshot file for being removed. 1429 */ 1430 void 1431 ffs_snapremove(struct vnode *vp) 1432 { 1433 struct inode *ip = VTOI(vp), *xp; 1434 struct vnode *devvp = ip->i_devvp; 1435 struct fs *fs = ip->i_fs; 1436 struct mount *mp = spec_node_getmountedfs(devvp); 1437 struct buf *ibp; 1438 struct snap_info *si; 1439 struct lwp *l = curlwp; 1440 daddr_t numblks, blkno, dblk; 1441 int error, loc, last; 1442 1443 si = VFSTOUFS(mp)->um_snapinfo; 1444 /* 1445 * If active, delete from incore list (this snapshot may 1446 * already have been in the process of being deleted, so 1447 * would not have been active). 1448 * 1449 * Clear copy-on-write flag if last snapshot. 1450 */ 1451 mutex_enter(&si->si_snaplock); 1452 mutex_enter(&si->si_lock); 1453 if (is_active_snapshot(si, ip)) { 1454 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1455 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1456 /* Roll back the list of preallocated blocks. */ 1457 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1458 si->si_snapblklist = xp->i_snapblklist; 1459 si->si_gen++; 1460 mutex_exit(&si->si_lock); 1461 mutex_exit(&si->si_snaplock); 1462 } else { 1463 si->si_snapblklist = 0; 1464 si->si_gen++; 1465 mutex_exit(&si->si_lock); 1466 mutex_exit(&si->si_snaplock); 1467 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1468 } 1469 if (ip->i_snapblklist != NULL) { 1470 free(ip->i_snapblklist, M_UFSMNT); 1471 ip->i_snapblklist = NULL; 1472 } 1473 } else { 1474 mutex_exit(&si->si_lock); 1475 mutex_exit(&si->si_snaplock); 1476 } 1477 /* 1478 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1479 * snapshots that want them (see ffs_snapblkfree below). 1480 */ 1481 for (blkno = 1; blkno < UFS_NDADDR; blkno++) { 1482 dblk = db_get(ip, blkno); 1483 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1484 db_assign(ip, blkno, 0); 1485 else if ((dblk == ffs_blkstofrags(fs, blkno) && 1486 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1487 ip->i_number))) { 1488 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1489 db_assign(ip, blkno, 0); 1490 } 1491 } 1492 numblks = howmany(ip->i_size, fs->fs_bsize); 1493 for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) { 1494 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno), 1495 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 1496 if (error) 1497 continue; 1498 if (fs->fs_size - blkno > FFS_NINDIR(fs)) 1499 last = FFS_NINDIR(fs); 1500 else 1501 last = fs->fs_size - blkno; 1502 for (loc = 0; loc < last; loc++) { 1503 dblk = idb_get(ip, ibp->b_data, loc); 1504 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1505 idb_assign(ip, ibp->b_data, loc, 0); 1506 else if (dblk == ffs_blkstofrags(fs, blkno) && 1507 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1508 fs->fs_bsize, ip->i_number)) { 1509 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1510 idb_assign(ip, ibp->b_data, loc, 0); 1511 } 1512 } 1513 bawrite(ibp); 1514 UFS_WAPBL_END(mp); 1515 error = UFS_WAPBL_BEGIN(mp); 1516 KASSERT(error == 0); 1517 } 1518 /* 1519 * Clear snapshot flag and drop reference. 1520 */ 1521 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL); 1522 DIP_ASSIGN(ip, flags, ip->i_flags); 1523 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1524 #if defined(QUOTA) || defined(QUOTA2) 1525 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE); 1526 chkiq(ip, 1, l->l_cred, FORCE); 1527 #endif 1528 } 1529 1530 /* 1531 * Notification that a block is being freed. Return zero if the free 1532 * should be allowed to proceed. Return non-zero if the snapshot file 1533 * wants to claim the block. The block will be claimed if it is an 1534 * uncopied part of one of the snapshots. It will be freed if it is 1535 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1536 * If a fragment is being freed, then all snapshots that care about 1537 * it must make a copy since a snapshot file can only claim full sized 1538 * blocks. Note that if more than one snapshot file maps the block, 1539 * we can pick one at random to claim it. Since none of the snapshots 1540 * can change, we are assurred that they will all see the same unmodified 1541 * image. When deleting a snapshot file (see ffs_snapremove above), we 1542 * must push any of these claimed blocks to one of the other snapshots 1543 * that maps it. These claimed blocks are easily identified as they will 1544 * have a block number equal to their logical block number within the 1545 * snapshot. A copied block can never have this property because they 1546 * must always have been allocated from a BLK_NOCOPY location. 1547 */ 1548 int 1549 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, 1550 long size, ino_t inum) 1551 { 1552 struct mount *mp = spec_node_getmountedfs(devvp); 1553 struct buf *ibp; 1554 struct inode *ip; 1555 struct vnode *vp = NULL; 1556 struct snap_info *si; 1557 void *saved_data = NULL; 1558 daddr_t lbn; 1559 daddr_t blkno; 1560 uint32_t gen; 1561 int indiroff = 0, error = 0, claimedblk = 0; 1562 1563 si = VFSTOUFS(mp)->um_snapinfo; 1564 lbn = ffs_fragstoblks(fs, bno); 1565 mutex_enter(&si->si_snaplock); 1566 mutex_enter(&si->si_lock); 1567 si->si_owner = curlwp; 1568 1569 retry: 1570 gen = si->si_gen; 1571 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1572 vp = ITOV(ip); 1573 /* 1574 * Lookup block being written. 1575 */ 1576 if (lbn < UFS_NDADDR) { 1577 blkno = db_get(ip, lbn); 1578 } else { 1579 mutex_exit(&si->si_lock); 1580 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), 1581 fs->fs_bsize, FSCRED, B_METAONLY, &ibp); 1582 if (error) { 1583 mutex_enter(&si->si_lock); 1584 break; 1585 } 1586 indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs); 1587 blkno = idb_get(ip, ibp->b_data, indiroff); 1588 mutex_enter(&si->si_lock); 1589 if (gen != si->si_gen) { 1590 brelse(ibp, 0); 1591 goto retry; 1592 } 1593 } 1594 /* 1595 * Check to see if block needs to be copied. 1596 */ 1597 if (blkno == 0) { 1598 /* 1599 * A block that we map is being freed. If it has not 1600 * been claimed yet, we will claim or copy it (below). 1601 */ 1602 claimedblk = 1; 1603 } else if (blkno == BLK_SNAP) { 1604 /* 1605 * No previous snapshot claimed the block, 1606 * so it will be freed and become a BLK_NOCOPY 1607 * (don't care) for us. 1608 */ 1609 if (claimedblk) 1610 panic("snapblkfree: inconsistent block type"); 1611 if (lbn < UFS_NDADDR) { 1612 db_assign(ip, lbn, BLK_NOCOPY); 1613 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1614 } else { 1615 idb_assign(ip, ibp->b_data, indiroff, 1616 BLK_NOCOPY); 1617 mutex_exit(&si->si_lock); 1618 if (ip->i_nlink > 0) 1619 bwrite(ibp); 1620 else 1621 bdwrite(ibp); 1622 mutex_enter(&si->si_lock); 1623 if (gen != si->si_gen) 1624 goto retry; 1625 } 1626 continue; 1627 } else /* BLK_NOCOPY or default */ { 1628 /* 1629 * If the snapshot has already copied the block 1630 * (default), or does not care about the block, 1631 * it is not needed. 1632 */ 1633 if (lbn >= UFS_NDADDR) 1634 brelse(ibp, 0); 1635 continue; 1636 } 1637 /* 1638 * If this is a full size block, we will just grab it 1639 * and assign it to the snapshot inode. Otherwise we 1640 * will proceed to copy it. See explanation for this 1641 * routine as to why only a single snapshot needs to 1642 * claim this block. 1643 */ 1644 if (size == fs->fs_bsize) { 1645 #ifdef DEBUG 1646 if (snapdebug) 1647 printf("%s %llu lbn %" PRId64 1648 "from inum %llu\n", 1649 "Grabonremove: snapino", 1650 (unsigned long long)ip->i_number, 1651 lbn, (unsigned long long)inum); 1652 #endif 1653 mutex_exit(&si->si_lock); 1654 if (lbn < UFS_NDADDR) { 1655 db_assign(ip, lbn, bno); 1656 } else { 1657 idb_assign(ip, ibp->b_data, indiroff, bno); 1658 if (ip->i_nlink > 0) 1659 bwrite(ibp); 1660 else 1661 bdwrite(ibp); 1662 } 1663 DIP_ADD(ip, blocks, btodb(size)); 1664 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1665 if (ip->i_nlink > 0 && mp->mnt_wapbl) 1666 error = syncsnap(vp); 1667 else 1668 error = 0; 1669 mutex_enter(&si->si_lock); 1670 si->si_owner = NULL; 1671 mutex_exit(&si->si_lock); 1672 mutex_exit(&si->si_snaplock); 1673 return (error == 0); 1674 } 1675 if (lbn >= UFS_NDADDR) 1676 brelse(ibp, 0); 1677 #ifdef DEBUG 1678 if (snapdebug) 1679 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1680 "Copyonremove: snapino ", 1681 (unsigned long long)ip->i_number, 1682 lbn, "for inum", (unsigned long long)inum, size); 1683 #endif 1684 /* 1685 * If we have already read the old block contents, then 1686 * simply copy them to the new block. Note that we need 1687 * to synchronously write snapshots that have not been 1688 * unlinked, and hence will be visible after a crash, 1689 * to ensure their integrity. 1690 */ 1691 mutex_exit(&si->si_lock); 1692 if (saved_data == NULL) { 1693 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1694 error = rwfsblk(vp, B_READ, saved_data, lbn); 1695 if (error) { 1696 free(saved_data, M_UFSMNT); 1697 saved_data = NULL; 1698 mutex_enter(&si->si_lock); 1699 break; 1700 } 1701 } 1702 error = wrsnapblk(vp, saved_data, lbn); 1703 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 1704 error = syncsnap(vp); 1705 mutex_enter(&si->si_lock); 1706 if (error) 1707 break; 1708 if (gen != si->si_gen) 1709 goto retry; 1710 } 1711 si->si_owner = NULL; 1712 mutex_exit(&si->si_lock); 1713 mutex_exit(&si->si_snaplock); 1714 if (saved_data) 1715 free(saved_data, M_UFSMNT); 1716 /* 1717 * If we have been unable to allocate a block in which to do 1718 * the copy, then return non-zero so that the fragment will 1719 * not be freed. Although space will be lost, the snapshot 1720 * will stay consistent. 1721 */ 1722 return (error); 1723 } 1724 1725 /* 1726 * Associate snapshot files when mounting. 1727 */ 1728 void 1729 ffs_snapshot_mount(struct mount *mp) 1730 { 1731 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1732 struct fs *fs = VFSTOUFS(mp)->um_fs; 1733 struct lwp *l = curlwp; 1734 struct vnode *vp; 1735 struct inode *ip, *xp; 1736 struct snap_info *si; 1737 daddr_t snaplistsize, *snapblklist; 1738 int i, error, ns __unused, snaploc, loc; 1739 1740 /* 1741 * No persistent snapshots on apple ufs file systems. 1742 */ 1743 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1744 return; 1745 1746 si = VFSTOUFS(mp)->um_snapinfo; 1747 ns = UFS_FSNEEDSWAP(fs); 1748 /* 1749 * XXX The following needs to be set before ffs_truncate or 1750 * VOP_READ can be called. 1751 */ 1752 mp->mnt_stat.f_iosize = fs->fs_bsize; 1753 /* 1754 * Process each snapshot listed in the superblock. 1755 */ 1756 vp = NULL; 1757 mutex_enter(&si->si_lock); 1758 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1759 if (fs->fs_snapinum[snaploc] == 0) 1760 break; 1761 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1762 LK_EXCLUSIVE, &vp)) != 0) { 1763 printf("ffs_snapshot_mount: vget failed %d\n", error); 1764 continue; 1765 } 1766 ip = VTOI(vp); 1767 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) != 1768 SF_SNAPSHOT) { 1769 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1770 fs->fs_snapinum[snaploc]); 1771 vput(vp); 1772 vp = NULL; 1773 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1774 if (fs->fs_snapinum[loc] == 0) 1775 break; 1776 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1777 } 1778 fs->fs_snapinum[loc - 1] = 0; 1779 snaploc--; 1780 continue; 1781 } 1782 1783 /* 1784 * Read the block hints list. Use an empty list on 1785 * read errors. 1786 */ 1787 error = vn_rdwr(UIO_READ, vp, 1788 (void *)&snaplistsize, sizeof(snaplistsize), 1789 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1790 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1791 l->l_cred, NULL, NULL); 1792 if (error) { 1793 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1794 snaplistsize = 1; 1795 } else 1796 snaplistsize = ufs_rw64(snaplistsize, ns); 1797 snapblklist = malloc( 1798 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); 1799 if (error) 1800 snapblklist[0] = 1; 1801 else { 1802 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1803 snaplistsize * sizeof(daddr_t), 1804 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1805 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS, 1806 l->l_cred, NULL, NULL); 1807 for (i = 0; i < snaplistsize; i++) 1808 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1809 if (error) { 1810 printf("ffs_snapshot_mount: read_2 failed %d\n", 1811 error); 1812 snapblklist[0] = 1; 1813 } 1814 } 1815 ip->i_snapblklist = &snapblklist[0]; 1816 1817 /* 1818 * Link it onto the active snapshot list. 1819 */ 1820 if (is_active_snapshot(si, ip)) 1821 panic("ffs_snapshot_mount: %"PRIu64" already on list", 1822 ip->i_number); 1823 else 1824 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1825 vp->v_vflag |= VV_SYSTEM; 1826 VOP_UNLOCK(vp); 1827 } 1828 /* 1829 * No usable snapshots found. 1830 */ 1831 if (vp == NULL) { 1832 mutex_exit(&si->si_lock); 1833 return; 1834 } 1835 /* 1836 * Attach the block hints list. We always want to 1837 * use the list from the newest snapshot. 1838 */ 1839 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1840 si->si_snapblklist = xp->i_snapblklist; 1841 fscow_establish(mp, ffs_copyonwrite, devvp); 1842 si->si_gen++; 1843 mutex_exit(&si->si_lock); 1844 } 1845 1846 /* 1847 * Disassociate snapshot files when unmounting. 1848 */ 1849 void 1850 ffs_snapshot_unmount(struct mount *mp) 1851 { 1852 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1853 struct inode *xp; 1854 struct vnode *vp = NULL; 1855 struct snap_info *si; 1856 1857 si = VFSTOUFS(mp)->um_snapinfo; 1858 mutex_enter(&si->si_lock); 1859 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1860 vp = ITOV(xp); 1861 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1862 if (xp->i_snapblklist == si->si_snapblklist) 1863 si->si_snapblklist = NULL; 1864 free(xp->i_snapblklist, M_UFSMNT); 1865 if (xp->i_nlink > 0) { 1866 si->si_gen++; 1867 mutex_exit(&si->si_lock); 1868 vrele(vp); 1869 mutex_enter(&si->si_lock); 1870 } 1871 } 1872 si->si_gen++; 1873 mutex_exit(&si->si_lock); 1874 if (vp) 1875 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1876 } 1877 1878 /* 1879 * Check for need to copy block that is about to be written, 1880 * copying the block if necessary. 1881 */ 1882 static int 1883 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1884 { 1885 struct fs *fs; 1886 struct inode *ip; 1887 struct vnode *devvp = v, *vp = NULL; 1888 struct mount *mp = spec_node_getmountedfs(devvp); 1889 struct snap_info *si; 1890 void *saved_data = NULL; 1891 daddr_t lbn, blkno, *snapblklist; 1892 uint32_t gen; 1893 int lower, upper, mid, snapshot_locked = 0, error = 0; 1894 1895 /* 1896 * Check for valid snapshots. 1897 */ 1898 si = VFSTOUFS(mp)->um_snapinfo; 1899 mutex_enter(&si->si_lock); 1900 ip = TAILQ_FIRST(&si->si_snapshots); 1901 if (ip == NULL) { 1902 mutex_exit(&si->si_lock); 1903 return 0; 1904 } 1905 /* 1906 * First check to see if it is after the file system, 1907 * in the journal or in the preallocated list. 1908 * By doing these checks we avoid several potential deadlocks. 1909 */ 1910 fs = ip->i_fs; 1911 lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)); 1912 if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) { 1913 mutex_exit(&si->si_lock); 1914 return 0; 1915 } 1916 if ((fs->fs_flags & FS_DOWAPBL) && 1917 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) { 1918 off_t blk_off, log_start, log_end; 1919 1920 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] * 1921 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; 1922 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] * 1923 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; 1924 blk_off = dbtob(bp->b_blkno); 1925 if (blk_off >= log_start && blk_off < log_end) { 1926 mutex_exit(&si->si_lock); 1927 return 0; 1928 } 1929 } 1930 snapblklist = si->si_snapblklist; 1931 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0); 1932 lower = 1; 1933 while (lower <= upper) { 1934 mid = (lower + upper) / 2; 1935 if (snapblklist[mid] == lbn) 1936 break; 1937 if (snapblklist[mid] < lbn) 1938 lower = mid + 1; 1939 else 1940 upper = mid - 1; 1941 } 1942 if (lower <= upper) { 1943 mutex_exit(&si->si_lock); 1944 return 0; 1945 } 1946 /* 1947 * Not in the precomputed list, so check the snapshots. 1948 */ 1949 if (si->si_owner != curlwp) { 1950 if (!mutex_tryenter(&si->si_snaplock)) { 1951 mutex_exit(&si->si_lock); 1952 mutex_enter(&si->si_snaplock); 1953 mutex_enter(&si->si_lock); 1954 } 1955 si->si_owner = curlwp; 1956 snapshot_locked = 1; 1957 } 1958 if (data_valid && bp->b_bcount == fs->fs_bsize) 1959 saved_data = bp->b_data; 1960 retry: 1961 gen = si->si_gen; 1962 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1963 vp = ITOV(ip); 1964 /* 1965 * We ensure that everything of our own that needs to be 1966 * copied will be done at the time that ffs_snapshot is 1967 * called. Thus we can skip the check here which can 1968 * deadlock in doing the lookup in ffs_balloc. 1969 */ 1970 if (bp->b_vp == vp) 1971 continue; 1972 /* 1973 * Check to see if block needs to be copied. 1974 */ 1975 if (lbn < UFS_NDADDR) { 1976 blkno = db_get(ip, lbn); 1977 } else { 1978 mutex_exit(&si->si_lock); 1979 blkno = 0; /* XXX: GCC */ 1980 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1981 mutex_enter(&si->si_lock); 1982 break; 1983 } 1984 mutex_enter(&si->si_lock); 1985 if (gen != si->si_gen) 1986 goto retry; 1987 } 1988 KASSERTMSG((blkno != BLK_SNAP || bp->b_lblkno < 0), 1989 "ffs_copyonwrite: bad copy block: blkno %jd, lblkno %jd", 1990 (intmax_t)blkno, (intmax_t)bp->b_lblkno); 1991 if (blkno != 0) 1992 continue; 1993 1994 if (curlwp == uvm.pagedaemon_lwp) { 1995 error = ENOMEM; 1996 break; 1997 } 1998 /* Only one level of recursion allowed. */ 1999 KASSERT(snapshot_locked); 2000 /* 2001 * Allocate the block into which to do the copy. Since 2002 * multiple processes may all try to copy the same block, 2003 * we have to recheck our need to do a copy if we sleep 2004 * waiting for the lock. 2005 * 2006 * Because all snapshots on a filesystem share a single 2007 * lock, we ensure that we will never be in competition 2008 * with another process to allocate a block. 2009 */ 2010 #ifdef DEBUG 2011 if (snapdebug) { 2012 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 2013 (unsigned long long)ip->i_number, lbn); 2014 if (bp->b_vp == devvp) 2015 printf("fs metadata"); 2016 else 2017 printf("inum %llu", (unsigned long long) 2018 VTOI(bp->b_vp)->i_number); 2019 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 2020 } 2021 #endif 2022 /* 2023 * If we have already read the old block contents, then 2024 * simply copy them to the new block. Note that we need 2025 * to synchronously write snapshots that have not been 2026 * unlinked, and hence will be visible after a crash, 2027 * to ensure their integrity. 2028 */ 2029 mutex_exit(&si->si_lock); 2030 if (saved_data == NULL) { 2031 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 2032 error = rwfsblk(vp, B_READ, saved_data, lbn); 2033 if (error) { 2034 free(saved_data, M_UFSMNT); 2035 saved_data = NULL; 2036 mutex_enter(&si->si_lock); 2037 break; 2038 } 2039 } 2040 error = wrsnapblk(vp, saved_data, lbn); 2041 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl) 2042 error = syncsnap(vp); 2043 mutex_enter(&si->si_lock); 2044 if (error) 2045 break; 2046 if (gen != si->si_gen) 2047 goto retry; 2048 } 2049 /* 2050 * Note that we need to synchronously write snapshots that 2051 * have not been unlinked, and hence will be visible after 2052 * a crash, to ensure their integrity. 2053 */ 2054 if (snapshot_locked) { 2055 si->si_owner = NULL; 2056 mutex_exit(&si->si_lock); 2057 mutex_exit(&si->si_snaplock); 2058 } else 2059 mutex_exit(&si->si_lock); 2060 if (saved_data && saved_data != bp->b_data) 2061 free(saved_data, M_UFSMNT); 2062 return error; 2063 } 2064 2065 /* 2066 * Read from a snapshot. 2067 */ 2068 int 2069 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag) 2070 { 2071 struct inode *ip = VTOI(vp); 2072 struct fs *fs = ip->i_fs; 2073 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo; 2074 struct buf *bp; 2075 daddr_t lbn, nextlbn; 2076 off_t fsbytes, bytesinfile; 2077 long size, xfersize, blkoffset; 2078 int error; 2079 2080 mutex_enter(&si->si_snaplock); 2081 2082 if (ioflag & IO_ALTSEMANTICS) 2083 fsbytes = ip->i_size; 2084 else 2085 fsbytes = ffs_lfragtosize(fs, fs->fs_size); 2086 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 2087 bytesinfile = fsbytes - uio->uio_offset; 2088 if (bytesinfile <= 0) 2089 break; 2090 lbn = ffs_lblkno(fs, uio->uio_offset); 2091 nextlbn = lbn + 1; 2092 size = fs->fs_bsize; 2093 blkoffset = ffs_blkoff(fs, uio->uio_offset); 2094 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid), 2095 bytesinfile); 2096 2097 if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) { 2098 if (ffs_lblktosize(fs, lbn) + size > fsbytes) 2099 size = ffs_fragroundup(fs, 2100 fsbytes - ffs_lblktosize(fs, lbn)); 2101 error = bread(vp, lbn, size, 0, &bp); 2102 } else { 2103 int nextsize = fs->fs_bsize; 2104 error = breadn(vp, lbn, 2105 size, &nextlbn, &nextsize, 1, 0, &bp); 2106 } 2107 if (error) 2108 break; 2109 2110 /* 2111 * We should only get non-zero b_resid when an I/O error 2112 * has occurred, which should cause us to break above. 2113 * However, if the short read did not cause an error, 2114 * then we want to ensure that we do not uiomove bad 2115 * or uninitialized data. 2116 */ 2117 size -= bp->b_resid; 2118 if (size < blkoffset + xfersize) { 2119 xfersize = size - blkoffset; 2120 if (xfersize <= 0) 2121 break; 2122 } 2123 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); 2124 if (error) 2125 break; 2126 brelse(bp, BC_AGE); 2127 } 2128 if (bp != NULL) 2129 brelse(bp, BC_AGE); 2130 2131 mutex_exit(&si->si_snaplock); 2132 return error; 2133 } 2134 2135 /* 2136 * Lookup a snapshots data block address. 2137 * Simpler than UFS_BALLOC() as we know all metadata is already allocated 2138 * and safe even for the pagedaemon where we cannot bread(). 2139 */ 2140 static int 2141 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 2142 { 2143 struct indir indirs[UFS_NIADDR + 2]; 2144 struct inode *ip = VTOI(vp); 2145 struct fs *fs = ip->i_fs; 2146 struct buf *bp; 2147 int error, num; 2148 2149 KASSERT(lbn >= 0); 2150 2151 if (lbn < UFS_NDADDR) { 2152 *res = db_get(ip, lbn); 2153 return 0; 2154 } 2155 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 2156 return error; 2157 if (curlwp == uvm.pagedaemon_lwp) { 2158 mutex_enter(&bufcache_lock); 2159 bp = incore(vp, indirs[num-1].in_lbn); 2160 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) { 2161 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2162 error = 0; 2163 } else 2164 error = ENOMEM; 2165 mutex_exit(&bufcache_lock); 2166 return error; 2167 } 2168 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp); 2169 if (error == 0) { 2170 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 2171 brelse(bp, 0); 2172 } 2173 2174 return error; 2175 } 2176 2177 /* 2178 * Read or write the specified block of the filesystem vp resides on 2179 * from or to the disk bypassing the buffer cache. 2180 */ 2181 static int 2182 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn) 2183 { 2184 int error; 2185 struct inode *ip = VTOI(vp); 2186 struct fs *fs = ip->i_fs; 2187 struct buf *nbp; 2188 2189 nbp = getiobuf(NULL, true); 2190 nbp->b_flags = flags; 2191 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2192 nbp->b_error = 0; 2193 nbp->b_data = data; 2194 nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn)); 2195 nbp->b_proc = NULL; 2196 nbp->b_dev = ip->i_devvp->v_rdev; 2197 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2198 2199 bdev_strategy(nbp); 2200 2201 error = biowait(nbp); 2202 2203 putiobuf(nbp); 2204 2205 return error; 2206 } 2207 2208 /* 2209 * Write all dirty buffers to disk and invalidate them. 2210 */ 2211 static int 2212 syncsnap(struct vnode *vp) 2213 { 2214 int error; 2215 buf_t *bp; 2216 struct fs *fs = VTOI(vp)->i_fs; 2217 2218 mutex_enter(&bufcache_lock); 2219 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) { 2220 error = bbusy(bp, false, 0, NULL); 2221 if (error == EPASSTHROUGH) 2222 continue; 2223 else if (error != 0) { 2224 mutex_exit(&bufcache_lock); 2225 return error; 2226 } 2227 KASSERT(bp->b_bcount == fs->fs_bsize); 2228 mutex_exit(&bufcache_lock); 2229 error = rwfsblk(vp, B_WRITE, bp->b_data, 2230 ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno))); 2231 brelse(bp, BC_INVAL | BC_VFLUSH); 2232 if (error) 2233 return error; 2234 mutex_enter(&bufcache_lock); 2235 } 2236 mutex_exit(&bufcache_lock); 2237 2238 return 0; 2239 } 2240 2241 /* 2242 * Write the specified block to a snapshot. 2243 */ 2244 static int 2245 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn) 2246 { 2247 struct inode *ip = VTOI(vp); 2248 struct fs *fs = ip->i_fs; 2249 struct buf *bp; 2250 int error; 2251 2252 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize, 2253 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp); 2254 if (error) 2255 return error; 2256 memcpy(bp->b_data, data, fs->fs_bsize); 2257 if (ip->i_nlink > 0) 2258 error = bwrite(bp); 2259 else 2260 bawrite(bp); 2261 2262 return error; 2263 } 2264 2265 /* 2266 * Check if this inode is present on the active snapshot list. 2267 * Must be called with snapinfo locked. 2268 */ 2269 static inline bool 2270 is_active_snapshot(struct snap_info *si, struct inode *ip) 2271 { 2272 struct inode *xp; 2273 2274 KASSERT(mutex_owned(&si->si_lock)); 2275 2276 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 2277 if (xp == ip) 2278 return true; 2279 return false; 2280 } 2281 2282 /* 2283 * Get/Put direct block from inode or buffer containing disk addresses. Take 2284 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2285 * into a global include. 2286 */ 2287 static inline daddr_t 2288 db_get(struct inode *ip, int loc) 2289 { 2290 if (ip->i_ump->um_fstype == UFS1) 2291 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2292 else 2293 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2294 } 2295 2296 static inline void 2297 db_assign(struct inode *ip, int loc, daddr_t val) 2298 { 2299 if (ip->i_ump->um_fstype == UFS1) 2300 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2301 else 2302 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2303 } 2304 2305 __unused static inline daddr_t 2306 ib_get(struct inode *ip, int loc) 2307 { 2308 if (ip->i_ump->um_fstype == UFS1) 2309 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip)); 2310 else 2311 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip)); 2312 } 2313 2314 static inline daddr_t 2315 idb_get(struct inode *ip, void *bf, int loc) 2316 { 2317 if (ip->i_ump->um_fstype == UFS1) 2318 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2319 else 2320 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip)); 2321 } 2322 2323 static inline void 2324 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val) 2325 { 2326 if (ip->i_ump->um_fstype == UFS1) 2327 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2328 else 2329 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2330 } 2331