ffs_snapshot.c revision 1.150 1 /* $NetBSD: ffs_snapshot.c,v 1.150 2020/01/17 20:08:10 ad Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.150 2020/01/17 20:08:10 ad Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_quota.h"
46 #endif
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77
78 #include <uvm/uvm.h>
79
80 TAILQ_HEAD(inodelst, inode); /* List of active snapshots */
81
82 struct snap_info {
83 kmutex_t si_lock; /* Lock this snapinfo */
84 kmutex_t si_snaplock; /* Snapshot vnode common lock */
85 lwp_t *si_owner; /* Snaplock owner */
86 struct inodelst si_snapshots; /* List of active snapshots */
87 daddr_t *si_snapblklist; /* Snapshot block hints list */
88 uint32_t si_gen; /* Incremented on change */
89 };
90
91 #if !defined(FFS_NO_SNAPSHOT)
92 typedef int (*acctfunc_t)
93 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
94
95 static int snapshot_setup(struct mount *, struct vnode *);
96 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
97 static int snapshot_expunge(struct mount *, struct vnode *,
98 struct fs *, daddr_t *, daddr_t **);
99 static int snapshot_expunge_snap(struct mount *, struct vnode *,
100 struct fs *, daddr_t);
101 static int snapshot_writefs(struct mount *, struct vnode *, void *);
102 static int cgaccount(struct vnode *, int, int *);
103 static int cgaccount1(int, struct vnode *, void *, int);
104 static int expunge(struct vnode *, struct inode *, struct fs *,
105 acctfunc_t, int);
106 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
107 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
108 static int fullacct(struct vnode *, void *, int, int, struct fs *,
109 daddr_t, int);
110 static int snapacct(struct vnode *, void *, int, int, struct fs *,
111 daddr_t, int);
112 static int mapacct(struct vnode *, void *, int, int, struct fs *,
113 daddr_t, int);
114 #endif /* !defined(FFS_NO_SNAPSHOT) */
115
116 static int ffs_copyonwrite(void *, struct buf *, bool);
117 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
118 static int rwfsblk(struct vnode *, int, void *, daddr_t);
119 static int syncsnap(struct vnode *);
120 static int wrsnapblk(struct vnode *, void *, daddr_t);
121 #if !defined(FFS_NO_SNAPSHOT)
122 static int blocks_in_journal(struct fs *);
123 #endif
124
125 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
126 static inline daddr_t db_get(struct inode *, int);
127 static inline void db_assign(struct inode *, int, daddr_t);
128 static inline daddr_t ib_get(struct inode *, int);
129 static inline daddr_t idb_get(struct inode *, void *, int);
130 static inline void idb_assign(struct inode *, void *, int, daddr_t);
131
132 #ifdef DEBUG
133 static int snapdebug = 0;
134 #endif
135
136 int
137 ffs_snapshot_init(struct ufsmount *ump)
138 {
139 struct snap_info *si;
140
141 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
142 TAILQ_INIT(&si->si_snapshots);
143 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
144 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
145 si->si_owner = NULL;
146 si->si_gen = 0;
147 si->si_snapblklist = NULL;
148
149 return 0;
150 }
151
152 void
153 ffs_snapshot_fini(struct ufsmount *ump)
154 {
155 struct snap_info *si;
156
157 si = ump->um_snapinfo;
158 ump->um_snapinfo = NULL;
159
160 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
161 mutex_destroy(&si->si_lock);
162 mutex_destroy(&si->si_snaplock);
163 KASSERT(si->si_snapblklist == NULL);
164 kmem_free(si, sizeof(*si));
165 }
166
167 /*
168 * Create a snapshot file and initialize it for the filesystem.
169 * Vnode is locked on entry and return.
170 */
171 int
172 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
173 {
174 #if defined(FFS_NO_SNAPSHOT)
175 return EOPNOTSUPP;
176 }
177 #else /* defined(FFS_NO_SNAPSHOT) */
178 bool suspended = false;
179 int error, redo = 0, snaploc;
180 void *sbbuf = NULL;
181 daddr_t *snaplist = NULL, snaplistsize = 0;
182 struct buf *bp, *nbp;
183 struct fs *copy_fs = NULL;
184 struct fs *fs = VFSTOUFS(mp)->um_fs;
185 struct inode *ip = VTOI(vp);
186 struct lwp *l = curlwp;
187 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
188 struct timespec ts;
189 struct timeval starttime;
190 #ifdef DEBUG
191 struct timeval endtime;
192 #endif
193 struct vnode *devvp = ip->i_devvp;
194
195 /*
196 * If the vnode already is a snapshot, return.
197 */
198 if ((ip->i_flags & SF_SNAPSHOT)) {
199 if ((ip->i_flags & SF_SNAPINVAL))
200 return EINVAL;
201 if (ctime) {
202 ctime->tv_sec = DIP(ip, mtime);
203 ctime->tv_nsec = DIP(ip, mtimensec);
204 }
205 return 0;
206 }
207 /*
208 * Check for free snapshot slot in the superblock.
209 */
210 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
211 if (fs->fs_snapinum[snaploc] == 0)
212 break;
213 if (snaploc == FSMAXSNAP)
214 return (ENOSPC);
215 /*
216 * Prepare the vnode to become a snapshot.
217 */
218 error = snapshot_setup(mp, vp);
219 if (error)
220 goto out;
221
222 /*
223 * Copy all the cylinder group maps. Although the
224 * filesystem is still active, we hope that only a few
225 * cylinder groups will change between now and when we
226 * suspend operations. Thus, we will be able to quickly
227 * touch up the few cylinder groups that changed during
228 * the suspension period.
229 */
230 error = cgaccount(vp, 1, NULL);
231 if (error)
232 goto out;
233
234 /*
235 * snapshot is now valid
236 */
237 ip->i_flags &= ~SF_SNAPINVAL;
238 DIP_ASSIGN(ip, flags, ip->i_flags);
239 ip->i_flag |= IN_CHANGE | IN_UPDATE;
240
241 /*
242 * Ensure that the snapshot is completely on disk.
243 * Since we have marked it as a snapshot it is safe to
244 * unlock it as no process will be allowed to write to it.
245 */
246 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
247 if (error)
248 goto out;
249 VOP_UNLOCK(vp);
250 /*
251 * All allocations are done, so we can now suspend the filesystem.
252 */
253 error = vfs_suspend(vp->v_mount, 0);
254 if (error == 0) {
255 suspended = true;
256 vrele_flush(vp->v_mount);
257 error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred);
258 }
259 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
260 if (error)
261 goto out;
262 getmicrotime(&starttime);
263 /*
264 * First, copy all the cylinder group maps that have changed.
265 */
266 error = cgaccount(vp, 2, &redo);
267 if (error)
268 goto out;
269 /*
270 * Create a copy of the superblock and its summary information.
271 */
272 error = snapshot_copyfs(mp, vp, &sbbuf);
273 if (error)
274 goto out;
275 copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
276 /*
277 * Expunge unlinked files from our view.
278 */
279 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
280 if (error)
281 goto out;
282 /*
283 * Record snapshot inode. Since this is the newest snapshot,
284 * it must be placed at the end of the list.
285 */
286 if (ip->i_nlink > 0)
287 fs->fs_snapinum[snaploc] = ip->i_number;
288
289 mutex_enter(&si->si_lock);
290 if (is_active_snapshot(si, ip))
291 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
292 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
293 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
294 /*
295 * If this is the first snapshot on this filesystem, put the
296 * preliminary list in place and establish the cow handler.
297 */
298 si->si_snapblklist = snaplist;
299 fscow_establish(mp, ffs_copyonwrite, devvp);
300 }
301 si->si_gen++;
302 mutex_exit(&si->si_lock);
303
304 vp->v_vflag |= VV_SYSTEM;
305 /*
306 * Set the mtime to the time the snapshot has been taken.
307 */
308 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
309 if (ctime)
310 *ctime = ts;
311 DIP_ASSIGN(ip, mtime, ts.tv_sec);
312 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
313 ip->i_flag |= IN_CHANGE | IN_UPDATE;
314 /*
315 * Copy allocation information from all snapshots and then
316 * expunge them from our view.
317 */
318 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
319 if (error)
320 goto out;
321 /*
322 * Write the superblock and its summary information to the snapshot.
323 */
324 error = snapshot_writefs(mp, vp, sbbuf);
325 if (error)
326 goto out;
327 /*
328 * We're nearly done, ensure that the snapshot is completely on disk.
329 */
330 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
331 if (error)
332 goto out;
333 /*
334 * Invalidate and free all pages on the snapshot vnode.
335 * We will read and write through the buffercache.
336 */
337 mutex_enter(vp->v_interlock);
338 error = VOP_PUTPAGES(vp, 0, 0,
339 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
340 if (error)
341 goto out;
342 /*
343 * Invalidate short ( < fs_bsize ) buffers. We will always read
344 * full size buffers later.
345 */
346 mutex_enter(&bufcache_lock);
347 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
348 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
349 nbp = LIST_NEXT(bp, b_vnbufs);
350 if (bp->b_bcount == fs->fs_bsize)
351 continue;
352 error = bbusy(bp, false, 0, NULL);
353 if (error != 0) {
354 if (error == EPASSTHROUGH) {
355 nbp = LIST_FIRST(&vp->v_cleanblkhd);
356 continue;
357 }
358 break;
359 }
360 brelsel(bp, BC_INVAL | BC_VFLUSH);
361 }
362 mutex_exit(&bufcache_lock);
363
364 out:
365 if (sbbuf != NULL) {
366 free(copy_fs->fs_csp, M_UFSMNT);
367 free(sbbuf, M_UFSMNT);
368 }
369 if (fs->fs_active != NULL) {
370 free(fs->fs_active, M_DEVBUF);
371 fs->fs_active = NULL;
372 }
373
374 mutex_enter(&si->si_lock);
375 if (snaplist != NULL) {
376 if (si->si_snapblklist == snaplist)
377 si->si_snapblklist = NULL;
378 free(snaplist, M_UFSMNT);
379 }
380 if (error) {
381 fs->fs_snapinum[snaploc] = 0;
382 } else {
383 /*
384 * As this is the newest list, it is the most inclusive, so
385 * should replace the previous list.
386 */
387 si->si_snapblklist = ip->i_snapblklist;
388 }
389 si->si_gen++;
390 mutex_exit(&si->si_lock);
391
392 if (suspended) {
393 VOP_UNLOCK(vp);
394 vfs_resume(vp->v_mount);
395 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
396 #ifdef DEBUG
397 getmicrotime(&endtime);
398 timersub(&endtime, &starttime, &endtime);
399 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
400 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
401 endtime.tv_usec / 1000, redo, fs->fs_ncg);
402 #endif
403 }
404 if (error) {
405 if (UFS_WAPBL_BEGIN(mp) == 0) {
406 /*
407 * We depend on ffs_truncate() to call ffs_snapremove()
408 * before it may return an error. On failed
409 * ffs_truncate() we have normal file with leaked
410 * (meta-) data, but no snapshot to use.
411 */
412 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
413 UFS_WAPBL_END(mp);
414 }
415 } else if (ip->i_nlink > 0)
416 vref(vp);
417 return (error);
418 }
419
420 /*
421 * Prepare vnode to become a snapshot.
422 */
423 static int
424 snapshot_setup(struct mount *mp, struct vnode *vp)
425 {
426 int error, n, len, loc, cg;
427 daddr_t blkno, numblks;
428 struct buf *ibp, *nbp;
429 struct fs *fs = VFSTOUFS(mp)->um_fs;
430 struct lwp *l = curlwp;
431 const int wbreak = blocks_in_journal(fs)/8;
432 struct inode *ip = VTOI(vp);
433
434 /*
435 * Check mount, readonly reference and owner.
436 */
437 if (vp->v_mount != mp)
438 return EXDEV;
439 if (vp->v_writecount != 0)
440 return EBUSY;
441 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
442 0, mp, vp, NULL);
443 if (error)
444 return EACCES;
445
446 if (vp->v_size != 0) {
447 /*
448 * Must completely truncate the file here. Allocated
449 * blocks on a snapshot mean that block has been copied
450 * on write, see ffs_copyonwrite() testing "blkno != 0"
451 */
452 error = ufs_truncate_retry(vp, 0, NOCRED);
453 if (error)
454 return error;
455 }
456
457 /* Change inode to snapshot type file. */
458 error = UFS_WAPBL_BEGIN(mp);
459 if (error)
460 return error;
461 #if defined(QUOTA) || defined(QUOTA2)
462 /* shapshot inodes are not accounted in quotas */
463 chkiq(ip, -1, l->l_cred, 0);
464 #endif
465 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
466 DIP_ASSIGN(ip, flags, ip->i_flags);
467 ip->i_flag |= IN_CHANGE | IN_UPDATE;
468 ffs_update(vp, NULL, NULL, UPDATE_WAIT);
469 UFS_WAPBL_END(mp);
470
471 KASSERT(ip->i_flags & SF_SNAPSHOT);
472 /*
473 * Write an empty list of preallocated blocks to the end of
474 * the snapshot to set size to at least that of the filesystem.
475 */
476 numblks = howmany(fs->fs_size, fs->fs_frag);
477 blkno = 1;
478 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
479 error = vn_rdwr(UIO_WRITE, vp,
480 (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
481 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
482 if (error)
483 return error;
484 /*
485 * Preallocate critical data structures so that we can copy
486 * them in without further allocation after we suspend all
487 * operations on the filesystem. We would like to just release
488 * the allocated buffers without writing them since they will
489 * be filled in below once we are ready to go, but this upsets
490 * the soft update code, so we go ahead and write the new buffers.
491 *
492 * Allocate all indirect blocks and mark all of them as not
493 * needing to be copied.
494 */
495 error = UFS_WAPBL_BEGIN(mp);
496 if (error)
497 return error;
498 for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
499 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
500 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
501 if (error)
502 goto out;
503 brelse(ibp, 0);
504 if (wbreak > 0 && (++n % wbreak) == 0) {
505 UFS_WAPBL_END(mp);
506 error = UFS_WAPBL_BEGIN(mp);
507 if (error)
508 return error;
509 }
510 }
511 /*
512 * Allocate copies for the superblock and its summary information.
513 */
514 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
515 0, &nbp);
516 if (error)
517 goto out;
518 bawrite(nbp);
519 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
520 len = howmany(fs->fs_cssize, fs->fs_bsize);
521 for (loc = 0; loc < len; loc++) {
522 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
523 fs->fs_bsize, l->l_cred, 0, &nbp);
524 if (error)
525 goto out;
526 bawrite(nbp);
527 if (wbreak > 0 && (++n % wbreak) == 0) {
528 UFS_WAPBL_END(mp);
529 error = UFS_WAPBL_BEGIN(mp);
530 if (error)
531 return error;
532 }
533 }
534 /*
535 * Allocate all cylinder group blocks.
536 */
537 for (cg = 0; cg < fs->fs_ncg; cg++) {
538 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
539 fs->fs_bsize, l->l_cred, 0, &nbp);
540 if (error)
541 goto out;
542 bawrite(nbp);
543 if (wbreak > 0 && (++n % wbreak) == 0) {
544 UFS_WAPBL_END(mp);
545 error = UFS_WAPBL_BEGIN(mp);
546 if (error)
547 return error;
548 }
549 }
550
551 out:
552 UFS_WAPBL_END(mp);
553 return error;
554 }
555
556 /*
557 * Create a copy of the superblock and its summary information.
558 * It is up to the caller to free copyfs and copy_fs->fs_csp.
559 */
560 static int
561 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
562 {
563 int error, i, len, loc, size;
564 void *space;
565 int32_t *lp;
566 struct buf *bp;
567 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
568 struct vnode *devvp = VTOI(vp)->i_devvp;
569
570 /*
571 * Grab a copy of the superblock and its summary information.
572 * We delay writing it until the suspension is released below.
573 */
574 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
575 loc = ffs_blkoff(fs, fs->fs_sblockloc);
576 if (loc > 0)
577 memset(*sbbuf, 0, loc);
578 copyfs = (struct fs *)((char *)(*sbbuf) + loc);
579 memcpy(copyfs, fs, fs->fs_sbsize);
580 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
581 if (fs->fs_sbsize < size)
582 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
583 size - fs->fs_sbsize);
584 size = ffs_blkroundup(fs, fs->fs_cssize);
585 if (fs->fs_contigsumsize > 0)
586 size += fs->fs_ncg * sizeof(int32_t);
587 space = malloc(size, M_UFSMNT, M_WAITOK);
588 copyfs->fs_csp = space;
589 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
590 space = (char *)space + fs->fs_cssize;
591 loc = howmany(fs->fs_cssize, fs->fs_fsize);
592 i = fs->fs_frag - loc % fs->fs_frag;
593 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
594 if (len > 0) {
595 if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
596 len, 0, &bp)) != 0) {
597 free(copyfs->fs_csp, M_UFSMNT);
598 free(*sbbuf, M_UFSMNT);
599 *sbbuf = NULL;
600 return error;
601 }
602 memcpy(space, bp->b_data, (u_int)len);
603 space = (char *)space + len;
604 brelse(bp, BC_INVAL | BC_NOCACHE);
605 }
606 if (fs->fs_contigsumsize > 0) {
607 copyfs->fs_maxcluster = lp = space;
608 for (i = 0; i < fs->fs_ncg; i++)
609 *lp++ = fs->fs_contigsumsize;
610 }
611 if (mp->mnt_wapbl)
612 copyfs->fs_flags &= ~FS_DOWAPBL;
613 return 0;
614 }
615
616 struct snapshot_expunge_ctx {
617 struct vnode *logvp;
618 struct lwp *l;
619 struct vnode *vp;
620 struct fs *copy_fs;
621 };
622
623 static bool
624 snapshot_expunge_selector(void *cl, struct vnode *xvp)
625 {
626 struct vattr vat;
627 struct snapshot_expunge_ctx *c = cl;
628 struct inode *xp;
629
630 KASSERT(mutex_owned(xvp->v_interlock));
631
632 xp = VTOI(xvp);
633 if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
634 (xp->i_flags & SF_SNAPSHOT))
635 return false;
636 #ifdef DEBUG
637 if (snapdebug)
638 vprint("ffs_snapshot: busy vnode", xvp);
639 #endif
640
641 if (xvp == c->logvp)
642 return true;
643
644 if (VOP_GETATTR(xvp, &vat, c->l->l_cred) == 0 &&
645 vat.va_nlink > 0)
646 return false;
647
648 if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
649 return false;
650
651 return true;
652 }
653
654 /*
655 * We must check for active files that have been unlinked (e.g., with a zero
656 * link count). We have to expunge all trace of these files from the snapshot
657 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
658 * Note that we skip unlinked snapshot files as they will be handled separately.
659 * Calculate the snapshot list size and create a preliminary list.
660 */
661 static int
662 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
663 daddr_t *snaplistsize, daddr_t **snaplist)
664 {
665 int cg, error = 0, len, loc;
666 daddr_t blkno, *blkp;
667 struct fs *fs = VFSTOUFS(mp)->um_fs;
668 struct inode *xp;
669 struct lwp *l = curlwp;
670 struct vnode *logvp = NULL, *xvp;
671 struct vnode_iterator *marker;
672 struct snapshot_expunge_ctx ctx;
673
674 *snaplist = NULL;
675 /*
676 * Get the log inode if any.
677 */
678 if ((fs->fs_flags & FS_DOWAPBL) &&
679 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
680 error = VFS_VGET(mp, fs->fs_journallocs[UFS_WAPBL_INFS_INO],
681 LK_EXCLUSIVE, &logvp);
682 if (error)
683 goto out;
684 }
685 /*
686 * We also calculate the needed size for the snapshot list.
687 */
688 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
689 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
690
691 vfs_vnode_iterator_init(mp, &marker);
692 ctx.logvp = logvp;
693 ctx.l = l;
694 ctx.vp = vp;
695 ctx.copy_fs = copy_fs;
696 while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
697 &ctx)))
698 {
699 /*
700 * If there is a fragment, clear it here.
701 */
702 xp = VTOI(xvp);
703 blkno = 0;
704 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
705 if (loc < UFS_NDADDR) {
706 len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
707 if (len > 0 && len < fs->fs_bsize) {
708 error = UFS_WAPBL_BEGIN(mp);
709 if (error) {
710 vrele(xvp);
711 vfs_vnode_iterator_destroy(marker);
712 goto out;
713 }
714 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
715 len, xp->i_number);
716 blkno = db_get(xp, loc);
717 db_assign(xp, loc, 0);
718 UFS_WAPBL_END(mp);
719 }
720 }
721 *snaplistsize += 1;
722 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
723 if (blkno)
724 db_assign(xp, loc, blkno);
725 if (!error) {
726 error = UFS_WAPBL_BEGIN(mp);
727 if (!error) {
728 error = ffs_freefile_snap(copy_fs, vp,
729 xp->i_number, xp->i_mode);
730 UFS_WAPBL_END(mp);
731 }
732 }
733 vrele(xvp);
734 if (error) {
735 vfs_vnode_iterator_destroy(marker);
736 goto out;
737 }
738 }
739 vfs_vnode_iterator_destroy(marker);
740
741 /*
742 * Create a preliminary list of preallocated snapshot blocks.
743 */
744 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
745 blkp = &(*snaplist)[1];
746 *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
747 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
748 for (cg = 0; cg < fs->fs_ncg; cg++) {
749 if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
750 break;
751 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
752 }
753 len = howmany(fs->fs_cssize, fs->fs_bsize);
754 for (loc = 0; loc < len; loc++)
755 *blkp++ = blkno + loc;
756 for (; cg < fs->fs_ncg; cg++)
757 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
758 (*snaplist)[0] = blkp - &(*snaplist)[0];
759
760 out:
761 if (logvp != NULL)
762 vput(logvp);
763 if (error && *snaplist != NULL) {
764 free(*snaplist, M_UFSMNT);
765 *snaplist = NULL;
766 }
767
768 return error;
769 }
770
771 /*
772 * Copy allocation information from all the snapshots in this snapshot and
773 * then expunge them from its view. Also, collect the list of allocated
774 * blocks in i_snapblklist.
775 */
776 static int
777 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
778 struct fs *copy_fs, daddr_t snaplistsize)
779 {
780 int error = 0, i;
781 daddr_t numblks, *snaplist = NULL;
782 struct fs *fs = VFSTOUFS(mp)->um_fs;
783 struct inode *ip = VTOI(vp), *xp;
784 struct lwp *l = curlwp;
785 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
786
787 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
788 if (xp != ip) {
789 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
790 if (error)
791 break;
792 }
793 if (xp->i_nlink != 0)
794 continue;
795 error = UFS_WAPBL_BEGIN(mp);
796 if (error)
797 break;
798 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
799 UFS_WAPBL_END(mp);
800 if (error)
801 break;
802 }
803 if (error)
804 goto out;
805 /*
806 * Allocate space for the full list of preallocated snapshot blocks.
807 */
808 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
809 ip->i_snapblklist = &snaplist[1];
810 /*
811 * Expunge the blocks used by the snapshots from the set of
812 * blocks marked as used in the snapshot bitmaps. Also, collect
813 * the list of allocated blocks in i_snapblklist.
814 */
815 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
816 if (error)
817 goto out;
818 if (snaplistsize < ip->i_snapblklist - snaplist)
819 panic("ffs_snapshot: list too small");
820 snaplistsize = ip->i_snapblklist - snaplist;
821 snaplist[0] = snaplistsize;
822 ip->i_snapblklist = &snaplist[0];
823 /*
824 * Write out the list of allocated blocks to the end of the snapshot.
825 */
826 numblks = howmany(fs->fs_size, fs->fs_frag);
827 for (i = 0; i < snaplistsize; i++)
828 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
829 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
830 snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
831 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
832 for (i = 0; i < snaplistsize; i++)
833 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
834 out:
835 if (error && snaplist != NULL) {
836 free(snaplist, M_UFSMNT);
837 ip->i_snapblklist = NULL;
838 }
839 return error;
840 }
841
842 /*
843 * Write the superblock and its summary information to the snapshot.
844 * Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
845 */
846 static int
847 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
848 {
849 int error, len, loc;
850 void *space;
851 daddr_t blkno;
852 struct buf *bp;
853 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
854 struct inode *ip = VTOI(vp);
855 struct lwp *l = curlwp;
856
857 copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
858
859 /*
860 * Write the superblock and its summary information
861 * to the snapshot.
862 */
863 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
864 len = howmany(fs->fs_cssize, fs->fs_bsize);
865 space = copyfs->fs_csp;
866 #ifdef FFS_EI
867 if (UFS_FSNEEDSWAP(fs)) {
868 ffs_sb_swap(copyfs, copyfs);
869 ffs_csum_swap(space, space, fs->fs_cssize);
870 }
871 #endif
872 error = UFS_WAPBL_BEGIN(mp);
873 if (error)
874 return error;
875 for (loc = 0; loc < len; loc++) {
876 error = bread(vp, blkno + loc, fs->fs_bsize,
877 B_MODIFY, &bp);
878 if (error) {
879 break;
880 }
881 memcpy(bp->b_data, space, fs->fs_bsize);
882 space = (char *)space + fs->fs_bsize;
883 bawrite(bp);
884 }
885 if (error)
886 goto out;
887 error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
888 fs->fs_bsize, B_MODIFY, &bp);
889 if (error) {
890 goto out;
891 } else {
892 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
893 bawrite(bp);
894 }
895 /*
896 * Copy the first UFS_NDADDR blocks to the snapshot so
897 * ffs_copyonwrite() and ffs_snapblkfree() will always work on
898 * indirect blocks.
899 */
900 for (loc = 0; loc < UFS_NDADDR; loc++) {
901 if (db_get(ip, loc) != 0)
902 continue;
903 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
904 fs->fs_bsize, l->l_cred, 0, &bp);
905 if (error)
906 break;
907 error = rwfsblk(vp, B_READ, bp->b_data, loc);
908 if (error) {
909 brelse(bp, 0);
910 break;
911 }
912 bawrite(bp);
913 }
914
915 out:
916 UFS_WAPBL_END(mp);
917 return error;
918 }
919
920 /*
921 * Copy all cylinder group maps.
922 */
923 static int
924 cgaccount(struct vnode *vp, int passno, int *redo)
925 {
926 int cg, error = 0;
927 struct buf *nbp;
928 struct fs *fs = VTOI(vp)->i_fs;
929
930 if (redo != NULL)
931 *redo = 0;
932 if (passno == 1)
933 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
934 M_DEVBUF, M_WAITOK | M_ZERO);
935 for (cg = 0; cg < fs->fs_ncg; cg++) {
936 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
937 continue;
938
939 if (redo != NULL)
940 *redo += 1;
941 error = UFS_WAPBL_BEGIN(vp->v_mount);
942 if (error)
943 return error;
944 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
945 fs->fs_bsize, curlwp->l_cred, 0, &nbp);
946 if (error) {
947 UFS_WAPBL_END(vp->v_mount);
948 break;
949 }
950 error = cgaccount1(cg, vp, nbp->b_data, passno);
951 bawrite(nbp);
952 UFS_WAPBL_END(vp->v_mount);
953 if (error)
954 break;
955 }
956 return error;
957 }
958
959 /*
960 * Copy a cylinder group map. All the unallocated blocks are marked
961 * BLK_NOCOPY so that the snapshot knows that it need not copy them
962 * if they are later written. If passno is one, then this is a first
963 * pass, so only setting needs to be done. If passno is 2, then this
964 * is a revision to a previous pass which must be undone as the
965 * replacement pass is done.
966 */
967 static int
968 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
969 {
970 struct buf *bp, *ibp;
971 struct inode *ip;
972 struct cg *cgp;
973 struct fs *fs;
974 struct lwp *l = curlwp;
975 daddr_t base, numblks;
976 int error, len, loc, ns __unused, indiroff;
977
978 ip = VTOI(vp);
979 fs = ip->i_fs;
980 ns = UFS_FSNEEDSWAP(fs);
981 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
982 (int)fs->fs_cgsize, 0, &bp);
983 if (error) {
984 return (error);
985 }
986 cgp = (struct cg *)bp->b_data;
987 if (!cg_chkmagic(cgp, ns)) {
988 brelse(bp, 0);
989 return (EIO);
990 }
991 ACTIVECG_SET(fs, cg);
992
993 memcpy(data, bp->b_data, fs->fs_cgsize);
994 brelse(bp, 0);
995 if (fs->fs_cgsize < fs->fs_bsize)
996 memset((char *)data + fs->fs_cgsize, 0,
997 fs->fs_bsize - fs->fs_cgsize);
998 numblks = howmany(fs->fs_size, fs->fs_frag);
999 len = howmany(fs->fs_fpg, fs->fs_frag);
1000 base = cg * fs->fs_fpg / fs->fs_frag;
1001 if (base + len >= numblks)
1002 len = numblks - base - 1;
1003 loc = 0;
1004 if (base < UFS_NDADDR) {
1005 for ( ; loc < UFS_NDADDR; loc++) {
1006 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1007 db_assign(ip, loc, BLK_NOCOPY);
1008 else if (db_get(ip, loc) == BLK_NOCOPY) {
1009 if (passno == 2)
1010 db_assign(ip, loc, 0);
1011 else if (passno == 1)
1012 panic("ffs_snapshot: lost direct block");
1013 }
1014 }
1015 }
1016 if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
1017 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1018 return (error);
1019 indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
1020 for ( ; loc < len; loc++, indiroff++) {
1021 if (indiroff >= FFS_NINDIR(fs)) {
1022 bawrite(ibp);
1023 if ((error = ffs_balloc(vp,
1024 ffs_lblktosize(fs, (off_t)(base + loc)),
1025 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1026 return (error);
1027 indiroff = 0;
1028 }
1029 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1030 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
1031 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
1032 if (passno == 2)
1033 idb_assign(ip, ibp->b_data, indiroff, 0);
1034 else if (passno == 1)
1035 panic("ffs_snapshot: lost indirect block");
1036 }
1037 }
1038 bdwrite(ibp);
1039 return (0);
1040 }
1041
1042 /*
1043 * Before expunging a snapshot inode, note all the
1044 * blocks that it claims with BLK_SNAP so that fsck will
1045 * be able to account for those blocks properly and so
1046 * that this snapshot knows that it need not copy them
1047 * if the other snapshot holding them is freed.
1048 */
1049 static int
1050 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1051 acctfunc_t acctfunc, int expungetype)
1052 {
1053 int i, error, ns __unused;
1054 daddr_t lbn, rlbn;
1055 daddr_t len, blkno, numblks, blksperindir;
1056 struct ufs1_dinode *dip1;
1057 struct ufs2_dinode *dip2;
1058 struct lwp *l = curlwp;
1059 void *bap;
1060 struct buf *bp;
1061 struct mount *mp;
1062
1063 ns = UFS_FSNEEDSWAP(fs);
1064 mp = snapvp->v_mount;
1065
1066 error = UFS_WAPBL_BEGIN(mp);
1067 if (error)
1068 return error;
1069 /*
1070 * Prepare to expunge the inode. If its inode block has not
1071 * yet been copied, then allocate and fill the copy.
1072 */
1073 lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1074 error = snapblkaddr(snapvp, lbn, &blkno);
1075 if (error)
1076 return error;
1077 if (blkno != 0) {
1078 error = bread(snapvp, lbn, fs->fs_bsize,
1079 B_MODIFY, &bp);
1080 } else {
1081 error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
1082 fs->fs_bsize, l->l_cred, 0, &bp);
1083 if (! error)
1084 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1085 }
1086 if (error) {
1087 UFS_WAPBL_END(mp);
1088 return error;
1089 }
1090 /*
1091 * Set a snapshot inode to be a zero length file, regular files
1092 * or unlinked snapshots to be completely unallocated.
1093 */
1094 if (fs->fs_magic == FS_UFS1_MAGIC) {
1095 dip1 = (struct ufs1_dinode *)bp->b_data +
1096 ino_to_fsbo(fs, cancelip->i_number);
1097 if (cancelip->i_flags & SF_SNAPSHOT) {
1098 dip1->di_flags =
1099 ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
1100 SF_SNAPINVAL, ns);
1101 }
1102 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1103 dip1->di_mode = 0;
1104 dip1->di_size = 0;
1105 dip1->di_blocks = 0;
1106 memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
1107 } else {
1108 dip2 = (struct ufs2_dinode *)bp->b_data +
1109 ino_to_fsbo(fs, cancelip->i_number);
1110 if (cancelip->i_flags & SF_SNAPSHOT) {
1111 dip2->di_flags =
1112 ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
1113 SF_SNAPINVAL, ns);
1114 }
1115 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1116 dip2->di_mode = 0;
1117 dip2->di_size = 0;
1118 dip2->di_blocks = 0;
1119 memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
1120 }
1121 bdwrite(bp);
1122 UFS_WAPBL_END(mp);
1123 /*
1124 * Now go through and expunge all the blocks in the file
1125 * using the function requested.
1126 */
1127 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1128 if (fs->fs_magic == FS_UFS1_MAGIC)
1129 bap = &cancelip->i_ffs1_db[0];
1130 else
1131 bap = &cancelip->i_ffs2_db[0];
1132 error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
1133 if (error)
1134 return (error);
1135 if (fs->fs_magic == FS_UFS1_MAGIC)
1136 bap = &cancelip->i_ffs1_ib[0];
1137 else
1138 bap = &cancelip->i_ffs2_ib[0];
1139 error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
1140 if (error)
1141 return (error);
1142 blksperindir = 1;
1143 lbn = -UFS_NDADDR;
1144 len = numblks - UFS_NDADDR;
1145 rlbn = UFS_NDADDR;
1146 for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1147 error = indiracct(snapvp, ITOV(cancelip), i,
1148 ib_get(cancelip, i), lbn, rlbn, len,
1149 blksperindir, fs, acctfunc, expungetype);
1150 if (error)
1151 return (error);
1152 blksperindir *= FFS_NINDIR(fs);
1153 lbn -= blksperindir + 1;
1154 len -= blksperindir;
1155 rlbn += blksperindir;
1156 }
1157 return (0);
1158 }
1159
1160 /*
1161 * Descend an indirect block chain for vnode cancelvp accounting for all
1162 * its indirect blocks in snapvp.
1163 */
1164 static int
1165 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1166 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1167 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1168 {
1169 int error, num, i;
1170 daddr_t subblksperindir;
1171 struct indir indirs[UFS_NIADDR + 2];
1172 daddr_t last;
1173 void *bap;
1174 struct buf *bp;
1175
1176 if (blkno == 0) {
1177 if (expungetype == BLK_NOCOPY)
1178 return (0);
1179 panic("indiracct: missing indir");
1180 }
1181 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1182 return (error);
1183 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1184 panic("indiracct: botched params");
1185 /*
1186 * We have to expand bread here since it will deadlock looking
1187 * up the block number for any blocks that are not in the cache.
1188 */
1189 error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
1190 false, &bp);
1191 if (error)
1192 return error;
1193 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1194 rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
1195 brelse(bp, 0);
1196 return (error);
1197 }
1198 /*
1199 * Account for the block pointers in this indirect block.
1200 */
1201 last = howmany(remblks, blksperindir);
1202 if (last > FFS_NINDIR(fs))
1203 last = FFS_NINDIR(fs);
1204 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1205 memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1206 brelse(bp, 0);
1207 error = (*acctfunc)(snapvp, bap, 0, last,
1208 fs, level == 0 ? rlbn : -1, expungetype);
1209 if (error || level == 0)
1210 goto out;
1211 /*
1212 * Account for the block pointers in each of the indirect blocks
1213 * in the levels below us.
1214 */
1215 subblksperindir = blksperindir / FFS_NINDIR(fs);
1216 for (lbn++, level--, i = 0; i < last; i++) {
1217 error = indiracct(snapvp, cancelvp, level,
1218 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1219 subblksperindir, fs, acctfunc, expungetype);
1220 if (error)
1221 goto out;
1222 rlbn += blksperindir;
1223 lbn -= blksperindir;
1224 remblks -= blksperindir;
1225 }
1226 out:
1227 free(bap, M_DEVBUF);
1228 return (error);
1229 }
1230
1231 /*
1232 * Do both snap accounting and map accounting.
1233 */
1234 static int
1235 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1236 struct fs *fs, daddr_t lblkno,
1237 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1238 {
1239 int error;
1240
1241 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1242 return (error);
1243 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1244 }
1245
1246 /*
1247 * Identify a set of blocks allocated in a snapshot inode.
1248 */
1249 static int
1250 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1251 struct fs *fs, daddr_t lblkno,
1252 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1253 {
1254 struct inode *ip = VTOI(vp);
1255 struct lwp *l = curlwp;
1256 struct mount *mp = vp->v_mount;
1257 daddr_t blkno;
1258 daddr_t lbn;
1259 struct buf *ibp;
1260 int error, n;
1261 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1262
1263 error = UFS_WAPBL_BEGIN(mp);
1264 if (error)
1265 return error;
1266 for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1267 blkno = idb_get(ip, bap, oldblkp);
1268 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1269 continue;
1270 lbn = ffs_fragstoblks(fs, blkno);
1271 if (lbn < UFS_NDADDR) {
1272 blkno = db_get(ip, lbn);
1273 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1274 } else {
1275 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1276 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1277 if (error)
1278 break;
1279 blkno = idb_get(ip, ibp->b_data,
1280 (lbn - UFS_NDADDR) % FFS_NINDIR(fs));
1281 }
1282 /*
1283 * If we are expunging a snapshot vnode and we
1284 * find a block marked BLK_NOCOPY, then it is
1285 * one that has been allocated to this snapshot after
1286 * we took our current snapshot and can be ignored.
1287 */
1288 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1289 if (lbn >= UFS_NDADDR)
1290 brelse(ibp, 0);
1291 } else {
1292 if (blkno != 0)
1293 panic("snapacct: bad block");
1294 if (lbn < UFS_NDADDR)
1295 db_assign(ip, lbn, expungetype);
1296 else {
1297 idb_assign(ip, ibp->b_data,
1298 (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
1299 bdwrite(ibp);
1300 }
1301 }
1302 if (wbreak > 0 && (++n % wbreak) == 0) {
1303 UFS_WAPBL_END(mp);
1304 error = UFS_WAPBL_BEGIN(mp);
1305 if (error)
1306 return error;
1307 }
1308 }
1309 UFS_WAPBL_END(mp);
1310 return error;
1311 }
1312
1313 /*
1314 * Account for a set of blocks allocated in a snapshot inode.
1315 */
1316 static int
1317 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1318 struct fs *fs, daddr_t lblkno, int expungetype)
1319 {
1320 daddr_t blkno;
1321 struct inode *ip;
1322 struct mount *mp = vp->v_mount;
1323 ino_t inum;
1324 int acctit, error, n;
1325 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1326
1327 error = UFS_WAPBL_BEGIN(mp);
1328 if (error)
1329 return error;
1330 ip = VTOI(vp);
1331 inum = ip->i_number;
1332 if (lblkno == -1)
1333 acctit = 0;
1334 else
1335 acctit = 1;
1336 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1337 blkno = idb_get(ip, bap, oldblkp);
1338 if (blkno == 0 || blkno == BLK_NOCOPY)
1339 continue;
1340 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1341 *ip->i_snapblklist++ = lblkno;
1342 if (blkno == BLK_SNAP)
1343 blkno = ffs_blkstofrags(fs, lblkno);
1344 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1345 if (wbreak > 0 && (++n % wbreak) == 0) {
1346 UFS_WAPBL_END(mp);
1347 error = UFS_WAPBL_BEGIN(mp);
1348 if (error)
1349 return error;
1350 }
1351 }
1352 UFS_WAPBL_END(mp);
1353 return (0);
1354 }
1355
1356 /*
1357 * Number of blocks that fit into the journal or zero if not logging.
1358 */
1359 static int
1360 blocks_in_journal(struct fs *fs)
1361 {
1362 off_t bpj;
1363
1364 if ((fs->fs_flags & FS_DOWAPBL) == 0)
1365 return 0;
1366 bpj = 1;
1367 if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1368 switch (fs->fs_journal_location) {
1369 case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1370 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1371 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1372 break;
1373 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1374 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1375 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1376 break;
1377 }
1378 }
1379 bpj /= fs->fs_bsize;
1380 return (bpj > 0 ? bpj : 1);
1381 }
1382 #endif /* defined(FFS_NO_SNAPSHOT) */
1383
1384 /*
1385 * Decrement extra reference on snapshot when last name is removed.
1386 * It will not be freed until the last open reference goes away.
1387 */
1388 void
1389 ffs_snapgone(struct vnode *vp)
1390 {
1391 struct inode *xp, *ip = VTOI(vp);
1392 struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
1393 struct fs *fs;
1394 struct snap_info *si;
1395 int snaploc;
1396
1397 si = VFSTOUFS(mp)->um_snapinfo;
1398
1399 /*
1400 * Find snapshot in incore list.
1401 */
1402 mutex_enter(&si->si_lock);
1403 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1404 if (xp == ip)
1405 break;
1406 mutex_exit(&si->si_lock);
1407 if (xp != NULL)
1408 vrele(ITOV(ip));
1409 #ifdef DEBUG
1410 else if (snapdebug)
1411 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1412 (unsigned long long)ip->i_number);
1413 #endif
1414 /*
1415 * Delete snapshot inode from superblock. Keep list dense.
1416 */
1417 mutex_enter(&si->si_lock);
1418 fs = ip->i_fs;
1419 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1420 if (fs->fs_snapinum[snaploc] == ip->i_number)
1421 break;
1422 if (snaploc < FSMAXSNAP) {
1423 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1424 if (fs->fs_snapinum[snaploc] == 0)
1425 break;
1426 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1427 }
1428 fs->fs_snapinum[snaploc - 1] = 0;
1429 }
1430 si->si_gen++;
1431 mutex_exit(&si->si_lock);
1432 }
1433
1434 /*
1435 * Prepare a snapshot file for being removed.
1436 */
1437 void
1438 ffs_snapremove(struct vnode *vp)
1439 {
1440 struct inode *ip = VTOI(vp), *xp;
1441 struct vnode *devvp = ip->i_devvp;
1442 struct fs *fs = ip->i_fs;
1443 struct mount *mp = spec_node_getmountedfs(devvp);
1444 struct buf *ibp;
1445 struct snap_info *si;
1446 struct lwp *l = curlwp;
1447 daddr_t numblks, blkno, dblk;
1448 int error, loc, last;
1449
1450 si = VFSTOUFS(mp)->um_snapinfo;
1451 /*
1452 * If active, delete from incore list (this snapshot may
1453 * already have been in the process of being deleted, so
1454 * would not have been active).
1455 *
1456 * Clear copy-on-write flag if last snapshot.
1457 */
1458 mutex_enter(&si->si_snaplock);
1459 mutex_enter(&si->si_lock);
1460 if (is_active_snapshot(si, ip)) {
1461 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1462 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1463 /* Roll back the list of preallocated blocks. */
1464 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1465 si->si_snapblklist = xp->i_snapblklist;
1466 si->si_gen++;
1467 mutex_exit(&si->si_lock);
1468 mutex_exit(&si->si_snaplock);
1469 } else {
1470 si->si_snapblklist = 0;
1471 si->si_gen++;
1472 mutex_exit(&si->si_lock);
1473 mutex_exit(&si->si_snaplock);
1474 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1475 }
1476 if (ip->i_snapblklist != NULL) {
1477 free(ip->i_snapblklist, M_UFSMNT);
1478 ip->i_snapblklist = NULL;
1479 }
1480 } else {
1481 mutex_exit(&si->si_lock);
1482 mutex_exit(&si->si_snaplock);
1483 }
1484 /*
1485 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1486 * snapshots that want them (see ffs_snapblkfree below).
1487 */
1488 for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1489 dblk = db_get(ip, blkno);
1490 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1491 db_assign(ip, blkno, 0);
1492 else if ((dblk == ffs_blkstofrags(fs, blkno) &&
1493 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1494 ip->i_number))) {
1495 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1496 db_assign(ip, blkno, 0);
1497 }
1498 }
1499 numblks = howmany(ip->i_size, fs->fs_bsize);
1500 for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
1501 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
1502 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1503 if (error)
1504 continue;
1505 if (fs->fs_size - blkno > FFS_NINDIR(fs))
1506 last = FFS_NINDIR(fs);
1507 else
1508 last = fs->fs_size - blkno;
1509 for (loc = 0; loc < last; loc++) {
1510 dblk = idb_get(ip, ibp->b_data, loc);
1511 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1512 idb_assign(ip, ibp->b_data, loc, 0);
1513 else if (dblk == ffs_blkstofrags(fs, blkno) &&
1514 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1515 fs->fs_bsize, ip->i_number)) {
1516 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1517 idb_assign(ip, ibp->b_data, loc, 0);
1518 }
1519 }
1520 bawrite(ibp);
1521 UFS_WAPBL_END(mp);
1522 error = UFS_WAPBL_BEGIN(mp);
1523 KASSERT(error == 0);
1524 }
1525 /*
1526 * Clear snapshot flag and drop reference.
1527 */
1528 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
1529 DIP_ASSIGN(ip, flags, ip->i_flags);
1530 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1531 #if defined(QUOTA) || defined(QUOTA2)
1532 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1533 chkiq(ip, 1, l->l_cred, FORCE);
1534 #endif
1535 }
1536
1537 /*
1538 * Notification that a block is being freed. Return zero if the free
1539 * should be allowed to proceed. Return non-zero if the snapshot file
1540 * wants to claim the block. The block will be claimed if it is an
1541 * uncopied part of one of the snapshots. It will be freed if it is
1542 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1543 * If a fragment is being freed, then all snapshots that care about
1544 * it must make a copy since a snapshot file can only claim full sized
1545 * blocks. Note that if more than one snapshot file maps the block,
1546 * we can pick one at random to claim it. Since none of the snapshots
1547 * can change, we are assurred that they will all see the same unmodified
1548 * image. When deleting a snapshot file (see ffs_snapremove above), we
1549 * must push any of these claimed blocks to one of the other snapshots
1550 * that maps it. These claimed blocks are easily identified as they will
1551 * have a block number equal to their logical block number within the
1552 * snapshot. A copied block can never have this property because they
1553 * must always have been allocated from a BLK_NOCOPY location.
1554 */
1555 int
1556 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1557 long size, ino_t inum)
1558 {
1559 struct mount *mp = spec_node_getmountedfs(devvp);
1560 struct buf *ibp;
1561 struct inode *ip;
1562 struct vnode *vp = NULL;
1563 struct snap_info *si;
1564 void *saved_data = NULL;
1565 daddr_t lbn;
1566 daddr_t blkno;
1567 uint32_t gen;
1568 int indiroff = 0, error = 0, claimedblk = 0;
1569
1570 si = VFSTOUFS(mp)->um_snapinfo;
1571 lbn = ffs_fragstoblks(fs, bno);
1572 mutex_enter(&si->si_snaplock);
1573 mutex_enter(&si->si_lock);
1574 si->si_owner = curlwp;
1575
1576 retry:
1577 gen = si->si_gen;
1578 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1579 vp = ITOV(ip);
1580 /*
1581 * Lookup block being written.
1582 */
1583 if (lbn < UFS_NDADDR) {
1584 blkno = db_get(ip, lbn);
1585 } else {
1586 mutex_exit(&si->si_lock);
1587 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1588 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1589 if (error) {
1590 mutex_enter(&si->si_lock);
1591 break;
1592 }
1593 indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
1594 blkno = idb_get(ip, ibp->b_data, indiroff);
1595 mutex_enter(&si->si_lock);
1596 if (gen != si->si_gen) {
1597 brelse(ibp, 0);
1598 goto retry;
1599 }
1600 }
1601 /*
1602 * Check to see if block needs to be copied.
1603 */
1604 if (blkno == 0) {
1605 /*
1606 * A block that we map is being freed. If it has not
1607 * been claimed yet, we will claim or copy it (below).
1608 */
1609 claimedblk = 1;
1610 } else if (blkno == BLK_SNAP) {
1611 /*
1612 * No previous snapshot claimed the block,
1613 * so it will be freed and become a BLK_NOCOPY
1614 * (don't care) for us.
1615 */
1616 if (claimedblk)
1617 panic("snapblkfree: inconsistent block type");
1618 if (lbn < UFS_NDADDR) {
1619 db_assign(ip, lbn, BLK_NOCOPY);
1620 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1621 } else {
1622 idb_assign(ip, ibp->b_data, indiroff,
1623 BLK_NOCOPY);
1624 mutex_exit(&si->si_lock);
1625 if (ip->i_nlink > 0)
1626 bwrite(ibp);
1627 else
1628 bdwrite(ibp);
1629 mutex_enter(&si->si_lock);
1630 if (gen != si->si_gen)
1631 goto retry;
1632 }
1633 continue;
1634 } else /* BLK_NOCOPY or default */ {
1635 /*
1636 * If the snapshot has already copied the block
1637 * (default), or does not care about the block,
1638 * it is not needed.
1639 */
1640 if (lbn >= UFS_NDADDR)
1641 brelse(ibp, 0);
1642 continue;
1643 }
1644 /*
1645 * If this is a full size block, we will just grab it
1646 * and assign it to the snapshot inode. Otherwise we
1647 * will proceed to copy it. See explanation for this
1648 * routine as to why only a single snapshot needs to
1649 * claim this block.
1650 */
1651 if (size == fs->fs_bsize) {
1652 #ifdef DEBUG
1653 if (snapdebug)
1654 printf("%s %llu lbn %" PRId64
1655 "from inum %llu\n",
1656 "Grabonremove: snapino",
1657 (unsigned long long)ip->i_number,
1658 lbn, (unsigned long long)inum);
1659 #endif
1660 mutex_exit(&si->si_lock);
1661 if (lbn < UFS_NDADDR) {
1662 db_assign(ip, lbn, bno);
1663 } else {
1664 idb_assign(ip, ibp->b_data, indiroff, bno);
1665 if (ip->i_nlink > 0)
1666 bwrite(ibp);
1667 else
1668 bdwrite(ibp);
1669 }
1670 DIP_ADD(ip, blocks, btodb(size));
1671 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1672 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1673 error = syncsnap(vp);
1674 else
1675 error = 0;
1676 mutex_enter(&si->si_lock);
1677 si->si_owner = NULL;
1678 mutex_exit(&si->si_lock);
1679 mutex_exit(&si->si_snaplock);
1680 return (error == 0);
1681 }
1682 if (lbn >= UFS_NDADDR)
1683 brelse(ibp, 0);
1684 #ifdef DEBUG
1685 if (snapdebug)
1686 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1687 "Copyonremove: snapino ",
1688 (unsigned long long)ip->i_number,
1689 lbn, "for inum", (unsigned long long)inum, size);
1690 #endif
1691 /*
1692 * If we have already read the old block contents, then
1693 * simply copy them to the new block. Note that we need
1694 * to synchronously write snapshots that have not been
1695 * unlinked, and hence will be visible after a crash,
1696 * to ensure their integrity.
1697 */
1698 mutex_exit(&si->si_lock);
1699 if (saved_data == NULL) {
1700 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1701 error = rwfsblk(vp, B_READ, saved_data, lbn);
1702 if (error) {
1703 free(saved_data, M_UFSMNT);
1704 saved_data = NULL;
1705 mutex_enter(&si->si_lock);
1706 break;
1707 }
1708 }
1709 error = wrsnapblk(vp, saved_data, lbn);
1710 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1711 error = syncsnap(vp);
1712 mutex_enter(&si->si_lock);
1713 if (error)
1714 break;
1715 if (gen != si->si_gen)
1716 goto retry;
1717 }
1718 si->si_owner = NULL;
1719 mutex_exit(&si->si_lock);
1720 mutex_exit(&si->si_snaplock);
1721 if (saved_data)
1722 free(saved_data, M_UFSMNT);
1723 /*
1724 * If we have been unable to allocate a block in which to do
1725 * the copy, then return non-zero so that the fragment will
1726 * not be freed. Although space will be lost, the snapshot
1727 * will stay consistent.
1728 */
1729 return (error);
1730 }
1731
1732 /*
1733 * Associate snapshot files when mounting.
1734 */
1735 void
1736 ffs_snapshot_mount(struct mount *mp)
1737 {
1738 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1739 struct fs *fs = VFSTOUFS(mp)->um_fs;
1740 struct lwp *l = curlwp;
1741 struct vnode *vp;
1742 struct inode *ip, *xp;
1743 struct snap_info *si;
1744 daddr_t snaplistsize, *snapblklist;
1745 int i, error, ns __unused, snaploc, loc;
1746
1747 /*
1748 * No persistent snapshots on apple ufs file systems.
1749 */
1750 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1751 return;
1752
1753 si = VFSTOUFS(mp)->um_snapinfo;
1754 ns = UFS_FSNEEDSWAP(fs);
1755 /*
1756 * XXX The following needs to be set before ffs_truncate or
1757 * VOP_READ can be called.
1758 */
1759 mp->mnt_stat.f_iosize = fs->fs_bsize;
1760 /*
1761 * Process each snapshot listed in the superblock.
1762 */
1763 vp = NULL;
1764 mutex_enter(&si->si_lock);
1765 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1766 if (fs->fs_snapinum[snaploc] == 0)
1767 break;
1768 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1769 LK_EXCLUSIVE, &vp)) != 0) {
1770 printf("ffs_snapshot_mount: vget failed %d\n", error);
1771 continue;
1772 }
1773 ip = VTOI(vp);
1774 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
1775 SF_SNAPSHOT) {
1776 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1777 fs->fs_snapinum[snaploc]);
1778 vput(vp);
1779 vp = NULL;
1780 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1781 if (fs->fs_snapinum[loc] == 0)
1782 break;
1783 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1784 }
1785 fs->fs_snapinum[loc - 1] = 0;
1786 snaploc--;
1787 continue;
1788 }
1789
1790 /*
1791 * Read the block hints list. Use an empty list on
1792 * read errors.
1793 */
1794 error = vn_rdwr(UIO_READ, vp,
1795 (void *)&snaplistsize, sizeof(snaplistsize),
1796 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1797 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1798 l->l_cred, NULL, NULL);
1799 if (error) {
1800 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1801 snaplistsize = 1;
1802 } else
1803 snaplistsize = ufs_rw64(snaplistsize, ns);
1804 snapblklist = malloc(
1805 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1806 if (error)
1807 snapblklist[0] = 1;
1808 else {
1809 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1810 snaplistsize * sizeof(daddr_t),
1811 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1812 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1813 l->l_cred, NULL, NULL);
1814 for (i = 0; i < snaplistsize; i++)
1815 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1816 if (error) {
1817 printf("ffs_snapshot_mount: read_2 failed %d\n",
1818 error);
1819 snapblklist[0] = 1;
1820 }
1821 }
1822 ip->i_snapblklist = &snapblklist[0];
1823
1824 /*
1825 * Link it onto the active snapshot list.
1826 */
1827 if (is_active_snapshot(si, ip))
1828 panic("ffs_snapshot_mount: %"PRIu64" already on list",
1829 ip->i_number);
1830 else
1831 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1832 vp->v_vflag |= VV_SYSTEM;
1833 VOP_UNLOCK(vp);
1834 }
1835 /*
1836 * No usable snapshots found.
1837 */
1838 if (vp == NULL) {
1839 mutex_exit(&si->si_lock);
1840 return;
1841 }
1842 /*
1843 * Attach the block hints list. We always want to
1844 * use the list from the newest snapshot.
1845 */
1846 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1847 si->si_snapblklist = xp->i_snapblklist;
1848 fscow_establish(mp, ffs_copyonwrite, devvp);
1849 si->si_gen++;
1850 mutex_exit(&si->si_lock);
1851 }
1852
1853 /*
1854 * Disassociate snapshot files when unmounting.
1855 */
1856 void
1857 ffs_snapshot_unmount(struct mount *mp)
1858 {
1859 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1860 struct inode *xp;
1861 struct vnode *vp = NULL;
1862 struct snap_info *si;
1863
1864 si = VFSTOUFS(mp)->um_snapinfo;
1865 mutex_enter(&si->si_lock);
1866 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1867 vp = ITOV(xp);
1868 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1869 if (xp->i_snapblklist == si->si_snapblklist)
1870 si->si_snapblklist = NULL;
1871 free(xp->i_snapblklist, M_UFSMNT);
1872 if (xp->i_nlink > 0) {
1873 si->si_gen++;
1874 mutex_exit(&si->si_lock);
1875 vrele(vp);
1876 mutex_enter(&si->si_lock);
1877 }
1878 }
1879 si->si_gen++;
1880 mutex_exit(&si->si_lock);
1881 if (vp)
1882 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1883 }
1884
1885 /*
1886 * Check for need to copy block that is about to be written,
1887 * copying the block if necessary.
1888 */
1889 static int
1890 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1891 {
1892 struct fs *fs;
1893 struct inode *ip;
1894 struct vnode *devvp = v, *vp = NULL;
1895 struct mount *mp = spec_node_getmountedfs(devvp);
1896 struct snap_info *si;
1897 void *saved_data = NULL;
1898 daddr_t lbn, blkno, *snapblklist;
1899 uint32_t gen;
1900 int lower, upper, mid, snapshot_locked = 0, error = 0;
1901
1902 /*
1903 * Check for valid snapshots.
1904 */
1905 si = VFSTOUFS(mp)->um_snapinfo;
1906 mutex_enter(&si->si_lock);
1907 ip = TAILQ_FIRST(&si->si_snapshots);
1908 if (ip == NULL) {
1909 mutex_exit(&si->si_lock);
1910 return 0;
1911 }
1912 /*
1913 * First check to see if it is after the file system,
1914 * in the journal or in the preallocated list.
1915 * By doing these checks we avoid several potential deadlocks.
1916 */
1917 fs = ip->i_fs;
1918 lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
1919 if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
1920 mutex_exit(&si->si_lock);
1921 return 0;
1922 }
1923 if ((fs->fs_flags & FS_DOWAPBL) &&
1924 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1925 off_t blk_off, log_start, log_end;
1926
1927 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1928 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1929 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1930 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1931 blk_off = dbtob(bp->b_blkno);
1932 if (blk_off >= log_start && blk_off < log_end) {
1933 mutex_exit(&si->si_lock);
1934 return 0;
1935 }
1936 }
1937 snapblklist = si->si_snapblklist;
1938 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1939 lower = 1;
1940 while (lower <= upper) {
1941 mid = (lower + upper) / 2;
1942 if (snapblklist[mid] == lbn)
1943 break;
1944 if (snapblklist[mid] < lbn)
1945 lower = mid + 1;
1946 else
1947 upper = mid - 1;
1948 }
1949 if (lower <= upper) {
1950 mutex_exit(&si->si_lock);
1951 return 0;
1952 }
1953 /*
1954 * Not in the precomputed list, so check the snapshots.
1955 */
1956 if (si->si_owner != curlwp) {
1957 if (!mutex_tryenter(&si->si_snaplock)) {
1958 mutex_exit(&si->si_lock);
1959 mutex_enter(&si->si_snaplock);
1960 mutex_enter(&si->si_lock);
1961 }
1962 si->si_owner = curlwp;
1963 snapshot_locked = 1;
1964 }
1965 if (data_valid && bp->b_bcount == fs->fs_bsize)
1966 saved_data = bp->b_data;
1967 retry:
1968 gen = si->si_gen;
1969 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1970 vp = ITOV(ip);
1971 /*
1972 * We ensure that everything of our own that needs to be
1973 * copied will be done at the time that ffs_snapshot is
1974 * called. Thus we can skip the check here which can
1975 * deadlock in doing the lookup in ffs_balloc.
1976 */
1977 if (bp->b_vp == vp)
1978 continue;
1979 /*
1980 * Check to see if block needs to be copied.
1981 */
1982 if (lbn < UFS_NDADDR) {
1983 blkno = db_get(ip, lbn);
1984 } else {
1985 mutex_exit(&si->si_lock);
1986 blkno = 0; /* XXX: GCC */
1987 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1988 mutex_enter(&si->si_lock);
1989 break;
1990 }
1991 mutex_enter(&si->si_lock);
1992 if (gen != si->si_gen)
1993 goto retry;
1994 }
1995 KASSERTMSG((blkno != BLK_SNAP || bp->b_lblkno < 0),
1996 "ffs_copyonwrite: bad copy block: blkno %jd, lblkno %jd",
1997 (intmax_t)blkno, (intmax_t)bp->b_lblkno);
1998 if (blkno != 0)
1999 continue;
2000
2001 if (curlwp == uvm.pagedaemon_lwp) {
2002 error = ENOMEM;
2003 break;
2004 }
2005 /* Only one level of recursion allowed. */
2006 KASSERT(snapshot_locked);
2007 /*
2008 * Allocate the block into which to do the copy. Since
2009 * multiple processes may all try to copy the same block,
2010 * we have to recheck our need to do a copy if we sleep
2011 * waiting for the lock.
2012 *
2013 * Because all snapshots on a filesystem share a single
2014 * lock, we ensure that we will never be in competition
2015 * with another process to allocate a block.
2016 */
2017 #ifdef DEBUG
2018 if (snapdebug) {
2019 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2020 (unsigned long long)ip->i_number, lbn);
2021 if (bp->b_vp == devvp)
2022 printf("fs metadata");
2023 else
2024 printf("inum %llu", (unsigned long long)
2025 VTOI(bp->b_vp)->i_number);
2026 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2027 }
2028 #endif
2029 /*
2030 * If we have already read the old block contents, then
2031 * simply copy them to the new block. Note that we need
2032 * to synchronously write snapshots that have not been
2033 * unlinked, and hence will be visible after a crash,
2034 * to ensure their integrity.
2035 */
2036 mutex_exit(&si->si_lock);
2037 if (saved_data == NULL) {
2038 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2039 error = rwfsblk(vp, B_READ, saved_data, lbn);
2040 if (error) {
2041 free(saved_data, M_UFSMNT);
2042 saved_data = NULL;
2043 mutex_enter(&si->si_lock);
2044 break;
2045 }
2046 }
2047 error = wrsnapblk(vp, saved_data, lbn);
2048 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
2049 error = syncsnap(vp);
2050 mutex_enter(&si->si_lock);
2051 if (error)
2052 break;
2053 if (gen != si->si_gen)
2054 goto retry;
2055 }
2056 /*
2057 * Note that we need to synchronously write snapshots that
2058 * have not been unlinked, and hence will be visible after
2059 * a crash, to ensure their integrity.
2060 */
2061 if (snapshot_locked) {
2062 si->si_owner = NULL;
2063 mutex_exit(&si->si_lock);
2064 mutex_exit(&si->si_snaplock);
2065 } else
2066 mutex_exit(&si->si_lock);
2067 if (saved_data && saved_data != bp->b_data)
2068 free(saved_data, M_UFSMNT);
2069 return error;
2070 }
2071
2072 /*
2073 * Read from a snapshot.
2074 */
2075 int
2076 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2077 {
2078 struct inode *ip = VTOI(vp);
2079 struct fs *fs = ip->i_fs;
2080 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2081 struct buf *bp;
2082 daddr_t lbn, nextlbn;
2083 off_t fsbytes, bytesinfile;
2084 long size, xfersize, blkoffset;
2085 int error;
2086
2087 mutex_enter(&si->si_snaplock);
2088
2089 if (ioflag & IO_ALTSEMANTICS)
2090 fsbytes = ip->i_size;
2091 else
2092 fsbytes = ffs_lfragtosize(fs, fs->fs_size);
2093 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2094 bytesinfile = fsbytes - uio->uio_offset;
2095 if (bytesinfile <= 0)
2096 break;
2097 lbn = ffs_lblkno(fs, uio->uio_offset);
2098 nextlbn = lbn + 1;
2099 size = fs->fs_bsize;
2100 blkoffset = ffs_blkoff(fs, uio->uio_offset);
2101 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2102 bytesinfile);
2103
2104 if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
2105 if (ffs_lblktosize(fs, lbn) + size > fsbytes)
2106 size = ffs_fragroundup(fs,
2107 fsbytes - ffs_lblktosize(fs, lbn));
2108 error = bread(vp, lbn, size, 0, &bp);
2109 } else {
2110 int nextsize = fs->fs_bsize;
2111 error = breadn(vp, lbn,
2112 size, &nextlbn, &nextsize, 1, 0, &bp);
2113 }
2114 if (error)
2115 break;
2116
2117 /*
2118 * We should only get non-zero b_resid when an I/O error
2119 * has occurred, which should cause us to break above.
2120 * However, if the short read did not cause an error,
2121 * then we want to ensure that we do not uiomove bad
2122 * or uninitialized data.
2123 */
2124 size -= bp->b_resid;
2125 if (size < blkoffset + xfersize) {
2126 xfersize = size - blkoffset;
2127 if (xfersize <= 0)
2128 break;
2129 }
2130 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2131 if (error)
2132 break;
2133 brelse(bp, BC_AGE);
2134 }
2135 if (bp != NULL)
2136 brelse(bp, BC_AGE);
2137
2138 mutex_exit(&si->si_snaplock);
2139 return error;
2140 }
2141
2142 /*
2143 * Lookup a snapshots data block address.
2144 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2145 * and safe even for the pagedaemon where we cannot bread().
2146 */
2147 static int
2148 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2149 {
2150 struct indir indirs[UFS_NIADDR + 2];
2151 struct inode *ip = VTOI(vp);
2152 struct fs *fs = ip->i_fs;
2153 struct buf *bp;
2154 int error, num;
2155
2156 KASSERT(lbn >= 0);
2157
2158 if (lbn < UFS_NDADDR) {
2159 *res = db_get(ip, lbn);
2160 return 0;
2161 }
2162 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2163 return error;
2164 if (curlwp == uvm.pagedaemon_lwp) {
2165 mutex_enter(&bufcache_lock);
2166 bp = incore(vp, indirs[num-1].in_lbn);
2167 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2168 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2169 error = 0;
2170 } else
2171 error = ENOMEM;
2172 mutex_exit(&bufcache_lock);
2173 return error;
2174 }
2175 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp);
2176 if (error == 0) {
2177 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2178 brelse(bp, 0);
2179 }
2180
2181 return error;
2182 }
2183
2184 /*
2185 * Read or write the specified block of the filesystem vp resides on
2186 * from or to the disk bypassing the buffer cache.
2187 */
2188 static int
2189 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2190 {
2191 int error;
2192 struct inode *ip = VTOI(vp);
2193 struct fs *fs = ip->i_fs;
2194 struct buf *nbp;
2195
2196 nbp = getiobuf(NULL, true);
2197 nbp->b_flags = flags;
2198 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2199 nbp->b_error = 0;
2200 nbp->b_data = data;
2201 nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
2202 nbp->b_proc = NULL;
2203 nbp->b_dev = ip->i_devvp->v_rdev;
2204 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2205
2206 bdev_strategy(nbp);
2207
2208 error = biowait(nbp);
2209
2210 putiobuf(nbp);
2211
2212 return error;
2213 }
2214
2215 /*
2216 * Write all dirty buffers to disk and invalidate them.
2217 */
2218 static int
2219 syncsnap(struct vnode *vp)
2220 {
2221 int error;
2222 buf_t *bp;
2223 struct fs *fs = VTOI(vp)->i_fs;
2224
2225 mutex_enter(&bufcache_lock);
2226 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2227 error = bbusy(bp, false, 0, NULL);
2228 if (error == EPASSTHROUGH)
2229 continue;
2230 else if (error != 0) {
2231 mutex_exit(&bufcache_lock);
2232 return error;
2233 }
2234 KASSERT(bp->b_bcount == fs->fs_bsize);
2235 mutex_exit(&bufcache_lock);
2236 error = rwfsblk(vp, B_WRITE, bp->b_data,
2237 ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
2238 brelse(bp, BC_INVAL | BC_VFLUSH);
2239 if (error)
2240 return error;
2241 mutex_enter(&bufcache_lock);
2242 }
2243 mutex_exit(&bufcache_lock);
2244
2245 return 0;
2246 }
2247
2248 /*
2249 * Write the specified block to a snapshot.
2250 */
2251 static int
2252 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2253 {
2254 struct inode *ip = VTOI(vp);
2255 struct fs *fs = ip->i_fs;
2256 struct buf *bp;
2257 int error;
2258
2259 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2260 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2261 if (error)
2262 return error;
2263 memcpy(bp->b_data, data, fs->fs_bsize);
2264 if (ip->i_nlink > 0)
2265 error = bwrite(bp);
2266 else
2267 bawrite(bp);
2268
2269 return error;
2270 }
2271
2272 /*
2273 * Check if this inode is present on the active snapshot list.
2274 * Must be called with snapinfo locked.
2275 */
2276 static inline bool
2277 is_active_snapshot(struct snap_info *si, struct inode *ip)
2278 {
2279 struct inode *xp;
2280
2281 KASSERT(mutex_owned(&si->si_lock));
2282
2283 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2284 if (xp == ip)
2285 return true;
2286 return false;
2287 }
2288
2289 /*
2290 * Get/Put direct block from inode or buffer containing disk addresses. Take
2291 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2292 * into a global include.
2293 */
2294 static inline daddr_t
2295 db_get(struct inode *ip, int loc)
2296 {
2297 if (ip->i_ump->um_fstype == UFS1)
2298 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2299 else
2300 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2301 }
2302
2303 static inline void
2304 db_assign(struct inode *ip, int loc, daddr_t val)
2305 {
2306 if (ip->i_ump->um_fstype == UFS1)
2307 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2308 else
2309 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2310 }
2311
2312 __unused static inline daddr_t
2313 ib_get(struct inode *ip, int loc)
2314 {
2315 if (ip->i_ump->um_fstype == UFS1)
2316 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2317 else
2318 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2319 }
2320
2321 static inline daddr_t
2322 idb_get(struct inode *ip, void *bf, int loc)
2323 {
2324 if (ip->i_ump->um_fstype == UFS1)
2325 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2326 else
2327 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2328 }
2329
2330 static inline void
2331 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2332 {
2333 if (ip->i_ump->um_fstype == UFS1)
2334 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2335 else
2336 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2337 }
2338