ffs_snapshot.c revision 1.109 1 /* $NetBSD: ffs_snapshot.c,v 1.109 2011/02/23 17:05:33 dyoung Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.109 2011/02/23 17:05:33 dyoung Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #endif
46
47 #include <sys/param.h>
48 #include <sys/kernel.h>
49 #include <sys/systm.h>
50 #include <sys/conf.h>
51 #include <sys/buf.h>
52 #include <sys/proc.h>
53 #include <sys/namei.h>
54 #include <sys/sched.h>
55 #include <sys/stat.h>
56 #include <sys/malloc.h>
57 #include <sys/mount.h>
58 #include <sys/resource.h>
59 #include <sys/resourcevar.h>
60 #include <sys/vnode.h>
61 #include <sys/kauth.h>
62 #include <sys/fstrans.h>
63 #include <sys/wapbl.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #include <ufs/ufs/quota.h>
68 #include <ufs/ufs/ufsmount.h>
69 #include <ufs/ufs/inode.h>
70 #include <ufs/ufs/ufs_extern.h>
71 #include <ufs/ufs/ufs_bswap.h>
72 #include <ufs/ufs/ufs_wapbl.h>
73
74 #include <ufs/ffs/fs.h>
75 #include <ufs/ffs/ffs_extern.h>
76
77 #include <uvm/uvm.h>
78
79 struct snap_info {
80 kmutex_t si_lock; /* Lock this snapinfo */
81 kmutex_t si_snaplock; /* Snapshot vnode common lock */
82 lwp_t *si_owner; /* Sanplock owner */
83 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
84 daddr_t *si_snapblklist; /* Snapshot block hints list */
85 uint32_t si_gen; /* Incremented on change */
86 };
87
88 #if !defined(FFS_NO_SNAPSHOT)
89 typedef int (*acctfunc_t)
90 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
91
92 static int snapshot_setup(struct mount *, struct vnode *);
93 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
94 static int snapshot_expunge(struct mount *, struct vnode *,
95 struct fs *, daddr_t *, daddr_t **);
96 static int snapshot_expunge_snap(struct mount *, struct vnode *,
97 struct fs *, daddr_t);
98 static int snapshot_writefs(struct mount *, struct vnode *, void *);
99 static int cgaccount(struct vnode *, int, int *);
100 static int cgaccount1(int, struct vnode *, void *, int);
101 static int expunge(struct vnode *, struct inode *, struct fs *,
102 acctfunc_t, int);
103 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
104 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
105 static int fullacct(struct vnode *, void *, int, int, struct fs *,
106 daddr_t, int);
107 static int snapacct(struct vnode *, void *, int, int, struct fs *,
108 daddr_t, int);
109 static int mapacct(struct vnode *, void *, int, int, struct fs *,
110 daddr_t, int);
111 #endif /* !defined(FFS_NO_SNAPSHOT) */
112
113 static int ffs_copyonwrite(void *, struct buf *, bool);
114 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
115 static int rwfsblk(struct vnode *, int, void *, daddr_t);
116 static int syncsnap(struct vnode *);
117 static int wrsnapblk(struct vnode *, void *, daddr_t);
118 #if !defined(FFS_NO_SNAPSHOT)
119 static int blocks_in_journal(struct fs *);
120 #endif
121
122 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
123 static inline daddr_t db_get(struct inode *, int);
124 static inline void db_assign(struct inode *, int, daddr_t);
125 static inline daddr_t ib_get(struct inode *, int);
126 static inline void ib_assign(struct inode *, int, daddr_t);
127 static inline daddr_t idb_get(struct inode *, void *, int);
128 static inline void idb_assign(struct inode *, void *, int, daddr_t);
129
130 #ifdef DEBUG
131 static int snapdebug = 0;
132 #endif
133
134 int
135 ffs_snapshot_init(struct ufsmount *ump)
136 {
137 struct snap_info *si;
138
139 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
140 if (si == NULL)
141 return ENOMEM;
142
143 TAILQ_INIT(&si->si_snapshots);
144 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
145 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
146 si->si_owner = NULL;
147 si->si_gen = 0;
148 si->si_snapblklist = NULL;
149
150 return 0;
151 }
152
153 void
154 ffs_snapshot_fini(struct ufsmount *ump)
155 {
156 struct snap_info *si;
157
158 si = ump->um_snapinfo;
159 ump->um_snapinfo = NULL;
160
161 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
162 mutex_destroy(&si->si_lock);
163 mutex_destroy(&si->si_snaplock);
164 KASSERT(si->si_snapblklist == NULL);
165 kmem_free(si, sizeof(*si));
166 }
167
168 /*
169 * Create a snapshot file and initialize it for the filesystem.
170 * Vnode is locked on entry and return.
171 */
172 int
173 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
174 {
175 #if defined(FFS_NO_SNAPSHOT)
176 return EOPNOTSUPP;
177 }
178 #else /* defined(FFS_NO_SNAPSHOT) */
179 bool suspended = false;
180 int error, redo = 0, snaploc;
181 void *sbbuf = NULL;
182 daddr_t *snaplist = NULL, snaplistsize = 0;
183 struct buf *bp, *nbp;
184 struct fs *copy_fs = NULL;
185 struct fs *fs = VFSTOUFS(mp)->um_fs;
186 struct inode *ip = VTOI(vp);
187 struct lwp *l = curlwp;
188 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
189 struct timespec ts;
190 struct timeval starttime;
191 #ifdef DEBUG
192 struct timeval endtime;
193 #endif
194 struct vnode *devvp = ip->i_devvp;
195
196 /*
197 * If the vnode already is a snapshot, return.
198 */
199 if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
200 if (ctime) {
201 ctime->tv_sec = DIP(VTOI(vp), mtime);
202 ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
203 }
204 return 0;
205 }
206 /*
207 * Check for free snapshot slot in the superblock.
208 */
209 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
210 if (fs->fs_snapinum[snaploc] == 0)
211 break;
212 if (snaploc == FSMAXSNAP)
213 return (ENOSPC);
214 /*
215 * Prepare the vnode to become a snapshot.
216 */
217 error = snapshot_setup(mp, vp);
218 if (error)
219 goto out;
220 /*
221 * Change inode to snapshot type file.
222 */
223 ip->i_flags |= SF_SNAPSHOT;
224 DIP_ASSIGN(ip, flags, ip->i_flags);
225 ip->i_flag |= IN_CHANGE | IN_UPDATE;
226 /*
227 * Copy all the cylinder group maps. Although the
228 * filesystem is still active, we hope that only a few
229 * cylinder groups will change between now and when we
230 * suspend operations. Thus, we will be able to quickly
231 * touch up the few cylinder groups that changed during
232 * the suspension period.
233 */
234 error = cgaccount(vp, 1, NULL);
235 if (error)
236 goto out;
237 /*
238 * Ensure that the snapshot is completely on disk.
239 * Since we have marked it as a snapshot it is safe to
240 * unlock it as no process will be allowed to write to it.
241 */
242 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
243 if (error)
244 goto out;
245 VOP_UNLOCK(vp);
246 /*
247 * All allocations are done, so we can now suspend the filesystem.
248 */
249 error = vfs_suspend(vp->v_mount, 0);
250 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
251 if (error)
252 goto out;
253 suspended = true;
254 getmicrotime(&starttime);
255 /*
256 * First, copy all the cylinder group maps that have changed.
257 */
258 error = cgaccount(vp, 2, &redo);
259 if (error)
260 goto out;
261 /*
262 * Create a copy of the superblock and its summary information.
263 */
264 error = snapshot_copyfs(mp, vp, &sbbuf);
265 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
266 if (error)
267 goto out;
268 /*
269 * Expunge unlinked files from our view.
270 */
271 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
272 if (error)
273 goto out;
274 /*
275 * Record snapshot inode. Since this is the newest snapshot,
276 * it must be placed at the end of the list.
277 */
278 fs->fs_snapinum[snaploc] = ip->i_number;
279
280 mutex_enter(&si->si_lock);
281 if (is_active_snapshot(si, ip))
282 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
283 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
284 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
285 /*
286 * If this is the first snapshot on this filesystem, put the
287 * preliminary list in place and establish the cow handler.
288 */
289 si->si_snapblklist = snaplist;
290 fscow_establish(mp, ffs_copyonwrite, devvp);
291 }
292 si->si_gen++;
293 mutex_exit(&si->si_lock);
294
295 vp->v_vflag |= VV_SYSTEM;
296 /*
297 * Set the mtime to the time the snapshot has been taken.
298 */
299 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
300 if (ctime)
301 *ctime = ts;
302 DIP_ASSIGN(ip, mtime, ts.tv_sec);
303 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
304 ip->i_flag |= IN_CHANGE | IN_UPDATE;
305 /*
306 * Copy allocation information from all snapshots and then
307 * expunge them from our view.
308 */
309 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
310 if (error)
311 goto out;
312 /*
313 * Write the superblock and its summary information to the snapshot.
314 */
315 error = snapshot_writefs(mp, vp, sbbuf);
316 if (error)
317 goto out;
318 /*
319 * We're nearly done, ensure that the snapshot is completely on disk.
320 */
321 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
322 if (error)
323 goto out;
324 /*
325 * Invalidate and free all pages on the snapshot vnode.
326 * We will read and write through the buffercache.
327 */
328 mutex_enter(&vp->v_interlock);
329 error = VOP_PUTPAGES(vp, 0, 0,
330 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
331 if (error)
332 goto out;
333 /*
334 * Invalidate short ( < fs_bsize ) buffers. We will always read
335 * full size buffers later.
336 */
337 mutex_enter(&bufcache_lock);
338 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
339 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
340 nbp = LIST_NEXT(bp, b_vnbufs);
341 KASSERT((bp->b_cflags & BC_BUSY) == 0);
342 if (bp->b_bcount < fs->fs_bsize) {
343 bp->b_cflags |= BC_BUSY;
344 brelsel(bp, BC_INVAL | BC_VFLUSH);
345 }
346 }
347 mutex_exit(&bufcache_lock);
348
349 out:
350 if (sbbuf != NULL) {
351 free(copy_fs->fs_csp, M_UFSMNT);
352 free(sbbuf, M_UFSMNT);
353 }
354 if (fs->fs_active != NULL) {
355 free(fs->fs_active, M_DEVBUF);
356 fs->fs_active = NULL;
357 }
358
359 mutex_enter(&si->si_lock);
360 if (snaplist != NULL) {
361 if (si->si_snapblklist == snaplist)
362 si->si_snapblklist = NULL;
363 free(snaplist, M_UFSMNT);
364 }
365 if (error) {
366 fs->fs_snapinum[snaploc] = 0;
367 } else {
368 /*
369 * As this is the newest list, it is the most inclusive, so
370 * should replace the previous list.
371 */
372 si->si_snapblklist = ip->i_snapblklist;
373 }
374 si->si_gen++;
375 mutex_exit(&si->si_lock);
376
377 if (suspended) {
378 vfs_resume(vp->v_mount);
379 #ifdef DEBUG
380 getmicrotime(&endtime);
381 timersub(&endtime, &starttime, &endtime);
382 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
383 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
384 endtime.tv_usec / 1000, redo, fs->fs_ncg);
385 #endif
386 }
387 if (error) {
388 if (!UFS_WAPBL_BEGIN(mp)) {
389 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
390 UFS_WAPBL_END(mp);
391 }
392 } else
393 vref(vp);
394 return (error);
395 }
396
397 /*
398 * Prepare vnode to become a snapshot.
399 */
400 static int
401 snapshot_setup(struct mount *mp, struct vnode *vp)
402 {
403 int error, n, len, loc;
404 daddr_t blkno, numblks;
405 struct buf *ibp, *nbp;
406 struct fs *fs = VFSTOUFS(mp)->um_fs;
407 struct lwp *l = curlwp;
408 const int wbreak = blocks_in_journal(fs)/8;
409
410 /*
411 * Check mount, exclusive reference and owner.
412 */
413 if (vp->v_mount != mp)
414 return EXDEV;
415 if (vp->v_usecount != 1 || vp->v_writecount != 0)
416 return EBUSY;
417 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
418 NULL) != 0 &&
419 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
420 return EACCES;
421
422 if (vp->v_size != 0) {
423 error = ffs_truncate(vp, 0, 0, NOCRED);
424 if (error)
425 return error;
426 }
427 /*
428 * Write an empty list of preallocated blocks to the end of
429 * the snapshot to set size to at least that of the filesystem.
430 */
431 numblks = howmany(fs->fs_size, fs->fs_frag);
432 blkno = 1;
433 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
434 error = vn_rdwr(UIO_WRITE, vp,
435 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
436 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
437 if (error)
438 return error;
439 /*
440 * Preallocate critical data structures so that we can copy
441 * them in without further allocation after we suspend all
442 * operations on the filesystem. We would like to just release
443 * the allocated buffers without writing them since they will
444 * be filled in below once we are ready to go, but this upsets
445 * the soft update code, so we go ahead and write the new buffers.
446 *
447 * Allocate all indirect blocks and mark all of them as not
448 * needing to be copied.
449 */
450 error = UFS_WAPBL_BEGIN(mp);
451 if (error)
452 return error;
453 for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) {
454 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
455 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
456 if (error)
457 goto out;
458 brelse(ibp, 0);
459 if (wbreak > 0 && (++n % wbreak) == 0) {
460 UFS_WAPBL_END(mp);
461 error = UFS_WAPBL_BEGIN(mp);
462 if (error)
463 return error;
464 }
465 }
466 /*
467 * Allocate copies for the superblock and its summary information.
468 */
469 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
470 0, &nbp);
471 if (error)
472 goto out;
473 bawrite(nbp);
474 blkno = fragstoblks(fs, fs->fs_csaddr);
475 len = howmany(fs->fs_cssize, fs->fs_bsize);
476 for (loc = 0; loc < len; loc++) {
477 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
478 fs->fs_bsize, l->l_cred, 0, &nbp);
479 if (error)
480 goto out;
481 bawrite(nbp);
482 }
483
484 out:
485 UFS_WAPBL_END(mp);
486 return error;
487 }
488
489 /*
490 * Create a copy of the superblock and its summary information.
491 * It is up to the caller to free copyfs and copy_fs->fs_csp.
492 */
493 static int
494 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
495 {
496 int error, i, len, loc, size;
497 void *space;
498 int32_t *lp;
499 struct buf *bp;
500 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
501 struct lwp *l = curlwp;
502 struct vnode *devvp = VTOI(vp)->i_devvp;
503
504 /*
505 * Grab a copy of the superblock and its summary information.
506 * We delay writing it until the suspension is released below.
507 */
508 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
509 loc = blkoff(fs, fs->fs_sblockloc);
510 if (loc > 0)
511 memset(*sbbuf, 0, loc);
512 copyfs = (struct fs *)((char *)(*sbbuf) + loc);
513 memcpy(copyfs, fs, fs->fs_sbsize);
514 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
515 if (fs->fs_sbsize < size)
516 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
517 size - fs->fs_sbsize);
518 size = blkroundup(fs, fs->fs_cssize);
519 if (fs->fs_contigsumsize > 0)
520 size += fs->fs_ncg * sizeof(int32_t);
521 space = malloc(size, M_UFSMNT, M_WAITOK);
522 copyfs->fs_csp = space;
523 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
524 space = (char *)space + fs->fs_cssize;
525 loc = howmany(fs->fs_cssize, fs->fs_fsize);
526 i = fs->fs_frag - loc % fs->fs_frag;
527 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
528 if (len > 0) {
529 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
530 len, l->l_cred, 0, &bp)) != 0) {
531 brelse(bp, 0);
532 free(copyfs->fs_csp, M_UFSMNT);
533 free(*sbbuf, M_UFSMNT);
534 *sbbuf = NULL;
535 return error;
536 }
537 memcpy(space, bp->b_data, (u_int)len);
538 space = (char *)space + len;
539 brelse(bp, BC_INVAL | BC_NOCACHE);
540 }
541 if (fs->fs_contigsumsize > 0) {
542 copyfs->fs_maxcluster = lp = space;
543 for (i = 0; i < fs->fs_ncg; i++)
544 *lp++ = fs->fs_contigsumsize;
545 }
546 if (mp->mnt_wapbl)
547 copyfs->fs_flags &= ~FS_DOWAPBL;
548 return 0;
549 }
550
551 /*
552 * We must check for active files that have been unlinked (e.g., with a zero
553 * link count). We have to expunge all trace of these files from the snapshot
554 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
555 * Note that we skip unlinked snapshot files as they will be handled separately.
556 * Calculate the snapshot list size and create a preliminary list.
557 */
558 static int
559 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
560 daddr_t *snaplistsize, daddr_t **snaplist)
561 {
562 int cg, error = 0, len, loc;
563 daddr_t blkno, *blkp;
564 struct fs *fs = VFSTOUFS(mp)->um_fs;
565 struct inode *xp;
566 struct lwp *l = curlwp;
567 struct vattr vat;
568 struct vnode *logvp = NULL, *mvp = NULL, *xvp;
569
570 *snaplist = NULL;
571 /*
572 * Get the log inode if any.
573 */
574 if ((fs->fs_flags & FS_DOWAPBL) &&
575 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
576 error = VFS_VGET(mp,
577 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
578 if (error)
579 goto out;
580 }
581 /*
582 * Allocate a marker vnode.
583 */
584 if ((mvp = vnalloc(mp)) == NULL) {
585 error = ENOMEM;
586 goto out;
587 }
588 /*
589 * We also calculate the needed size for the snapshot list.
590 */
591 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
592 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
593 mutex_enter(&mntvnode_lock);
594 /*
595 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
596 * and vclean() can be called indirectly
597 */
598 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
599 vmark(mvp, xvp);
600 /*
601 * Make sure this vnode wasn't reclaimed in getnewvnode().
602 * Start over if it has (it won't be on the list anymore).
603 */
604 if (xvp->v_mount != mp || vismarker(xvp))
605 continue;
606 mutex_enter(&xvp->v_interlock);
607 if ((xvp->v_iflag & VI_XLOCK) ||
608 xvp->v_usecount == 0 || xvp->v_type == VNON ||
609 VTOI(xvp) == NULL ||
610 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
611 mutex_exit(&xvp->v_interlock);
612 continue;
613 }
614 mutex_exit(&mntvnode_lock);
615 /*
616 * XXXAD should increase vnode ref count to prevent it
617 * disappearing or being recycled.
618 */
619 mutex_exit(&xvp->v_interlock);
620 #ifdef DEBUG
621 if (snapdebug)
622 vprint("ffs_snapshot: busy vnode", xvp);
623 #endif
624 xp = VTOI(xvp);
625 if (xvp != logvp) {
626 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
627 vat.va_nlink > 0) {
628 mutex_enter(&mntvnode_lock);
629 continue;
630 }
631 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
632 mutex_enter(&mntvnode_lock);
633 continue;
634 }
635 }
636 /*
637 * If there is a fragment, clear it here.
638 */
639 blkno = 0;
640 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
641 if (loc < NDADDR) {
642 len = fragroundup(fs, blkoff(fs, xp->i_size));
643 if (len > 0 && len < fs->fs_bsize) {
644 error = UFS_WAPBL_BEGIN(mp);
645 if (error) {
646 (void)vunmark(mvp);
647 goto out;
648 }
649 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
650 len, xp->i_number);
651 blkno = db_get(xp, loc);
652 db_assign(xp, loc, 0);
653 UFS_WAPBL_END(mp);
654 }
655 }
656 *snaplistsize += 1;
657 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
658 if (blkno)
659 db_assign(xp, loc, blkno);
660 if (!error) {
661 error = UFS_WAPBL_BEGIN(mp);
662 if (!error) {
663 error = ffs_freefile_snap(copy_fs, vp,
664 xp->i_number, xp->i_mode);
665 UFS_WAPBL_END(mp);
666 }
667 }
668 if (error) {
669 (void)vunmark(mvp);
670 goto out;
671 }
672 mutex_enter(&mntvnode_lock);
673 }
674 mutex_exit(&mntvnode_lock);
675 /*
676 * Create a preliminary list of preallocated snapshot blocks.
677 */
678 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
679 blkp = &(*snaplist)[1];
680 *blkp++ = lblkno(fs, fs->fs_sblockloc);
681 blkno = fragstoblks(fs, fs->fs_csaddr);
682 for (cg = 0; cg < fs->fs_ncg; cg++) {
683 if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
684 break;
685 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
686 }
687 len = howmany(fs->fs_cssize, fs->fs_bsize);
688 for (loc = 0; loc < len; loc++)
689 *blkp++ = blkno + loc;
690 for (; cg < fs->fs_ncg; cg++)
691 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
692 (*snaplist)[0] = blkp - &(*snaplist)[0];
693
694 out:
695 if (mvp != NULL)
696 vnfree(mvp);
697 if (logvp != NULL)
698 vput(logvp);
699 if (error && *snaplist != NULL) {
700 free(*snaplist, M_UFSMNT);
701 *snaplist = NULL;
702 }
703
704 return error;
705 }
706
707 /*
708 * Copy allocation information from all the snapshots in this snapshot and
709 * then expunge them from its view. Also, collect the list of allocated
710 * blocks in i_snapblklist.
711 */
712 static int
713 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
714 struct fs *copy_fs, daddr_t snaplistsize)
715 {
716 int error = 0, i;
717 daddr_t numblks, *snaplist = NULL;
718 struct fs *fs = VFSTOUFS(mp)->um_fs;
719 struct inode *ip = VTOI(vp), *xp;
720 struct lwp *l = curlwp;
721 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
722
723 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
724 if (xp == ip)
725 break;
726 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
727 if (error)
728 break;
729 if (xp->i_nlink != 0)
730 continue;
731 error = UFS_WAPBL_BEGIN(mp);
732 if (error)
733 break;
734 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
735 UFS_WAPBL_END(mp);
736 if (error)
737 break;
738 }
739 if (error)
740 goto out;
741 /*
742 * Allocate space for the full list of preallocated snapshot blocks.
743 */
744 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
745 ip->i_snapblklist = &snaplist[1];
746 /*
747 * Expunge the blocks used by the snapshots from the set of
748 * blocks marked as used in the snapshot bitmaps. Also, collect
749 * the list of allocated blocks in i_snapblklist.
750 */
751 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
752 if (error)
753 goto out;
754 if (snaplistsize < ip->i_snapblklist - snaplist)
755 panic("ffs_snapshot: list too small");
756 snaplistsize = ip->i_snapblklist - snaplist;
757 snaplist[0] = snaplistsize;
758 ip->i_snapblklist = &snaplist[0];
759 /*
760 * Write out the list of allocated blocks to the end of the snapshot.
761 */
762 numblks = howmany(fs->fs_size, fs->fs_frag);
763 for (i = 0; i < snaplistsize; i++)
764 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
765 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
766 snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks),
767 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
768 for (i = 0; i < snaplistsize; i++)
769 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
770 out:
771 if (error && snaplist != NULL) {
772 free(snaplist, M_UFSMNT);
773 ip->i_snapblklist = NULL;
774 }
775 return error;
776 }
777
778 /*
779 * Write the superblock and its summary information to the snapshot.
780 * Make sure, the first NDADDR blocks get copied to the snapshot.
781 */
782 static int
783 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
784 {
785 int error, len, loc;
786 void *space;
787 daddr_t blkno;
788 struct buf *bp;
789 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
790 struct inode *ip = VTOI(vp);
791 struct lwp *l = curlwp;
792
793 copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
794
795 /*
796 * Write the superblock and its summary information
797 * to the snapshot.
798 */
799 blkno = fragstoblks(fs, fs->fs_csaddr);
800 len = howmany(fs->fs_cssize, fs->fs_bsize);
801 space = copyfs->fs_csp;
802 #ifdef FFS_EI
803 if (UFS_FSNEEDSWAP(fs)) {
804 ffs_sb_swap(copyfs, copyfs);
805 ffs_csum_swap(space, space, fs->fs_cssize);
806 }
807 #endif
808 error = UFS_WAPBL_BEGIN(mp);
809 if (error)
810 return error;
811 for (loc = 0; loc < len; loc++) {
812 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
813 B_MODIFY, &bp);
814 if (error) {
815 brelse(bp, 0);
816 break;
817 }
818 memcpy(bp->b_data, space, fs->fs_bsize);
819 space = (char *)space + fs->fs_bsize;
820 bawrite(bp);
821 }
822 if (error)
823 goto out;
824 error = bread(vp, lblkno(fs, fs->fs_sblockloc),
825 fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
826 if (error) {
827 brelse(bp, 0);
828 goto out;
829 } else {
830 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
831 bawrite(bp);
832 }
833 /*
834 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
835 * and ffs_snapblkfree() will always work on indirect blocks.
836 */
837 for (loc = 0; loc < NDADDR; loc++) {
838 if (db_get(ip, loc) != 0)
839 continue;
840 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
841 fs->fs_bsize, l->l_cred, 0, &bp);
842 if (error)
843 break;
844 error = rwfsblk(vp, B_READ, bp->b_data, loc);
845 if (error) {
846 brelse(bp, 0);
847 break;
848 }
849 bawrite(bp);
850 }
851
852 out:
853 UFS_WAPBL_END(mp);
854 return error;
855 }
856
857 /*
858 * Copy all cylinder group maps.
859 */
860 static int
861 cgaccount(struct vnode *vp, int passno, int *redo)
862 {
863 int cg, error = 0;
864 struct buf *nbp;
865 struct fs *fs = VTOI(vp)->i_fs;
866
867 if (redo != NULL)
868 *redo = 0;
869 if (passno == 1)
870 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
871 M_DEVBUF, M_WAITOK | M_ZERO);
872 for (cg = 0; cg < fs->fs_ncg; cg++) {
873 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
874 continue;
875
876 if (redo != NULL)
877 *redo += 1;
878 error = UFS_WAPBL_BEGIN(vp->v_mount);
879 if (error)
880 return error;
881 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
882 fs->fs_bsize, curlwp->l_cred, 0, &nbp);
883 if (error) {
884 UFS_WAPBL_END(vp->v_mount);
885 break;
886 }
887 error = cgaccount1(cg, vp, nbp->b_data, passno);
888 bawrite(nbp);
889 UFS_WAPBL_END(vp->v_mount);
890 if (error)
891 break;
892 }
893 return error;
894 }
895
896 /*
897 * Copy a cylinder group map. All the unallocated blocks are marked
898 * BLK_NOCOPY so that the snapshot knows that it need not copy them
899 * if they are later written. If passno is one, then this is a first
900 * pass, so only setting needs to be done. If passno is 2, then this
901 * is a revision to a previous pass which must be undone as the
902 * replacement pass is done.
903 */
904 static int
905 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
906 {
907 struct buf *bp, *ibp;
908 struct inode *ip;
909 struct cg *cgp;
910 struct fs *fs;
911 struct lwp *l = curlwp;
912 daddr_t base, numblks;
913 int error, len, loc, ns, indiroff;
914
915 ip = VTOI(vp);
916 fs = ip->i_fs;
917 ns = UFS_FSNEEDSWAP(fs);
918 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
919 (int)fs->fs_cgsize, l->l_cred, 0, &bp);
920 if (error) {
921 brelse(bp, 0);
922 return (error);
923 }
924 cgp = (struct cg *)bp->b_data;
925 if (!cg_chkmagic(cgp, ns)) {
926 brelse(bp, 0);
927 return (EIO);
928 }
929 ACTIVECG_SET(fs, cg);
930
931 memcpy(data, bp->b_data, fs->fs_cgsize);
932 brelse(bp, 0);
933 if (fs->fs_cgsize < fs->fs_bsize)
934 memset((char *)data + fs->fs_cgsize, 0,
935 fs->fs_bsize - fs->fs_cgsize);
936 numblks = howmany(fs->fs_size, fs->fs_frag);
937 len = howmany(fs->fs_fpg, fs->fs_frag);
938 base = cg * fs->fs_fpg / fs->fs_frag;
939 if (base + len >= numblks)
940 len = numblks - base - 1;
941 loc = 0;
942 if (base < NDADDR) {
943 for ( ; loc < NDADDR; loc++) {
944 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
945 db_assign(ip, loc, BLK_NOCOPY);
946 else if (db_get(ip, loc) == BLK_NOCOPY) {
947 if (passno == 2)
948 db_assign(ip, loc, 0);
949 else if (passno == 1)
950 panic("ffs_snapshot: lost direct block");
951 }
952 }
953 }
954 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
955 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
956 return (error);
957 indiroff = (base + loc - NDADDR) % NINDIR(fs);
958 for ( ; loc < len; loc++, indiroff++) {
959 if (indiroff >= NINDIR(fs)) {
960 bawrite(ibp);
961 if ((error = ffs_balloc(vp,
962 lblktosize(fs, (off_t)(base + loc)),
963 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
964 return (error);
965 indiroff = 0;
966 }
967 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
968 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
969 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
970 if (passno == 2)
971 idb_assign(ip, ibp->b_data, indiroff, 0);
972 else if (passno == 1)
973 panic("ffs_snapshot: lost indirect block");
974 }
975 }
976 bdwrite(ibp);
977 return (0);
978 }
979
980 /*
981 * Before expunging a snapshot inode, note all the
982 * blocks that it claims with BLK_SNAP so that fsck will
983 * be able to account for those blocks properly and so
984 * that this snapshot knows that it need not copy them
985 * if the other snapshot holding them is freed.
986 */
987 static int
988 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
989 acctfunc_t acctfunc, int expungetype)
990 {
991 int i, error, ns;
992 daddr_t lbn, rlbn;
993 daddr_t len, blkno, numblks, blksperindir;
994 struct ufs1_dinode *dip1;
995 struct ufs2_dinode *dip2;
996 struct lwp *l = curlwp;
997 void *bap;
998 struct buf *bp;
999 struct mount *mp;
1000
1001 ns = UFS_FSNEEDSWAP(fs);
1002 mp = snapvp->v_mount;
1003
1004 error = UFS_WAPBL_BEGIN(mp);
1005 if (error)
1006 return error;
1007 /*
1008 * Prepare to expunge the inode. If its inode block has not
1009 * yet been copied, then allocate and fill the copy.
1010 */
1011 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1012 error = snapblkaddr(snapvp, lbn, &blkno);
1013 if (error)
1014 return error;
1015 if (blkno != 0) {
1016 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
1017 B_MODIFY, &bp);
1018 } else {
1019 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
1020 fs->fs_bsize, l->l_cred, 0, &bp);
1021 if (! error)
1022 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1023 }
1024 if (error) {
1025 UFS_WAPBL_END(mp);
1026 return error;
1027 }
1028 /*
1029 * Set a snapshot inode to be a zero length file, regular files
1030 * or unlinked snapshots to be completely unallocated.
1031 */
1032 if (fs->fs_magic == FS_UFS1_MAGIC) {
1033 dip1 = (struct ufs1_dinode *)bp->b_data +
1034 ino_to_fsbo(fs, cancelip->i_number);
1035 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1036 dip1->di_mode = 0;
1037 dip1->di_size = 0;
1038 dip1->di_blocks = 0;
1039 dip1->di_flags =
1040 ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns);
1041 memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t));
1042 } else {
1043 dip2 = (struct ufs2_dinode *)bp->b_data +
1044 ino_to_fsbo(fs, cancelip->i_number);
1045 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1046 dip2->di_mode = 0;
1047 dip2->di_size = 0;
1048 dip2->di_blocks = 0;
1049 dip2->di_flags =
1050 ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns);
1051 memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t));
1052 }
1053 bdwrite(bp);
1054 UFS_WAPBL_END(mp);
1055 /*
1056 * Now go through and expunge all the blocks in the file
1057 * using the function requested.
1058 */
1059 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1060 if (fs->fs_magic == FS_UFS1_MAGIC)
1061 bap = &cancelip->i_ffs1_db[0];
1062 else
1063 bap = &cancelip->i_ffs2_db[0];
1064 error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype);
1065 if (error)
1066 return (error);
1067 if (fs->fs_magic == FS_UFS1_MAGIC)
1068 bap = &cancelip->i_ffs1_ib[0];
1069 else
1070 bap = &cancelip->i_ffs2_ib[0];
1071 error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype);
1072 if (error)
1073 return (error);
1074 blksperindir = 1;
1075 lbn = -NDADDR;
1076 len = numblks - NDADDR;
1077 rlbn = NDADDR;
1078 for (i = 0; len > 0 && i < NIADDR; i++) {
1079 error = indiracct(snapvp, ITOV(cancelip), i,
1080 ib_get(cancelip, i), lbn, rlbn, len,
1081 blksperindir, fs, acctfunc, expungetype);
1082 if (error)
1083 return (error);
1084 blksperindir *= NINDIR(fs);
1085 lbn -= blksperindir + 1;
1086 len -= blksperindir;
1087 rlbn += blksperindir;
1088 }
1089 return (0);
1090 }
1091
1092 /*
1093 * Descend an indirect block chain for vnode cancelvp accounting for all
1094 * its indirect blocks in snapvp.
1095 */
1096 static int
1097 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1098 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1099 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1100 {
1101 int error, num, i;
1102 daddr_t subblksperindir;
1103 struct indir indirs[NIADDR + 2];
1104 daddr_t last;
1105 void *bap;
1106 struct buf *bp;
1107
1108 if (blkno == 0) {
1109 if (expungetype == BLK_NOCOPY)
1110 return (0);
1111 panic("indiracct: missing indir");
1112 }
1113 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1114 return (error);
1115 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1116 panic("indiracct: botched params");
1117 /*
1118 * We have to expand bread here since it will deadlock looking
1119 * up the block number for any blocks that are not in the cache.
1120 */
1121 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
1122 false, &bp);
1123 if (error)
1124 return error;
1125 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1126 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
1127 brelse(bp, 0);
1128 return (error);
1129 }
1130 /*
1131 * Account for the block pointers in this indirect block.
1132 */
1133 last = howmany(remblks, blksperindir);
1134 if (last > NINDIR(fs))
1135 last = NINDIR(fs);
1136 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1137 memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1138 brelse(bp, 0);
1139 error = (*acctfunc)(snapvp, bap, 0, last,
1140 fs, level == 0 ? rlbn : -1, expungetype);
1141 if (error || level == 0)
1142 goto out;
1143 /*
1144 * Account for the block pointers in each of the indirect blocks
1145 * in the levels below us.
1146 */
1147 subblksperindir = blksperindir / NINDIR(fs);
1148 for (lbn++, level--, i = 0; i < last; i++) {
1149 error = indiracct(snapvp, cancelvp, level,
1150 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1151 subblksperindir, fs, acctfunc, expungetype);
1152 if (error)
1153 goto out;
1154 rlbn += blksperindir;
1155 lbn -= blksperindir;
1156 remblks -= blksperindir;
1157 }
1158 out:
1159 free(bap, M_DEVBUF);
1160 return (error);
1161 }
1162
1163 /*
1164 * Do both snap accounting and map accounting.
1165 */
1166 static int
1167 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1168 struct fs *fs, daddr_t lblkno,
1169 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1170 {
1171 int error;
1172
1173 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1174 return (error);
1175 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1176 }
1177
1178 /*
1179 * Identify a set of blocks allocated in a snapshot inode.
1180 */
1181 static int
1182 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1183 struct fs *fs, daddr_t lblkno,
1184 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1185 {
1186 struct inode *ip = VTOI(vp);
1187 struct lwp *l = curlwp;
1188 struct mount *mp = vp->v_mount;
1189 daddr_t blkno;
1190 daddr_t lbn;
1191 struct buf *ibp;
1192 int error, n;
1193 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1194
1195 error = UFS_WAPBL_BEGIN(mp);
1196 if (error)
1197 return error;
1198 for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1199 blkno = idb_get(ip, bap, oldblkp);
1200 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1201 continue;
1202 lbn = fragstoblks(fs, blkno);
1203 if (lbn < NDADDR) {
1204 blkno = db_get(ip, lbn);
1205 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1206 } else {
1207 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1208 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1209 if (error)
1210 break;
1211 blkno = idb_get(ip, ibp->b_data,
1212 (lbn - NDADDR) % NINDIR(fs));
1213 }
1214 /*
1215 * If we are expunging a snapshot vnode and we
1216 * find a block marked BLK_NOCOPY, then it is
1217 * one that has been allocated to this snapshot after
1218 * we took our current snapshot and can be ignored.
1219 */
1220 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1221 if (lbn >= NDADDR)
1222 brelse(ibp, 0);
1223 } else {
1224 if (blkno != 0)
1225 panic("snapacct: bad block");
1226 if (lbn < NDADDR)
1227 db_assign(ip, lbn, expungetype);
1228 else {
1229 idb_assign(ip, ibp->b_data,
1230 (lbn - NDADDR) % NINDIR(fs), expungetype);
1231 bdwrite(ibp);
1232 }
1233 }
1234 if (wbreak > 0 && (++n % wbreak) == 0) {
1235 UFS_WAPBL_END(mp);
1236 error = UFS_WAPBL_BEGIN(mp);
1237 if (error)
1238 return error;
1239 }
1240 }
1241 UFS_WAPBL_END(mp);
1242 return error;
1243 }
1244
1245 /*
1246 * Account for a set of blocks allocated in a snapshot inode.
1247 */
1248 static int
1249 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1250 struct fs *fs, daddr_t lblkno, int expungetype)
1251 {
1252 daddr_t blkno;
1253 struct inode *ip;
1254 struct mount *mp = vp->v_mount;
1255 ino_t inum;
1256 int acctit, error, n;
1257 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1258
1259 error = UFS_WAPBL_BEGIN(mp);
1260 if (error)
1261 return error;
1262 ip = VTOI(vp);
1263 inum = ip->i_number;
1264 if (lblkno == -1)
1265 acctit = 0;
1266 else
1267 acctit = 1;
1268 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1269 blkno = idb_get(ip, bap, oldblkp);
1270 if (blkno == 0 || blkno == BLK_NOCOPY)
1271 continue;
1272 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1273 *ip->i_snapblklist++ = lblkno;
1274 if (blkno == BLK_SNAP)
1275 blkno = blkstofrags(fs, lblkno);
1276 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1277 if (wbreak > 0 && (++n % wbreak) == 0) {
1278 UFS_WAPBL_END(mp);
1279 error = UFS_WAPBL_BEGIN(mp);
1280 if (error)
1281 return error;
1282 }
1283 }
1284 UFS_WAPBL_END(mp);
1285 return (0);
1286 }
1287
1288 /*
1289 * Number of blocks that fit into the journal or zero if not logging.
1290 */
1291 static int
1292 blocks_in_journal(struct fs *fs)
1293 {
1294 off_t bpj;
1295
1296 if ((fs->fs_flags & FS_DOWAPBL) == 0)
1297 return 0;
1298 bpj = 1;
1299 if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1300 switch (fs->fs_journal_location) {
1301 case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1302 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1303 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1304 break;
1305 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1306 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1307 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1308 break;
1309 }
1310 }
1311 bpj /= fs->fs_bsize;
1312 return (bpj > 0 ? bpj : 1);
1313 }
1314 #endif /* defined(FFS_NO_SNAPSHOT) */
1315
1316 /*
1317 * Decrement extra reference on snapshot when last name is removed.
1318 * It will not be freed until the last open reference goes away.
1319 */
1320 void
1321 ffs_snapgone(struct inode *ip)
1322 {
1323 struct mount *mp = ip->i_devvp->v_specmountpoint;
1324 struct inode *xp;
1325 struct fs *fs;
1326 struct snap_info *si;
1327 int snaploc;
1328
1329 si = VFSTOUFS(mp)->um_snapinfo;
1330
1331 /*
1332 * Find snapshot in incore list.
1333 */
1334 mutex_enter(&si->si_lock);
1335 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1336 if (xp == ip)
1337 break;
1338 mutex_exit(&si->si_lock);
1339 if (xp != NULL)
1340 vrele(ITOV(ip));
1341 #ifdef DEBUG
1342 else if (snapdebug)
1343 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1344 (unsigned long long)ip->i_number);
1345 #endif
1346 /*
1347 * Delete snapshot inode from superblock. Keep list dense.
1348 */
1349 mutex_enter(&si->si_lock);
1350 fs = ip->i_fs;
1351 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1352 if (fs->fs_snapinum[snaploc] == ip->i_number)
1353 break;
1354 if (snaploc < FSMAXSNAP) {
1355 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1356 if (fs->fs_snapinum[snaploc] == 0)
1357 break;
1358 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1359 }
1360 fs->fs_snapinum[snaploc - 1] = 0;
1361 }
1362 si->si_gen++;
1363 mutex_exit(&si->si_lock);
1364 }
1365
1366 /*
1367 * Prepare a snapshot file for being removed.
1368 */
1369 void
1370 ffs_snapremove(struct vnode *vp)
1371 {
1372 struct inode *ip = VTOI(vp), *xp;
1373 struct vnode *devvp = ip->i_devvp;
1374 struct fs *fs = ip->i_fs;
1375 struct mount *mp = devvp->v_specmountpoint;
1376 struct buf *ibp;
1377 struct snap_info *si;
1378 struct lwp *l = curlwp;
1379 daddr_t numblks, blkno, dblk;
1380 int error, loc, last;
1381
1382 si = VFSTOUFS(mp)->um_snapinfo;
1383 /*
1384 * If active, delete from incore list (this snapshot may
1385 * already have been in the process of being deleted, so
1386 * would not have been active).
1387 *
1388 * Clear copy-on-write flag if last snapshot.
1389 */
1390 mutex_enter(&si->si_snaplock);
1391 mutex_enter(&si->si_lock);
1392 if (is_active_snapshot(si, ip)) {
1393 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1394 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1395 /* Roll back the list of preallocated blocks. */
1396 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1397 si->si_snapblklist = xp->i_snapblklist;
1398 si->si_gen++;
1399 mutex_exit(&si->si_lock);
1400 mutex_exit(&si->si_snaplock);
1401 } else {
1402 si->si_snapblklist = 0;
1403 si->si_gen++;
1404 mutex_exit(&si->si_lock);
1405 mutex_exit(&si->si_snaplock);
1406 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1407 }
1408 if (ip->i_snapblklist != NULL) {
1409 free(ip->i_snapblklist, M_UFSMNT);
1410 ip->i_snapblklist = NULL;
1411 }
1412 } else {
1413 mutex_exit(&si->si_lock);
1414 mutex_exit(&si->si_snaplock);
1415 }
1416 /*
1417 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1418 * snapshots that want them (see ffs_snapblkfree below).
1419 */
1420 for (blkno = 1; blkno < NDADDR; blkno++) {
1421 dblk = db_get(ip, blkno);
1422 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1423 db_assign(ip, blkno, 0);
1424 else if ((dblk == blkstofrags(fs, blkno) &&
1425 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1426 ip->i_number))) {
1427 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1428 db_assign(ip, blkno, 0);
1429 }
1430 }
1431 numblks = howmany(ip->i_size, fs->fs_bsize);
1432 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1433 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1434 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1435 if (error)
1436 continue;
1437 if (fs->fs_size - blkno > NINDIR(fs))
1438 last = NINDIR(fs);
1439 else
1440 last = fs->fs_size - blkno;
1441 for (loc = 0; loc < last; loc++) {
1442 dblk = idb_get(ip, ibp->b_data, loc);
1443 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1444 idb_assign(ip, ibp->b_data, loc, 0);
1445 else if (dblk == blkstofrags(fs, blkno) &&
1446 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1447 fs->fs_bsize, ip->i_number)) {
1448 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1449 idb_assign(ip, ibp->b_data, loc, 0);
1450 }
1451 }
1452 bawrite(ibp);
1453 UFS_WAPBL_END(mp);
1454 error = UFS_WAPBL_BEGIN(mp);
1455 KASSERT(error == 0);
1456 }
1457 /*
1458 * Clear snapshot flag and drop reference.
1459 */
1460 ip->i_flags &= ~SF_SNAPSHOT;
1461 DIP_ASSIGN(ip, flags, ip->i_flags);
1462 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1463 }
1464
1465 /*
1466 * Notification that a block is being freed. Return zero if the free
1467 * should be allowed to proceed. Return non-zero if the snapshot file
1468 * wants to claim the block. The block will be claimed if it is an
1469 * uncopied part of one of the snapshots. It will be freed if it is
1470 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1471 * If a fragment is being freed, then all snapshots that care about
1472 * it must make a copy since a snapshot file can only claim full sized
1473 * blocks. Note that if more than one snapshot file maps the block,
1474 * we can pick one at random to claim it. Since none of the snapshots
1475 * can change, we are assurred that they will all see the same unmodified
1476 * image. When deleting a snapshot file (see ffs_snapremove above), we
1477 * must push any of these claimed blocks to one of the other snapshots
1478 * that maps it. These claimed blocks are easily identified as they will
1479 * have a block number equal to their logical block number within the
1480 * snapshot. A copied block can never have this property because they
1481 * must always have been allocated from a BLK_NOCOPY location.
1482 */
1483 int
1484 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1485 long size, ino_t inum)
1486 {
1487 struct mount *mp = devvp->v_specmountpoint;
1488 struct buf *ibp;
1489 struct inode *ip;
1490 struct vnode *vp = NULL;
1491 struct snap_info *si;
1492 void *saved_data = NULL;
1493 daddr_t lbn;
1494 daddr_t blkno;
1495 uint32_t gen;
1496 int indiroff = 0, error = 0, claimedblk = 0;
1497
1498 si = VFSTOUFS(mp)->um_snapinfo;
1499 lbn = fragstoblks(fs, bno);
1500 mutex_enter(&si->si_snaplock);
1501 mutex_enter(&si->si_lock);
1502 si->si_owner = curlwp;
1503
1504 retry:
1505 gen = si->si_gen;
1506 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1507 vp = ITOV(ip);
1508 /*
1509 * Lookup block being written.
1510 */
1511 if (lbn < NDADDR) {
1512 blkno = db_get(ip, lbn);
1513 } else {
1514 mutex_exit(&si->si_lock);
1515 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1516 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1517 if (error) {
1518 mutex_enter(&si->si_lock);
1519 break;
1520 }
1521 indiroff = (lbn - NDADDR) % NINDIR(fs);
1522 blkno = idb_get(ip, ibp->b_data, indiroff);
1523 mutex_enter(&si->si_lock);
1524 if (gen != si->si_gen) {
1525 brelse(ibp, 0);
1526 goto retry;
1527 }
1528 }
1529 /*
1530 * Check to see if block needs to be copied.
1531 */
1532 if (blkno == 0) {
1533 /*
1534 * A block that we map is being freed. If it has not
1535 * been claimed yet, we will claim or copy it (below).
1536 */
1537 claimedblk = 1;
1538 } else if (blkno == BLK_SNAP) {
1539 /*
1540 * No previous snapshot claimed the block,
1541 * so it will be freed and become a BLK_NOCOPY
1542 * (don't care) for us.
1543 */
1544 if (claimedblk)
1545 panic("snapblkfree: inconsistent block type");
1546 if (lbn < NDADDR) {
1547 db_assign(ip, lbn, BLK_NOCOPY);
1548 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1549 } else {
1550 idb_assign(ip, ibp->b_data, indiroff,
1551 BLK_NOCOPY);
1552 mutex_exit(&si->si_lock);
1553 if (ip->i_nlink > 0)
1554 bwrite(ibp);
1555 else
1556 bdwrite(ibp);
1557 mutex_enter(&si->si_lock);
1558 if (gen != si->si_gen)
1559 goto retry;
1560 }
1561 continue;
1562 } else /* BLK_NOCOPY or default */ {
1563 /*
1564 * If the snapshot has already copied the block
1565 * (default), or does not care about the block,
1566 * it is not needed.
1567 */
1568 if (lbn >= NDADDR)
1569 brelse(ibp, 0);
1570 continue;
1571 }
1572 /*
1573 * If this is a full size block, we will just grab it
1574 * and assign it to the snapshot inode. Otherwise we
1575 * will proceed to copy it. See explanation for this
1576 * routine as to why only a single snapshot needs to
1577 * claim this block.
1578 */
1579 if (size == fs->fs_bsize) {
1580 #ifdef DEBUG
1581 if (snapdebug)
1582 printf("%s %llu lbn %" PRId64
1583 "from inum %llu\n",
1584 "Grabonremove: snapino",
1585 (unsigned long long)ip->i_number,
1586 lbn, (unsigned long long)inum);
1587 #endif
1588 mutex_exit(&si->si_lock);
1589 if (lbn < NDADDR) {
1590 db_assign(ip, lbn, bno);
1591 } else {
1592 idb_assign(ip, ibp->b_data, indiroff, bno);
1593 if (ip->i_nlink > 0)
1594 bwrite(ibp);
1595 else
1596 bdwrite(ibp);
1597 }
1598 DIP_ADD(ip, blocks, btodb(size));
1599 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1600 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1601 error = syncsnap(vp);
1602 else
1603 error = 0;
1604 mutex_enter(&si->si_lock);
1605 si->si_owner = NULL;
1606 mutex_exit(&si->si_lock);
1607 mutex_exit(&si->si_snaplock);
1608 return (error == 0);
1609 }
1610 if (lbn >= NDADDR)
1611 brelse(ibp, 0);
1612 #ifdef DEBUG
1613 if (snapdebug)
1614 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1615 "Copyonremove: snapino ",
1616 (unsigned long long)ip->i_number,
1617 lbn, "for inum", (unsigned long long)inum, size);
1618 #endif
1619 /*
1620 * If we have already read the old block contents, then
1621 * simply copy them to the new block. Note that we need
1622 * to synchronously write snapshots that have not been
1623 * unlinked, and hence will be visible after a crash,
1624 * to ensure their integrity.
1625 */
1626 mutex_exit(&si->si_lock);
1627 if (saved_data == NULL) {
1628 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1629 error = rwfsblk(vp, B_READ, saved_data, lbn);
1630 if (error) {
1631 free(saved_data, M_UFSMNT);
1632 saved_data = NULL;
1633 mutex_enter(&si->si_lock);
1634 break;
1635 }
1636 }
1637 error = wrsnapblk(vp, saved_data, lbn);
1638 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1639 error = syncsnap(vp);
1640 mutex_enter(&si->si_lock);
1641 if (error)
1642 break;
1643 if (gen != si->si_gen)
1644 goto retry;
1645 }
1646 si->si_owner = NULL;
1647 mutex_exit(&si->si_lock);
1648 mutex_exit(&si->si_snaplock);
1649 if (saved_data)
1650 free(saved_data, M_UFSMNT);
1651 /*
1652 * If we have been unable to allocate a block in which to do
1653 * the copy, then return non-zero so that the fragment will
1654 * not be freed. Although space will be lost, the snapshot
1655 * will stay consistent.
1656 */
1657 return (error);
1658 }
1659
1660 /*
1661 * Associate snapshot files when mounting.
1662 */
1663 void
1664 ffs_snapshot_mount(struct mount *mp)
1665 {
1666 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1667 struct fs *fs = VFSTOUFS(mp)->um_fs;
1668 struct lwp *l = curlwp;
1669 struct vnode *vp;
1670 struct inode *ip, *xp;
1671 struct snap_info *si;
1672 daddr_t snaplistsize, *snapblklist;
1673 int i, error, ns, snaploc, loc;
1674
1675 /*
1676 * No persistent snapshots on apple ufs file systems.
1677 */
1678 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1679 return;
1680
1681 si = VFSTOUFS(mp)->um_snapinfo;
1682 ns = UFS_FSNEEDSWAP(fs);
1683 /*
1684 * XXX The following needs to be set before ffs_truncate or
1685 * VOP_READ can be called.
1686 */
1687 mp->mnt_stat.f_iosize = fs->fs_bsize;
1688 /*
1689 * Process each snapshot listed in the superblock.
1690 */
1691 vp = NULL;
1692 mutex_enter(&si->si_lock);
1693 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1694 if (fs->fs_snapinum[snaploc] == 0)
1695 break;
1696 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1697 &vp)) != 0) {
1698 printf("ffs_snapshot_mount: vget failed %d\n", error);
1699 continue;
1700 }
1701 ip = VTOI(vp);
1702 if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1703 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1704 fs->fs_snapinum[snaploc]);
1705 vput(vp);
1706 vp = NULL;
1707 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1708 if (fs->fs_snapinum[loc] == 0)
1709 break;
1710 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1711 }
1712 fs->fs_snapinum[loc - 1] = 0;
1713 snaploc--;
1714 continue;
1715 }
1716
1717 /*
1718 * Read the block hints list. Use an empty list on
1719 * read errors.
1720 */
1721 error = vn_rdwr(UIO_READ, vp,
1722 (void *)&snaplistsize, sizeof(snaplistsize),
1723 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1724 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1725 l->l_cred, NULL, NULL);
1726 if (error) {
1727 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1728 snaplistsize = 1;
1729 } else
1730 snaplistsize = ufs_rw64(snaplistsize, ns);
1731 snapblklist = malloc(
1732 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1733 if (error)
1734 snapblklist[0] = 1;
1735 else {
1736 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1737 snaplistsize * sizeof(daddr_t),
1738 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1739 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1740 l->l_cred, NULL, NULL);
1741 for (i = 0; i < snaplistsize; i++)
1742 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1743 if (error) {
1744 printf("ffs_snapshot_mount: read_2 failed %d\n",
1745 error);
1746 snapblklist[0] = 1;
1747 }
1748 }
1749 ip->i_snapblklist = &snapblklist[0];
1750
1751 /*
1752 * Link it onto the active snapshot list.
1753 */
1754 if (is_active_snapshot(si, ip))
1755 panic("ffs_snapshot_mount: %"PRIu64" already on list",
1756 ip->i_number);
1757 else
1758 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1759 vp->v_vflag |= VV_SYSTEM;
1760 VOP_UNLOCK(vp);
1761 }
1762 /*
1763 * No usable snapshots found.
1764 */
1765 if (vp == NULL) {
1766 mutex_exit(&si->si_lock);
1767 return;
1768 }
1769 /*
1770 * Attach the block hints list. We always want to
1771 * use the list from the newest snapshot.
1772 */
1773 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1774 si->si_snapblklist = xp->i_snapblklist;
1775 fscow_establish(mp, ffs_copyonwrite, devvp);
1776 si->si_gen++;
1777 mutex_exit(&si->si_lock);
1778 }
1779
1780 /*
1781 * Disassociate snapshot files when unmounting.
1782 */
1783 void
1784 ffs_snapshot_unmount(struct mount *mp)
1785 {
1786 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1787 struct inode *xp;
1788 struct vnode *vp = NULL;
1789 struct snap_info *si;
1790
1791 si = VFSTOUFS(mp)->um_snapinfo;
1792 mutex_enter(&si->si_lock);
1793 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1794 vp = ITOV(xp);
1795 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1796 if (xp->i_snapblklist == si->si_snapblklist)
1797 si->si_snapblklist = NULL;
1798 free(xp->i_snapblklist, M_UFSMNT);
1799 if (xp->i_nlink > 0) {
1800 si->si_gen++;
1801 mutex_exit(&si->si_lock);
1802 vrele(vp);
1803 mutex_enter(&si->si_lock);
1804 }
1805 }
1806 si->si_gen++;
1807 mutex_exit(&si->si_lock);
1808 if (vp)
1809 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1810 }
1811
1812 /*
1813 * Check for need to copy block that is about to be written,
1814 * copying the block if necessary.
1815 */
1816 static int
1817 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1818 {
1819 struct fs *fs;
1820 struct inode *ip;
1821 struct vnode *devvp = v, *vp = NULL;
1822 struct mount *mp = devvp->v_specmountpoint;
1823 struct snap_info *si;
1824 void *saved_data = NULL;
1825 daddr_t lbn, blkno, *snapblklist;
1826 uint32_t gen;
1827 int lower, upper, mid, snapshot_locked = 0, error = 0;
1828
1829 /*
1830 * Check for valid snapshots.
1831 */
1832 si = VFSTOUFS(mp)->um_snapinfo;
1833 mutex_enter(&si->si_lock);
1834 ip = TAILQ_FIRST(&si->si_snapshots);
1835 if (ip == NULL) {
1836 mutex_exit(&si->si_lock);
1837 return 0;
1838 }
1839 /*
1840 * First check to see if it is after the file system or
1841 * in the preallocated list.
1842 * By doing this check we avoid several potential deadlocks.
1843 */
1844 fs = ip->i_fs;
1845 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1846 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
1847 mutex_exit(&si->si_lock);
1848 return 0;
1849 }
1850 snapblklist = si->si_snapblklist;
1851 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1852 lower = 1;
1853 while (lower <= upper) {
1854 mid = (lower + upper) / 2;
1855 if (snapblklist[mid] == lbn)
1856 break;
1857 if (snapblklist[mid] < lbn)
1858 lower = mid + 1;
1859 else
1860 upper = mid - 1;
1861 }
1862 if (lower <= upper) {
1863 mutex_exit(&si->si_lock);
1864 return 0;
1865 }
1866 /*
1867 * Not in the precomputed list, so check the snapshots.
1868 */
1869 if (si->si_owner != curlwp) {
1870 if (!mutex_tryenter(&si->si_snaplock)) {
1871 mutex_exit(&si->si_lock);
1872 mutex_enter(&si->si_snaplock);
1873 mutex_enter(&si->si_lock);
1874 }
1875 si->si_owner = curlwp;
1876 snapshot_locked = 1;
1877 }
1878 if (data_valid && bp->b_bcount == fs->fs_bsize)
1879 saved_data = bp->b_data;
1880 retry:
1881 gen = si->si_gen;
1882 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1883 vp = ITOV(ip);
1884 /*
1885 * We ensure that everything of our own that needs to be
1886 * copied will be done at the time that ffs_snapshot is
1887 * called. Thus we can skip the check here which can
1888 * deadlock in doing the lookup in ffs_balloc.
1889 */
1890 if (bp->b_vp == vp)
1891 continue;
1892 /*
1893 * Check to see if block needs to be copied.
1894 */
1895 if (lbn < NDADDR) {
1896 blkno = db_get(ip, lbn);
1897 } else {
1898 mutex_exit(&si->si_lock);
1899 blkno = 0; /* XXX: GCC */
1900 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1901 mutex_enter(&si->si_lock);
1902 break;
1903 }
1904 mutex_enter(&si->si_lock);
1905 if (gen != si->si_gen)
1906 goto retry;
1907 }
1908 #ifdef DIAGNOSTIC
1909 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1910 panic("ffs_copyonwrite: bad copy block");
1911 #endif
1912 if (blkno != 0)
1913 continue;
1914
1915 if (curlwp == uvm.pagedaemon_lwp) {
1916 error = ENOMEM;
1917 break;
1918 }
1919 /* Only one level of recursion allowed. */
1920 KASSERT(snapshot_locked);
1921 /*
1922 * Allocate the block into which to do the copy. Since
1923 * multiple processes may all try to copy the same block,
1924 * we have to recheck our need to do a copy if we sleep
1925 * waiting for the lock.
1926 *
1927 * Because all snapshots on a filesystem share a single
1928 * lock, we ensure that we will never be in competition
1929 * with another process to allocate a block.
1930 */
1931 #ifdef DEBUG
1932 if (snapdebug) {
1933 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
1934 (unsigned long long)ip->i_number, lbn);
1935 if (bp->b_vp == devvp)
1936 printf("fs metadata");
1937 else
1938 printf("inum %llu", (unsigned long long)
1939 VTOI(bp->b_vp)->i_number);
1940 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1941 }
1942 #endif
1943 /*
1944 * If we have already read the old block contents, then
1945 * simply copy them to the new block. Note that we need
1946 * to synchronously write snapshots that have not been
1947 * unlinked, and hence will be visible after a crash,
1948 * to ensure their integrity.
1949 */
1950 mutex_exit(&si->si_lock);
1951 if (saved_data == NULL) {
1952 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1953 error = rwfsblk(vp, B_READ, saved_data, lbn);
1954 if (error) {
1955 free(saved_data, M_UFSMNT);
1956 saved_data = NULL;
1957 mutex_enter(&si->si_lock);
1958 break;
1959 }
1960 }
1961 error = wrsnapblk(vp, saved_data, lbn);
1962 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1963 error = syncsnap(vp);
1964 mutex_enter(&si->si_lock);
1965 if (error)
1966 break;
1967 if (gen != si->si_gen)
1968 goto retry;
1969 }
1970 /*
1971 * Note that we need to synchronously write snapshots that
1972 * have not been unlinked, and hence will be visible after
1973 * a crash, to ensure their integrity.
1974 */
1975 if (snapshot_locked) {
1976 si->si_owner = NULL;
1977 mutex_exit(&si->si_lock);
1978 mutex_exit(&si->si_snaplock);
1979 } else
1980 mutex_exit(&si->si_lock);
1981 if (saved_data && saved_data != bp->b_data)
1982 free(saved_data, M_UFSMNT);
1983 return error;
1984 }
1985
1986 /*
1987 * Read from a snapshot.
1988 */
1989 int
1990 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
1991 {
1992 struct inode *ip = VTOI(vp);
1993 struct fs *fs = ip->i_fs;
1994 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
1995 struct buf *bp;
1996 daddr_t lbn, nextlbn;
1997 off_t fsbytes, bytesinfile;
1998 long size, xfersize, blkoffset;
1999 int error;
2000
2001 fstrans_start(vp->v_mount, FSTRANS_SHARED);
2002 mutex_enter(&si->si_snaplock);
2003
2004 if (ioflag & IO_ALTSEMANTICS)
2005 fsbytes = ip->i_size;
2006 else
2007 fsbytes = lfragtosize(fs, fs->fs_size);
2008 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2009 bytesinfile = fsbytes - uio->uio_offset;
2010 if (bytesinfile <= 0)
2011 break;
2012 lbn = lblkno(fs, uio->uio_offset);
2013 nextlbn = lbn + 1;
2014 size = fs->fs_bsize;
2015 blkoffset = blkoff(fs, uio->uio_offset);
2016 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2017 bytesinfile);
2018
2019 if (lblktosize(fs, nextlbn + 1) >= fsbytes) {
2020 if (lblktosize(fs, lbn) + size > fsbytes)
2021 size = fragroundup(fs,
2022 fsbytes - lblktosize(fs, lbn));
2023 error = bread(vp, lbn, size, NOCRED, 0, &bp);
2024 } else {
2025 int nextsize = fs->fs_bsize;
2026 error = breadn(vp, lbn,
2027 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
2028 }
2029 if (error)
2030 break;
2031
2032 /*
2033 * We should only get non-zero b_resid when an I/O error
2034 * has occurred, which should cause us to break above.
2035 * However, if the short read did not cause an error,
2036 * then we want to ensure that we do not uiomove bad
2037 * or uninitialized data.
2038 */
2039 size -= bp->b_resid;
2040 if (size < blkoffset + xfersize) {
2041 xfersize = size - blkoffset;
2042 if (xfersize <= 0)
2043 break;
2044 }
2045 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2046 if (error)
2047 break;
2048 brelse(bp, BC_AGE);
2049 }
2050 if (bp != NULL)
2051 brelse(bp, BC_AGE);
2052
2053 mutex_exit(&si->si_snaplock);
2054 fstrans_done(vp->v_mount);
2055 return error;
2056 }
2057
2058 /*
2059 * Lookup a snapshots data block address.
2060 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2061 * and safe even for the pagedaemon where we cannot bread().
2062 */
2063 static int
2064 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2065 {
2066 struct indir indirs[NIADDR + 2];
2067 struct inode *ip = VTOI(vp);
2068 struct fs *fs = ip->i_fs;
2069 struct buf *bp;
2070 int error, num;
2071
2072 KASSERT(lbn >= 0);
2073
2074 if (lbn < NDADDR) {
2075 *res = db_get(ip, lbn);
2076 return 0;
2077 }
2078 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2079 return error;
2080 if (curlwp == uvm.pagedaemon_lwp) {
2081 mutex_enter(&bufcache_lock);
2082 bp = incore(vp, indirs[num-1].in_lbn);
2083 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2084 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2085 error = 0;
2086 } else
2087 error = ENOMEM;
2088 mutex_exit(&bufcache_lock);
2089 return error;
2090 }
2091 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
2092 if (error == 0)
2093 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2094 brelse(bp, 0);
2095
2096 return error;
2097 }
2098
2099 /*
2100 * Read or write the specified block of the filesystem vp resides on
2101 * from or to the disk bypassing the buffer cache.
2102 */
2103 static int
2104 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2105 {
2106 int error;
2107 struct inode *ip = VTOI(vp);
2108 struct fs *fs = ip->i_fs;
2109 struct buf *nbp;
2110
2111 nbp = getiobuf(NULL, true);
2112 nbp->b_flags = flags;
2113 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2114 nbp->b_error = 0;
2115 nbp->b_data = data;
2116 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
2117 nbp->b_proc = NULL;
2118 nbp->b_dev = ip->i_devvp->v_rdev;
2119 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2120
2121 bdev_strategy(nbp);
2122
2123 error = biowait(nbp);
2124
2125 putiobuf(nbp);
2126
2127 return error;
2128 }
2129
2130 /*
2131 * Write all dirty buffers to disk and invalidate them.
2132 */
2133 static int
2134 syncsnap(struct vnode *vp)
2135 {
2136 int error;
2137 buf_t *bp;
2138 struct fs *fs = VTOI(vp)->i_fs;
2139
2140 mutex_enter(&bufcache_lock);
2141 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2142 error = bbusy(bp, false, 0, NULL);
2143 if (error == EPASSTHROUGH)
2144 continue;
2145 else if (error != 0) {
2146 mutex_exit(&bufcache_lock);
2147 return error;
2148 }
2149 KASSERT(bp->b_bcount == fs->fs_bsize);
2150 mutex_exit(&bufcache_lock);
2151 error = rwfsblk(vp, B_WRITE, bp->b_data,
2152 fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
2153 brelse(bp, BC_INVAL | BC_VFLUSH);
2154 if (error)
2155 return error;
2156 mutex_enter(&bufcache_lock);
2157 }
2158 mutex_exit(&bufcache_lock);
2159
2160 return 0;
2161 }
2162
2163 /*
2164 * Write the specified block to a snapshot.
2165 */
2166 static int
2167 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2168 {
2169 struct inode *ip = VTOI(vp);
2170 struct fs *fs = ip->i_fs;
2171 struct buf *bp;
2172 int error;
2173
2174 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2175 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2176 if (error)
2177 return error;
2178 memcpy(bp->b_data, data, fs->fs_bsize);
2179 if (ip->i_nlink > 0)
2180 error = bwrite(bp);
2181 else
2182 bawrite(bp);
2183
2184 return error;
2185 }
2186
2187 /*
2188 * Check if this inode is present on the active snapshot list.
2189 * Must be called with snapinfo locked.
2190 */
2191 static inline bool
2192 is_active_snapshot(struct snap_info *si, struct inode *ip)
2193 {
2194 struct inode *xp;
2195
2196 KASSERT(mutex_owned(&si->si_lock));
2197
2198 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2199 if (xp == ip)
2200 return true;
2201 return false;
2202 }
2203
2204 /*
2205 * Get/Put direct block from inode or buffer containing disk addresses. Take
2206 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2207 * into a global include.
2208 */
2209 static inline daddr_t
2210 db_get(struct inode *ip, int loc)
2211 {
2212 if (ip->i_ump->um_fstype == UFS1)
2213 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2214 else
2215 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2216 }
2217
2218 static inline void
2219 db_assign(struct inode *ip, int loc, daddr_t val)
2220 {
2221 if (ip->i_ump->um_fstype == UFS1)
2222 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2223 else
2224 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2225 }
2226
2227 static inline daddr_t
2228 ib_get(struct inode *ip, int loc)
2229 {
2230 if (ip->i_ump->um_fstype == UFS1)
2231 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2232 else
2233 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2234 }
2235
2236 static inline void
2237 ib_assign(struct inode *ip, int loc, daddr_t val)
2238 {
2239 if (ip->i_ump->um_fstype == UFS1)
2240 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2241 else
2242 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2243 }
2244
2245 static inline daddr_t
2246 idb_get(struct inode *ip, void *bf, int loc)
2247 {
2248 if (ip->i_ump->um_fstype == UFS1)
2249 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2250 else
2251 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2252 }
2253
2254 static inline void
2255 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2256 {
2257 if (ip->i_ump->um_fstype == UFS1)
2258 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2259 else
2260 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2261 }
2262