ffs_snapshot.c revision 1.75 1 /* $NetBSD: ffs_snapshot.c,v 1.75 2008/08/22 10:48:22 hannken Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.75 2008/08/22 10:48:22 hannken Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_wapbl.h"
46 #endif
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77
78 #include <uvm/uvm.h>
79
80 /* FreeBSD -> NetBSD conversion */
81 #define KERNCRED lwp0.l_cred
82 #define ufs1_daddr_t int32_t
83 #define ufs2_daddr_t int64_t
84 #define ufs_lbn_t daddr_t
85 #define VI_MTX(v) (&(v)->v_interlock)
86 #define VI_LOCK(v) mutex_enter(&(v)->v_interlock)
87 #define VI_UNLOCK(v) mutex_exit(&(v)->v_interlock)
88 #define MNT_ILOCK(v) mutex_enter(&mntvnode_lock)
89 #define MNT_IUNLOCK(v) mutex_exit(&mntvnode_lock)
90
91 #if !defined(FFS_NO_SNAPSHOT)
92 static int cgaccount(int, struct vnode *, void *, int);
93 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
94 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
95 ufs_lbn_t, int), int);
96 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
97 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
98 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
99 ufs_lbn_t, int), int);
100 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
101 struct fs *, ufs_lbn_t, int);
102 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
103 struct fs *, ufs_lbn_t, int);
104 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
105 struct fs *, ufs_lbn_t, int);
106 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
107 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
108 ufs_lbn_t, int), int);
109 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
110 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
111 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
112 ufs_lbn_t, int), int);
113 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
114 struct fs *, ufs_lbn_t, int);
115 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
116 struct fs *, ufs_lbn_t, int);
117 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
118 struct fs *, ufs_lbn_t, int);
119 #endif /* !defined(FFS_NO_SNAPSHOT) */
120
121 static int ffs_copyonwrite(void *, struct buf *, bool);
122 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
123 static int rwfsblk(struct vnode *, int, void *, ufs2_daddr_t);
124 static int syncsnap(struct vnode *);
125 static int wrsnapblk(struct vnode *, void *, ufs2_daddr_t);
126 static inline ufs2_daddr_t db_get(struct inode *, int);
127 static inline void db_assign(struct inode *, int, ufs2_daddr_t);
128 static inline ufs2_daddr_t idb_get(struct inode *, void *, int);
129 static inline void idb_assign(struct inode *, void *, int, ufs2_daddr_t);
130
131 struct snap_info {
132 kmutex_t si_lock; /* Lock this snapinfo */
133 kmutex_t si_snaplock; /* Snapshot vnode common lock */
134 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
135 daddr_t *si_snapblklist; /* Snapshot block hints list */
136 uint32_t si_gen; /* Incremented on change */
137 };
138
139 #ifdef DEBUG
140 static int snapdebug = 0;
141 #endif
142
143 int
144 ffs_snapshot_init(struct ufsmount *ump)
145 {
146 struct snap_info *si;
147
148 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
149 if (si == NULL)
150 return ENOMEM;
151
152 TAILQ_INIT(&si->si_snapshots);
153 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
154 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
155 si->si_gen = 0;
156 si->si_snapblklist = NULL;
157
158 return 0;
159 }
160
161 void
162 ffs_snapshot_fini(struct ufsmount *ump)
163 {
164 struct snap_info *si;
165
166 si = ump->um_snapinfo;
167 ump->um_snapinfo = NULL;
168
169 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
170 mutex_destroy(&si->si_lock);
171 mutex_destroy(&si->si_snaplock);
172 KASSERT(si->si_snapblklist == NULL);
173 kmem_free(si, sizeof(*si));
174 }
175
176 /*
177 * Create a snapshot file and initialize it for the filesystem.
178 * Vnode is locked on entry and return.
179 */
180 int
181 ffs_snapshot(struct mount *mp, struct vnode *vp,
182 struct timespec *ctime)
183 {
184 #if defined(FFS_NO_SNAPSHOT)
185 return EOPNOTSUPP;
186 }
187 #else /* defined(FFS_NO_SNAPSHOT) */
188 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist;
189 int error, ns, cg, snaploc;
190 int i, size, len, loc;
191 int flag = mp->mnt_flag;
192 struct timeval starttime;
193 #ifdef DEBUG
194 struct timeval endtime;
195 #endif
196 struct timespec ts;
197 long redo = 0;
198 int32_t *lp;
199 void *space;
200 void *sbbuf = NULL;
201 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
202 struct lwp *l = curlwp;
203 struct inode *ip, *xp;
204 struct buf *bp, *ibp, *nbp;
205 struct vattr vat;
206 struct vnode *xvp, *mvp, *logvp, *devvp;
207 struct snap_info *si;
208 bool suspended = false;
209 bool snapshot_locked = false;
210
211 ns = UFS_FSNEEDSWAP(fs);
212 si = VFSTOUFS(mp)->um_snapinfo;
213
214 /*
215 * Need to serialize access to snapshot code per filesystem.
216 */
217 /*
218 * If the vnode already is a snapshot, return.
219 */
220 if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
221 if (ctime) {
222 ctime->tv_sec = DIP(VTOI(vp), mtime);
223 ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
224 }
225 return 0;
226 }
227 /*
228 * Check mount, exclusive reference and owner.
229 */
230 if (vp->v_mount != mp)
231 return EXDEV;
232 if (vp->v_usecount != 1 || vp->v_writecount != 0)
233 return EBUSY;
234 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
235 NULL) != 0 &&
236 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
237 return EACCES;
238
239 if (vp->v_size != 0) {
240 error = ffs_truncate(vp, 0, 0, NOCRED);
241 if (error)
242 return error;
243 }
244 /*
245 * Assign a snapshot slot in the superblock.
246 */
247 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
248 if (fs->fs_snapinum[snaploc] == 0)
249 break;
250 if (snaploc == FSMAXSNAP)
251 return (ENOSPC);
252 ip = VTOI(vp);
253 devvp = ip->i_devvp;
254 if ((fs->fs_flags & FS_DOWAPBL) &&
255 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
256 error = VFS_VGET(mp,
257 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
258 if (error)
259 return error;
260 } else
261 logvp = NULL;
262 /*
263 * Write an empty list of preallocated blocks to the end of
264 * the snapshot to set size to at least that of the filesystem.
265 */
266 numblks = howmany(fs->fs_size, fs->fs_frag);
267 blkno = 1;
268 blkno = ufs_rw64(blkno, ns);
269 error = vn_rdwr(UIO_WRITE, vp,
270 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
271 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
272 if (error)
273 goto out;
274 /*
275 * Preallocate critical data structures so that we can copy
276 * them in without further allocation after we suspend all
277 * operations on the filesystem. We would like to just release
278 * the allocated buffers without writing them since they will
279 * be filled in below once we are ready to go, but this upsets
280 * the soft update code, so we go ahead and write the new buffers.
281 *
282 * Allocate all indirect blocks and mark all of them as not
283 * needing to be copied.
284 */
285 error = UFS_WAPBL_BEGIN(mp);
286 if (error)
287 goto out;
288 for (blkno = NDADDR, i = 0; blkno < numblks; blkno += NINDIR(fs)) {
289 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
290 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
291 if (error) {
292 UFS_WAPBL_END(mp);
293 goto out;
294 }
295 if (DOINGSOFTDEP(vp))
296 bawrite(ibp);
297 else
298 brelse(ibp, 0);
299 if ((++i % 16) == 0) {
300 UFS_WAPBL_END(mp);
301 error = UFS_WAPBL_BEGIN(mp);
302 if (error)
303 goto out;
304 }
305 }
306 /*
307 * Allocate copies for the superblock and its summary information.
308 */
309 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
310 0, &nbp);
311 if (error) {
312 UFS_WAPBL_END(mp);
313 goto out;
314 }
315 bawrite(nbp);
316 blkno = fragstoblks(fs, fs->fs_csaddr);
317 len = howmany(fs->fs_cssize, fs->fs_bsize);
318 for (loc = 0; loc < len; loc++) {
319 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
320 fs->fs_bsize, KERNCRED, 0, &nbp);
321 if (error) {
322 UFS_WAPBL_END(mp);
323 goto out;
324 }
325 bawrite(nbp);
326 }
327 /*
328 * Copy all the cylinder group maps. Although the
329 * filesystem is still active, we hope that only a few
330 * cylinder groups will change between now and when we
331 * suspend operations. Thus, we will be able to quickly
332 * touch up the few cylinder groups that changed during
333 * the suspension period.
334 */
335 len = howmany(fs->fs_ncg, NBBY);
336 fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO);
337 for (cg = 0; cg < fs->fs_ncg; cg++) {
338 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
339 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0)
340 break;
341 error = cgaccount(cg, vp, nbp->b_data, 1);
342 bawrite(nbp);
343 if (error)
344 break;
345 }
346 UFS_WAPBL_END(mp);
347 if (error)
348 goto out;
349 /*
350 * Change inode to snapshot type file.
351 */
352 ip->i_flags |= SF_SNAPSHOT;
353 DIP_ASSIGN(ip, flags, ip->i_flags);
354 ip->i_flag |= IN_CHANGE | IN_UPDATE;
355 /*
356 * Ensure that the snapshot is completely on disk.
357 * Since we have marked it as a snapshot it is safe to
358 * unlock it as no process will be allowed to write to it.
359 */
360 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0)) != 0)
361 goto out;
362 VOP_UNLOCK(vp, 0);
363 /*
364 * All allocations are done, so we can now snapshot the system.
365 *
366 * Suspend operation on filesystem.
367 */
368 if ((error = vfs_suspend(vp->v_mount, 0)) != 0) {
369 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
370 goto out;
371 }
372 suspended = true;
373 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
374 getmicrotime(&starttime);
375 error = UFS_WAPBL_BEGIN(mp);
376 if (error)
377 goto out;
378 /*
379 * First, copy all the cylinder group maps that have changed.
380 */
381 for (cg = 0; cg < fs->fs_ncg; cg++) {
382 if (ACTIVECG_ISSET(fs, cg))
383 continue;
384 redo++;
385 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
386 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0)
387 break;
388 error = cgaccount(cg, vp, nbp->b_data, 2);
389 bawrite(nbp);
390 if (error)
391 break;
392 }
393 if (error) {
394 UFS_WAPBL_END(mp);
395 goto out;
396 }
397 /*
398 * Grab a copy of the superblock and its summary information.
399 * We delay writing it until the suspension is released below.
400 */
401 sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
402 loc = blkoff(fs, fs->fs_sblockloc);
403 if (loc > 0)
404 memset(sbbuf, 0, loc);
405 copy_fs = (struct fs *)((char *)sbbuf + loc);
406 bcopy(fs, copy_fs, fs->fs_sbsize);
407 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
408 if (fs->fs_sbsize < size)
409 memset((char *)sbbuf + loc + fs->fs_sbsize, 0,
410 size - fs->fs_sbsize);
411 size = blkroundup(fs, fs->fs_cssize);
412 if (fs->fs_contigsumsize > 0)
413 size += fs->fs_ncg * sizeof(int32_t);
414 space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
415 copy_fs->fs_csp = space;
416 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
417 space = (char *)space + fs->fs_cssize;
418 loc = howmany(fs->fs_cssize, fs->fs_fsize);
419 i = fs->fs_frag - loc % fs->fs_frag;
420 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
421 if (len > 0) {
422 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
423 len, KERNCRED, 0, &bp)) != 0) {
424 brelse(bp, 0);
425 free(copy_fs->fs_csp, M_UFSMNT);
426 goto out;
427 }
428 bcopy(bp->b_data, space, (u_int)len);
429 space = (char *)space + len;
430 brelse(bp, BC_INVAL | BC_NOCACHE);
431 }
432 if (fs->fs_contigsumsize > 0) {
433 copy_fs->fs_maxcluster = lp = space;
434 for (i = 0; i < fs->fs_ncg; i++)
435 *lp++ = fs->fs_contigsumsize;
436 }
437 /*
438 * We must check for active files that have been unlinked
439 * (e.g., with a zero link count). We have to expunge all
440 * trace of these files from the snapshot so that they are
441 * not reclaimed prematurely by fsck or unnecessarily dumped.
442 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
443 * spec_strategy about writing on a suspended filesystem.
444 * Note that we skip unlinked snapshot files as they will
445 * be handled separately below.
446 *
447 * We also calculate the needed size for the snapshot list.
448 */
449 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
450 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
451 /* Allocate a marker vnode */
452 if ((mvp = vnalloc(mp)) == NULL) {
453 error = ENOMEM;
454 goto out;
455 }
456 MNT_ILOCK(mp);
457 /*
458 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
459 * and vclean() can be called indirectly
460 */
461 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
462 vmark(mvp, xvp);
463 /*
464 * Make sure this vnode wasn't reclaimed in getnewvnode().
465 * Start over if it has (it won't be on the list anymore).
466 */
467 if (xvp->v_mount != mp || vismarker(xvp))
468 continue;
469 VI_LOCK(xvp);
470 if ((xvp->v_iflag & VI_XLOCK) ||
471 xvp->v_usecount == 0 || xvp->v_type == VNON ||
472 VTOI(xvp) == NULL ||
473 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
474 VI_UNLOCK(xvp);
475 continue;
476 }
477 MNT_IUNLOCK(mp);
478 /*
479 * XXXAD should increase vnode ref count to prevent it
480 * disappearing or being recycled.
481 */
482 VI_UNLOCK(xvp);
483 #ifdef DEBUG
484 if (snapdebug)
485 vprint("ffs_snapshot: busy vnode", xvp);
486 #endif
487 if (xvp != logvp && VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
488 vat.va_nlink > 0) {
489 MNT_ILOCK(mp);
490 continue;
491 }
492 xp = VTOI(xvp);
493 if (xvp != logvp &&
494 ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
495 MNT_ILOCK(mp);
496 continue;
497 }
498 /*
499 * If there is a fragment, clear it here.
500 */
501 blkno = 0;
502 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
503 if (loc < NDADDR) {
504 len = fragroundup(fs, blkoff(fs, xp->i_size));
505 if (len > 0 && len < fs->fs_bsize) {
506 ffs_blkfree(copy_fs, vp, db_get(xp, loc),
507 len, xp->i_number);
508 blkno = db_get(xp, loc);
509 db_assign(xp, loc, 0);
510 }
511 }
512 snaplistsize += 1;
513 if (xp->i_ump->um_fstype == UFS1)
514 error = expunge_ufs1(vp, xp, copy_fs,
515 fullacct_ufs1, BLK_NOCOPY);
516 else
517 error = expunge_ufs2(vp, xp, copy_fs,
518 fullacct_ufs2, BLK_NOCOPY);
519 if (blkno)
520 db_assign(xp, loc, blkno);
521 if (!error)
522 error = ffs_freefile(copy_fs, vp, xp->i_number,
523 xp->i_mode);
524 if (error) {
525 free(copy_fs->fs_csp, M_UFSMNT);
526 (void)vunmark(mvp);
527 goto out;
528 }
529 MNT_ILOCK(mp);
530 }
531 MNT_IUNLOCK(mp);
532 vnfree(mvp);
533 UFS_WAPBL_END(mp);
534 /*
535 * Acquire the snapshot lock.
536 */
537 mutex_enter(&si->si_snaplock);
538 snapshot_locked = true;
539 /*
540 * If this is the first snapshot on this filesystem, then we need
541 * to allocate the space for the list of preallocated snapshot blocks.
542 * This list will be refined below, but this preliminary one will
543 * keep us out of deadlock until the full one is ready.
544 */
545 mutex_enter(&si->si_lock);
546 if ((xp = TAILQ_FIRST(&si->si_snapshots)) == NULL) {
547 mutex_exit(&si->si_lock);
548 snapblklist = malloc(
549 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
550 blkp = &snapblklist[1];
551 *blkp++ = lblkno(fs, fs->fs_sblockloc);
552 blkno = fragstoblks(fs, fs->fs_csaddr);
553 for (cg = 0; cg < fs->fs_ncg; cg++) {
554 if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
555 break;
556 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
557 }
558 len = howmany(fs->fs_cssize, fs->fs_bsize);
559 for (loc = 0; loc < len; loc++)
560 *blkp++ = blkno + loc;
561 for (; cg < fs->fs_ncg; cg++)
562 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
563 snapblklist[0] = blkp - snapblklist;
564 mutex_enter(&si->si_lock);
565 if (si->si_snapblklist != NULL)
566 panic("ffs_snapshot: non-empty list");
567 si->si_snapblklist = snapblklist;
568 }
569 /*
570 * Record snapshot inode. Since this is the newest snapshot,
571 * it must be placed at the end of the list.
572 */
573 fs->fs_snapinum[snaploc] = ip->i_number;
574 if (ip->i_nextsnap.tqe_prev != 0)
575 panic("ffs_snapshot: %llu already on list",
576 (unsigned long long)ip->i_number);
577 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
578 if (xp == NULL)
579 fscow_establish(mp, ffs_copyonwrite, devvp);
580 si->si_gen++;
581 mutex_exit(&si->si_lock);
582 vp->v_vflag |= VV_SYSTEM;
583 /*
584 * Set the mtime to the time the snapshot has been taken.
585 */
586 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
587 if (ctime)
588 *ctime = ts;
589 DIP_ASSIGN(ip, mtime, ts.tv_sec);
590 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
591 ip->i_flag |= IN_CHANGE | IN_UPDATE;
592 /*
593 * Copy allocation information from all the snapshots in
594 * this snapshot and then expunge them from its view.
595 */
596 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
597 if (xp == ip)
598 break;
599 if ((error = UFS_WAPBL_BEGIN(mp)) == 0) {
600 if (xp->i_ump->um_fstype == UFS1)
601 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
602 BLK_SNAP);
603 else
604 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
605 BLK_SNAP);
606 if (error == 0 && xp->i_ffs_effnlink == 0)
607 error = ffs_freefile(copy_fs, vp,
608 xp->i_number, xp->i_mode);
609 UFS_WAPBL_END(mp);
610 }
611 if (error) {
612 fs->fs_snapinum[snaploc] = 0;
613 goto done;
614 }
615 }
616 /*
617 * Allocate space for the full list of preallocated snapshot blocks.
618 */
619 snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t),
620 M_UFSMNT, M_WAITOK);
621 ip->i_snapblklist = &snapblklist[1];
622 /*
623 * Expunge the blocks used by the snapshots from the set of
624 * blocks marked as used in the snapshot bitmaps. Also, collect
625 * the list of allocated blocks in i_snapblklist.
626 */
627 if ((error = UFS_WAPBL_BEGIN(mp)) == 0) {
628 if (ip->i_ump->um_fstype == UFS1)
629 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
630 BLK_SNAP);
631 else
632 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
633 BLK_SNAP);
634 UFS_WAPBL_END(mp);
635 }
636 if (error) {
637 fs->fs_snapinum[snaploc] = 0;
638 FREE(snapblklist, M_UFSMNT);
639 goto done;
640 }
641 if (snaplistsize < ip->i_snapblklist - snapblklist)
642 panic("ffs_snapshot: list too small");
643 snaplistsize = ip->i_snapblklist - snapblklist;
644 snapblklist[0] = snaplistsize;
645 ip->i_snapblklist = &snapblklist[0];
646 /*
647 * Write out the list of allocated blocks to the end of the snapshot.
648 */
649 for (i = 0; i < snaplistsize; i++)
650 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
651 error = vn_rdwr(UIO_WRITE, vp, (void *)snapblklist,
652 snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks),
653 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
654 for (i = 0; i < snaplistsize; i++)
655 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
656 if (error) {
657 fs->fs_snapinum[snaploc] = 0;
658 FREE(snapblklist, M_UFSMNT);
659 goto done;
660 }
661 /*
662 * Write the superblock and its summary information
663 * to the snapshot.
664 */
665 blkno = fragstoblks(fs, fs->fs_csaddr);
666 len = howmany(fs->fs_cssize, fs->fs_bsize);
667 space = copy_fs->fs_csp;
668 #ifdef FFS_EI
669 if (ns) {
670 ffs_sb_swap(copy_fs, copy_fs);
671 ffs_csum_swap(space, space, fs->fs_cssize);
672 }
673 #endif
674 error = UFS_WAPBL_BEGIN(mp);
675 if (error) {
676 fs->fs_snapinum[snaploc] = 0;
677 FREE(snapblklist, M_UFSMNT);
678 goto done;
679 }
680 for (loc = 0; loc < len; loc++) {
681 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED,
682 B_MODIFY, &nbp);
683 if (error) {
684 brelse(nbp, 0);
685 fs->fs_snapinum[snaploc] = 0;
686 FREE(snapblklist, M_UFSMNT);
687 goto done;
688 }
689 bcopy(space, nbp->b_data, fs->fs_bsize);
690 space = (char *)space + fs->fs_bsize;
691 bawrite(nbp);
692 }
693 /*
694 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
695 * and ffs_snapblkfree() will always work on indirect blocks.
696 */
697 for (loc = 0; loc < NDADDR; loc++) {
698 if (db_get(ip, loc) != 0)
699 continue;
700 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
701 fs->fs_bsize, KERNCRED, 0, &nbp);
702 if (error)
703 break;
704 error = rwfsblk(vp, B_READ, nbp->b_data, loc);
705 if (error) {
706 brelse(nbp, 0);
707 fs->fs_snapinum[snaploc] = 0;
708 FREE(snapblklist, M_UFSMNT);
709 goto done;
710 }
711 bawrite(nbp);
712 }
713 UFS_WAPBL_END(mp);
714 /*
715 * As this is the newest list, it is the most inclusive, so
716 * should replace the previous list. If this is the first snapshot
717 * free the preliminary list.
718 */
719 mutex_enter(&si->si_lock);
720 space = si->si_snapblklist;
721 si->si_snapblklist = snapblklist;
722 if (TAILQ_FIRST(&si->si_snapshots) == ip)
723 FREE(space, M_UFSMNT);
724 si->si_gen++;
725 mutex_exit(&si->si_lock);
726 done:
727 if (mp->mnt_wapbl)
728 copy_fs->fs_flags &= ~FS_DOWAPBL;
729 free(copy_fs->fs_csp, M_UFSMNT);
730 if (!error) {
731 error = UFS_WAPBL_BEGIN(mp);
732 if (!error) {
733 error = bread(vp, lblkno(fs, fs->fs_sblockloc),
734 fs->fs_bsize, KERNCRED, B_MODIFY, &nbp);
735 if (error) {
736 brelse(nbp, 0);
737 } else {
738 bcopy(sbbuf, nbp->b_data, fs->fs_bsize);
739 bawrite(nbp);
740 }
741 UFS_WAPBL_END(mp);
742 }
743 if (error)
744 fs->fs_snapinum[snaploc] = 0;
745 }
746 out:
747 /*
748 * Invalidate and free all pages on the snapshot vnode.
749 * We will read and write through the buffercache.
750 */
751 if (!error) {
752 mutex_enter(&vp->v_interlock);
753 error = VOP_PUTPAGES(vp, 0, 0,
754 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
755 }
756 #ifdef WAPBL
757 if (!error && mp->mnt_wapbl)
758 error = wapbl_flush(mp->mnt_wapbl, 1);
759 #endif
760 if (suspended) {
761 vfs_resume(vp->v_mount);
762 #ifdef DEBUG
763 if (starttime.tv_sec > 0) {
764 getmicrotime(&endtime);
765 timersub(&endtime, &starttime, &endtime);
766 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
767 vp->v_mount->mnt_stat.f_mntonname,
768 (long)endtime.tv_sec, endtime.tv_usec / 1000,
769 redo, fs->fs_ncg);
770 }
771 #endif
772 }
773 if (sbbuf)
774 free(sbbuf, M_UFSMNT);
775 if (fs->fs_active != 0) {
776 FREE(fs->fs_active, M_DEVBUF);
777 fs->fs_active = 0;
778 }
779 mp->mnt_flag = flag;
780 if (error) {
781 if (!UFS_WAPBL_BEGIN(mp)) {
782 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
783 UFS_WAPBL_END(mp);
784 }
785 } else
786 vref(vp);
787 if (snapshot_locked)
788 mutex_exit(&si->si_snaplock);
789 return (error);
790 }
791
792 /*
793 * Copy a cylinder group map. All the unallocated blocks are marked
794 * BLK_NOCOPY so that the snapshot knows that it need not copy them
795 * if they are later written. If passno is one, then this is a first
796 * pass, so only setting needs to be done. If passno is 2, then this
797 * is a revision to a previous pass which must be undone as the
798 * replacement pass is done.
799 */
800 static int
801 cgaccount(int cg, struct vnode *vp, void *data, int passno)
802 {
803 struct buf *bp, *ibp;
804 struct inode *ip;
805 struct cg *cgp;
806 struct fs *fs;
807 ufs2_daddr_t base, numblks;
808 int error, len, loc, ns, indiroff;
809
810 ip = VTOI(vp);
811 fs = ip->i_fs;
812 ns = UFS_FSNEEDSWAP(fs);
813 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
814 (int)fs->fs_cgsize, KERNCRED, 0, &bp);
815 if (error) {
816 brelse(bp, 0);
817 return (error);
818 }
819 cgp = (struct cg *)bp->b_data;
820 if (!cg_chkmagic(cgp, ns)) {
821 brelse(bp, 0);
822 return (EIO);
823 }
824 ACTIVECG_SET(fs, cg);
825
826 bcopy(bp->b_data, data, fs->fs_cgsize);
827 brelse(bp, 0);
828 if (fs->fs_cgsize < fs->fs_bsize)
829 memset((char *)data + fs->fs_cgsize, 0,
830 fs->fs_bsize - fs->fs_cgsize);
831 numblks = howmany(fs->fs_size, fs->fs_frag);
832 len = howmany(fs->fs_fpg, fs->fs_frag);
833 base = cg * fs->fs_fpg / fs->fs_frag;
834 if (base + len >= numblks)
835 len = numblks - base - 1;
836 loc = 0;
837 if (base < NDADDR) {
838 for ( ; loc < NDADDR; loc++) {
839 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
840 db_assign(ip, loc, BLK_NOCOPY);
841 else if (db_get(ip, loc) == BLK_NOCOPY) {
842 if (passno == 2)
843 db_assign(ip, loc, 0);
844 else if (passno == 1)
845 panic("ffs_snapshot: lost direct block");
846 }
847 }
848 }
849 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
850 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
851 return (error);
852 indiroff = (base + loc - NDADDR) % NINDIR(fs);
853 for ( ; loc < len; loc++, indiroff++) {
854 if (indiroff >= NINDIR(fs)) {
855 bawrite(ibp);
856 if ((error = ffs_balloc(vp,
857 lblktosize(fs, (off_t)(base + loc)),
858 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
859 return (error);
860 indiroff = 0;
861 }
862 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
863 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
864 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
865 if (passno == 2)
866 idb_assign(ip, ibp->b_data, indiroff, 0);
867 else if (passno == 1)
868 panic("ffs_snapshot: lost indirect block");
869 }
870 }
871 bdwrite(ibp);
872 return (0);
873 }
874
875 /*
876 * Before expunging a snapshot inode, note all the
877 * blocks that it claims with BLK_SNAP so that fsck will
878 * be able to account for those blocks properly and so
879 * that this snapshot knows that it need not copy them
880 * if the other snapshot holding them is freed. This code
881 * is reproduced once each for UFS1 and UFS2.
882 */
883 static int
884 expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
885 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
886 struct fs *, ufs_lbn_t, int),
887 int expungetype)
888 {
889 int i, error, ns;
890 ufs_lbn_t lbn, rlbn;
891 ufs2_daddr_t len, blkno, numblks, blksperindir;
892 struct ufs1_dinode *dip;
893 struct buf *bp;
894
895 ns = UFS_FSNEEDSWAP(fs);
896 /*
897 * Prepare to expunge the inode. If its inode block has not
898 * yet been copied, then allocate and fill the copy.
899 */
900 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
901 error = snapblkaddr(snapvp, lbn, &blkno);
902 if (error)
903 return error;
904 if (blkno != 0) {
905 error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED,
906 B_MODIFY, &bp);
907 } else {
908 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
909 fs->fs_bsize, KERNCRED, 0, &bp);
910 if (! error)
911 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
912 }
913 if (error)
914 return error;
915 /*
916 * Set a snapshot inode to be a zero length file, regular files
917 * or unlinked snapshots to be completely unallocated.
918 */
919 dip = (struct ufs1_dinode *)bp->b_data +
920 ino_to_fsbo(fs, cancelip->i_number);
921 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0)
922 dip->di_mode = 0;
923 dip->di_size = 0;
924 dip->di_blocks = 0;
925 dip->di_flags =
926 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
927 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
928 bdwrite(bp);
929 /*
930 * Now go through and expunge all the blocks in the file
931 * using the function requested.
932 */
933 numblks = howmany(cancelip->i_size, fs->fs_bsize);
934 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0],
935 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype)))
936 return (error);
937 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0],
938 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype)))
939 return (error);
940 blksperindir = 1;
941 lbn = -NDADDR;
942 len = numblks - NDADDR;
943 rlbn = NDADDR;
944 for (i = 0; len > 0 && i < NIADDR; i++) {
945 error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
946 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len,
947 blksperindir, fs, acctfunc, expungetype);
948 if (error)
949 return (error);
950 blksperindir *= NINDIR(fs);
951 lbn -= blksperindir + 1;
952 len -= blksperindir;
953 rlbn += blksperindir;
954 }
955 return (0);
956 }
957
958 /*
959 * Descend an indirect block chain for vnode cancelvp accounting for all
960 * its indirect blocks in snapvp.
961 */
962 static int
963 indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level,
964 ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks,
965 ufs_lbn_t blksperindir, struct fs *fs,
966 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
967 struct fs *, ufs_lbn_t, int),
968 int expungetype)
969 {
970 int error, ns, num, i;
971 ufs_lbn_t subblksperindir;
972 struct indir indirs[NIADDR + 2];
973 ufs1_daddr_t last, *bap;
974 struct buf *bp;
975
976 ns = UFS_FSNEEDSWAP(fs);
977
978 if (blkno == 0) {
979 if (expungetype == BLK_NOCOPY)
980 return (0);
981 panic("indiracct_ufs1: missing indir");
982 }
983 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
984 return (error);
985 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
986 panic("indiracct_ufs1: botched params");
987 /*
988 * We have to expand bread here since it will deadlock looking
989 * up the block number for any blocks that are not in the cache.
990 */
991 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
992 false, &bp);
993 if (error)
994 return error;
995 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
996 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
997 brelse(bp, 0);
998 return (error);
999 }
1000 /*
1001 * Account for the block pointers in this indirect block.
1002 */
1003 last = howmany(remblks, blksperindir);
1004 if (last > NINDIR(fs))
1005 last = NINDIR(fs);
1006 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1007 bcopy(bp->b_data, (void *)bap, fs->fs_bsize);
1008 brelse(bp, 0);
1009 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1010 level == 0 ? rlbn : -1, expungetype);
1011 if (error || level == 0)
1012 goto out;
1013 /*
1014 * Account for the block pointers in each of the indirect blocks
1015 * in the levels below us.
1016 */
1017 subblksperindir = blksperindir / NINDIR(fs);
1018 for (lbn++, level--, i = 0; i < last; i++) {
1019 error = indiracct_ufs1(snapvp, cancelvp, level,
1020 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
1021 fs, acctfunc, expungetype);
1022 if (error)
1023 goto out;
1024 rlbn += blksperindir;
1025 lbn -= blksperindir;
1026 remblks -= blksperindir;
1027 }
1028 out:
1029 FREE(bap, M_DEVBUF);
1030 return (error);
1031 }
1032
1033 /*
1034 * Do both snap accounting and map accounting.
1035 */
1036 static int
1037 fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp,
1038 struct fs *fs, ufs_lbn_t lblkno,
1039 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1040 {
1041 int error;
1042
1043 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1044 return (error);
1045 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1046 }
1047
1048 /*
1049 * Identify a set of blocks allocated in a snapshot inode.
1050 */
1051 static int
1052 snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp,
1053 struct fs *fs, ufs_lbn_t lblkno,
1054 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1055 {
1056 struct inode *ip = VTOI(vp);
1057 ufs1_daddr_t blkno, *blkp;
1058 ufs_lbn_t lbn;
1059 struct buf *ibp;
1060 int error, ns;
1061
1062 ns = UFS_FSNEEDSWAP(fs);
1063
1064 for ( ; oldblkp < lastblkp; oldblkp++) {
1065 blkno = ufs_rw32(*oldblkp, ns);
1066 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1067 continue;
1068 lbn = fragstoblks(fs, blkno);
1069 if (lbn < NDADDR) {
1070 blkp = &ip->i_ffs1_db[lbn];
1071 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1072 } else {
1073 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1074 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1075 if (error)
1076 return (error);
1077 blkp = &((ufs1_daddr_t *)(ibp->b_data))
1078 [(lbn - NDADDR) % NINDIR(fs)];
1079 }
1080 /*
1081 * If we are expunging a snapshot vnode and we
1082 * find a block marked BLK_NOCOPY, then it is
1083 * one that has been allocated to this snapshot after
1084 * we took our current snapshot and can be ignored.
1085 */
1086 blkno = ufs_rw32(*blkp, ns);
1087 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1088 if (lbn >= NDADDR)
1089 brelse(ibp, 0);
1090 } else {
1091 if (blkno != 0)
1092 panic("snapacct_ufs1: bad block");
1093 *blkp = ufs_rw32(expungetype, ns);
1094 if (lbn >= NDADDR)
1095 bdwrite(ibp);
1096 }
1097 }
1098 return (0);
1099 }
1100
1101 /*
1102 * Account for a set of blocks allocated in a snapshot inode.
1103 */
1104 static int
1105 mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp,
1106 struct fs *fs, ufs_lbn_t lblkno, int expungetype)
1107 {
1108 ufs1_daddr_t blkno;
1109 struct inode *ip;
1110 ino_t inum;
1111 int acctit, ns;
1112
1113 ns = UFS_FSNEEDSWAP(fs);
1114 ip = VTOI(vp);
1115 inum = ip->i_number;
1116 if (lblkno == -1)
1117 acctit = 0;
1118 else
1119 acctit = 1;
1120 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1121 blkno = ufs_rw32(*oldblkp, ns);
1122 if (blkno == 0 || blkno == BLK_NOCOPY)
1123 continue;
1124 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1125 *ip->i_snapblklist++ = lblkno;
1126 if (blkno == BLK_SNAP)
1127 blkno = blkstofrags(fs, lblkno);
1128 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1129 }
1130 return (0);
1131 }
1132
1133 /*
1134 * Before expunging a snapshot inode, note all the
1135 * blocks that it claims with BLK_SNAP so that fsck will
1136 * be able to account for those blocks properly and so
1137 * that this snapshot knows that it need not copy them
1138 * if the other snapshot holding them is freed. This code
1139 * is reproduced once each for UFS1 and UFS2.
1140 */
1141 static int
1142 expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1143 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1144 struct fs *, ufs_lbn_t, int),
1145 int expungetype)
1146 {
1147 int i, error, ns;
1148 ufs_lbn_t lbn, rlbn;
1149 ufs2_daddr_t len, blkno, numblks, blksperindir;
1150 struct ufs2_dinode *dip;
1151 struct buf *bp;
1152
1153 ns = UFS_FSNEEDSWAP(fs);
1154 /*
1155 * Prepare to expunge the inode. If its inode block has not
1156 * yet been copied, then allocate and fill the copy.
1157 */
1158 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1159 error = snapblkaddr(snapvp, lbn, &blkno);
1160 if (error)
1161 return error;
1162 if (blkno != 0) {
1163 error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED,
1164 B_MODIFY, &bp);
1165 } else {
1166 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
1167 fs->fs_bsize, KERNCRED, 0, &bp);
1168 if (! error)
1169 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1170 }
1171 if (error)
1172 return error;
1173 /*
1174 * Set a snapshot inode to be a zero length file, regular files
1175 * or unlinked snapshots to be completely unallocated.
1176 */
1177 dip = (struct ufs2_dinode *)bp->b_data +
1178 ino_to_fsbo(fs, cancelip->i_number);
1179 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0)
1180 dip->di_mode = 0;
1181 dip->di_size = 0;
1182 dip->di_blocks = 0;
1183 dip->di_flags =
1184 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
1185 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1186 bdwrite(bp);
1187 /*
1188 * Now go through and expunge all the blocks in the file
1189 * using the function requested.
1190 */
1191 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1192 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0],
1193 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype)))
1194 return (error);
1195 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0],
1196 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype)))
1197 return (error);
1198 blksperindir = 1;
1199 lbn = -NDADDR;
1200 len = numblks - NDADDR;
1201 rlbn = NDADDR;
1202 for (i = 0; len > 0 && i < NIADDR; i++) {
1203 error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1204 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len,
1205 blksperindir, fs, acctfunc, expungetype);
1206 if (error)
1207 return (error);
1208 blksperindir *= NINDIR(fs);
1209 lbn -= blksperindir + 1;
1210 len -= blksperindir;
1211 rlbn += blksperindir;
1212 }
1213 return (0);
1214 }
1215
1216 /*
1217 * Descend an indirect block chain for vnode cancelvp accounting for all
1218 * its indirect blocks in snapvp.
1219 */
1220 static int
1221 indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level,
1222 ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks,
1223 ufs_lbn_t blksperindir, struct fs *fs,
1224 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1225 struct fs *, ufs_lbn_t, int),
1226 int expungetype)
1227 {
1228 int error, ns, num, i;
1229 ufs_lbn_t subblksperindir;
1230 struct indir indirs[NIADDR + 2];
1231 ufs2_daddr_t last, *bap;
1232 struct buf *bp;
1233
1234 ns = UFS_FSNEEDSWAP(fs);
1235
1236 if (blkno == 0) {
1237 if (expungetype == BLK_NOCOPY)
1238 return (0);
1239 panic("indiracct_ufs2: missing indir");
1240 }
1241 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1242 return (error);
1243 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1244 panic("indiracct_ufs2: botched params");
1245 /*
1246 * We have to expand bread here since it will deadlock looking
1247 * up the block number for any blocks that are not in the cache.
1248 */
1249 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
1250 false, &bp);
1251 if (error)
1252 return error;
1253 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1254 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
1255 brelse(bp, 0);
1256 return (error);
1257 }
1258 /*
1259 * Account for the block pointers in this indirect block.
1260 */
1261 last = howmany(remblks, blksperindir);
1262 if (last > NINDIR(fs))
1263 last = NINDIR(fs);
1264 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1265 bcopy(bp->b_data, (void *)bap, fs->fs_bsize);
1266 brelse(bp, 0);
1267 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1268 level == 0 ? rlbn : -1, expungetype);
1269 if (error || level == 0)
1270 goto out;
1271 /*
1272 * Account for the block pointers in each of the indirect blocks
1273 * in the levels below us.
1274 */
1275 subblksperindir = blksperindir / NINDIR(fs);
1276 for (lbn++, level--, i = 0; i < last; i++) {
1277 error = indiracct_ufs2(snapvp, cancelvp, level,
1278 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
1279 fs, acctfunc, expungetype);
1280 if (error)
1281 goto out;
1282 rlbn += blksperindir;
1283 lbn -= blksperindir;
1284 remblks -= blksperindir;
1285 }
1286 out:
1287 FREE(bap, M_DEVBUF);
1288 return (error);
1289 }
1290
1291 /*
1292 * Do both snap accounting and map accounting.
1293 */
1294 static int
1295 fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp,
1296 struct fs *fs, ufs_lbn_t lblkno,
1297 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1298 {
1299 int error;
1300
1301 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1302 return (error);
1303 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1304 }
1305
1306 /*
1307 * Identify a set of blocks allocated in a snapshot inode.
1308 */
1309 static int
1310 snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp,
1311 struct fs *fs, ufs_lbn_t lblkno,
1312 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1313 {
1314 struct inode *ip = VTOI(vp);
1315 ufs2_daddr_t blkno, *blkp;
1316 ufs_lbn_t lbn;
1317 struct buf *ibp;
1318 int error, ns;
1319
1320 ns = UFS_FSNEEDSWAP(fs);
1321
1322 for ( ; oldblkp < lastblkp; oldblkp++) {
1323 blkno = ufs_rw64(*oldblkp, ns);
1324 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1325 continue;
1326 lbn = fragstoblks(fs, blkno);
1327 if (lbn < NDADDR) {
1328 blkp = &ip->i_ffs2_db[lbn];
1329 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1330 } else {
1331 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1332 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1333 if (error)
1334 return (error);
1335 blkp = &((ufs2_daddr_t *)(ibp->b_data))
1336 [(lbn - NDADDR) % NINDIR(fs)];
1337 }
1338 /*
1339 * If we are expunging a snapshot vnode and we
1340 * find a block marked BLK_NOCOPY, then it is
1341 * one that has been allocated to this snapshot after
1342 * we took our current snapshot and can be ignored.
1343 */
1344 blkno = ufs_rw64(*blkp, ns);
1345 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1346 if (lbn >= NDADDR)
1347 brelse(ibp, 0);
1348 } else {
1349 if (blkno != 0)
1350 panic("snapacct_ufs2: bad block");
1351 *blkp = ufs_rw64(expungetype, ns);
1352 if (lbn >= NDADDR)
1353 bdwrite(ibp);
1354 }
1355 }
1356 return (0);
1357 }
1358
1359 /*
1360 * Account for a set of blocks allocated in a snapshot inode.
1361 */
1362 static int
1363 mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp,
1364 struct fs *fs, ufs_lbn_t lblkno, int expungetype)
1365 {
1366 ufs2_daddr_t blkno;
1367 struct inode *ip;
1368 ino_t inum;
1369 int acctit, ns;
1370
1371 ns = UFS_FSNEEDSWAP(fs);
1372 ip = VTOI(vp);
1373 inum = ip->i_number;
1374 if (lblkno == -1)
1375 acctit = 0;
1376 else
1377 acctit = 1;
1378 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1379 blkno = ufs_rw64(*oldblkp, ns);
1380 if (blkno == 0 || blkno == BLK_NOCOPY)
1381 continue;
1382 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1383 *ip->i_snapblklist++ = lblkno;
1384 if (blkno == BLK_SNAP)
1385 blkno = blkstofrags(fs, lblkno);
1386 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1387 }
1388 return (0);
1389 }
1390 #endif /* defined(FFS_NO_SNAPSHOT) */
1391
1392 /*
1393 * Decrement extra reference on snapshot when last name is removed.
1394 * It will not be freed until the last open reference goes away.
1395 */
1396 void
1397 ffs_snapgone(struct inode *ip)
1398 {
1399 struct mount *mp = ip->i_devvp->v_specmountpoint;
1400 struct inode *xp;
1401 struct fs *fs;
1402 struct snap_info *si;
1403 int snaploc;
1404
1405 si = VFSTOUFS(mp)->um_snapinfo;
1406
1407 /*
1408 * Find snapshot in incore list.
1409 */
1410 mutex_enter(&si->si_lock);
1411 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1412 if (xp == ip)
1413 break;
1414 mutex_exit(&si->si_lock);
1415 if (xp != NULL)
1416 vrele(ITOV(ip));
1417 #ifdef DEBUG
1418 else if (snapdebug)
1419 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1420 (unsigned long long)ip->i_number);
1421 #endif
1422 /*
1423 * Delete snapshot inode from superblock. Keep list dense.
1424 */
1425 mutex_enter(&si->si_lock);
1426 fs = ip->i_fs;
1427 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1428 if (fs->fs_snapinum[snaploc] == ip->i_number)
1429 break;
1430 if (snaploc < FSMAXSNAP) {
1431 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1432 if (fs->fs_snapinum[snaploc] == 0)
1433 break;
1434 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1435 }
1436 fs->fs_snapinum[snaploc - 1] = 0;
1437 }
1438 si->si_gen++;
1439 mutex_exit(&si->si_lock);
1440 }
1441
1442 /*
1443 * Prepare a snapshot file for being removed.
1444 */
1445 void
1446 ffs_snapremove(struct vnode *vp)
1447 {
1448 struct inode *ip = VTOI(vp), *xp;
1449 struct vnode *devvp = ip->i_devvp;
1450 struct fs *fs = ip->i_fs;
1451 struct mount *mp = devvp->v_specmountpoint;
1452 struct buf *ibp;
1453 struct snap_info *si;
1454 ufs2_daddr_t numblks, blkno, dblk;
1455 int error, ns, loc, last;
1456
1457 si = VFSTOUFS(mp)->um_snapinfo;
1458 ns = UFS_FSNEEDSWAP(fs);
1459 mutex_enter(&si->si_snaplock);
1460 /*
1461 * If active, delete from incore list (this snapshot may
1462 * already have been in the process of being deleted, so
1463 * would not have been active).
1464 *
1465 * Clear copy-on-write flag if last snapshot.
1466 */
1467 if (ip->i_nextsnap.tqe_prev != 0) {
1468 mutex_enter(&si->si_lock);
1469 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1470 ip->i_nextsnap.tqe_prev = 0;
1471 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1472 /* Roll back the list of preallocated blocks. */
1473 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1474 si->si_snapblklist = xp->i_snapblklist;
1475 } else {
1476 si->si_snapblklist = 0;
1477 si->si_gen++;
1478 mutex_exit(&si->si_lock);
1479 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1480 mutex_enter(&si->si_lock);
1481 }
1482 si->si_gen++;
1483 mutex_exit(&si->si_lock);
1484 FREE(ip->i_snapblklist, M_UFSMNT);
1485 ip->i_snapblklist = NULL;
1486 }
1487 mutex_exit(&si->si_snaplock);
1488 /*
1489 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1490 * snapshots that want them (see ffs_snapblkfree below).
1491 */
1492 for (blkno = 1; blkno < NDADDR; blkno++) {
1493 dblk = db_get(ip, blkno);
1494 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1495 db_assign(ip, blkno, 0);
1496 else if ((dblk == blkstofrags(fs, blkno) &&
1497 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1498 ip->i_number))) {
1499 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1500 db_assign(ip, blkno, 0);
1501 }
1502 }
1503 numblks = howmany(ip->i_size, fs->fs_bsize);
1504 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1505 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1506 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1507 if (error)
1508 continue;
1509 if (fs->fs_size - blkno > NINDIR(fs))
1510 last = NINDIR(fs);
1511 else
1512 last = fs->fs_size - blkno;
1513 for (loc = 0; loc < last; loc++) {
1514 dblk = idb_get(ip, ibp->b_data, loc);
1515 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1516 idb_assign(ip, ibp->b_data, loc, 0);
1517 else if (dblk == blkstofrags(fs, blkno) &&
1518 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1519 fs->fs_bsize, ip->i_number)) {
1520 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1521 idb_assign(ip, ibp->b_data, loc, 0);
1522 }
1523 }
1524 bawrite(ibp);
1525 }
1526 /*
1527 * Clear snapshot flag and drop reference.
1528 */
1529 ip->i_flags &= ~SF_SNAPSHOT;
1530 DIP_ASSIGN(ip, flags, ip->i_flags);
1531 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1532 }
1533
1534 /*
1535 * Notification that a block is being freed. Return zero if the free
1536 * should be allowed to proceed. Return non-zero if the snapshot file
1537 * wants to claim the block. The block will be claimed if it is an
1538 * uncopied part of one of the snapshots. It will be freed if it is
1539 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1540 * If a fragment is being freed, then all snapshots that care about
1541 * it must make a copy since a snapshot file can only claim full sized
1542 * blocks. Note that if more than one snapshot file maps the block,
1543 * we can pick one at random to claim it. Since none of the snapshots
1544 * can change, we are assurred that they will all see the same unmodified
1545 * image. When deleting a snapshot file (see ffs_snapremove above), we
1546 * must push any of these claimed blocks to one of the other snapshots
1547 * that maps it. These claimed blocks are easily identified as they will
1548 * have a block number equal to their logical block number within the
1549 * snapshot. A copied block can never have this property because they
1550 * must always have been allocated from a BLK_NOCOPY location.
1551 */
1552 int
1553 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno,
1554 long size, ino_t inum)
1555 {
1556 struct mount *mp = devvp->v_specmountpoint;
1557 struct buf *ibp;
1558 struct inode *ip;
1559 struct vnode *vp = NULL;
1560 struct snap_info *si;
1561 void *saved_data = NULL;
1562 ufs_lbn_t lbn;
1563 ufs2_daddr_t blkno;
1564 uint32_t gen;
1565 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1566
1567 si = VFSTOUFS(mp)->um_snapinfo;
1568 lbn = fragstoblks(fs, bno);
1569 mutex_enter(&si->si_lock);
1570 retry:
1571 gen = si->si_gen;
1572 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1573 vp = ITOV(ip);
1574 if (snapshot_locked == 0) {
1575 if (!mutex_tryenter(&si->si_snaplock)) {
1576 mutex_exit(&si->si_lock);
1577 mutex_enter(&si->si_snaplock);
1578 mutex_enter(&si->si_lock);
1579 }
1580 snapshot_locked = 1;
1581 if (gen != si->si_gen)
1582 goto retry;
1583 }
1584 /*
1585 * Lookup block being written.
1586 */
1587 if (lbn < NDADDR) {
1588 blkno = db_get(ip, lbn);
1589 } else {
1590 mutex_exit(&si->si_lock);
1591 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1592 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1593 if (error) {
1594 mutex_enter(&si->si_lock);
1595 break;
1596 }
1597 indiroff = (lbn - NDADDR) % NINDIR(fs);
1598 blkno = idb_get(ip, ibp->b_data, indiroff);
1599 mutex_enter(&si->si_lock);
1600 if (gen != si->si_gen) {
1601 brelse(ibp, 0);
1602 goto retry;
1603 }
1604 }
1605 /*
1606 * Check to see if block needs to be copied.
1607 */
1608 if (blkno == 0) {
1609 /*
1610 * A block that we map is being freed. If it has not
1611 * been claimed yet, we will claim or copy it (below).
1612 */
1613 claimedblk = 1;
1614 } else if (blkno == BLK_SNAP) {
1615 /*
1616 * No previous snapshot claimed the block,
1617 * so it will be freed and become a BLK_NOCOPY
1618 * (don't care) for us.
1619 */
1620 if (claimedblk)
1621 panic("snapblkfree: inconsistent block type");
1622 if (lbn < NDADDR) {
1623 db_assign(ip, lbn, BLK_NOCOPY);
1624 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1625 } else {
1626 idb_assign(ip, ibp->b_data, indiroff,
1627 BLK_NOCOPY);
1628 mutex_exit(&si->si_lock);
1629 if (ip->i_ffs_effnlink > 0)
1630 bwrite(ibp);
1631 else
1632 bdwrite(ibp);
1633 mutex_enter(&si->si_lock);
1634 if (gen != si->si_gen)
1635 goto retry;
1636 }
1637 continue;
1638 } else /* BLK_NOCOPY or default */ {
1639 /*
1640 * If the snapshot has already copied the block
1641 * (default), or does not care about the block,
1642 * it is not needed.
1643 */
1644 if (lbn >= NDADDR)
1645 brelse(ibp, 0);
1646 continue;
1647 }
1648 /*
1649 * If this is a full size block, we will just grab it
1650 * and assign it to the snapshot inode. Otherwise we
1651 * will proceed to copy it. See explanation for this
1652 * routine as to why only a single snapshot needs to
1653 * claim this block.
1654 */
1655 if (size == fs->fs_bsize) {
1656 #ifdef DEBUG
1657 if (snapdebug)
1658 printf("%s %llu lbn %" PRId64
1659 "from inum %llu\n",
1660 "Grabonremove: snapino",
1661 (unsigned long long)ip->i_number,
1662 lbn, (unsigned long long)inum);
1663 #endif
1664 mutex_exit(&si->si_lock);
1665 if (lbn < NDADDR) {
1666 db_assign(ip, lbn, bno);
1667 } else {
1668 idb_assign(ip, ibp->b_data, indiroff, bno);
1669 if (ip->i_ffs_effnlink > 0)
1670 bwrite(ibp);
1671 else
1672 bdwrite(ibp);
1673 }
1674 DIP_ADD(ip, blocks, btodb(size));
1675 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1676 if (ip->i_ffs_effnlink > 0 && mp->mnt_wapbl)
1677 error = syncsnap(vp);
1678 else
1679 error = 0;
1680 mutex_exit(&si->si_snaplock);
1681 return (error == 0);
1682 }
1683 if (lbn >= NDADDR)
1684 brelse(ibp, 0);
1685 #ifdef DEBUG
1686 if (snapdebug)
1687 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1688 "Copyonremove: snapino ",
1689 (unsigned long long)ip->i_number,
1690 lbn, "for inum", (unsigned long long)inum, size);
1691 #endif
1692 /*
1693 * If we have already read the old block contents, then
1694 * simply copy them to the new block. Note that we need
1695 * to synchronously write snapshots that have not been
1696 * unlinked, and hence will be visible after a crash,
1697 * to ensure their integrity.
1698 */
1699 mutex_exit(&si->si_lock);
1700 if (saved_data == NULL) {
1701 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1702 error = rwfsblk(vp, B_READ, saved_data, lbn);
1703 if (error) {
1704 free(saved_data, M_UFSMNT);
1705 saved_data = NULL;
1706 mutex_enter(&si->si_lock);
1707 break;
1708 }
1709 }
1710 error = wrsnapblk(vp, saved_data, lbn);
1711 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl)
1712 error = syncsnap(vp);
1713 mutex_enter(&si->si_lock);
1714 if (error)
1715 break;
1716 if (gen != si->si_gen)
1717 goto retry;
1718 }
1719 mutex_exit(&si->si_lock);
1720 if (saved_data)
1721 free(saved_data, M_UFSMNT);
1722 /*
1723 * If we have been unable to allocate a block in which to do
1724 * the copy, then return non-zero so that the fragment will
1725 * not be freed. Although space will be lost, the snapshot
1726 * will stay consistent.
1727 */
1728 if (snapshot_locked)
1729 mutex_exit(&si->si_snaplock);
1730 return (error);
1731 }
1732
1733 /*
1734 * Associate snapshot files when mounting.
1735 */
1736 void
1737 ffs_snapshot_mount(struct mount *mp)
1738 {
1739 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1740 struct fs *fs = VFSTOUFS(mp)->um_fs;
1741 struct lwp *l = curlwp;
1742 struct vnode *vp;
1743 struct inode *ip, *xp;
1744 struct snap_info *si;
1745 ufs2_daddr_t snaplistsize, *snapblklist;
1746 int i, error, ns, snaploc, loc;
1747
1748 /*
1749 * No persistent snapshots on apple ufs file systems.
1750 */
1751 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1752 return;
1753
1754 si = VFSTOUFS(mp)->um_snapinfo;
1755 ns = UFS_FSNEEDSWAP(fs);
1756 /*
1757 * XXX The following needs to be set before ffs_truncate or
1758 * VOP_READ can be called.
1759 */
1760 mp->mnt_stat.f_iosize = fs->fs_bsize;
1761 /*
1762 * Process each snapshot listed in the superblock.
1763 */
1764 vp = NULL;
1765 mutex_enter(&si->si_lock);
1766 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1767 if (fs->fs_snapinum[snaploc] == 0)
1768 break;
1769 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1770 &vp)) != 0) {
1771 printf("ffs_snapshot_mount: vget failed %d\n", error);
1772 continue;
1773 }
1774 ip = VTOI(vp);
1775 if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1776 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1777 fs->fs_snapinum[snaploc]);
1778 vput(vp);
1779 vp = NULL;
1780 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1781 if (fs->fs_snapinum[loc] == 0)
1782 break;
1783 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1784 }
1785 fs->fs_snapinum[loc - 1] = 0;
1786 snaploc--;
1787 continue;
1788 }
1789
1790 /*
1791 * Read the block hints list. Use an empty list on
1792 * read errors.
1793 */
1794 error = vn_rdwr(UIO_READ, vp,
1795 (void *)&snaplistsize, sizeof(snaplistsize),
1796 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1797 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT,
1798 l->l_cred, NULL, NULL);
1799 if (error) {
1800 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1801 snaplistsize = 1;
1802 } else
1803 snaplistsize = ufs_rw64(snaplistsize, ns);
1804 snapblklist = malloc(
1805 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
1806 if (error)
1807 snapblklist[0] = 1;
1808 else {
1809 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1810 snaplistsize * sizeof(ufs2_daddr_t),
1811 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1812 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT,
1813 l->l_cred, NULL, NULL);
1814 for (i = 0; i < snaplistsize; i++)
1815 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1816 if (error) {
1817 printf("ffs_snapshot_mount: read_2 failed %d\n",
1818 error);
1819 snapblklist[0] = 1;
1820 }
1821 }
1822 ip->i_snapblklist = &snapblklist[0];
1823
1824 /*
1825 * Link it onto the active snapshot list.
1826 */
1827 if (ip->i_nextsnap.tqe_prev != 0)
1828 panic("ffs_snapshot_mount: %llu already on list",
1829 (unsigned long long)ip->i_number);
1830 else
1831 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1832 vp->v_vflag |= VV_SYSTEM;
1833 VOP_UNLOCK(vp, 0);
1834 }
1835 /*
1836 * No usable snapshots found.
1837 */
1838 if (vp == NULL) {
1839 mutex_exit(&si->si_lock);
1840 return;
1841 }
1842 /*
1843 * Attach the block hints list. We always want to
1844 * use the list from the newest snapshot.
1845 */
1846 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1847 si->si_snapblklist = xp->i_snapblklist;
1848 fscow_establish(mp, ffs_copyonwrite, devvp);
1849 si->si_gen++;
1850 mutex_exit(&si->si_lock);
1851 }
1852
1853 /*
1854 * Disassociate snapshot files when unmounting.
1855 */
1856 void
1857 ffs_snapshot_unmount(struct mount *mp)
1858 {
1859 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1860 struct inode *xp;
1861 struct vnode *vp = NULL;
1862 struct snap_info *si;
1863
1864 si = VFSTOUFS(mp)->um_snapinfo;
1865 mutex_enter(&si->si_lock);
1866 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1867 vp = ITOV(xp);
1868 vp->v_vnlock = &vp->v_lock;
1869 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1870 xp->i_nextsnap.tqe_prev = 0;
1871 if (xp->i_snapblklist == si->si_snapblklist)
1872 si->si_snapblklist = NULL;
1873 FREE(xp->i_snapblklist, M_UFSMNT);
1874 if (xp->i_ffs_effnlink > 0) {
1875 si->si_gen++;
1876 mutex_exit(&si->si_lock);
1877 vrele(vp);
1878 mutex_enter(&si->si_lock);
1879 }
1880 }
1881 if (vp)
1882 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1883 si->si_gen++;
1884 mutex_exit(&si->si_lock);
1885 }
1886
1887 /*
1888 * Lookup a snapshots data block address.
1889 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
1890 * and safe even for the pagedaemon where we cannot bread().
1891 */
1892 static int
1893 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
1894 {
1895 struct indir indirs[NIADDR + 2];
1896 struct inode *ip = VTOI(vp);
1897 struct fs *fs = ip->i_fs;
1898 struct buf *bp;
1899 int error, num;
1900
1901 KASSERT(lbn >= 0);
1902
1903 if (lbn < NDADDR) {
1904 *res = db_get(ip, lbn);
1905 return 0;
1906 }
1907 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
1908 return error;
1909 if (curlwp == uvm.pagedaemon_lwp) {
1910 mutex_enter(&bufcache_lock);
1911 bp = incore(vp, indirs[num-1].in_lbn);
1912 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
1913 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
1914 error = 0;
1915 } else
1916 error = ENOMEM;
1917 mutex_exit(&bufcache_lock);
1918 return error;
1919 }
1920 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
1921 if (error == 0)
1922 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
1923 brelse(bp, 0);
1924
1925 return error;
1926 }
1927
1928 /*
1929 * Check for need to copy block that is about to be written,
1930 * copying the block if necessary.
1931 */
1932 static int
1933 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1934 {
1935 struct fs *fs;
1936 struct inode *ip;
1937 struct vnode *devvp = v, *vp = NULL;
1938 struct mount *mp = devvp->v_specmountpoint;
1939 struct snap_info *si;
1940 void *saved_data = NULL;
1941 ufs2_daddr_t lbn, blkno, *snapblklist;
1942 uint32_t gen;
1943 int lower, upper, mid, ns, snapshot_locked = 0, error = 0;
1944
1945 /*
1946 * Check for valid snapshots.
1947 */
1948 si = VFSTOUFS(mp)->um_snapinfo;
1949 mutex_enter(&si->si_lock);
1950 ip = TAILQ_FIRST(&si->si_snapshots);
1951 if (ip == NULL) {
1952 mutex_exit(&si->si_lock);
1953 return 0;
1954 }
1955 /*
1956 * First check to see if it is after the file system or
1957 * in the preallocated list.
1958 * By doing this check we avoid several potential deadlocks.
1959 */
1960 fs = ip->i_fs;
1961 ns = UFS_FSNEEDSWAP(fs);
1962 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1963 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
1964 mutex_exit(&si->si_lock);
1965 return 0;
1966 }
1967 snapblklist = si->si_snapblklist;
1968 upper = si->si_snapblklist[0] - 1;
1969 lower = 1;
1970 while (lower <= upper) {
1971 mid = (lower + upper) / 2;
1972 if (snapblklist[mid] == lbn)
1973 break;
1974 if (snapblklist[mid] < lbn)
1975 lower = mid + 1;
1976 else
1977 upper = mid - 1;
1978 }
1979 if (lower <= upper) {
1980 mutex_exit(&si->si_lock);
1981 return 0;
1982 }
1983 /*
1984 * Not in the precomputed list, so check the snapshots.
1985 */
1986 if (data_valid && bp->b_bcount == fs->fs_bsize)
1987 saved_data = bp->b_data;
1988 retry:
1989 gen = si->si_gen;
1990 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1991 vp = ITOV(ip);
1992 /*
1993 * We ensure that everything of our own that needs to be
1994 * copied will be done at the time that ffs_snapshot is
1995 * called. Thus we can skip the check here which can
1996 * deadlock in doing the lookup in ffs_balloc.
1997 */
1998 if (bp->b_vp == vp)
1999 continue;
2000 /*
2001 * Check to see if block needs to be copied.
2002 */
2003 if (lbn < NDADDR) {
2004 blkno = db_get(ip, lbn);
2005 } else {
2006 mutex_exit(&si->si_lock);
2007 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
2008 mutex_enter(&si->si_lock);
2009 break;
2010 }
2011 mutex_enter(&si->si_lock);
2012 if (gen != si->si_gen)
2013 goto retry;
2014 }
2015 #ifdef DIAGNOSTIC
2016 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
2017 panic("ffs_copyonwrite: bad copy block");
2018 #endif
2019 if (blkno != 0)
2020 continue;
2021
2022 if (curlwp == uvm.pagedaemon_lwp) {
2023 error = ENOMEM;
2024 break;
2025 }
2026
2027 if (snapshot_locked == 0) {
2028 if (!mutex_tryenter(&si->si_snaplock)) {
2029 mutex_exit(&si->si_lock);
2030 mutex_enter(&si->si_snaplock);
2031 mutex_enter(&si->si_lock);
2032 }
2033 snapshot_locked = 1;
2034 if (gen != si->si_gen)
2035 goto retry;
2036
2037 /* Check again if block still needs to be copied */
2038 if (lbn < NDADDR) {
2039 blkno = db_get(ip, lbn);
2040 } else {
2041 mutex_exit(&si->si_lock);
2042 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
2043 mutex_enter(&si->si_lock);
2044 break;
2045 }
2046 mutex_enter(&si->si_lock);
2047 if (gen != si->si_gen)
2048 goto retry;
2049 }
2050
2051 if (blkno != 0)
2052 continue;
2053 }
2054 /*
2055 * Allocate the block into which to do the copy. Since
2056 * multiple processes may all try to copy the same block,
2057 * we have to recheck our need to do a copy if we sleep
2058 * waiting for the lock.
2059 *
2060 * Because all snapshots on a filesystem share a single
2061 * lock, we ensure that we will never be in competition
2062 * with another process to allocate a block.
2063 */
2064 #ifdef DEBUG
2065 if (snapdebug) {
2066 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2067 (unsigned long long)ip->i_number, lbn);
2068 if (bp->b_vp == devvp)
2069 printf("fs metadata");
2070 else
2071 printf("inum %llu", (unsigned long long)
2072 VTOI(bp->b_vp)->i_number);
2073 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2074 }
2075 #endif
2076 /*
2077 * If we have already read the old block contents, then
2078 * simply copy them to the new block. Note that we need
2079 * to synchronously write snapshots that have not been
2080 * unlinked, and hence will be visible after a crash,
2081 * to ensure their integrity.
2082 */
2083 mutex_exit(&si->si_lock);
2084 if (saved_data == NULL) {
2085 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2086 error = rwfsblk(vp, B_READ, saved_data, lbn);
2087 if (error) {
2088 free(saved_data, M_UFSMNT);
2089 saved_data = NULL;
2090 mutex_enter(&si->si_lock);
2091 break;
2092 }
2093 }
2094 error = wrsnapblk(vp, saved_data, lbn);
2095 if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl)
2096 error = syncsnap(vp);
2097 mutex_enter(&si->si_lock);
2098 if (error)
2099 break;
2100 if (gen != si->si_gen)
2101 goto retry;
2102 }
2103 /*
2104 * Note that we need to synchronously write snapshots that
2105 * have not been unlinked, and hence will be visible after
2106 * a crash, to ensure their integrity.
2107 */
2108 mutex_exit(&si->si_lock);
2109 if (saved_data && saved_data != bp->b_data)
2110 free(saved_data, M_UFSMNT);
2111 if (snapshot_locked)
2112 mutex_exit(&si->si_snaplock);
2113 return error;
2114 }
2115
2116 /*
2117 * Read from a snapshot.
2118 */
2119 int
2120 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2121 {
2122 struct inode *ip = VTOI(vp);
2123 struct fs *fs = ip->i_fs;
2124 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2125 struct buf *bp;
2126 daddr_t lbn, nextlbn;
2127 off_t bytesinfile;
2128 long size, xfersize, blkoffset;
2129 int error;
2130
2131 fstrans_start(vp->v_mount, FSTRANS_SHARED);
2132 mutex_enter(&si->si_snaplock);
2133
2134 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2135 bytesinfile = ip->i_size - uio->uio_offset;
2136 if (bytesinfile <= 0)
2137 break;
2138 lbn = lblkno(fs, uio->uio_offset);
2139 nextlbn = lbn + 1;
2140 size = blksize(fs, ip, lbn);
2141 blkoffset = blkoff(fs, uio->uio_offset);
2142 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2143 bytesinfile);
2144
2145 if (lblktosize(fs, nextlbn) >= ip->i_size)
2146 error = bread(vp, lbn, size, NOCRED, 0, &bp);
2147 else {
2148 int nextsize = blksize(fs, ip, nextlbn);
2149 error = breadn(vp, lbn,
2150 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
2151 }
2152 if (error)
2153 break;
2154
2155 /*
2156 * We should only get non-zero b_resid when an I/O error
2157 * has occurred, which should cause us to break above.
2158 * However, if the short read did not cause an error,
2159 * then we want to ensure that we do not uiomove bad
2160 * or uninitialized data.
2161 */
2162 size -= bp->b_resid;
2163 if (size < xfersize) {
2164 if (size == 0)
2165 break;
2166 xfersize = size;
2167 }
2168 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2169 if (error)
2170 break;
2171 brelse(bp, BC_AGE);
2172 }
2173 if (bp != NULL)
2174 brelse(bp, BC_AGE);
2175
2176 mutex_exit(&si->si_snaplock);
2177 fstrans_done(vp->v_mount);
2178 return error;
2179 }
2180
2181 /*
2182 * Read or write the specified block of the filesystem vp resides on
2183 * from or to the disk bypassing the buffer cache.
2184 */
2185 static int
2186 rwfsblk(struct vnode *vp, int flags, void *data, ufs2_daddr_t lbn)
2187 {
2188 int error;
2189 struct inode *ip = VTOI(vp);
2190 struct fs *fs = ip->i_fs;
2191 struct buf *nbp;
2192
2193 nbp = getiobuf(NULL, true);
2194 nbp->b_flags = flags;
2195 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2196 nbp->b_error = 0;
2197 nbp->b_data = data;
2198 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
2199 nbp->b_proc = NULL;
2200 nbp->b_dev = ip->i_devvp->v_rdev;
2201 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2202
2203 bdev_strategy(nbp);
2204
2205 error = biowait(nbp);
2206
2207 putiobuf(nbp);
2208
2209 return error;
2210 }
2211
2212 /*
2213 * Write all dirty buffers to disk and invalidate them.
2214 */
2215 static int
2216 syncsnap(struct vnode *vp)
2217 {
2218 int error;
2219 buf_t *bp;
2220 struct fs *fs = VTOI(vp)->i_fs;
2221
2222 mutex_enter(&bufcache_lock);
2223 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2224 KASSERT((bp->b_cflags & BC_BUSY) == 0);
2225 KASSERT(bp->b_bcount == fs->fs_bsize);
2226 bp->b_cflags |= BC_BUSY;
2227 mutex_exit(&bufcache_lock);
2228 error = rwfsblk(vp, B_WRITE, bp->b_data,
2229 fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
2230 brelse(bp, BC_INVAL | BC_VFLUSH);
2231 if (error)
2232 return error;
2233 mutex_enter(&bufcache_lock);
2234 }
2235 mutex_exit(&bufcache_lock);
2236
2237 return 0;
2238 }
2239
2240 /*
2241 * Write the specified block to a snapshot.
2242 */
2243 static int
2244 wrsnapblk(struct vnode *vp, void *data, ufs2_daddr_t lbn)
2245 {
2246 struct inode *ip = VTOI(vp);
2247 struct fs *fs = ip->i_fs;
2248 struct buf *bp;
2249 int error;
2250
2251 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2252 KERNCRED, (ip->i_ffs_effnlink > 0 ? B_SYNC : 0), &bp);
2253 if (error)
2254 return error;
2255 bcopy(data, bp->b_data, fs->fs_bsize);
2256 if (ip->i_ffs_effnlink > 0)
2257 error = bwrite(bp);
2258 else
2259 bawrite(bp);
2260
2261 return error;
2262 }
2263
2264 /*
2265 * Get/Put direct block from inode or buffer containing disk addresses. Take
2266 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2267 * into a global include.
2268 */
2269 static inline ufs2_daddr_t
2270 db_get(struct inode *ip, int loc)
2271 {
2272 if (ip->i_ump->um_fstype == UFS1)
2273 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2274 else
2275 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2276 }
2277
2278 static inline void
2279 db_assign(struct inode *ip, int loc, ufs2_daddr_t val)
2280 {
2281 if (ip->i_ump->um_fstype == UFS1)
2282 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2283 else
2284 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2285 }
2286
2287 static inline ufs2_daddr_t
2288 idb_get(struct inode *ip, void *bf, int loc)
2289 {
2290 if (ip->i_ump->um_fstype == UFS1)
2291 return ufs_rw32(((ufs1_daddr_t *)(bf))[loc],
2292 UFS_IPNEEDSWAP(ip));
2293 else
2294 return ufs_rw64(((ufs2_daddr_t *)(bf))[loc],
2295 UFS_IPNEEDSWAP(ip));
2296 }
2297
2298 static inline void
2299 idb_assign(struct inode *ip, void *bf, int loc, ufs2_daddr_t val)
2300 {
2301 if (ip->i_ump->um_fstype == UFS1)
2302 ((ufs1_daddr_t *)(bf))[loc] =
2303 ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2304 else
2305 ((ufs2_daddr_t *)(bf))[loc] =
2306 ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2307 }
2308