ffs_snapshot.c revision 1.105 1 /* $NetBSD: ffs_snapshot.c,v 1.105 2011/02/18 14:48:54 bouyer Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.105 2011/02/18 14:48:54 bouyer Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #endif
46
47 #include <sys/param.h>
48 #include <sys/kernel.h>
49 #include <sys/systm.h>
50 #include <sys/conf.h>
51 #include <sys/buf.h>
52 #include <sys/proc.h>
53 #include <sys/namei.h>
54 #include <sys/sched.h>
55 #include <sys/stat.h>
56 #include <sys/malloc.h>
57 #include <sys/mount.h>
58 #include <sys/resource.h>
59 #include <sys/resourcevar.h>
60 #include <sys/vnode.h>
61 #include <sys/kauth.h>
62 #include <sys/fstrans.h>
63 #include <sys/wapbl.h>
64
65 #include <miscfs/specfs/specdev.h>
66
67 #include <ufs/ufs/quota.h>
68 #include <ufs/ufs/ufsmount.h>
69 #include <ufs/ufs/inode.h>
70 #include <ufs/ufs/ufs_extern.h>
71 #include <ufs/ufs/ufs_bswap.h>
72 #include <ufs/ufs/ufs_wapbl.h>
73
74 #include <ufs/ffs/fs.h>
75 #include <ufs/ffs/ffs_extern.h>
76
77 #include <uvm/uvm.h>
78
79 struct snap_info {
80 kmutex_t si_lock; /* Lock this snapinfo */
81 kmutex_t si_snaplock; /* Snapshot vnode common lock */
82 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
83 daddr_t *si_snapblklist; /* Snapshot block hints list */
84 uint32_t si_gen; /* Incremented on change */
85 };
86
87 #if !defined(FFS_NO_SNAPSHOT)
88 typedef int (*acctfunc_t)
89 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
90
91 static int snapshot_setup(struct mount *, struct vnode *);
92 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
93 static int snapshot_expunge(struct mount *, struct vnode *,
94 struct fs *, daddr_t *, daddr_t **);
95 static int snapshot_expunge_snap(struct mount *, struct vnode *,
96 struct fs *, daddr_t);
97 static int snapshot_writefs(struct mount *, struct vnode *, void *);
98 static int cgaccount(struct vnode *, int, int *);
99 static int cgaccount1(int, struct vnode *, void *, int);
100 static int expunge(struct vnode *, struct inode *, struct fs *,
101 acctfunc_t, int);
102 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
103 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
104 static int fullacct(struct vnode *, void *, int, int, struct fs *,
105 daddr_t, int);
106 static int snapacct(struct vnode *, void *, int, int, struct fs *,
107 daddr_t, int);
108 static int mapacct(struct vnode *, void *, int, int, struct fs *,
109 daddr_t, int);
110 #endif /* !defined(FFS_NO_SNAPSHOT) */
111
112 static int ffs_copyonwrite(void *, struct buf *, bool);
113 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
114 static int rwfsblk(struct vnode *, int, void *, daddr_t);
115 static int syncsnap(struct vnode *);
116 static int wrsnapblk(struct vnode *, void *, daddr_t);
117 static int blocks_in_journal(struct fs *);
118
119 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
120 static inline daddr_t db_get(struct inode *, int);
121 static inline void db_assign(struct inode *, int, daddr_t);
122 static inline daddr_t ib_get(struct inode *, int);
123 static inline void ib_assign(struct inode *, int, daddr_t);
124 static inline daddr_t idb_get(struct inode *, void *, int);
125 static inline void idb_assign(struct inode *, void *, int, daddr_t);
126
127 #ifdef DEBUG
128 static int snapdebug = 0;
129 #endif
130
131 int
132 ffs_snapshot_init(struct ufsmount *ump)
133 {
134 struct snap_info *si;
135
136 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
137 if (si == NULL)
138 return ENOMEM;
139
140 TAILQ_INIT(&si->si_snapshots);
141 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
142 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
143 si->si_gen = 0;
144 si->si_snapblklist = NULL;
145
146 return 0;
147 }
148
149 void
150 ffs_snapshot_fini(struct ufsmount *ump)
151 {
152 struct snap_info *si;
153
154 si = ump->um_snapinfo;
155 ump->um_snapinfo = NULL;
156
157 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
158 mutex_destroy(&si->si_lock);
159 mutex_destroy(&si->si_snaplock);
160 KASSERT(si->si_snapblklist == NULL);
161 kmem_free(si, sizeof(*si));
162 }
163
164 /*
165 * Create a snapshot file and initialize it for the filesystem.
166 * Vnode is locked on entry and return.
167 */
168 int
169 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
170 {
171 #if defined(FFS_NO_SNAPSHOT)
172 return EOPNOTSUPP;
173 }
174 #else /* defined(FFS_NO_SNAPSHOT) */
175 bool suspended = false;
176 bool snapshot_locked = false;
177 int error, redo = 0, snaploc;
178 void *sbbuf = NULL;
179 daddr_t *snaplist = NULL, snaplistsize = 0;
180 struct buf *bp, *nbp;
181 struct fs *copy_fs = NULL;
182 struct fs *fs = VFSTOUFS(mp)->um_fs;
183 struct inode *ip = VTOI(vp);
184 struct lwp *l = curlwp;
185 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
186 struct timespec ts;
187 struct timeval starttime;
188 #ifdef DEBUG
189 struct timeval endtime;
190 #endif
191 struct vnode *devvp = ip->i_devvp;
192
193 /*
194 * If the vnode already is a snapshot, return.
195 */
196 if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
197 if (ctime) {
198 ctime->tv_sec = DIP(VTOI(vp), mtime);
199 ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
200 }
201 return 0;
202 }
203 /*
204 * Check for free snapshot slot in the superblock.
205 */
206 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
207 if (fs->fs_snapinum[snaploc] == 0)
208 break;
209 if (snaploc == FSMAXSNAP)
210 return (ENOSPC);
211 /*
212 * Prepare the vnode to become a snapshot.
213 */
214 error = snapshot_setup(mp, vp);
215 if (error)
216 goto out;
217 /*
218 * Change inode to snapshot type file.
219 */
220 ip->i_flags |= SF_SNAPSHOT;
221 DIP_ASSIGN(ip, flags, ip->i_flags);
222 ip->i_flag |= IN_CHANGE | IN_UPDATE;
223 /*
224 * Copy all the cylinder group maps. Although the
225 * filesystem is still active, we hope that only a few
226 * cylinder groups will change between now and when we
227 * suspend operations. Thus, we will be able to quickly
228 * touch up the few cylinder groups that changed during
229 * the suspension period.
230 */
231 error = cgaccount(vp, 1, NULL);
232 if (error)
233 goto out;
234 /*
235 * Ensure that the snapshot is completely on disk.
236 * Since we have marked it as a snapshot it is safe to
237 * unlock it as no process will be allowed to write to it.
238 */
239 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
240 if (error)
241 goto out;
242 VOP_UNLOCK(vp);
243 /*
244 * All allocations are done, so we can now suspend the filesystem.
245 */
246 error = vfs_suspend(vp->v_mount, 0);
247 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
248 if (error)
249 goto out;
250 suspended = true;
251 getmicrotime(&starttime);
252 /*
253 * First, copy all the cylinder group maps that have changed.
254 */
255 error = cgaccount(vp, 2, &redo);
256 if (error)
257 goto out;
258 /*
259 * Create a copy of the superblock and its summary information.
260 */
261 error = snapshot_copyfs(mp, vp, &sbbuf);
262 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
263 if (error)
264 goto out;
265 /*
266 * Expunge unlinked files from our view.
267 */
268 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
269 if (error)
270 goto out;
271 /*
272 * Acquire the snapshot lock.
273 */
274 mutex_enter(&si->si_snaplock);
275 snapshot_locked = true;
276 /*
277 * Record snapshot inode. Since this is the newest snapshot,
278 * it must be placed at the end of the list.
279 */
280 fs->fs_snapinum[snaploc] = ip->i_number;
281
282 mutex_enter(&si->si_lock);
283 if (is_active_snapshot(si, ip))
284 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
285 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
286 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
287 /*
288 * If this is the first snapshot on this filesystem, put the
289 * preliminary list in place and establish the cow handler.
290 */
291 si->si_snapblklist = snaplist;
292 fscow_establish(mp, ffs_copyonwrite, devvp);
293 }
294 si->si_gen++;
295 mutex_exit(&si->si_lock);
296
297 vp->v_vflag |= VV_SYSTEM;
298 /*
299 * Set the mtime to the time the snapshot has been taken.
300 */
301 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
302 if (ctime)
303 *ctime = ts;
304 DIP_ASSIGN(ip, mtime, ts.tv_sec);
305 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
306 ip->i_flag |= IN_CHANGE | IN_UPDATE;
307 /*
308 * Copy allocation information from all snapshots and then
309 * expunge them from our view.
310 */
311 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
312 if (error)
313 goto out;
314 /*
315 * Write the superblock and its summary information to the snapshot.
316 */
317 error = snapshot_writefs(mp, vp, sbbuf);
318 if (error)
319 goto out;
320 /*
321 * We're nearly done, ensure that the snapshot is completely on disk.
322 */
323 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
324 if (error)
325 goto out;
326 /*
327 * Invalidate and free all pages on the snapshot vnode.
328 * We will read and write through the buffercache.
329 */
330 mutex_enter(&vp->v_interlock);
331 error = VOP_PUTPAGES(vp, 0, 0,
332 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
333 if (error)
334 goto out;
335 /*
336 * Invalidate short ( < fs_bsize ) buffers. We will always read
337 * full size buffers later.
338 */
339 mutex_enter(&bufcache_lock);
340 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
341 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
342 nbp = LIST_NEXT(bp, b_vnbufs);
343 KASSERT((bp->b_cflags & BC_BUSY) == 0);
344 if (bp->b_bcount < fs->fs_bsize) {
345 bp->b_cflags |= BC_BUSY;
346 brelsel(bp, BC_INVAL | BC_VFLUSH);
347 }
348 }
349 mutex_exit(&bufcache_lock);
350
351 out:
352 if (sbbuf != NULL) {
353 free(copy_fs->fs_csp, M_UFSMNT);
354 free(sbbuf, M_UFSMNT);
355 }
356 if (fs->fs_active != NULL) {
357 free(fs->fs_active, M_DEVBUF);
358 fs->fs_active = NULL;
359 }
360
361 mutex_enter(&si->si_lock);
362 if (snaplist != NULL) {
363 if (si->si_snapblklist == snaplist)
364 si->si_snapblklist = NULL;
365 free(snaplist, M_UFSMNT);
366 }
367 if (error) {
368 fs->fs_snapinum[snaploc] = 0;
369 } else {
370 /*
371 * As this is the newest list, it is the most inclusive, so
372 * should replace the previous list.
373 */
374 si->si_snapblklist = ip->i_snapblklist;
375 }
376 si->si_gen++;
377 mutex_exit(&si->si_lock);
378
379 if (snapshot_locked)
380 mutex_exit(&si->si_snaplock);
381 if (suspended) {
382 vfs_resume(vp->v_mount);
383 #ifdef DEBUG
384 getmicrotime(&endtime);
385 timersub(&endtime, &starttime, &endtime);
386 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
387 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
388 endtime.tv_usec / 1000, redo, fs->fs_ncg);
389 #endif
390 }
391 if (error) {
392 if (!UFS_WAPBL_BEGIN(mp)) {
393 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
394 UFS_WAPBL_END(mp);
395 }
396 } else
397 vref(vp);
398 return (error);
399 }
400
401 /*
402 * Prepare vnode to become a snapshot.
403 */
404 static int
405 snapshot_setup(struct mount *mp, struct vnode *vp)
406 {
407 int error, n, len, loc;
408 daddr_t blkno, numblks;
409 struct buf *ibp, *nbp;
410 struct fs *fs = VFSTOUFS(mp)->um_fs;
411 struct lwp *l = curlwp;
412 const int wbreak = blocks_in_journal(fs)/8;
413
414 /*
415 * Check mount, exclusive reference and owner.
416 */
417 if (vp->v_mount != mp)
418 return EXDEV;
419 if (vp->v_usecount != 1 || vp->v_writecount != 0)
420 return EBUSY;
421 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
422 NULL) != 0 &&
423 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
424 return EACCES;
425
426 if (vp->v_size != 0) {
427 error = ffs_truncate(vp, 0, 0, NOCRED);
428 if (error)
429 return error;
430 }
431 /*
432 * Write an empty list of preallocated blocks to the end of
433 * the snapshot to set size to at least that of the filesystem.
434 */
435 numblks = howmany(fs->fs_size, fs->fs_frag);
436 blkno = 1;
437 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
438 error = vn_rdwr(UIO_WRITE, vp,
439 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
440 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
441 if (error)
442 return error;
443 /*
444 * Preallocate critical data structures so that we can copy
445 * them in without further allocation after we suspend all
446 * operations on the filesystem. We would like to just release
447 * the allocated buffers without writing them since they will
448 * be filled in below once we are ready to go, but this upsets
449 * the soft update code, so we go ahead and write the new buffers.
450 *
451 * Allocate all indirect blocks and mark all of them as not
452 * needing to be copied.
453 */
454 error = UFS_WAPBL_BEGIN(mp);
455 if (error)
456 return error;
457 for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) {
458 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
459 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
460 if (error)
461 goto out;
462 brelse(ibp, 0);
463 if (wbreak > 0 && (++n % wbreak) == 0) {
464 UFS_WAPBL_END(mp);
465 error = UFS_WAPBL_BEGIN(mp);
466 if (error)
467 return error;
468 }
469 }
470 /*
471 * Allocate copies for the superblock and its summary information.
472 */
473 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
474 0, &nbp);
475 if (error)
476 goto out;
477 bawrite(nbp);
478 blkno = fragstoblks(fs, fs->fs_csaddr);
479 len = howmany(fs->fs_cssize, fs->fs_bsize);
480 for (loc = 0; loc < len; loc++) {
481 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
482 fs->fs_bsize, l->l_cred, 0, &nbp);
483 if (error)
484 goto out;
485 bawrite(nbp);
486 }
487
488 out:
489 UFS_WAPBL_END(mp);
490 return error;
491 }
492
493 /*
494 * Create a copy of the superblock and its summary information.
495 * It is up to the caller to free copyfs and copy_fs->fs_csp.
496 */
497 static int
498 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
499 {
500 int error, i, len, loc, size;
501 void *space;
502 int32_t *lp;
503 struct buf *bp;
504 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
505 struct lwp *l = curlwp;
506 struct vnode *devvp = VTOI(vp)->i_devvp;
507
508 /*
509 * Grab a copy of the superblock and its summary information.
510 * We delay writing it until the suspension is released below.
511 */
512 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
513 loc = blkoff(fs, fs->fs_sblockloc);
514 if (loc > 0)
515 memset(*sbbuf, 0, loc);
516 copyfs = (struct fs *)((char *)(*sbbuf) + loc);
517 memcpy(copyfs, fs, fs->fs_sbsize);
518 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
519 if (fs->fs_sbsize < size)
520 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
521 size - fs->fs_sbsize);
522 size = blkroundup(fs, fs->fs_cssize);
523 if (fs->fs_contigsumsize > 0)
524 size += fs->fs_ncg * sizeof(int32_t);
525 space = malloc(size, M_UFSMNT, M_WAITOK);
526 copyfs->fs_csp = space;
527 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
528 space = (char *)space + fs->fs_cssize;
529 loc = howmany(fs->fs_cssize, fs->fs_fsize);
530 i = fs->fs_frag - loc % fs->fs_frag;
531 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
532 if (len > 0) {
533 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
534 len, l->l_cred, 0, &bp)) != 0) {
535 brelse(bp, 0);
536 free(copyfs->fs_csp, M_UFSMNT);
537 free(*sbbuf, M_UFSMNT);
538 *sbbuf = NULL;
539 return error;
540 }
541 memcpy(space, bp->b_data, (u_int)len);
542 space = (char *)space + len;
543 brelse(bp, BC_INVAL | BC_NOCACHE);
544 }
545 if (fs->fs_contigsumsize > 0) {
546 copyfs->fs_maxcluster = lp = space;
547 for (i = 0; i < fs->fs_ncg; i++)
548 *lp++ = fs->fs_contigsumsize;
549 }
550 if (mp->mnt_wapbl)
551 copyfs->fs_flags &= ~FS_DOWAPBL;
552 return 0;
553 }
554
555 /*
556 * We must check for active files that have been unlinked (e.g., with a zero
557 * link count). We have to expunge all trace of these files from the snapshot
558 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
559 * Note that we skip unlinked snapshot files as they will be handled separately.
560 * Calculate the snapshot list size and create a preliminary list.
561 */
562 static int
563 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
564 daddr_t *snaplistsize, daddr_t **snaplist)
565 {
566 int cg, error = 0, len, loc;
567 daddr_t blkno, *blkp;
568 struct fs *fs = VFSTOUFS(mp)->um_fs;
569 struct inode *xp;
570 struct lwp *l = curlwp;
571 struct vattr vat;
572 struct vnode *logvp = NULL, *mvp = NULL, *xvp;
573
574 *snaplist = NULL;
575 /*
576 * Get the log inode if any.
577 */
578 if ((fs->fs_flags & FS_DOWAPBL) &&
579 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
580 error = VFS_VGET(mp,
581 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
582 if (error)
583 goto out;
584 }
585 /*
586 * Allocate a marker vnode.
587 */
588 if ((mvp = vnalloc(mp)) == NULL) {
589 error = ENOMEM;
590 goto out;
591 }
592 /*
593 * We also calculate the needed size for the snapshot list.
594 */
595 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
596 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
597 mutex_enter(&mntvnode_lock);
598 /*
599 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
600 * and vclean() can be called indirectly
601 */
602 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
603 vmark(mvp, xvp);
604 /*
605 * Make sure this vnode wasn't reclaimed in getnewvnode().
606 * Start over if it has (it won't be on the list anymore).
607 */
608 if (xvp->v_mount != mp || vismarker(xvp))
609 continue;
610 mutex_enter(&xvp->v_interlock);
611 if ((xvp->v_iflag & VI_XLOCK) ||
612 xvp->v_usecount == 0 || xvp->v_type == VNON ||
613 VTOI(xvp) == NULL ||
614 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
615 mutex_exit(&xvp->v_interlock);
616 continue;
617 }
618 mutex_exit(&mntvnode_lock);
619 /*
620 * XXXAD should increase vnode ref count to prevent it
621 * disappearing or being recycled.
622 */
623 mutex_exit(&xvp->v_interlock);
624 #ifdef DEBUG
625 if (snapdebug)
626 vprint("ffs_snapshot: busy vnode", xvp);
627 #endif
628 xp = VTOI(xvp);
629 if (xvp != logvp) {
630 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
631 vat.va_nlink > 0) {
632 mutex_enter(&mntvnode_lock);
633 continue;
634 }
635 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
636 mutex_enter(&mntvnode_lock);
637 continue;
638 }
639 }
640 /*
641 * If there is a fragment, clear it here.
642 */
643 blkno = 0;
644 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
645 if (loc < NDADDR) {
646 len = fragroundup(fs, blkoff(fs, xp->i_size));
647 if (len > 0 && len < fs->fs_bsize) {
648 error = UFS_WAPBL_BEGIN(mp);
649 if (error) {
650 (void)vunmark(mvp);
651 goto out;
652 }
653 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
654 len, xp->i_number);
655 blkno = db_get(xp, loc);
656 db_assign(xp, loc, 0);
657 UFS_WAPBL_END(mp);
658 }
659 }
660 *snaplistsize += 1;
661 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
662 if (blkno)
663 db_assign(xp, loc, blkno);
664 if (!error) {
665 error = UFS_WAPBL_BEGIN(mp);
666 if (!error) {
667 error = ffs_freefile_snap(copy_fs, vp,
668 xp->i_number, xp->i_mode);
669 UFS_WAPBL_END(mp);
670 }
671 }
672 if (error) {
673 (void)vunmark(mvp);
674 goto out;
675 }
676 mutex_enter(&mntvnode_lock);
677 }
678 mutex_exit(&mntvnode_lock);
679 /*
680 * Create a preliminary list of preallocated snapshot blocks.
681 */
682 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
683 blkp = &(*snaplist)[1];
684 *blkp++ = lblkno(fs, fs->fs_sblockloc);
685 blkno = fragstoblks(fs, fs->fs_csaddr);
686 for (cg = 0; cg < fs->fs_ncg; cg++) {
687 if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
688 break;
689 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
690 }
691 len = howmany(fs->fs_cssize, fs->fs_bsize);
692 for (loc = 0; loc < len; loc++)
693 *blkp++ = blkno + loc;
694 for (; cg < fs->fs_ncg; cg++)
695 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
696 (*snaplist)[0] = blkp - &(*snaplist)[0];
697
698 out:
699 if (mvp != NULL)
700 vnfree(mvp);
701 if (logvp != NULL)
702 vput(logvp);
703 if (error && *snaplist != NULL) {
704 free(*snaplist, M_UFSMNT);
705 *snaplist = NULL;
706 }
707
708 return error;
709 }
710
711 /*
712 * Copy allocation information from all the snapshots in this snapshot and
713 * then expunge them from its view. Also, collect the list of allocated
714 * blocks in i_snapblklist.
715 */
716 static int
717 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
718 struct fs *copy_fs, daddr_t snaplistsize)
719 {
720 int error = 0, i;
721 daddr_t numblks, *snaplist = NULL;
722 struct fs *fs = VFSTOUFS(mp)->um_fs;
723 struct inode *ip = VTOI(vp), *xp;
724 struct lwp *l = curlwp;
725 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
726
727 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
728 if (xp == ip)
729 break;
730 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
731 if (error)
732 break;
733 if (xp->i_nlink != 0)
734 continue;
735 error = UFS_WAPBL_BEGIN(mp);
736 if (error)
737 break;
738 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
739 UFS_WAPBL_END(mp);
740 if (error)
741 break;
742 }
743 if (error)
744 goto out;
745 /*
746 * Allocate space for the full list of preallocated snapshot blocks.
747 */
748 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
749 ip->i_snapblklist = &snaplist[1];
750 /*
751 * Expunge the blocks used by the snapshots from the set of
752 * blocks marked as used in the snapshot bitmaps. Also, collect
753 * the list of allocated blocks in i_snapblklist.
754 */
755 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
756 if (error)
757 goto out;
758 if (snaplistsize < ip->i_snapblklist - snaplist)
759 panic("ffs_snapshot: list too small");
760 snaplistsize = ip->i_snapblklist - snaplist;
761 snaplist[0] = snaplistsize;
762 ip->i_snapblklist = &snaplist[0];
763 /*
764 * Write out the list of allocated blocks to the end of the snapshot.
765 */
766 numblks = howmany(fs->fs_size, fs->fs_frag);
767 for (i = 0; i < snaplistsize; i++)
768 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
769 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
770 snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks),
771 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
772 for (i = 0; i < snaplistsize; i++)
773 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
774 out:
775 if (error && snaplist != NULL) {
776 free(snaplist, M_UFSMNT);
777 ip->i_snapblklist = NULL;
778 }
779 return error;
780 }
781
782 /*
783 * Write the superblock and its summary information to the snapshot.
784 * Make sure, the first NDADDR blocks get copied to the snapshot.
785 */
786 static int
787 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
788 {
789 int error, len, loc;
790 void *space;
791 daddr_t blkno;
792 struct buf *bp;
793 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
794 struct inode *ip = VTOI(vp);
795 struct lwp *l = curlwp;
796
797 copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
798
799 /*
800 * Write the superblock and its summary information
801 * to the snapshot.
802 */
803 blkno = fragstoblks(fs, fs->fs_csaddr);
804 len = howmany(fs->fs_cssize, fs->fs_bsize);
805 space = copyfs->fs_csp;
806 #ifdef FFS_EI
807 if (UFS_FSNEEDSWAP(fs)) {
808 ffs_sb_swap(copyfs, copyfs);
809 ffs_csum_swap(space, space, fs->fs_cssize);
810 }
811 #endif
812 error = UFS_WAPBL_BEGIN(mp);
813 if (error)
814 return error;
815 for (loc = 0; loc < len; loc++) {
816 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
817 B_MODIFY, &bp);
818 if (error) {
819 brelse(bp, 0);
820 break;
821 }
822 memcpy(bp->b_data, space, fs->fs_bsize);
823 space = (char *)space + fs->fs_bsize;
824 bawrite(bp);
825 }
826 if (error)
827 goto out;
828 error = bread(vp, lblkno(fs, fs->fs_sblockloc),
829 fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
830 if (error) {
831 brelse(bp, 0);
832 goto out;
833 } else {
834 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
835 bawrite(bp);
836 }
837 /*
838 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
839 * and ffs_snapblkfree() will always work on indirect blocks.
840 */
841 for (loc = 0; loc < NDADDR; loc++) {
842 if (db_get(ip, loc) != 0)
843 continue;
844 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
845 fs->fs_bsize, l->l_cred, 0, &bp);
846 if (error)
847 break;
848 error = rwfsblk(vp, B_READ, bp->b_data, loc);
849 if (error) {
850 brelse(bp, 0);
851 break;
852 }
853 bawrite(bp);
854 }
855
856 out:
857 UFS_WAPBL_END(mp);
858 return error;
859 }
860
861 /*
862 * Copy all cylinder group maps.
863 */
864 static int
865 cgaccount(struct vnode *vp, int passno, int *redo)
866 {
867 int cg, error = 0;
868 struct buf *nbp;
869 struct fs *fs = VTOI(vp)->i_fs;
870
871 if (redo != NULL)
872 *redo = 0;
873 if (passno == 1)
874 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
875 M_DEVBUF, M_WAITOK | M_ZERO);
876 for (cg = 0; cg < fs->fs_ncg; cg++) {
877 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
878 continue;
879
880 if (redo != NULL)
881 *redo += 1;
882 error = UFS_WAPBL_BEGIN(vp->v_mount);
883 if (error)
884 return error;
885 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
886 fs->fs_bsize, curlwp->l_cred, 0, &nbp);
887 if (error) {
888 UFS_WAPBL_END(vp->v_mount);
889 break;
890 }
891 error = cgaccount1(cg, vp, nbp->b_data, passno);
892 bawrite(nbp);
893 UFS_WAPBL_END(vp->v_mount);
894 if (error)
895 break;
896 }
897 return error;
898 }
899
900 /*
901 * Copy a cylinder group map. All the unallocated blocks are marked
902 * BLK_NOCOPY so that the snapshot knows that it need not copy them
903 * if they are later written. If passno is one, then this is a first
904 * pass, so only setting needs to be done. If passno is 2, then this
905 * is a revision to a previous pass which must be undone as the
906 * replacement pass is done.
907 */
908 static int
909 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
910 {
911 struct buf *bp, *ibp;
912 struct inode *ip;
913 struct cg *cgp;
914 struct fs *fs;
915 struct lwp *l = curlwp;
916 daddr_t base, numblks;
917 int error, len, loc, ns, indiroff;
918
919 ip = VTOI(vp);
920 fs = ip->i_fs;
921 ns = UFS_FSNEEDSWAP(fs);
922 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
923 (int)fs->fs_cgsize, l->l_cred, 0, &bp);
924 if (error) {
925 brelse(bp, 0);
926 return (error);
927 }
928 cgp = (struct cg *)bp->b_data;
929 if (!cg_chkmagic(cgp, ns)) {
930 brelse(bp, 0);
931 return (EIO);
932 }
933 ACTIVECG_SET(fs, cg);
934
935 memcpy(data, bp->b_data, fs->fs_cgsize);
936 brelse(bp, 0);
937 if (fs->fs_cgsize < fs->fs_bsize)
938 memset((char *)data + fs->fs_cgsize, 0,
939 fs->fs_bsize - fs->fs_cgsize);
940 numblks = howmany(fs->fs_size, fs->fs_frag);
941 len = howmany(fs->fs_fpg, fs->fs_frag);
942 base = cg * fs->fs_fpg / fs->fs_frag;
943 if (base + len >= numblks)
944 len = numblks - base - 1;
945 loc = 0;
946 if (base < NDADDR) {
947 for ( ; loc < NDADDR; loc++) {
948 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
949 db_assign(ip, loc, BLK_NOCOPY);
950 else if (db_get(ip, loc) == BLK_NOCOPY) {
951 if (passno == 2)
952 db_assign(ip, loc, 0);
953 else if (passno == 1)
954 panic("ffs_snapshot: lost direct block");
955 }
956 }
957 }
958 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
959 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
960 return (error);
961 indiroff = (base + loc - NDADDR) % NINDIR(fs);
962 for ( ; loc < len; loc++, indiroff++) {
963 if (indiroff >= NINDIR(fs)) {
964 bawrite(ibp);
965 if ((error = ffs_balloc(vp,
966 lblktosize(fs, (off_t)(base + loc)),
967 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
968 return (error);
969 indiroff = 0;
970 }
971 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
972 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
973 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
974 if (passno == 2)
975 idb_assign(ip, ibp->b_data, indiroff, 0);
976 else if (passno == 1)
977 panic("ffs_snapshot: lost indirect block");
978 }
979 }
980 bdwrite(ibp);
981 return (0);
982 }
983
984 /*
985 * Before expunging a snapshot inode, note all the
986 * blocks that it claims with BLK_SNAP so that fsck will
987 * be able to account for those blocks properly and so
988 * that this snapshot knows that it need not copy them
989 * if the other snapshot holding them is freed.
990 */
991 static int
992 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
993 acctfunc_t acctfunc, int expungetype)
994 {
995 int i, error, ns;
996 daddr_t lbn, rlbn;
997 daddr_t len, blkno, numblks, blksperindir;
998 struct ufs1_dinode *dip1;
999 struct ufs2_dinode *dip2;
1000 struct lwp *l = curlwp;
1001 void *bap;
1002 struct buf *bp;
1003 struct mount *mp;
1004
1005 ns = UFS_FSNEEDSWAP(fs);
1006 mp = snapvp->v_mount;
1007
1008 error = UFS_WAPBL_BEGIN(mp);
1009 if (error)
1010 return error;
1011 /*
1012 * Prepare to expunge the inode. If its inode block has not
1013 * yet been copied, then allocate and fill the copy.
1014 */
1015 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1016 error = snapblkaddr(snapvp, lbn, &blkno);
1017 if (error)
1018 return error;
1019 if (blkno != 0) {
1020 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
1021 B_MODIFY, &bp);
1022 } else {
1023 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
1024 fs->fs_bsize, l->l_cred, 0, &bp);
1025 if (! error)
1026 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1027 }
1028 if (error) {
1029 UFS_WAPBL_END(mp);
1030 return error;
1031 }
1032 /*
1033 * Set a snapshot inode to be a zero length file, regular files
1034 * or unlinked snapshots to be completely unallocated.
1035 */
1036 if (fs->fs_magic == FS_UFS1_MAGIC) {
1037 dip1 = (struct ufs1_dinode *)bp->b_data +
1038 ino_to_fsbo(fs, cancelip->i_number);
1039 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1040 dip1->di_mode = 0;
1041 dip1->di_size = 0;
1042 dip1->di_blocks = 0;
1043 dip1->di_flags =
1044 ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns);
1045 memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t));
1046 } else {
1047 dip2 = (struct ufs2_dinode *)bp->b_data +
1048 ino_to_fsbo(fs, cancelip->i_number);
1049 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1050 dip2->di_mode = 0;
1051 dip2->di_size = 0;
1052 dip2->di_blocks = 0;
1053 dip2->di_flags =
1054 ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns);
1055 memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t));
1056 }
1057 bdwrite(bp);
1058 UFS_WAPBL_END(mp);
1059 /*
1060 * Now go through and expunge all the blocks in the file
1061 * using the function requested.
1062 */
1063 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1064 if (fs->fs_magic == FS_UFS1_MAGIC)
1065 bap = &cancelip->i_ffs1_db[0];
1066 else
1067 bap = &cancelip->i_ffs2_db[0];
1068 error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype);
1069 if (error)
1070 return (error);
1071 if (fs->fs_magic == FS_UFS1_MAGIC)
1072 bap = &cancelip->i_ffs1_ib[0];
1073 else
1074 bap = &cancelip->i_ffs2_ib[0];
1075 error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype);
1076 if (error)
1077 return (error);
1078 blksperindir = 1;
1079 lbn = -NDADDR;
1080 len = numblks - NDADDR;
1081 rlbn = NDADDR;
1082 for (i = 0; len > 0 && i < NIADDR; i++) {
1083 error = indiracct(snapvp, ITOV(cancelip), i,
1084 ib_get(cancelip, i), lbn, rlbn, len,
1085 blksperindir, fs, acctfunc, expungetype);
1086 if (error)
1087 return (error);
1088 blksperindir *= NINDIR(fs);
1089 lbn -= blksperindir + 1;
1090 len -= blksperindir;
1091 rlbn += blksperindir;
1092 }
1093 return (0);
1094 }
1095
1096 /*
1097 * Descend an indirect block chain for vnode cancelvp accounting for all
1098 * its indirect blocks in snapvp.
1099 */
1100 static int
1101 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1102 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1103 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1104 {
1105 int error, num, i;
1106 daddr_t subblksperindir;
1107 struct indir indirs[NIADDR + 2];
1108 daddr_t last;
1109 void *bap;
1110 struct buf *bp;
1111
1112 if (blkno == 0) {
1113 if (expungetype == BLK_NOCOPY)
1114 return (0);
1115 panic("indiracct: missing indir");
1116 }
1117 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1118 return (error);
1119 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1120 panic("indiracct: botched params");
1121 /*
1122 * We have to expand bread here since it will deadlock looking
1123 * up the block number for any blocks that are not in the cache.
1124 */
1125 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
1126 false, &bp);
1127 if (error)
1128 return error;
1129 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1130 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
1131 brelse(bp, 0);
1132 return (error);
1133 }
1134 /*
1135 * Account for the block pointers in this indirect block.
1136 */
1137 last = howmany(remblks, blksperindir);
1138 if (last > NINDIR(fs))
1139 last = NINDIR(fs);
1140 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1141 memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1142 brelse(bp, 0);
1143 error = (*acctfunc)(snapvp, bap, 0, last,
1144 fs, level == 0 ? rlbn : -1, expungetype);
1145 if (error || level == 0)
1146 goto out;
1147 /*
1148 * Account for the block pointers in each of the indirect blocks
1149 * in the levels below us.
1150 */
1151 subblksperindir = blksperindir / NINDIR(fs);
1152 for (lbn++, level--, i = 0; i < last; i++) {
1153 error = indiracct(snapvp, cancelvp, level,
1154 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1155 subblksperindir, fs, acctfunc, expungetype);
1156 if (error)
1157 goto out;
1158 rlbn += blksperindir;
1159 lbn -= blksperindir;
1160 remblks -= blksperindir;
1161 }
1162 out:
1163 free(bap, M_DEVBUF);
1164 return (error);
1165 }
1166
1167 /*
1168 * Do both snap accounting and map accounting.
1169 */
1170 static int
1171 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1172 struct fs *fs, daddr_t lblkno,
1173 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1174 {
1175 int error;
1176
1177 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1178 return (error);
1179 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1180 }
1181
1182 /*
1183 * Identify a set of blocks allocated in a snapshot inode.
1184 */
1185 static int
1186 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1187 struct fs *fs, daddr_t lblkno,
1188 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1189 {
1190 struct inode *ip = VTOI(vp);
1191 struct lwp *l = curlwp;
1192 struct mount *mp = vp->v_mount;
1193 daddr_t blkno;
1194 daddr_t lbn;
1195 struct buf *ibp;
1196 int error, n;
1197 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1198
1199 error = UFS_WAPBL_BEGIN(mp);
1200 if (error)
1201 return error;
1202 for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1203 blkno = idb_get(ip, bap, oldblkp);
1204 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1205 continue;
1206 lbn = fragstoblks(fs, blkno);
1207 if (lbn < NDADDR) {
1208 blkno = db_get(ip, lbn);
1209 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1210 } else {
1211 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1212 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1213 if (error)
1214 break;
1215 blkno = idb_get(ip, ibp->b_data,
1216 (lbn - NDADDR) % NINDIR(fs));
1217 }
1218 /*
1219 * If we are expunging a snapshot vnode and we
1220 * find a block marked BLK_NOCOPY, then it is
1221 * one that has been allocated to this snapshot after
1222 * we took our current snapshot and can be ignored.
1223 */
1224 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1225 if (lbn >= NDADDR)
1226 brelse(ibp, 0);
1227 } else {
1228 if (blkno != 0)
1229 panic("snapacct: bad block");
1230 if (lbn < NDADDR)
1231 db_assign(ip, lbn, expungetype);
1232 else {
1233 idb_assign(ip, ibp->b_data,
1234 (lbn - NDADDR) % NINDIR(fs), expungetype);
1235 bdwrite(ibp);
1236 }
1237 }
1238 if (wbreak > 0 && (++n % wbreak) == 0) {
1239 UFS_WAPBL_END(mp);
1240 error = UFS_WAPBL_BEGIN(mp);
1241 if (error)
1242 return error;
1243 }
1244 }
1245 UFS_WAPBL_END(mp);
1246 return error;
1247 }
1248
1249 /*
1250 * Account for a set of blocks allocated in a snapshot inode.
1251 */
1252 static int
1253 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1254 struct fs *fs, daddr_t lblkno, int expungetype)
1255 {
1256 daddr_t blkno;
1257 struct inode *ip;
1258 struct mount *mp = vp->v_mount;
1259 ino_t inum;
1260 int acctit, error, n;
1261 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1262
1263 error = UFS_WAPBL_BEGIN(mp);
1264 if (error)
1265 return error;
1266 ip = VTOI(vp);
1267 inum = ip->i_number;
1268 if (lblkno == -1)
1269 acctit = 0;
1270 else
1271 acctit = 1;
1272 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1273 blkno = idb_get(ip, bap, oldblkp);
1274 if (blkno == 0 || blkno == BLK_NOCOPY)
1275 continue;
1276 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1277 *ip->i_snapblklist++ = lblkno;
1278 if (blkno == BLK_SNAP)
1279 blkno = blkstofrags(fs, lblkno);
1280 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1281 if (wbreak > 0 && (++n % wbreak) == 0) {
1282 UFS_WAPBL_END(mp);
1283 error = UFS_WAPBL_BEGIN(mp);
1284 if (error)
1285 return error;
1286 }
1287 }
1288 UFS_WAPBL_END(mp);
1289 return (0);
1290 }
1291 #endif /* defined(FFS_NO_SNAPSHOT) */
1292
1293 /*
1294 * Decrement extra reference on snapshot when last name is removed.
1295 * It will not be freed until the last open reference goes away.
1296 */
1297 void
1298 ffs_snapgone(struct inode *ip)
1299 {
1300 struct mount *mp = ip->i_devvp->v_specmountpoint;
1301 struct inode *xp;
1302 struct fs *fs;
1303 struct snap_info *si;
1304 int snaploc;
1305
1306 si = VFSTOUFS(mp)->um_snapinfo;
1307
1308 /*
1309 * Find snapshot in incore list.
1310 */
1311 mutex_enter(&si->si_lock);
1312 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1313 if (xp == ip)
1314 break;
1315 mutex_exit(&si->si_lock);
1316 if (xp != NULL)
1317 vrele(ITOV(ip));
1318 #ifdef DEBUG
1319 else if (snapdebug)
1320 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1321 (unsigned long long)ip->i_number);
1322 #endif
1323 /*
1324 * Delete snapshot inode from superblock. Keep list dense.
1325 */
1326 mutex_enter(&si->si_lock);
1327 fs = ip->i_fs;
1328 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1329 if (fs->fs_snapinum[snaploc] == ip->i_number)
1330 break;
1331 if (snaploc < FSMAXSNAP) {
1332 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1333 if (fs->fs_snapinum[snaploc] == 0)
1334 break;
1335 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1336 }
1337 fs->fs_snapinum[snaploc - 1] = 0;
1338 }
1339 si->si_gen++;
1340 mutex_exit(&si->si_lock);
1341 }
1342
1343 /*
1344 * Prepare a snapshot file for being removed.
1345 */
1346 void
1347 ffs_snapremove(struct vnode *vp)
1348 {
1349 struct inode *ip = VTOI(vp), *xp;
1350 struct vnode *devvp = ip->i_devvp;
1351 struct fs *fs = ip->i_fs;
1352 struct mount *mp = devvp->v_specmountpoint;
1353 struct buf *ibp;
1354 struct snap_info *si;
1355 struct lwp *l = curlwp;
1356 daddr_t numblks, blkno, dblk;
1357 int error, loc, last, n;
1358 const int wbreak = blocks_in_journal(fs)/8;
1359
1360 si = VFSTOUFS(mp)->um_snapinfo;
1361 /*
1362 * If active, delete from incore list (this snapshot may
1363 * already have been in the process of being deleted, so
1364 * would not have been active).
1365 *
1366 * Clear copy-on-write flag if last snapshot.
1367 */
1368 mutex_enter(&si->si_lock);
1369 if (is_active_snapshot(si, ip)) {
1370 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1371 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1372 /* Roll back the list of preallocated blocks. */
1373 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1374 si->si_snapblklist = xp->i_snapblklist;
1375 si->si_gen++;
1376 mutex_exit(&si->si_lock);
1377 } else {
1378 si->si_snapblklist = 0;
1379 si->si_gen++;
1380 mutex_exit(&si->si_lock);
1381 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1382 }
1383 if (ip->i_snapblklist != NULL) {
1384 free(ip->i_snapblklist, M_UFSMNT);
1385 ip->i_snapblklist = NULL;
1386 }
1387 } else
1388 mutex_exit(&si->si_lock);
1389 /*
1390 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1391 * snapshots that want them (see ffs_snapblkfree below).
1392 */
1393 for (blkno = 1; blkno < NDADDR; blkno++) {
1394 dblk = db_get(ip, blkno);
1395 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1396 db_assign(ip, blkno, 0);
1397 else if ((dblk == blkstofrags(fs, blkno) &&
1398 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1399 ip->i_number))) {
1400 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1401 db_assign(ip, blkno, 0);
1402 }
1403 }
1404 numblks = howmany(ip->i_size, fs->fs_bsize);
1405 for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) {
1406 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1407 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1408 if (error)
1409 continue;
1410 if (fs->fs_size - blkno > NINDIR(fs))
1411 last = NINDIR(fs);
1412 else
1413 last = fs->fs_size - blkno;
1414 for (loc = 0; loc < last; loc++) {
1415 if (wbreak > 0 && (++n % wbreak) == 0) {
1416 UFS_WAPBL_END(mp);
1417 error = UFS_WAPBL_BEGIN(mp);
1418 if (error)
1419 panic("UFS_WAPBL_BEGIN failed");
1420 }
1421 dblk = idb_get(ip, ibp->b_data, loc);
1422 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1423 idb_assign(ip, ibp->b_data, loc, 0);
1424 else if (dblk == blkstofrags(fs, blkno) &&
1425 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1426 fs->fs_bsize, ip->i_number)) {
1427 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1428 idb_assign(ip, ibp->b_data, loc, 0);
1429 }
1430 }
1431 bawrite(ibp);
1432 }
1433 /*
1434 * Clear snapshot flag and drop reference.
1435 */
1436 ip->i_flags &= ~SF_SNAPSHOT;
1437 DIP_ASSIGN(ip, flags, ip->i_flags);
1438 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1439 }
1440
1441 /*
1442 * Notification that a block is being freed. Return zero if the free
1443 * should be allowed to proceed. Return non-zero if the snapshot file
1444 * wants to claim the block. The block will be claimed if it is an
1445 * uncopied part of one of the snapshots. It will be freed if it is
1446 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1447 * If a fragment is being freed, then all snapshots that care about
1448 * it must make a copy since a snapshot file can only claim full sized
1449 * blocks. Note that if more than one snapshot file maps the block,
1450 * we can pick one at random to claim it. Since none of the snapshots
1451 * can change, we are assurred that they will all see the same unmodified
1452 * image. When deleting a snapshot file (see ffs_snapremove above), we
1453 * must push any of these claimed blocks to one of the other snapshots
1454 * that maps it. These claimed blocks are easily identified as they will
1455 * have a block number equal to their logical block number within the
1456 * snapshot. A copied block can never have this property because they
1457 * must always have been allocated from a BLK_NOCOPY location.
1458 */
1459 int
1460 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1461 long size, ino_t inum)
1462 {
1463 struct mount *mp = devvp->v_specmountpoint;
1464 struct buf *ibp;
1465 struct inode *ip;
1466 struct vnode *vp = NULL;
1467 struct snap_info *si;
1468 void *saved_data = NULL;
1469 daddr_t lbn;
1470 daddr_t blkno;
1471 uint32_t gen;
1472 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1473
1474 si = VFSTOUFS(mp)->um_snapinfo;
1475 lbn = fragstoblks(fs, bno);
1476 mutex_enter(&si->si_lock);
1477 retry:
1478 gen = si->si_gen;
1479 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1480 vp = ITOV(ip);
1481 if (snapshot_locked == 0) {
1482 if (!mutex_tryenter(&si->si_snaplock)) {
1483 mutex_exit(&si->si_lock);
1484 mutex_enter(&si->si_snaplock);
1485 mutex_enter(&si->si_lock);
1486 }
1487 snapshot_locked = 1;
1488 if (gen != si->si_gen)
1489 goto retry;
1490 }
1491 /*
1492 * Lookup block being written.
1493 */
1494 if (lbn < NDADDR) {
1495 blkno = db_get(ip, lbn);
1496 } else {
1497 mutex_exit(&si->si_lock);
1498 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1499 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1500 if (error) {
1501 mutex_enter(&si->si_lock);
1502 break;
1503 }
1504 indiroff = (lbn - NDADDR) % NINDIR(fs);
1505 blkno = idb_get(ip, ibp->b_data, indiroff);
1506 mutex_enter(&si->si_lock);
1507 if (gen != si->si_gen) {
1508 brelse(ibp, 0);
1509 goto retry;
1510 }
1511 }
1512 /*
1513 * Check to see if block needs to be copied.
1514 */
1515 if (blkno == 0) {
1516 /*
1517 * A block that we map is being freed. If it has not
1518 * been claimed yet, we will claim or copy it (below).
1519 */
1520 claimedblk = 1;
1521 } else if (blkno == BLK_SNAP) {
1522 /*
1523 * No previous snapshot claimed the block,
1524 * so it will be freed and become a BLK_NOCOPY
1525 * (don't care) for us.
1526 */
1527 if (claimedblk)
1528 panic("snapblkfree: inconsistent block type");
1529 if (lbn < NDADDR) {
1530 db_assign(ip, lbn, BLK_NOCOPY);
1531 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1532 } else {
1533 idb_assign(ip, ibp->b_data, indiroff,
1534 BLK_NOCOPY);
1535 mutex_exit(&si->si_lock);
1536 if (ip->i_nlink > 0)
1537 bwrite(ibp);
1538 else
1539 bdwrite(ibp);
1540 mutex_enter(&si->si_lock);
1541 if (gen != si->si_gen)
1542 goto retry;
1543 }
1544 continue;
1545 } else /* BLK_NOCOPY or default */ {
1546 /*
1547 * If the snapshot has already copied the block
1548 * (default), or does not care about the block,
1549 * it is not needed.
1550 */
1551 if (lbn >= NDADDR)
1552 brelse(ibp, 0);
1553 continue;
1554 }
1555 /*
1556 * If this is a full size block, we will just grab it
1557 * and assign it to the snapshot inode. Otherwise we
1558 * will proceed to copy it. See explanation for this
1559 * routine as to why only a single snapshot needs to
1560 * claim this block.
1561 */
1562 if (size == fs->fs_bsize) {
1563 #ifdef DEBUG
1564 if (snapdebug)
1565 printf("%s %llu lbn %" PRId64
1566 "from inum %llu\n",
1567 "Grabonremove: snapino",
1568 (unsigned long long)ip->i_number,
1569 lbn, (unsigned long long)inum);
1570 #endif
1571 mutex_exit(&si->si_lock);
1572 if (lbn < NDADDR) {
1573 db_assign(ip, lbn, bno);
1574 } else {
1575 idb_assign(ip, ibp->b_data, indiroff, bno);
1576 if (ip->i_nlink > 0)
1577 bwrite(ibp);
1578 else
1579 bdwrite(ibp);
1580 }
1581 DIP_ADD(ip, blocks, btodb(size));
1582 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1583 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1584 error = syncsnap(vp);
1585 else
1586 error = 0;
1587 mutex_exit(&si->si_snaplock);
1588 return (error == 0);
1589 }
1590 if (lbn >= NDADDR)
1591 brelse(ibp, 0);
1592 #ifdef DEBUG
1593 if (snapdebug)
1594 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1595 "Copyonremove: snapino ",
1596 (unsigned long long)ip->i_number,
1597 lbn, "for inum", (unsigned long long)inum, size);
1598 #endif
1599 /*
1600 * If we have already read the old block contents, then
1601 * simply copy them to the new block. Note that we need
1602 * to synchronously write snapshots that have not been
1603 * unlinked, and hence will be visible after a crash,
1604 * to ensure their integrity.
1605 */
1606 mutex_exit(&si->si_lock);
1607 if (saved_data == NULL) {
1608 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1609 error = rwfsblk(vp, B_READ, saved_data, lbn);
1610 if (error) {
1611 free(saved_data, M_UFSMNT);
1612 saved_data = NULL;
1613 mutex_enter(&si->si_lock);
1614 break;
1615 }
1616 }
1617 error = wrsnapblk(vp, saved_data, lbn);
1618 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1619 error = syncsnap(vp);
1620 mutex_enter(&si->si_lock);
1621 if (error)
1622 break;
1623 if (gen != si->si_gen)
1624 goto retry;
1625 }
1626 mutex_exit(&si->si_lock);
1627 if (saved_data)
1628 free(saved_data, M_UFSMNT);
1629 /*
1630 * If we have been unable to allocate a block in which to do
1631 * the copy, then return non-zero so that the fragment will
1632 * not be freed. Although space will be lost, the snapshot
1633 * will stay consistent.
1634 */
1635 if (snapshot_locked)
1636 mutex_exit(&si->si_snaplock);
1637 return (error);
1638 }
1639
1640 /*
1641 * Associate snapshot files when mounting.
1642 */
1643 void
1644 ffs_snapshot_mount(struct mount *mp)
1645 {
1646 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1647 struct fs *fs = VFSTOUFS(mp)->um_fs;
1648 struct lwp *l = curlwp;
1649 struct vnode *vp;
1650 struct inode *ip, *xp;
1651 struct snap_info *si;
1652 daddr_t snaplistsize, *snapblklist;
1653 int i, error, ns, snaploc, loc;
1654
1655 /*
1656 * No persistent snapshots on apple ufs file systems.
1657 */
1658 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1659 return;
1660
1661 si = VFSTOUFS(mp)->um_snapinfo;
1662 ns = UFS_FSNEEDSWAP(fs);
1663 /*
1664 * XXX The following needs to be set before ffs_truncate or
1665 * VOP_READ can be called.
1666 */
1667 mp->mnt_stat.f_iosize = fs->fs_bsize;
1668 /*
1669 * Process each snapshot listed in the superblock.
1670 */
1671 vp = NULL;
1672 mutex_enter(&si->si_lock);
1673 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1674 if (fs->fs_snapinum[snaploc] == 0)
1675 break;
1676 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1677 &vp)) != 0) {
1678 printf("ffs_snapshot_mount: vget failed %d\n", error);
1679 continue;
1680 }
1681 ip = VTOI(vp);
1682 if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1683 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1684 fs->fs_snapinum[snaploc]);
1685 vput(vp);
1686 vp = NULL;
1687 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1688 if (fs->fs_snapinum[loc] == 0)
1689 break;
1690 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1691 }
1692 fs->fs_snapinum[loc - 1] = 0;
1693 snaploc--;
1694 continue;
1695 }
1696
1697 /*
1698 * Read the block hints list. Use an empty list on
1699 * read errors.
1700 */
1701 error = vn_rdwr(UIO_READ, vp,
1702 (void *)&snaplistsize, sizeof(snaplistsize),
1703 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1704 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1705 l->l_cred, NULL, NULL);
1706 if (error) {
1707 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1708 snaplistsize = 1;
1709 } else
1710 snaplistsize = ufs_rw64(snaplistsize, ns);
1711 snapblklist = malloc(
1712 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1713 if (error)
1714 snapblklist[0] = 1;
1715 else {
1716 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1717 snaplistsize * sizeof(daddr_t),
1718 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1719 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1720 l->l_cred, NULL, NULL);
1721 for (i = 0; i < snaplistsize; i++)
1722 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1723 if (error) {
1724 printf("ffs_snapshot_mount: read_2 failed %d\n",
1725 error);
1726 snapblklist[0] = 1;
1727 }
1728 }
1729 ip->i_snapblklist = &snapblklist[0];
1730
1731 /*
1732 * Link it onto the active snapshot list.
1733 */
1734 if (is_active_snapshot(si, ip))
1735 panic("ffs_snapshot_mount: %"PRIu64" already on list",
1736 ip->i_number);
1737 else
1738 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1739 vp->v_vflag |= VV_SYSTEM;
1740 VOP_UNLOCK(vp);
1741 }
1742 /*
1743 * No usable snapshots found.
1744 */
1745 if (vp == NULL) {
1746 mutex_exit(&si->si_lock);
1747 return;
1748 }
1749 /*
1750 * Attach the block hints list. We always want to
1751 * use the list from the newest snapshot.
1752 */
1753 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1754 si->si_snapblklist = xp->i_snapblklist;
1755 fscow_establish(mp, ffs_copyonwrite, devvp);
1756 si->si_gen++;
1757 mutex_exit(&si->si_lock);
1758 }
1759
1760 /*
1761 * Disassociate snapshot files when unmounting.
1762 */
1763 void
1764 ffs_snapshot_unmount(struct mount *mp)
1765 {
1766 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1767 struct inode *xp;
1768 struct vnode *vp = NULL;
1769 struct snap_info *si;
1770
1771 si = VFSTOUFS(mp)->um_snapinfo;
1772 mutex_enter(&si->si_lock);
1773 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1774 vp = ITOV(xp);
1775 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1776 if (xp->i_snapblklist == si->si_snapblklist)
1777 si->si_snapblklist = NULL;
1778 free(xp->i_snapblklist, M_UFSMNT);
1779 if (xp->i_nlink > 0) {
1780 si->si_gen++;
1781 mutex_exit(&si->si_lock);
1782 vrele(vp);
1783 mutex_enter(&si->si_lock);
1784 }
1785 }
1786 si->si_gen++;
1787 mutex_exit(&si->si_lock);
1788 if (vp)
1789 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1790 }
1791
1792 /*
1793 * Check for need to copy block that is about to be written,
1794 * copying the block if necessary.
1795 */
1796 static int
1797 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1798 {
1799 struct fs *fs;
1800 struct inode *ip;
1801 struct vnode *devvp = v, *vp = NULL;
1802 struct mount *mp = devvp->v_specmountpoint;
1803 struct snap_info *si;
1804 void *saved_data = NULL;
1805 daddr_t lbn, blkno, *snapblklist;
1806 uint32_t gen;
1807 int lower, upper, mid, snapshot_locked = 0, error = 0;
1808
1809 /*
1810 * Check for valid snapshots.
1811 */
1812 si = VFSTOUFS(mp)->um_snapinfo;
1813 mutex_enter(&si->si_lock);
1814 ip = TAILQ_FIRST(&si->si_snapshots);
1815 if (ip == NULL) {
1816 mutex_exit(&si->si_lock);
1817 return 0;
1818 }
1819 /*
1820 * First check to see if it is after the file system or
1821 * in the preallocated list.
1822 * By doing this check we avoid several potential deadlocks.
1823 */
1824 fs = ip->i_fs;
1825 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1826 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
1827 mutex_exit(&si->si_lock);
1828 return 0;
1829 }
1830 snapblklist = si->si_snapblklist;
1831 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1832 lower = 1;
1833 while (lower <= upper) {
1834 mid = (lower + upper) / 2;
1835 if (snapblklist[mid] == lbn)
1836 break;
1837 if (snapblklist[mid] < lbn)
1838 lower = mid + 1;
1839 else
1840 upper = mid - 1;
1841 }
1842 if (lower <= upper) {
1843 mutex_exit(&si->si_lock);
1844 return 0;
1845 }
1846 /*
1847 * Not in the precomputed list, so check the snapshots.
1848 */
1849 if (data_valid && bp->b_bcount == fs->fs_bsize)
1850 saved_data = bp->b_data;
1851 retry:
1852 gen = si->si_gen;
1853 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1854 vp = ITOV(ip);
1855 /*
1856 * We ensure that everything of our own that needs to be
1857 * copied will be done at the time that ffs_snapshot is
1858 * called. Thus we can skip the check here which can
1859 * deadlock in doing the lookup in ffs_balloc.
1860 */
1861 if (bp->b_vp == vp)
1862 continue;
1863 /*
1864 * Check to see if block needs to be copied.
1865 */
1866 if (lbn < NDADDR) {
1867 blkno = db_get(ip, lbn);
1868 } else {
1869 mutex_exit(&si->si_lock);
1870 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1871 mutex_enter(&si->si_lock);
1872 break;
1873 }
1874 mutex_enter(&si->si_lock);
1875 if (gen != si->si_gen)
1876 goto retry;
1877 }
1878 #ifdef DIAGNOSTIC
1879 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1880 panic("ffs_copyonwrite: bad copy block");
1881 #endif
1882 if (blkno != 0)
1883 continue;
1884
1885 if (curlwp == uvm.pagedaemon_lwp) {
1886 error = ENOMEM;
1887 break;
1888 }
1889
1890 if (snapshot_locked == 0) {
1891 if (!mutex_tryenter(&si->si_snaplock)) {
1892 mutex_exit(&si->si_lock);
1893 mutex_enter(&si->si_snaplock);
1894 mutex_enter(&si->si_lock);
1895 }
1896 snapshot_locked = 1;
1897 if (gen != si->si_gen)
1898 goto retry;
1899
1900 /* Check again if block still needs to be copied */
1901 if (lbn < NDADDR) {
1902 blkno = db_get(ip, lbn);
1903 } else {
1904 mutex_exit(&si->si_lock);
1905 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1906 mutex_enter(&si->si_lock);
1907 break;
1908 }
1909 mutex_enter(&si->si_lock);
1910 if (gen != si->si_gen)
1911 goto retry;
1912 }
1913
1914 if (blkno != 0)
1915 continue;
1916 }
1917 /*
1918 * Allocate the block into which to do the copy. Since
1919 * multiple processes may all try to copy the same block,
1920 * we have to recheck our need to do a copy if we sleep
1921 * waiting for the lock.
1922 *
1923 * Because all snapshots on a filesystem share a single
1924 * lock, we ensure that we will never be in competition
1925 * with another process to allocate a block.
1926 */
1927 #ifdef DEBUG
1928 if (snapdebug) {
1929 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
1930 (unsigned long long)ip->i_number, lbn);
1931 if (bp->b_vp == devvp)
1932 printf("fs metadata");
1933 else
1934 printf("inum %llu", (unsigned long long)
1935 VTOI(bp->b_vp)->i_number);
1936 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1937 }
1938 #endif
1939 /*
1940 * If we have already read the old block contents, then
1941 * simply copy them to the new block. Note that we need
1942 * to synchronously write snapshots that have not been
1943 * unlinked, and hence will be visible after a crash,
1944 * to ensure their integrity.
1945 */
1946 mutex_exit(&si->si_lock);
1947 if (saved_data == NULL) {
1948 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1949 error = rwfsblk(vp, B_READ, saved_data, lbn);
1950 if (error) {
1951 free(saved_data, M_UFSMNT);
1952 saved_data = NULL;
1953 mutex_enter(&si->si_lock);
1954 break;
1955 }
1956 }
1957 error = wrsnapblk(vp, saved_data, lbn);
1958 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1959 error = syncsnap(vp);
1960 mutex_enter(&si->si_lock);
1961 if (error)
1962 break;
1963 if (gen != si->si_gen)
1964 goto retry;
1965 }
1966 /*
1967 * Note that we need to synchronously write snapshots that
1968 * have not been unlinked, and hence will be visible after
1969 * a crash, to ensure their integrity.
1970 */
1971 mutex_exit(&si->si_lock);
1972 if (saved_data && saved_data != bp->b_data)
1973 free(saved_data, M_UFSMNT);
1974 if (snapshot_locked)
1975 mutex_exit(&si->si_snaplock);
1976 return error;
1977 }
1978
1979 /*
1980 * Read from a snapshot.
1981 */
1982 int
1983 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
1984 {
1985 struct inode *ip = VTOI(vp);
1986 struct fs *fs = ip->i_fs;
1987 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
1988 struct buf *bp;
1989 daddr_t lbn, nextlbn;
1990 off_t fsbytes, bytesinfile;
1991 long size, xfersize, blkoffset;
1992 int error;
1993
1994 fstrans_start(vp->v_mount, FSTRANS_SHARED);
1995 mutex_enter(&si->si_snaplock);
1996
1997 if (ioflag & IO_ALTSEMANTICS)
1998 fsbytes = ip->i_size;
1999 else
2000 fsbytes = lfragtosize(fs, fs->fs_size);
2001 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2002 bytesinfile = fsbytes - uio->uio_offset;
2003 if (bytesinfile <= 0)
2004 break;
2005 lbn = lblkno(fs, uio->uio_offset);
2006 nextlbn = lbn + 1;
2007 size = fs->fs_bsize;
2008 blkoffset = blkoff(fs, uio->uio_offset);
2009 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2010 bytesinfile);
2011
2012 if (lblktosize(fs, nextlbn + 1) >= fsbytes) {
2013 if (lblktosize(fs, lbn) + size > fsbytes)
2014 size = fragroundup(fs,
2015 fsbytes - lblktosize(fs, lbn));
2016 error = bread(vp, lbn, size, NOCRED, 0, &bp);
2017 } else {
2018 int nextsize = fs->fs_bsize;
2019 error = breadn(vp, lbn,
2020 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
2021 }
2022 if (error)
2023 break;
2024
2025 /*
2026 * We should only get non-zero b_resid when an I/O error
2027 * has occurred, which should cause us to break above.
2028 * However, if the short read did not cause an error,
2029 * then we want to ensure that we do not uiomove bad
2030 * or uninitialized data.
2031 */
2032 size -= bp->b_resid;
2033 if (size < blkoffset + xfersize) {
2034 xfersize = size - blkoffset;
2035 if (xfersize <= 0)
2036 break;
2037 }
2038 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2039 if (error)
2040 break;
2041 brelse(bp, BC_AGE);
2042 }
2043 if (bp != NULL)
2044 brelse(bp, BC_AGE);
2045
2046 mutex_exit(&si->si_snaplock);
2047 fstrans_done(vp->v_mount);
2048 return error;
2049 }
2050
2051 /*
2052 * Lookup a snapshots data block address.
2053 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2054 * and safe even for the pagedaemon where we cannot bread().
2055 */
2056 static int
2057 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2058 {
2059 struct indir indirs[NIADDR + 2];
2060 struct inode *ip = VTOI(vp);
2061 struct fs *fs = ip->i_fs;
2062 struct buf *bp;
2063 int error, num;
2064
2065 KASSERT(lbn >= 0);
2066
2067 if (lbn < NDADDR) {
2068 *res = db_get(ip, lbn);
2069 return 0;
2070 }
2071 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2072 return error;
2073 if (curlwp == uvm.pagedaemon_lwp) {
2074 mutex_enter(&bufcache_lock);
2075 bp = incore(vp, indirs[num-1].in_lbn);
2076 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2077 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2078 error = 0;
2079 } else
2080 error = ENOMEM;
2081 mutex_exit(&bufcache_lock);
2082 return error;
2083 }
2084 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
2085 if (error == 0)
2086 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2087 brelse(bp, 0);
2088
2089 return error;
2090 }
2091
2092 /*
2093 * Read or write the specified block of the filesystem vp resides on
2094 * from or to the disk bypassing the buffer cache.
2095 */
2096 static int
2097 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2098 {
2099 int error;
2100 struct inode *ip = VTOI(vp);
2101 struct fs *fs = ip->i_fs;
2102 struct buf *nbp;
2103
2104 nbp = getiobuf(NULL, true);
2105 nbp->b_flags = flags;
2106 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2107 nbp->b_error = 0;
2108 nbp->b_data = data;
2109 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
2110 nbp->b_proc = NULL;
2111 nbp->b_dev = ip->i_devvp->v_rdev;
2112 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2113
2114 bdev_strategy(nbp);
2115
2116 error = biowait(nbp);
2117
2118 putiobuf(nbp);
2119
2120 return error;
2121 }
2122
2123 /*
2124 * Write all dirty buffers to disk and invalidate them.
2125 */
2126 static int
2127 syncsnap(struct vnode *vp)
2128 {
2129 int error;
2130 buf_t *bp;
2131 struct fs *fs = VTOI(vp)->i_fs;
2132
2133 mutex_enter(&bufcache_lock);
2134 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2135 error = bbusy(bp, false, 0, NULL);
2136 if (error == EPASSTHROUGH)
2137 continue;
2138 else if (error != 0) {
2139 mutex_exit(&bufcache_lock);
2140 return error;
2141 }
2142 KASSERT(bp->b_bcount == fs->fs_bsize);
2143 mutex_exit(&bufcache_lock);
2144 error = rwfsblk(vp, B_WRITE, bp->b_data,
2145 fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
2146 brelse(bp, BC_INVAL | BC_VFLUSH);
2147 if (error)
2148 return error;
2149 mutex_enter(&bufcache_lock);
2150 }
2151 mutex_exit(&bufcache_lock);
2152
2153 return 0;
2154 }
2155
2156 /*
2157 * Write the specified block to a snapshot.
2158 */
2159 static int
2160 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2161 {
2162 struct inode *ip = VTOI(vp);
2163 struct fs *fs = ip->i_fs;
2164 struct buf *bp;
2165 int error;
2166
2167 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2168 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2169 if (error)
2170 return error;
2171 memcpy(bp->b_data, data, fs->fs_bsize);
2172 if (ip->i_nlink > 0)
2173 error = bwrite(bp);
2174 else
2175 bawrite(bp);
2176
2177 return error;
2178 }
2179
2180 /*
2181 * Check if this inode is present on the active snapshot list.
2182 * Must be called with snapinfo locked.
2183 */
2184 static inline bool
2185 is_active_snapshot(struct snap_info *si, struct inode *ip)
2186 {
2187 struct inode *xp;
2188
2189 KASSERT(mutex_owned(&si->si_lock));
2190
2191 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2192 if (xp == ip)
2193 return true;
2194 return false;
2195 }
2196
2197 /*
2198 * Number of blocks that fit into the journal or zero if not logging.
2199 */
2200 static int
2201 blocks_in_journal(struct fs *fs)
2202 {
2203 off_t bpj;
2204
2205 if ((fs->fs_flags & FS_DOWAPBL) == 0)
2206 return 0;
2207 bpj = 1;
2208 if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
2209 switch (fs->fs_journal_location) {
2210 case UFS_WAPBL_JOURNALLOC_END_PARTITION:
2211 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
2212 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
2213 break;
2214 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
2215 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
2216 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
2217 break;
2218 }
2219 }
2220 bpj /= fs->fs_bsize;
2221 return (bpj > 0 ? bpj : 1);
2222 }
2223
2224 /*
2225 * Get/Put direct block from inode or buffer containing disk addresses. Take
2226 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2227 * into a global include.
2228 */
2229 static inline daddr_t
2230 db_get(struct inode *ip, int loc)
2231 {
2232 if (ip->i_ump->um_fstype == UFS1)
2233 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2234 else
2235 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2236 }
2237
2238 static inline void
2239 db_assign(struct inode *ip, int loc, daddr_t val)
2240 {
2241 if (ip->i_ump->um_fstype == UFS1)
2242 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2243 else
2244 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2245 }
2246
2247 static inline daddr_t
2248 ib_get(struct inode *ip, int loc)
2249 {
2250 if (ip->i_ump->um_fstype == UFS1)
2251 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2252 else
2253 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2254 }
2255
2256 static inline void
2257 ib_assign(struct inode *ip, int loc, daddr_t val)
2258 {
2259 if (ip->i_ump->um_fstype == UFS1)
2260 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2261 else
2262 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2263 }
2264
2265 static inline daddr_t
2266 idb_get(struct inode *ip, void *bf, int loc)
2267 {
2268 if (ip->i_ump->um_fstype == UFS1)
2269 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2270 else
2271 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2272 }
2273
2274 static inline void
2275 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2276 {
2277 if (ip->i_ump->um_fstype == UFS1)
2278 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2279 else
2280 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2281 }
2282