ffs_snapshot.c revision 1.136 1 /* $NetBSD: ffs_snapshot.c,v 1.136 2014/07/10 06:02:40 dholland Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.136 2014/07/10 06:02:40 dholland Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_quota.h"
46 #endif
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77
78 #include <uvm/uvm.h>
79
80 struct snap_info {
81 kmutex_t si_lock; /* Lock this snapinfo */
82 kmutex_t si_snaplock; /* Snapshot vnode common lock */
83 lwp_t *si_owner; /* Sanplock owner */
84 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
85 daddr_t *si_snapblklist; /* Snapshot block hints list */
86 uint32_t si_gen; /* Incremented on change */
87 };
88
89 #if !defined(FFS_NO_SNAPSHOT)
90 typedef int (*acctfunc_t)
91 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
92
93 static int snapshot_setup(struct mount *, struct vnode *);
94 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
95 static int snapshot_expunge(struct mount *, struct vnode *,
96 struct fs *, daddr_t *, daddr_t **);
97 static int snapshot_expunge_snap(struct mount *, struct vnode *,
98 struct fs *, daddr_t);
99 static int snapshot_writefs(struct mount *, struct vnode *, void *);
100 static int cgaccount(struct vnode *, int, int *);
101 static int cgaccount1(int, struct vnode *, void *, int);
102 static int expunge(struct vnode *, struct inode *, struct fs *,
103 acctfunc_t, int);
104 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
105 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
106 static int fullacct(struct vnode *, void *, int, int, struct fs *,
107 daddr_t, int);
108 static int snapacct(struct vnode *, void *, int, int, struct fs *,
109 daddr_t, int);
110 static int mapacct(struct vnode *, void *, int, int, struct fs *,
111 daddr_t, int);
112 #endif /* !defined(FFS_NO_SNAPSHOT) */
113
114 static int ffs_copyonwrite(void *, struct buf *, bool);
115 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
116 static int rwfsblk(struct vnode *, int, void *, daddr_t);
117 static int syncsnap(struct vnode *);
118 static int wrsnapblk(struct vnode *, void *, daddr_t);
119 #if !defined(FFS_NO_SNAPSHOT)
120 static int blocks_in_journal(struct fs *);
121 #endif
122
123 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
124 static inline daddr_t db_get(struct inode *, int);
125 static inline void db_assign(struct inode *, int, daddr_t);
126 static inline daddr_t ib_get(struct inode *, int);
127 static inline daddr_t idb_get(struct inode *, void *, int);
128 static inline void idb_assign(struct inode *, void *, int, daddr_t);
129
130 #ifdef DEBUG
131 static int snapdebug = 0;
132 #endif
133
134 int
135 ffs_snapshot_init(struct ufsmount *ump)
136 {
137 struct snap_info *si;
138
139 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
140 if (si == NULL)
141 return ENOMEM;
142
143 TAILQ_INIT(&si->si_snapshots);
144 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
145 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
146 si->si_owner = NULL;
147 si->si_gen = 0;
148 si->si_snapblklist = NULL;
149
150 return 0;
151 }
152
153 void
154 ffs_snapshot_fini(struct ufsmount *ump)
155 {
156 struct snap_info *si;
157
158 si = ump->um_snapinfo;
159 ump->um_snapinfo = NULL;
160
161 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
162 mutex_destroy(&si->si_lock);
163 mutex_destroy(&si->si_snaplock);
164 KASSERT(si->si_snapblklist == NULL);
165 kmem_free(si, sizeof(*si));
166 }
167
168 /*
169 * Create a snapshot file and initialize it for the filesystem.
170 * Vnode is locked on entry and return.
171 */
172 int
173 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
174 {
175 #if defined(FFS_NO_SNAPSHOT)
176 return EOPNOTSUPP;
177 }
178 #else /* defined(FFS_NO_SNAPSHOT) */
179 bool suspended = false;
180 int error, redo = 0, snaploc;
181 void *sbbuf = NULL;
182 daddr_t *snaplist = NULL, snaplistsize = 0;
183 struct buf *bp, *nbp;
184 struct fs *copy_fs = NULL;
185 struct fs *fs = VFSTOUFS(mp)->um_fs;
186 struct inode *ip = VTOI(vp);
187 struct lwp *l = curlwp;
188 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
189 struct timespec ts;
190 struct timeval starttime;
191 #ifdef DEBUG
192 struct timeval endtime;
193 #endif
194 struct vnode *devvp = ip->i_devvp;
195
196 /*
197 * If the vnode already is a snapshot, return.
198 */
199 if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) {
200 if ((VTOI(vp)->i_flags & SF_SNAPINVAL))
201 return EINVAL;
202 if (ctime) {
203 ctime->tv_sec = DIP(VTOI(vp), mtime);
204 ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
205 }
206 return 0;
207 }
208 /*
209 * Check for free snapshot slot in the superblock.
210 */
211 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
212 if (fs->fs_snapinum[snaploc] == 0)
213 break;
214 if (snaploc == FSMAXSNAP)
215 return (ENOSPC);
216 /*
217 * Prepare the vnode to become a snapshot.
218 */
219 error = snapshot_setup(mp, vp);
220 if (error)
221 goto out;
222
223 /*
224 * Copy all the cylinder group maps. Although the
225 * filesystem is still active, we hope that only a few
226 * cylinder groups will change between now and when we
227 * suspend operations. Thus, we will be able to quickly
228 * touch up the few cylinder groups that changed during
229 * the suspension period.
230 */
231 error = cgaccount(vp, 1, NULL);
232 if (error)
233 goto out;
234
235 /*
236 * snapshot is now valid
237 */
238 ip->i_flags &= ~SF_SNAPINVAL;
239 DIP_ASSIGN(ip, flags, ip->i_flags);
240 ip->i_flag |= IN_CHANGE | IN_UPDATE;
241
242 /*
243 * Ensure that the snapshot is completely on disk.
244 * Since we have marked it as a snapshot it is safe to
245 * unlock it as no process will be allowed to write to it.
246 */
247 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
248 if (error)
249 goto out;
250 VOP_UNLOCK(vp);
251 /*
252 * All allocations are done, so we can now suspend the filesystem.
253 */
254 error = vfs_suspend(vp->v_mount, 0);
255 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
256 if (error)
257 goto out;
258 suspended = true;
259 getmicrotime(&starttime);
260 /*
261 * First, copy all the cylinder group maps that have changed.
262 */
263 error = cgaccount(vp, 2, &redo);
264 if (error)
265 goto out;
266 /*
267 * Create a copy of the superblock and its summary information.
268 */
269 error = snapshot_copyfs(mp, vp, &sbbuf);
270 copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
271 if (error)
272 goto out;
273 /*
274 * Expunge unlinked files from our view.
275 */
276 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
277 if (error)
278 goto out;
279 /*
280 * Record snapshot inode. Since this is the newest snapshot,
281 * it must be placed at the end of the list.
282 */
283 if (ip->i_nlink > 0)
284 fs->fs_snapinum[snaploc] = ip->i_number;
285
286 mutex_enter(&si->si_lock);
287 if (is_active_snapshot(si, ip))
288 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
289 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
290 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
291 /*
292 * If this is the first snapshot on this filesystem, put the
293 * preliminary list in place and establish the cow handler.
294 */
295 si->si_snapblklist = snaplist;
296 fscow_establish(mp, ffs_copyonwrite, devvp);
297 }
298 si->si_gen++;
299 mutex_exit(&si->si_lock);
300
301 vp->v_vflag |= VV_SYSTEM;
302 /*
303 * Set the mtime to the time the snapshot has been taken.
304 */
305 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
306 if (ctime)
307 *ctime = ts;
308 DIP_ASSIGN(ip, mtime, ts.tv_sec);
309 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
310 ip->i_flag |= IN_CHANGE | IN_UPDATE;
311 /*
312 * Copy allocation information from all snapshots and then
313 * expunge them from our view.
314 */
315 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
316 if (error)
317 goto out;
318 /*
319 * Write the superblock and its summary information to the snapshot.
320 */
321 error = snapshot_writefs(mp, vp, sbbuf);
322 if (error)
323 goto out;
324 /*
325 * We're nearly done, ensure that the snapshot is completely on disk.
326 */
327 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
328 if (error)
329 goto out;
330 /*
331 * Invalidate and free all pages on the snapshot vnode.
332 * We will read and write through the buffercache.
333 */
334 mutex_enter(vp->v_interlock);
335 error = VOP_PUTPAGES(vp, 0, 0,
336 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
337 if (error)
338 goto out;
339 /*
340 * Invalidate short ( < fs_bsize ) buffers. We will always read
341 * full size buffers later.
342 */
343 mutex_enter(&bufcache_lock);
344 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
345 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
346 nbp = LIST_NEXT(bp, b_vnbufs);
347 if (bp->b_bcount == fs->fs_bsize)
348 continue;
349 error = bbusy(bp, false, 0, NULL);
350 if (error != 0) {
351 if (error == EPASSTHROUGH) {
352 nbp = LIST_FIRST(&vp->v_cleanblkhd);
353 continue;
354 }
355 break;
356 }
357 brelsel(bp, BC_INVAL | BC_VFLUSH);
358 }
359 mutex_exit(&bufcache_lock);
360
361 out:
362 if (sbbuf != NULL) {
363 free(copy_fs->fs_csp, M_UFSMNT);
364 free(sbbuf, M_UFSMNT);
365 }
366 if (fs->fs_active != NULL) {
367 free(fs->fs_active, M_DEVBUF);
368 fs->fs_active = NULL;
369 }
370
371 mutex_enter(&si->si_lock);
372 if (snaplist != NULL) {
373 if (si->si_snapblklist == snaplist)
374 si->si_snapblklist = NULL;
375 free(snaplist, M_UFSMNT);
376 }
377 if (error) {
378 fs->fs_snapinum[snaploc] = 0;
379 } else {
380 /*
381 * As this is the newest list, it is the most inclusive, so
382 * should replace the previous list.
383 */
384 si->si_snapblklist = ip->i_snapblklist;
385 }
386 si->si_gen++;
387 mutex_exit(&si->si_lock);
388
389 if (suspended) {
390 VOP_UNLOCK(vp);
391 vfs_resume(vp->v_mount);
392 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
393 #ifdef DEBUG
394 getmicrotime(&endtime);
395 timersub(&endtime, &starttime, &endtime);
396 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
397 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
398 endtime.tv_usec / 1000, redo, fs->fs_ncg);
399 #endif
400 }
401 if (error) {
402 if (UFS_WAPBL_BEGIN(mp) == 0) {
403 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
404 UFS_WAPBL_END(mp);
405 }
406 } else if (ip->i_nlink > 0)
407 vref(vp);
408 return (error);
409 }
410
411 /*
412 * Prepare vnode to become a snapshot.
413 */
414 static int
415 snapshot_setup(struct mount *mp, struct vnode *vp)
416 {
417 int error, n, len, loc, cg;
418 daddr_t blkno, numblks;
419 struct buf *ibp, *nbp;
420 struct fs *fs = VFSTOUFS(mp)->um_fs;
421 struct lwp *l = curlwp;
422 const int wbreak = blocks_in_journal(fs)/8;
423 struct inode *ip = VTOI(vp);
424
425 /*
426 * Check mount, readonly reference and owner.
427 */
428 if (vp->v_mount != mp)
429 return EXDEV;
430 if (vp->v_writecount != 0)
431 return EBUSY;
432 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
433 0, mp, vp, NULL);
434 if (error)
435 return EACCES;
436
437 if (vp->v_size != 0) {
438 error = ffs_truncate(vp, 0, 0, NOCRED);
439 if (error)
440 return error;
441 }
442
443 /* Change inode to snapshot type file. */
444 error = UFS_WAPBL_BEGIN(mp);
445 if (error)
446 return error;
447 #if defined(QUOTA) || defined(QUOTA2)
448 /* shapshot inodes are not accounted in quotas */
449 chkiq(ip, -1, l->l_cred, 0);
450 #endif
451 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
452 DIP_ASSIGN(ip, flags, ip->i_flags);
453 ip->i_flag |= IN_CHANGE | IN_UPDATE;
454 ffs_update(vp, NULL, NULL, UPDATE_WAIT);
455 UFS_WAPBL_END(mp);
456
457 KASSERT(ip->i_flags & SF_SNAPSHOT);
458 /*
459 * Write an empty list of preallocated blocks to the end of
460 * the snapshot to set size to at least that of the filesystem.
461 */
462 numblks = howmany(fs->fs_size, fs->fs_frag);
463 blkno = 1;
464 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
465 error = vn_rdwr(UIO_WRITE, vp,
466 (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
467 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
468 if (error)
469 return error;
470 /*
471 * Preallocate critical data structures so that we can copy
472 * them in without further allocation after we suspend all
473 * operations on the filesystem. We would like to just release
474 * the allocated buffers without writing them since they will
475 * be filled in below once we are ready to go, but this upsets
476 * the soft update code, so we go ahead and write the new buffers.
477 *
478 * Allocate all indirect blocks and mark all of them as not
479 * needing to be copied.
480 */
481 error = UFS_WAPBL_BEGIN(mp);
482 if (error)
483 return error;
484 for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
485 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
486 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
487 if (error)
488 goto out;
489 brelse(ibp, 0);
490 if (wbreak > 0 && (++n % wbreak) == 0) {
491 UFS_WAPBL_END(mp);
492 error = UFS_WAPBL_BEGIN(mp);
493 if (error)
494 return error;
495 }
496 }
497 /*
498 * Allocate copies for the superblock and its summary information.
499 */
500 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
501 0, &nbp);
502 if (error)
503 goto out;
504 bawrite(nbp);
505 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
506 len = howmany(fs->fs_cssize, fs->fs_bsize);
507 for (loc = 0; loc < len; loc++) {
508 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
509 fs->fs_bsize, l->l_cred, 0, &nbp);
510 if (error)
511 goto out;
512 bawrite(nbp);
513 if (wbreak > 0 && (++n % wbreak) == 0) {
514 UFS_WAPBL_END(mp);
515 error = UFS_WAPBL_BEGIN(mp);
516 if (error)
517 return error;
518 }
519 }
520 /*
521 * Allocate all cylinder group blocks.
522 */
523 for (cg = 0; cg < fs->fs_ncg; cg++) {
524 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
525 fs->fs_bsize, l->l_cred, 0, &nbp);
526 if (error)
527 goto out;
528 bawrite(nbp);
529 if (wbreak > 0 && (++n % wbreak) == 0) {
530 UFS_WAPBL_END(mp);
531 error = UFS_WAPBL_BEGIN(mp);
532 if (error)
533 return error;
534 }
535 }
536
537 out:
538 UFS_WAPBL_END(mp);
539 return error;
540 }
541
542 /*
543 * Create a copy of the superblock and its summary information.
544 * It is up to the caller to free copyfs and copy_fs->fs_csp.
545 */
546 static int
547 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
548 {
549 int error, i, len, loc, size;
550 void *space;
551 int32_t *lp;
552 struct buf *bp;
553 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
554 struct lwp *l = curlwp;
555 struct vnode *devvp = VTOI(vp)->i_devvp;
556
557 /*
558 * Grab a copy of the superblock and its summary information.
559 * We delay writing it until the suspension is released below.
560 */
561 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
562 loc = ffs_blkoff(fs, fs->fs_sblockloc);
563 if (loc > 0)
564 memset(*sbbuf, 0, loc);
565 copyfs = (struct fs *)((char *)(*sbbuf) + loc);
566 memcpy(copyfs, fs, fs->fs_sbsize);
567 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
568 if (fs->fs_sbsize < size)
569 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
570 size - fs->fs_sbsize);
571 size = ffs_blkroundup(fs, fs->fs_cssize);
572 if (fs->fs_contigsumsize > 0)
573 size += fs->fs_ncg * sizeof(int32_t);
574 space = malloc(size, M_UFSMNT, M_WAITOK);
575 copyfs->fs_csp = space;
576 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
577 space = (char *)space + fs->fs_cssize;
578 loc = howmany(fs->fs_cssize, fs->fs_fsize);
579 i = fs->fs_frag - loc % fs->fs_frag;
580 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
581 if (len > 0) {
582 if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
583 len, l->l_cred, 0, &bp)) != 0) {
584 free(copyfs->fs_csp, M_UFSMNT);
585 free(*sbbuf, M_UFSMNT);
586 *sbbuf = NULL;
587 return error;
588 }
589 memcpy(space, bp->b_data, (u_int)len);
590 space = (char *)space + len;
591 brelse(bp, BC_INVAL | BC_NOCACHE);
592 }
593 if (fs->fs_contigsumsize > 0) {
594 copyfs->fs_maxcluster = lp = space;
595 for (i = 0; i < fs->fs_ncg; i++)
596 *lp++ = fs->fs_contigsumsize;
597 }
598 if (mp->mnt_wapbl)
599 copyfs->fs_flags &= ~FS_DOWAPBL;
600 return 0;
601 }
602
603 struct snapshot_expunge_ctx {
604 struct vnode *logvp;
605 struct lwp *l;
606 struct vnode *vp;
607 struct fs *copy_fs;
608 };
609
610 static bool
611 snapshot_expunge_selector(void *cl, struct vnode *xvp)
612 {
613 struct vattr vat;
614 struct snapshot_expunge_ctx *c = cl;
615 struct inode *xp;
616
617 xp = VTOI(xvp);
618 if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
619 (xp->i_flags & SF_SNAPSHOT))
620 return false;
621 #ifdef DEBUG
622 if (snapdebug)
623 vprint("ffs_snapshot: busy vnode", xvp);
624 #endif
625
626 if (xvp == c->logvp)
627 return true;
628
629 if (VOP_GETATTR(xvp, &vat, c->l->l_cred) == 0 &&
630 vat.va_nlink > 0)
631 return false;
632
633 if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
634 return false;
635
636 return true;
637 }
638
639 /*
640 * We must check for active files that have been unlinked (e.g., with a zero
641 * link count). We have to expunge all trace of these files from the snapshot
642 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
643 * Note that we skip unlinked snapshot files as they will be handled separately.
644 * Calculate the snapshot list size and create a preliminary list.
645 */
646 static int
647 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
648 daddr_t *snaplistsize, daddr_t **snaplist)
649 {
650 int cg, error = 0, len, loc;
651 daddr_t blkno, *blkp;
652 struct fs *fs = VFSTOUFS(mp)->um_fs;
653 struct inode *xp;
654 struct lwp *l = curlwp;
655 struct vnode *logvp = NULL, *xvp;
656 struct vnode_iterator *marker;
657 struct snapshot_expunge_ctx ctx;
658
659 *snaplist = NULL;
660 /*
661 * Get the log inode if any.
662 */
663 if ((fs->fs_flags & FS_DOWAPBL) &&
664 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
665 error = VFS_VGET(mp,
666 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
667 if (error)
668 goto out;
669 }
670 /*
671 * We also calculate the needed size for the snapshot list.
672 */
673 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
674 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
675
676 vfs_vnode_iterator_init(mp, &marker);
677 ctx.logvp = logvp;
678 ctx.l = l;
679 ctx.vp = vp;
680 ctx.copy_fs = copy_fs;
681 while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
682 &ctx)))
683 {
684 /*
685 * If there is a fragment, clear it here.
686 */
687 xp = VTOI(xvp);
688 blkno = 0;
689 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
690 if (loc < UFS_NDADDR) {
691 len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
692 if (len > 0 && len < fs->fs_bsize) {
693 error = UFS_WAPBL_BEGIN(mp);
694 if (error) {
695 vrele(xvp);
696 vfs_vnode_iterator_destroy(marker);
697 goto out;
698 }
699 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
700 len, xp->i_number);
701 blkno = db_get(xp, loc);
702 db_assign(xp, loc, 0);
703 UFS_WAPBL_END(mp);
704 }
705 }
706 *snaplistsize += 1;
707 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
708 if (blkno)
709 db_assign(xp, loc, blkno);
710 if (!error) {
711 error = UFS_WAPBL_BEGIN(mp);
712 if (!error) {
713 error = ffs_freefile_snap(copy_fs, vp,
714 xp->i_number, xp->i_mode);
715 UFS_WAPBL_END(mp);
716 }
717 }
718 vrele(xvp);
719 if (error) {
720 vfs_vnode_iterator_destroy(marker);
721 goto out;
722 }
723 }
724 vfs_vnode_iterator_destroy(marker);
725
726 /*
727 * Create a preliminary list of preallocated snapshot blocks.
728 */
729 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
730 blkp = &(*snaplist)[1];
731 *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
732 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
733 for (cg = 0; cg < fs->fs_ncg; cg++) {
734 if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
735 break;
736 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
737 }
738 len = howmany(fs->fs_cssize, fs->fs_bsize);
739 for (loc = 0; loc < len; loc++)
740 *blkp++ = blkno + loc;
741 for (; cg < fs->fs_ncg; cg++)
742 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
743 (*snaplist)[0] = blkp - &(*snaplist)[0];
744
745 out:
746 if (logvp != NULL)
747 vput(logvp);
748 if (error && *snaplist != NULL) {
749 free(*snaplist, M_UFSMNT);
750 *snaplist = NULL;
751 }
752
753 return error;
754 }
755
756 /*
757 * Copy allocation information from all the snapshots in this snapshot and
758 * then expunge them from its view. Also, collect the list of allocated
759 * blocks in i_snapblklist.
760 */
761 static int
762 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
763 struct fs *copy_fs, daddr_t snaplistsize)
764 {
765 int error = 0, i;
766 daddr_t numblks, *snaplist = NULL;
767 struct fs *fs = VFSTOUFS(mp)->um_fs;
768 struct inode *ip = VTOI(vp), *xp;
769 struct lwp *l = curlwp;
770 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
771
772 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
773 if (xp != ip) {
774 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
775 if (error)
776 break;
777 }
778 if (xp->i_nlink != 0)
779 continue;
780 error = UFS_WAPBL_BEGIN(mp);
781 if (error)
782 break;
783 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
784 UFS_WAPBL_END(mp);
785 if (error)
786 break;
787 }
788 if (error)
789 goto out;
790 /*
791 * Allocate space for the full list of preallocated snapshot blocks.
792 */
793 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
794 ip->i_snapblklist = &snaplist[1];
795 /*
796 * Expunge the blocks used by the snapshots from the set of
797 * blocks marked as used in the snapshot bitmaps. Also, collect
798 * the list of allocated blocks in i_snapblklist.
799 */
800 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
801 if (error)
802 goto out;
803 if (snaplistsize < ip->i_snapblklist - snaplist)
804 panic("ffs_snapshot: list too small");
805 snaplistsize = ip->i_snapblklist - snaplist;
806 snaplist[0] = snaplistsize;
807 ip->i_snapblklist = &snaplist[0];
808 /*
809 * Write out the list of allocated blocks to the end of the snapshot.
810 */
811 numblks = howmany(fs->fs_size, fs->fs_frag);
812 for (i = 0; i < snaplistsize; i++)
813 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
814 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
815 snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
816 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
817 for (i = 0; i < snaplistsize; i++)
818 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
819 out:
820 if (error && snaplist != NULL) {
821 free(snaplist, M_UFSMNT);
822 ip->i_snapblklist = NULL;
823 }
824 return error;
825 }
826
827 /*
828 * Write the superblock and its summary information to the snapshot.
829 * Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
830 */
831 static int
832 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
833 {
834 int error, len, loc;
835 void *space;
836 daddr_t blkno;
837 struct buf *bp;
838 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
839 struct inode *ip = VTOI(vp);
840 struct lwp *l = curlwp;
841
842 copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
843
844 /*
845 * Write the superblock and its summary information
846 * to the snapshot.
847 */
848 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
849 len = howmany(fs->fs_cssize, fs->fs_bsize);
850 space = copyfs->fs_csp;
851 #ifdef FFS_EI
852 if (UFS_FSNEEDSWAP(fs)) {
853 ffs_sb_swap(copyfs, copyfs);
854 ffs_csum_swap(space, space, fs->fs_cssize);
855 }
856 #endif
857 error = UFS_WAPBL_BEGIN(mp);
858 if (error)
859 return error;
860 for (loc = 0; loc < len; loc++) {
861 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
862 B_MODIFY, &bp);
863 if (error) {
864 break;
865 }
866 memcpy(bp->b_data, space, fs->fs_bsize);
867 space = (char *)space + fs->fs_bsize;
868 bawrite(bp);
869 }
870 if (error)
871 goto out;
872 error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
873 fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
874 if (error) {
875 goto out;
876 } else {
877 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
878 bawrite(bp);
879 }
880 /*
881 * Copy the first UFS_NDADDR blocks to the snapshot so
882 * ffs_copyonwrite() and ffs_snapblkfree() will always work on
883 * indirect blocks.
884 */
885 for (loc = 0; loc < UFS_NDADDR; loc++) {
886 if (db_get(ip, loc) != 0)
887 continue;
888 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
889 fs->fs_bsize, l->l_cred, 0, &bp);
890 if (error)
891 break;
892 error = rwfsblk(vp, B_READ, bp->b_data, loc);
893 if (error) {
894 brelse(bp, 0);
895 break;
896 }
897 bawrite(bp);
898 }
899
900 out:
901 UFS_WAPBL_END(mp);
902 return error;
903 }
904
905 /*
906 * Copy all cylinder group maps.
907 */
908 static int
909 cgaccount(struct vnode *vp, int passno, int *redo)
910 {
911 int cg, error = 0;
912 struct buf *nbp;
913 struct fs *fs = VTOI(vp)->i_fs;
914
915 if (redo != NULL)
916 *redo = 0;
917 if (passno == 1)
918 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
919 M_DEVBUF, M_WAITOK | M_ZERO);
920 for (cg = 0; cg < fs->fs_ncg; cg++) {
921 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
922 continue;
923
924 if (redo != NULL)
925 *redo += 1;
926 error = UFS_WAPBL_BEGIN(vp->v_mount);
927 if (error)
928 return error;
929 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
930 fs->fs_bsize, curlwp->l_cred, 0, &nbp);
931 if (error) {
932 UFS_WAPBL_END(vp->v_mount);
933 break;
934 }
935 error = cgaccount1(cg, vp, nbp->b_data, passno);
936 bawrite(nbp);
937 UFS_WAPBL_END(vp->v_mount);
938 if (error)
939 break;
940 }
941 return error;
942 }
943
944 /*
945 * Copy a cylinder group map. All the unallocated blocks are marked
946 * BLK_NOCOPY so that the snapshot knows that it need not copy them
947 * if they are later written. If passno is one, then this is a first
948 * pass, so only setting needs to be done. If passno is 2, then this
949 * is a revision to a previous pass which must be undone as the
950 * replacement pass is done.
951 */
952 static int
953 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
954 {
955 struct buf *bp, *ibp;
956 struct inode *ip;
957 struct cg *cgp;
958 struct fs *fs;
959 struct lwp *l = curlwp;
960 daddr_t base, numblks;
961 int error, len, loc, ns __unused, indiroff;
962
963 ip = VTOI(vp);
964 fs = ip->i_fs;
965 ns = UFS_FSNEEDSWAP(fs);
966 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
967 (int)fs->fs_cgsize, l->l_cred, 0, &bp);
968 if (error) {
969 return (error);
970 }
971 cgp = (struct cg *)bp->b_data;
972 if (!cg_chkmagic(cgp, ns)) {
973 brelse(bp, 0);
974 return (EIO);
975 }
976 ACTIVECG_SET(fs, cg);
977
978 memcpy(data, bp->b_data, fs->fs_cgsize);
979 brelse(bp, 0);
980 if (fs->fs_cgsize < fs->fs_bsize)
981 memset((char *)data + fs->fs_cgsize, 0,
982 fs->fs_bsize - fs->fs_cgsize);
983 numblks = howmany(fs->fs_size, fs->fs_frag);
984 len = howmany(fs->fs_fpg, fs->fs_frag);
985 base = cg * fs->fs_fpg / fs->fs_frag;
986 if (base + len >= numblks)
987 len = numblks - base - 1;
988 loc = 0;
989 if (base < UFS_NDADDR) {
990 for ( ; loc < UFS_NDADDR; loc++) {
991 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
992 db_assign(ip, loc, BLK_NOCOPY);
993 else if (db_get(ip, loc) == BLK_NOCOPY) {
994 if (passno == 2)
995 db_assign(ip, loc, 0);
996 else if (passno == 1)
997 panic("ffs_snapshot: lost direct block");
998 }
999 }
1000 }
1001 if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
1002 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1003 return (error);
1004 indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
1005 for ( ; loc < len; loc++, indiroff++) {
1006 if (indiroff >= FFS_NINDIR(fs)) {
1007 bawrite(ibp);
1008 if ((error = ffs_balloc(vp,
1009 ffs_lblktosize(fs, (off_t)(base + loc)),
1010 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1011 return (error);
1012 indiroff = 0;
1013 }
1014 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1015 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
1016 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
1017 if (passno == 2)
1018 idb_assign(ip, ibp->b_data, indiroff, 0);
1019 else if (passno == 1)
1020 panic("ffs_snapshot: lost indirect block");
1021 }
1022 }
1023 bdwrite(ibp);
1024 return (0);
1025 }
1026
1027 /*
1028 * Before expunging a snapshot inode, note all the
1029 * blocks that it claims with BLK_SNAP so that fsck will
1030 * be able to account for those blocks properly and so
1031 * that this snapshot knows that it need not copy them
1032 * if the other snapshot holding them is freed.
1033 */
1034 static int
1035 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1036 acctfunc_t acctfunc, int expungetype)
1037 {
1038 int i, error, ns __unused;
1039 daddr_t lbn, rlbn;
1040 daddr_t len, blkno, numblks, blksperindir;
1041 struct ufs1_dinode *dip1;
1042 struct ufs2_dinode *dip2;
1043 struct lwp *l = curlwp;
1044 void *bap;
1045 struct buf *bp;
1046 struct mount *mp;
1047
1048 ns = UFS_FSNEEDSWAP(fs);
1049 mp = snapvp->v_mount;
1050
1051 error = UFS_WAPBL_BEGIN(mp);
1052 if (error)
1053 return error;
1054 /*
1055 * Prepare to expunge the inode. If its inode block has not
1056 * yet been copied, then allocate and fill the copy.
1057 */
1058 lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1059 error = snapblkaddr(snapvp, lbn, &blkno);
1060 if (error)
1061 return error;
1062 if (blkno != 0) {
1063 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
1064 B_MODIFY, &bp);
1065 } else {
1066 error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
1067 fs->fs_bsize, l->l_cred, 0, &bp);
1068 if (! error)
1069 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1070 }
1071 if (error) {
1072 UFS_WAPBL_END(mp);
1073 return error;
1074 }
1075 /*
1076 * Set a snapshot inode to be a zero length file, regular files
1077 * or unlinked snapshots to be completely unallocated.
1078 */
1079 if (fs->fs_magic == FS_UFS1_MAGIC) {
1080 dip1 = (struct ufs1_dinode *)bp->b_data +
1081 ino_to_fsbo(fs, cancelip->i_number);
1082 if (cancelip->i_flags & SF_SNAPSHOT) {
1083 dip1->di_flags =
1084 ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
1085 SF_SNAPINVAL, ns);
1086 }
1087 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1088 dip1->di_mode = 0;
1089 dip1->di_size = 0;
1090 dip1->di_blocks = 0;
1091 memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
1092 } else {
1093 dip2 = (struct ufs2_dinode *)bp->b_data +
1094 ino_to_fsbo(fs, cancelip->i_number);
1095 if (cancelip->i_flags & SF_SNAPSHOT) {
1096 dip2->di_flags =
1097 ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
1098 SF_SNAPINVAL, ns);
1099 }
1100 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1101 dip2->di_mode = 0;
1102 dip2->di_size = 0;
1103 dip2->di_blocks = 0;
1104 memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
1105 }
1106 bdwrite(bp);
1107 UFS_WAPBL_END(mp);
1108 /*
1109 * Now go through and expunge all the blocks in the file
1110 * using the function requested.
1111 */
1112 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1113 if (fs->fs_magic == FS_UFS1_MAGIC)
1114 bap = &cancelip->i_ffs1_db[0];
1115 else
1116 bap = &cancelip->i_ffs2_db[0];
1117 error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
1118 if (error)
1119 return (error);
1120 if (fs->fs_magic == FS_UFS1_MAGIC)
1121 bap = &cancelip->i_ffs1_ib[0];
1122 else
1123 bap = &cancelip->i_ffs2_ib[0];
1124 error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
1125 if (error)
1126 return (error);
1127 blksperindir = 1;
1128 lbn = -UFS_NDADDR;
1129 len = numblks - UFS_NDADDR;
1130 rlbn = UFS_NDADDR;
1131 for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1132 error = indiracct(snapvp, ITOV(cancelip), i,
1133 ib_get(cancelip, i), lbn, rlbn, len,
1134 blksperindir, fs, acctfunc, expungetype);
1135 if (error)
1136 return (error);
1137 blksperindir *= FFS_NINDIR(fs);
1138 lbn -= blksperindir + 1;
1139 len -= blksperindir;
1140 rlbn += blksperindir;
1141 }
1142 return (0);
1143 }
1144
1145 /*
1146 * Descend an indirect block chain for vnode cancelvp accounting for all
1147 * its indirect blocks in snapvp.
1148 */
1149 static int
1150 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1151 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1152 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1153 {
1154 int error, num, i;
1155 daddr_t subblksperindir;
1156 struct indir indirs[UFS_NIADDR + 2];
1157 daddr_t last;
1158 void *bap;
1159 struct buf *bp;
1160
1161 if (blkno == 0) {
1162 if (expungetype == BLK_NOCOPY)
1163 return (0);
1164 panic("indiracct: missing indir");
1165 }
1166 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1167 return (error);
1168 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1169 panic("indiracct: botched params");
1170 /*
1171 * We have to expand bread here since it will deadlock looking
1172 * up the block number for any blocks that are not in the cache.
1173 */
1174 error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
1175 false, &bp);
1176 if (error)
1177 return error;
1178 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1179 rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
1180 brelse(bp, 0);
1181 return (error);
1182 }
1183 /*
1184 * Account for the block pointers in this indirect block.
1185 */
1186 last = howmany(remblks, blksperindir);
1187 if (last > FFS_NINDIR(fs))
1188 last = FFS_NINDIR(fs);
1189 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1190 memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1191 brelse(bp, 0);
1192 error = (*acctfunc)(snapvp, bap, 0, last,
1193 fs, level == 0 ? rlbn : -1, expungetype);
1194 if (error || level == 0)
1195 goto out;
1196 /*
1197 * Account for the block pointers in each of the indirect blocks
1198 * in the levels below us.
1199 */
1200 subblksperindir = blksperindir / FFS_NINDIR(fs);
1201 for (lbn++, level--, i = 0; i < last; i++) {
1202 error = indiracct(snapvp, cancelvp, level,
1203 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1204 subblksperindir, fs, acctfunc, expungetype);
1205 if (error)
1206 goto out;
1207 rlbn += blksperindir;
1208 lbn -= blksperindir;
1209 remblks -= blksperindir;
1210 }
1211 out:
1212 free(bap, M_DEVBUF);
1213 return (error);
1214 }
1215
1216 /*
1217 * Do both snap accounting and map accounting.
1218 */
1219 static int
1220 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1221 struct fs *fs, daddr_t lblkno,
1222 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1223 {
1224 int error;
1225
1226 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1227 return (error);
1228 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1229 }
1230
1231 /*
1232 * Identify a set of blocks allocated in a snapshot inode.
1233 */
1234 static int
1235 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1236 struct fs *fs, daddr_t lblkno,
1237 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1238 {
1239 struct inode *ip = VTOI(vp);
1240 struct lwp *l = curlwp;
1241 struct mount *mp = vp->v_mount;
1242 daddr_t blkno;
1243 daddr_t lbn;
1244 struct buf *ibp;
1245 int error, n;
1246 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1247
1248 error = UFS_WAPBL_BEGIN(mp);
1249 if (error)
1250 return error;
1251 for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1252 blkno = idb_get(ip, bap, oldblkp);
1253 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1254 continue;
1255 lbn = ffs_fragstoblks(fs, blkno);
1256 if (lbn < UFS_NDADDR) {
1257 blkno = db_get(ip, lbn);
1258 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1259 } else {
1260 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1261 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1262 if (error)
1263 break;
1264 blkno = idb_get(ip, ibp->b_data,
1265 (lbn - UFS_NDADDR) % FFS_NINDIR(fs));
1266 }
1267 /*
1268 * If we are expunging a snapshot vnode and we
1269 * find a block marked BLK_NOCOPY, then it is
1270 * one that has been allocated to this snapshot after
1271 * we took our current snapshot and can be ignored.
1272 */
1273 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1274 if (lbn >= UFS_NDADDR)
1275 brelse(ibp, 0);
1276 } else {
1277 if (blkno != 0)
1278 panic("snapacct: bad block");
1279 if (lbn < UFS_NDADDR)
1280 db_assign(ip, lbn, expungetype);
1281 else {
1282 idb_assign(ip, ibp->b_data,
1283 (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
1284 bdwrite(ibp);
1285 }
1286 }
1287 if (wbreak > 0 && (++n % wbreak) == 0) {
1288 UFS_WAPBL_END(mp);
1289 error = UFS_WAPBL_BEGIN(mp);
1290 if (error)
1291 return error;
1292 }
1293 }
1294 UFS_WAPBL_END(mp);
1295 return error;
1296 }
1297
1298 /*
1299 * Account for a set of blocks allocated in a snapshot inode.
1300 */
1301 static int
1302 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1303 struct fs *fs, daddr_t lblkno, int expungetype)
1304 {
1305 daddr_t blkno;
1306 struct inode *ip;
1307 struct mount *mp = vp->v_mount;
1308 ino_t inum;
1309 int acctit, error, n;
1310 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1311
1312 error = UFS_WAPBL_BEGIN(mp);
1313 if (error)
1314 return error;
1315 ip = VTOI(vp);
1316 inum = ip->i_number;
1317 if (lblkno == -1)
1318 acctit = 0;
1319 else
1320 acctit = 1;
1321 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1322 blkno = idb_get(ip, bap, oldblkp);
1323 if (blkno == 0 || blkno == BLK_NOCOPY)
1324 continue;
1325 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1326 *ip->i_snapblklist++ = lblkno;
1327 if (blkno == BLK_SNAP)
1328 blkno = ffs_blkstofrags(fs, lblkno);
1329 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1330 if (wbreak > 0 && (++n % wbreak) == 0) {
1331 UFS_WAPBL_END(mp);
1332 error = UFS_WAPBL_BEGIN(mp);
1333 if (error)
1334 return error;
1335 }
1336 }
1337 UFS_WAPBL_END(mp);
1338 return (0);
1339 }
1340
1341 /*
1342 * Number of blocks that fit into the journal or zero if not logging.
1343 */
1344 static int
1345 blocks_in_journal(struct fs *fs)
1346 {
1347 off_t bpj;
1348
1349 if ((fs->fs_flags & FS_DOWAPBL) == 0)
1350 return 0;
1351 bpj = 1;
1352 if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1353 switch (fs->fs_journal_location) {
1354 case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1355 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1356 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1357 break;
1358 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1359 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1360 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1361 break;
1362 }
1363 }
1364 bpj /= fs->fs_bsize;
1365 return (bpj > 0 ? bpj : 1);
1366 }
1367 #endif /* defined(FFS_NO_SNAPSHOT) */
1368
1369 /*
1370 * Decrement extra reference on snapshot when last name is removed.
1371 * It will not be freed until the last open reference goes away.
1372 */
1373 void
1374 ffs_snapgone(struct vnode *vp)
1375 {
1376 struct inode *xp, *ip = VTOI(vp);
1377 struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
1378 struct fs *fs;
1379 struct snap_info *si;
1380 int snaploc;
1381
1382 si = VFSTOUFS(mp)->um_snapinfo;
1383
1384 /*
1385 * Find snapshot in incore list.
1386 */
1387 mutex_enter(&si->si_lock);
1388 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1389 if (xp == ip)
1390 break;
1391 mutex_exit(&si->si_lock);
1392 if (xp != NULL)
1393 vrele(ITOV(ip));
1394 #ifdef DEBUG
1395 else if (snapdebug)
1396 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1397 (unsigned long long)ip->i_number);
1398 #endif
1399 /*
1400 * Delete snapshot inode from superblock. Keep list dense.
1401 */
1402 mutex_enter(&si->si_lock);
1403 fs = ip->i_fs;
1404 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1405 if (fs->fs_snapinum[snaploc] == ip->i_number)
1406 break;
1407 if (snaploc < FSMAXSNAP) {
1408 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1409 if (fs->fs_snapinum[snaploc] == 0)
1410 break;
1411 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1412 }
1413 fs->fs_snapinum[snaploc - 1] = 0;
1414 }
1415 si->si_gen++;
1416 mutex_exit(&si->si_lock);
1417 }
1418
1419 /*
1420 * Prepare a snapshot file for being removed.
1421 */
1422 void
1423 ffs_snapremove(struct vnode *vp)
1424 {
1425 struct inode *ip = VTOI(vp), *xp;
1426 struct vnode *devvp = ip->i_devvp;
1427 struct fs *fs = ip->i_fs;
1428 struct mount *mp = spec_node_getmountedfs(devvp);
1429 struct buf *ibp;
1430 struct snap_info *si;
1431 struct lwp *l = curlwp;
1432 daddr_t numblks, blkno, dblk;
1433 int error, loc, last;
1434
1435 si = VFSTOUFS(mp)->um_snapinfo;
1436 /*
1437 * If active, delete from incore list (this snapshot may
1438 * already have been in the process of being deleted, so
1439 * would not have been active).
1440 *
1441 * Clear copy-on-write flag if last snapshot.
1442 */
1443 mutex_enter(&si->si_snaplock);
1444 mutex_enter(&si->si_lock);
1445 if (is_active_snapshot(si, ip)) {
1446 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1447 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1448 /* Roll back the list of preallocated blocks. */
1449 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1450 si->si_snapblklist = xp->i_snapblklist;
1451 si->si_gen++;
1452 mutex_exit(&si->si_lock);
1453 mutex_exit(&si->si_snaplock);
1454 } else {
1455 si->si_snapblklist = 0;
1456 si->si_gen++;
1457 mutex_exit(&si->si_lock);
1458 mutex_exit(&si->si_snaplock);
1459 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1460 }
1461 if (ip->i_snapblklist != NULL) {
1462 free(ip->i_snapblklist, M_UFSMNT);
1463 ip->i_snapblklist = NULL;
1464 }
1465 } else {
1466 mutex_exit(&si->si_lock);
1467 mutex_exit(&si->si_snaplock);
1468 }
1469 /*
1470 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1471 * snapshots that want them (see ffs_snapblkfree below).
1472 */
1473 for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1474 dblk = db_get(ip, blkno);
1475 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1476 db_assign(ip, blkno, 0);
1477 else if ((dblk == ffs_blkstofrags(fs, blkno) &&
1478 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1479 ip->i_number))) {
1480 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1481 db_assign(ip, blkno, 0);
1482 }
1483 }
1484 numblks = howmany(ip->i_size, fs->fs_bsize);
1485 for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
1486 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
1487 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1488 if (error)
1489 continue;
1490 if (fs->fs_size - blkno > FFS_NINDIR(fs))
1491 last = FFS_NINDIR(fs);
1492 else
1493 last = fs->fs_size - blkno;
1494 for (loc = 0; loc < last; loc++) {
1495 dblk = idb_get(ip, ibp->b_data, loc);
1496 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1497 idb_assign(ip, ibp->b_data, loc, 0);
1498 else if (dblk == ffs_blkstofrags(fs, blkno) &&
1499 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1500 fs->fs_bsize, ip->i_number)) {
1501 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1502 idb_assign(ip, ibp->b_data, loc, 0);
1503 }
1504 }
1505 bawrite(ibp);
1506 UFS_WAPBL_END(mp);
1507 error = UFS_WAPBL_BEGIN(mp);
1508 KASSERT(error == 0);
1509 }
1510 /*
1511 * Clear snapshot flag and drop reference.
1512 */
1513 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
1514 DIP_ASSIGN(ip, flags, ip->i_flags);
1515 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1516 #if defined(QUOTA) || defined(QUOTA2)
1517 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1518 chkiq(ip, 1, l->l_cred, FORCE);
1519 #endif
1520 }
1521
1522 /*
1523 * Notification that a block is being freed. Return zero if the free
1524 * should be allowed to proceed. Return non-zero if the snapshot file
1525 * wants to claim the block. The block will be claimed if it is an
1526 * uncopied part of one of the snapshots. It will be freed if it is
1527 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1528 * If a fragment is being freed, then all snapshots that care about
1529 * it must make a copy since a snapshot file can only claim full sized
1530 * blocks. Note that if more than one snapshot file maps the block,
1531 * we can pick one at random to claim it. Since none of the snapshots
1532 * can change, we are assurred that they will all see the same unmodified
1533 * image. When deleting a snapshot file (see ffs_snapremove above), we
1534 * must push any of these claimed blocks to one of the other snapshots
1535 * that maps it. These claimed blocks are easily identified as they will
1536 * have a block number equal to their logical block number within the
1537 * snapshot. A copied block can never have this property because they
1538 * must always have been allocated from a BLK_NOCOPY location.
1539 */
1540 int
1541 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1542 long size, ino_t inum)
1543 {
1544 struct mount *mp = spec_node_getmountedfs(devvp);
1545 struct buf *ibp;
1546 struct inode *ip;
1547 struct vnode *vp = NULL;
1548 struct snap_info *si;
1549 void *saved_data = NULL;
1550 daddr_t lbn;
1551 daddr_t blkno;
1552 uint32_t gen;
1553 int indiroff = 0, error = 0, claimedblk = 0;
1554
1555 si = VFSTOUFS(mp)->um_snapinfo;
1556 lbn = ffs_fragstoblks(fs, bno);
1557 mutex_enter(&si->si_snaplock);
1558 mutex_enter(&si->si_lock);
1559 si->si_owner = curlwp;
1560
1561 retry:
1562 gen = si->si_gen;
1563 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1564 vp = ITOV(ip);
1565 /*
1566 * Lookup block being written.
1567 */
1568 if (lbn < UFS_NDADDR) {
1569 blkno = db_get(ip, lbn);
1570 } else {
1571 mutex_exit(&si->si_lock);
1572 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1573 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1574 if (error) {
1575 mutex_enter(&si->si_lock);
1576 break;
1577 }
1578 indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
1579 blkno = idb_get(ip, ibp->b_data, indiroff);
1580 mutex_enter(&si->si_lock);
1581 if (gen != si->si_gen) {
1582 brelse(ibp, 0);
1583 goto retry;
1584 }
1585 }
1586 /*
1587 * Check to see if block needs to be copied.
1588 */
1589 if (blkno == 0) {
1590 /*
1591 * A block that we map is being freed. If it has not
1592 * been claimed yet, we will claim or copy it (below).
1593 */
1594 claimedblk = 1;
1595 } else if (blkno == BLK_SNAP) {
1596 /*
1597 * No previous snapshot claimed the block,
1598 * so it will be freed and become a BLK_NOCOPY
1599 * (don't care) for us.
1600 */
1601 if (claimedblk)
1602 panic("snapblkfree: inconsistent block type");
1603 if (lbn < UFS_NDADDR) {
1604 db_assign(ip, lbn, BLK_NOCOPY);
1605 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1606 } else {
1607 idb_assign(ip, ibp->b_data, indiroff,
1608 BLK_NOCOPY);
1609 mutex_exit(&si->si_lock);
1610 if (ip->i_nlink > 0)
1611 bwrite(ibp);
1612 else
1613 bdwrite(ibp);
1614 mutex_enter(&si->si_lock);
1615 if (gen != si->si_gen)
1616 goto retry;
1617 }
1618 continue;
1619 } else /* BLK_NOCOPY or default */ {
1620 /*
1621 * If the snapshot has already copied the block
1622 * (default), or does not care about the block,
1623 * it is not needed.
1624 */
1625 if (lbn >= UFS_NDADDR)
1626 brelse(ibp, 0);
1627 continue;
1628 }
1629 /*
1630 * If this is a full size block, we will just grab it
1631 * and assign it to the snapshot inode. Otherwise we
1632 * will proceed to copy it. See explanation for this
1633 * routine as to why only a single snapshot needs to
1634 * claim this block.
1635 */
1636 if (size == fs->fs_bsize) {
1637 #ifdef DEBUG
1638 if (snapdebug)
1639 printf("%s %llu lbn %" PRId64
1640 "from inum %llu\n",
1641 "Grabonremove: snapino",
1642 (unsigned long long)ip->i_number,
1643 lbn, (unsigned long long)inum);
1644 #endif
1645 mutex_exit(&si->si_lock);
1646 if (lbn < UFS_NDADDR) {
1647 db_assign(ip, lbn, bno);
1648 } else {
1649 idb_assign(ip, ibp->b_data, indiroff, bno);
1650 if (ip->i_nlink > 0)
1651 bwrite(ibp);
1652 else
1653 bdwrite(ibp);
1654 }
1655 DIP_ADD(ip, blocks, btodb(size));
1656 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1657 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1658 error = syncsnap(vp);
1659 else
1660 error = 0;
1661 mutex_enter(&si->si_lock);
1662 si->si_owner = NULL;
1663 mutex_exit(&si->si_lock);
1664 mutex_exit(&si->si_snaplock);
1665 return (error == 0);
1666 }
1667 if (lbn >= UFS_NDADDR)
1668 brelse(ibp, 0);
1669 #ifdef DEBUG
1670 if (snapdebug)
1671 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1672 "Copyonremove: snapino ",
1673 (unsigned long long)ip->i_number,
1674 lbn, "for inum", (unsigned long long)inum, size);
1675 #endif
1676 /*
1677 * If we have already read the old block contents, then
1678 * simply copy them to the new block. Note that we need
1679 * to synchronously write snapshots that have not been
1680 * unlinked, and hence will be visible after a crash,
1681 * to ensure their integrity.
1682 */
1683 mutex_exit(&si->si_lock);
1684 if (saved_data == NULL) {
1685 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1686 error = rwfsblk(vp, B_READ, saved_data, lbn);
1687 if (error) {
1688 free(saved_data, M_UFSMNT);
1689 saved_data = NULL;
1690 mutex_enter(&si->si_lock);
1691 break;
1692 }
1693 }
1694 error = wrsnapblk(vp, saved_data, lbn);
1695 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1696 error = syncsnap(vp);
1697 mutex_enter(&si->si_lock);
1698 if (error)
1699 break;
1700 if (gen != si->si_gen)
1701 goto retry;
1702 }
1703 si->si_owner = NULL;
1704 mutex_exit(&si->si_lock);
1705 mutex_exit(&si->si_snaplock);
1706 if (saved_data)
1707 free(saved_data, M_UFSMNT);
1708 /*
1709 * If we have been unable to allocate a block in which to do
1710 * the copy, then return non-zero so that the fragment will
1711 * not be freed. Although space will be lost, the snapshot
1712 * will stay consistent.
1713 */
1714 return (error);
1715 }
1716
1717 /*
1718 * Associate snapshot files when mounting.
1719 */
1720 void
1721 ffs_snapshot_mount(struct mount *mp)
1722 {
1723 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1724 struct fs *fs = VFSTOUFS(mp)->um_fs;
1725 struct lwp *l = curlwp;
1726 struct vnode *vp;
1727 struct inode *ip, *xp;
1728 struct snap_info *si;
1729 daddr_t snaplistsize, *snapblklist;
1730 int i, error, ns __unused, snaploc, loc;
1731
1732 /*
1733 * No persistent snapshots on apple ufs file systems.
1734 */
1735 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1736 return;
1737
1738 si = VFSTOUFS(mp)->um_snapinfo;
1739 ns = UFS_FSNEEDSWAP(fs);
1740 /*
1741 * XXX The following needs to be set before ffs_truncate or
1742 * VOP_READ can be called.
1743 */
1744 mp->mnt_stat.f_iosize = fs->fs_bsize;
1745 /*
1746 * Process each snapshot listed in the superblock.
1747 */
1748 vp = NULL;
1749 mutex_enter(&si->si_lock);
1750 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1751 if (fs->fs_snapinum[snaploc] == 0)
1752 break;
1753 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1754 &vp)) != 0) {
1755 printf("ffs_snapshot_mount: vget failed %d\n", error);
1756 continue;
1757 }
1758 ip = VTOI(vp);
1759 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
1760 SF_SNAPSHOT) {
1761 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1762 fs->fs_snapinum[snaploc]);
1763 vput(vp);
1764 vp = NULL;
1765 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1766 if (fs->fs_snapinum[loc] == 0)
1767 break;
1768 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1769 }
1770 fs->fs_snapinum[loc - 1] = 0;
1771 snaploc--;
1772 continue;
1773 }
1774
1775 /*
1776 * Read the block hints list. Use an empty list on
1777 * read errors.
1778 */
1779 error = vn_rdwr(UIO_READ, vp,
1780 (void *)&snaplistsize, sizeof(snaplistsize),
1781 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1782 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1783 l->l_cred, NULL, NULL);
1784 if (error) {
1785 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1786 snaplistsize = 1;
1787 } else
1788 snaplistsize = ufs_rw64(snaplistsize, ns);
1789 snapblklist = malloc(
1790 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1791 if (error)
1792 snapblklist[0] = 1;
1793 else {
1794 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1795 snaplistsize * sizeof(daddr_t),
1796 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1797 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1798 l->l_cred, NULL, NULL);
1799 for (i = 0; i < snaplistsize; i++)
1800 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1801 if (error) {
1802 printf("ffs_snapshot_mount: read_2 failed %d\n",
1803 error);
1804 snapblklist[0] = 1;
1805 }
1806 }
1807 ip->i_snapblklist = &snapblklist[0];
1808
1809 /*
1810 * Link it onto the active snapshot list.
1811 */
1812 if (is_active_snapshot(si, ip))
1813 panic("ffs_snapshot_mount: %"PRIu64" already on list",
1814 ip->i_number);
1815 else
1816 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1817 vp->v_vflag |= VV_SYSTEM;
1818 VOP_UNLOCK(vp);
1819 }
1820 /*
1821 * No usable snapshots found.
1822 */
1823 if (vp == NULL) {
1824 mutex_exit(&si->si_lock);
1825 return;
1826 }
1827 /*
1828 * Attach the block hints list. We always want to
1829 * use the list from the newest snapshot.
1830 */
1831 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1832 si->si_snapblklist = xp->i_snapblklist;
1833 fscow_establish(mp, ffs_copyonwrite, devvp);
1834 si->si_gen++;
1835 mutex_exit(&si->si_lock);
1836 }
1837
1838 /*
1839 * Disassociate snapshot files when unmounting.
1840 */
1841 void
1842 ffs_snapshot_unmount(struct mount *mp)
1843 {
1844 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1845 struct inode *xp;
1846 struct vnode *vp = NULL;
1847 struct snap_info *si;
1848
1849 si = VFSTOUFS(mp)->um_snapinfo;
1850 mutex_enter(&si->si_lock);
1851 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1852 vp = ITOV(xp);
1853 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1854 if (xp->i_snapblklist == si->si_snapblklist)
1855 si->si_snapblklist = NULL;
1856 free(xp->i_snapblklist, M_UFSMNT);
1857 if (xp->i_nlink > 0) {
1858 si->si_gen++;
1859 mutex_exit(&si->si_lock);
1860 vrele(vp);
1861 mutex_enter(&si->si_lock);
1862 }
1863 }
1864 si->si_gen++;
1865 mutex_exit(&si->si_lock);
1866 if (vp)
1867 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1868 }
1869
1870 /*
1871 * Check for need to copy block that is about to be written,
1872 * copying the block if necessary.
1873 */
1874 static int
1875 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1876 {
1877 struct fs *fs;
1878 struct inode *ip;
1879 struct vnode *devvp = v, *vp = NULL;
1880 struct mount *mp = spec_node_getmountedfs(devvp);
1881 struct snap_info *si;
1882 void *saved_data = NULL;
1883 daddr_t lbn, blkno, *snapblklist;
1884 uint32_t gen;
1885 int lower, upper, mid, snapshot_locked = 0, error = 0;
1886
1887 /*
1888 * Check for valid snapshots.
1889 */
1890 si = VFSTOUFS(mp)->um_snapinfo;
1891 mutex_enter(&si->si_lock);
1892 ip = TAILQ_FIRST(&si->si_snapshots);
1893 if (ip == NULL) {
1894 mutex_exit(&si->si_lock);
1895 return 0;
1896 }
1897 /*
1898 * First check to see if it is after the file system,
1899 * in the journal or in the preallocated list.
1900 * By doing these checks we avoid several potential deadlocks.
1901 */
1902 fs = ip->i_fs;
1903 lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
1904 if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
1905 mutex_exit(&si->si_lock);
1906 return 0;
1907 }
1908 if ((fs->fs_flags & FS_DOWAPBL) &&
1909 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1910 off_t blk_off, log_start, log_end;
1911
1912 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1913 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1914 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1915 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1916 blk_off = dbtob(bp->b_blkno);
1917 if (blk_off >= log_start && blk_off < log_end) {
1918 mutex_exit(&si->si_lock);
1919 return 0;
1920 }
1921 }
1922 snapblklist = si->si_snapblklist;
1923 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1924 lower = 1;
1925 while (lower <= upper) {
1926 mid = (lower + upper) / 2;
1927 if (snapblklist[mid] == lbn)
1928 break;
1929 if (snapblklist[mid] < lbn)
1930 lower = mid + 1;
1931 else
1932 upper = mid - 1;
1933 }
1934 if (lower <= upper) {
1935 mutex_exit(&si->si_lock);
1936 return 0;
1937 }
1938 /*
1939 * Not in the precomputed list, so check the snapshots.
1940 */
1941 if (si->si_owner != curlwp) {
1942 if (!mutex_tryenter(&si->si_snaplock)) {
1943 mutex_exit(&si->si_lock);
1944 mutex_enter(&si->si_snaplock);
1945 mutex_enter(&si->si_lock);
1946 }
1947 si->si_owner = curlwp;
1948 snapshot_locked = 1;
1949 }
1950 if (data_valid && bp->b_bcount == fs->fs_bsize)
1951 saved_data = bp->b_data;
1952 retry:
1953 gen = si->si_gen;
1954 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1955 vp = ITOV(ip);
1956 /*
1957 * We ensure that everything of our own that needs to be
1958 * copied will be done at the time that ffs_snapshot is
1959 * called. Thus we can skip the check here which can
1960 * deadlock in doing the lookup in ffs_balloc.
1961 */
1962 if (bp->b_vp == vp)
1963 continue;
1964 /*
1965 * Check to see if block needs to be copied.
1966 */
1967 if (lbn < UFS_NDADDR) {
1968 blkno = db_get(ip, lbn);
1969 } else {
1970 mutex_exit(&si->si_lock);
1971 blkno = 0; /* XXX: GCC */
1972 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1973 mutex_enter(&si->si_lock);
1974 break;
1975 }
1976 mutex_enter(&si->si_lock);
1977 if (gen != si->si_gen)
1978 goto retry;
1979 }
1980 #ifdef DIAGNOSTIC
1981 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1982 panic("ffs_copyonwrite: bad copy block");
1983 #endif
1984 if (blkno != 0)
1985 continue;
1986
1987 if (curlwp == uvm.pagedaemon_lwp) {
1988 error = ENOMEM;
1989 break;
1990 }
1991 /* Only one level of recursion allowed. */
1992 KASSERT(snapshot_locked);
1993 /*
1994 * Allocate the block into which to do the copy. Since
1995 * multiple processes may all try to copy the same block,
1996 * we have to recheck our need to do a copy if we sleep
1997 * waiting for the lock.
1998 *
1999 * Because all snapshots on a filesystem share a single
2000 * lock, we ensure that we will never be in competition
2001 * with another process to allocate a block.
2002 */
2003 #ifdef DEBUG
2004 if (snapdebug) {
2005 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2006 (unsigned long long)ip->i_number, lbn);
2007 if (bp->b_vp == devvp)
2008 printf("fs metadata");
2009 else
2010 printf("inum %llu", (unsigned long long)
2011 VTOI(bp->b_vp)->i_number);
2012 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2013 }
2014 #endif
2015 /*
2016 * If we have already read the old block contents, then
2017 * simply copy them to the new block. Note that we need
2018 * to synchronously write snapshots that have not been
2019 * unlinked, and hence will be visible after a crash,
2020 * to ensure their integrity.
2021 */
2022 mutex_exit(&si->si_lock);
2023 if (saved_data == NULL) {
2024 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2025 error = rwfsblk(vp, B_READ, saved_data, lbn);
2026 if (error) {
2027 free(saved_data, M_UFSMNT);
2028 saved_data = NULL;
2029 mutex_enter(&si->si_lock);
2030 break;
2031 }
2032 }
2033 error = wrsnapblk(vp, saved_data, lbn);
2034 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
2035 error = syncsnap(vp);
2036 mutex_enter(&si->si_lock);
2037 if (error)
2038 break;
2039 if (gen != si->si_gen)
2040 goto retry;
2041 }
2042 /*
2043 * Note that we need to synchronously write snapshots that
2044 * have not been unlinked, and hence will be visible after
2045 * a crash, to ensure their integrity.
2046 */
2047 if (snapshot_locked) {
2048 si->si_owner = NULL;
2049 mutex_exit(&si->si_lock);
2050 mutex_exit(&si->si_snaplock);
2051 } else
2052 mutex_exit(&si->si_lock);
2053 if (saved_data && saved_data != bp->b_data)
2054 free(saved_data, M_UFSMNT);
2055 return error;
2056 }
2057
2058 /*
2059 * Read from a snapshot.
2060 */
2061 int
2062 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2063 {
2064 struct inode *ip = VTOI(vp);
2065 struct fs *fs = ip->i_fs;
2066 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2067 struct buf *bp;
2068 daddr_t lbn, nextlbn;
2069 off_t fsbytes, bytesinfile;
2070 long size, xfersize, blkoffset;
2071 int error;
2072
2073 fstrans_start(vp->v_mount, FSTRANS_SHARED);
2074 mutex_enter(&si->si_snaplock);
2075
2076 if (ioflag & IO_ALTSEMANTICS)
2077 fsbytes = ip->i_size;
2078 else
2079 fsbytes = ffs_lfragtosize(fs, fs->fs_size);
2080 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2081 bytesinfile = fsbytes - uio->uio_offset;
2082 if (bytesinfile <= 0)
2083 break;
2084 lbn = ffs_lblkno(fs, uio->uio_offset);
2085 nextlbn = lbn + 1;
2086 size = fs->fs_bsize;
2087 blkoffset = ffs_blkoff(fs, uio->uio_offset);
2088 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2089 bytesinfile);
2090
2091 if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
2092 if (ffs_lblktosize(fs, lbn) + size > fsbytes)
2093 size = ffs_fragroundup(fs,
2094 fsbytes - ffs_lblktosize(fs, lbn));
2095 error = bread(vp, lbn, size, NOCRED, 0, &bp);
2096 } else {
2097 int nextsize = fs->fs_bsize;
2098 error = breadn(vp, lbn,
2099 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
2100 }
2101 if (error)
2102 break;
2103
2104 /*
2105 * We should only get non-zero b_resid when an I/O error
2106 * has occurred, which should cause us to break above.
2107 * However, if the short read did not cause an error,
2108 * then we want to ensure that we do not uiomove bad
2109 * or uninitialized data.
2110 */
2111 size -= bp->b_resid;
2112 if (size < blkoffset + xfersize) {
2113 xfersize = size - blkoffset;
2114 if (xfersize <= 0)
2115 break;
2116 }
2117 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2118 if (error)
2119 break;
2120 brelse(bp, BC_AGE);
2121 }
2122 if (bp != NULL)
2123 brelse(bp, BC_AGE);
2124
2125 mutex_exit(&si->si_snaplock);
2126 fstrans_done(vp->v_mount);
2127 return error;
2128 }
2129
2130 /*
2131 * Lookup a snapshots data block address.
2132 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2133 * and safe even for the pagedaemon where we cannot bread().
2134 */
2135 static int
2136 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2137 {
2138 struct indir indirs[UFS_NIADDR + 2];
2139 struct inode *ip = VTOI(vp);
2140 struct fs *fs = ip->i_fs;
2141 struct buf *bp;
2142 int error, num;
2143
2144 KASSERT(lbn >= 0);
2145
2146 if (lbn < UFS_NDADDR) {
2147 *res = db_get(ip, lbn);
2148 return 0;
2149 }
2150 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2151 return error;
2152 if (curlwp == uvm.pagedaemon_lwp) {
2153 mutex_enter(&bufcache_lock);
2154 bp = incore(vp, indirs[num-1].in_lbn);
2155 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2156 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2157 error = 0;
2158 } else
2159 error = ENOMEM;
2160 mutex_exit(&bufcache_lock);
2161 return error;
2162 }
2163 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
2164 if (error == 0) {
2165 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2166 brelse(bp, 0);
2167 }
2168
2169 return error;
2170 }
2171
2172 /*
2173 * Read or write the specified block of the filesystem vp resides on
2174 * from or to the disk bypassing the buffer cache.
2175 */
2176 static int
2177 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2178 {
2179 int error;
2180 struct inode *ip = VTOI(vp);
2181 struct fs *fs = ip->i_fs;
2182 struct buf *nbp;
2183
2184 nbp = getiobuf(NULL, true);
2185 nbp->b_flags = flags;
2186 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2187 nbp->b_error = 0;
2188 nbp->b_data = data;
2189 nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
2190 nbp->b_proc = NULL;
2191 nbp->b_dev = ip->i_devvp->v_rdev;
2192 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2193
2194 bdev_strategy(nbp);
2195
2196 error = biowait(nbp);
2197
2198 putiobuf(nbp);
2199
2200 return error;
2201 }
2202
2203 /*
2204 * Write all dirty buffers to disk and invalidate them.
2205 */
2206 static int
2207 syncsnap(struct vnode *vp)
2208 {
2209 int error;
2210 buf_t *bp;
2211 struct fs *fs = VTOI(vp)->i_fs;
2212
2213 mutex_enter(&bufcache_lock);
2214 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2215 error = bbusy(bp, false, 0, NULL);
2216 if (error == EPASSTHROUGH)
2217 continue;
2218 else if (error != 0) {
2219 mutex_exit(&bufcache_lock);
2220 return error;
2221 }
2222 KASSERT(bp->b_bcount == fs->fs_bsize);
2223 mutex_exit(&bufcache_lock);
2224 error = rwfsblk(vp, B_WRITE, bp->b_data,
2225 ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
2226 brelse(bp, BC_INVAL | BC_VFLUSH);
2227 if (error)
2228 return error;
2229 mutex_enter(&bufcache_lock);
2230 }
2231 mutex_exit(&bufcache_lock);
2232
2233 return 0;
2234 }
2235
2236 /*
2237 * Write the specified block to a snapshot.
2238 */
2239 static int
2240 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2241 {
2242 struct inode *ip = VTOI(vp);
2243 struct fs *fs = ip->i_fs;
2244 struct buf *bp;
2245 int error;
2246
2247 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2248 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2249 if (error)
2250 return error;
2251 memcpy(bp->b_data, data, fs->fs_bsize);
2252 if (ip->i_nlink > 0)
2253 error = bwrite(bp);
2254 else
2255 bawrite(bp);
2256
2257 return error;
2258 }
2259
2260 /*
2261 * Check if this inode is present on the active snapshot list.
2262 * Must be called with snapinfo locked.
2263 */
2264 static inline bool
2265 is_active_snapshot(struct snap_info *si, struct inode *ip)
2266 {
2267 struct inode *xp;
2268
2269 KASSERT(mutex_owned(&si->si_lock));
2270
2271 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2272 if (xp == ip)
2273 return true;
2274 return false;
2275 }
2276
2277 /*
2278 * Get/Put direct block from inode or buffer containing disk addresses. Take
2279 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2280 * into a global include.
2281 */
2282 static inline daddr_t
2283 db_get(struct inode *ip, int loc)
2284 {
2285 if (ip->i_ump->um_fstype == UFS1)
2286 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2287 else
2288 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2289 }
2290
2291 static inline void
2292 db_assign(struct inode *ip, int loc, daddr_t val)
2293 {
2294 if (ip->i_ump->um_fstype == UFS1)
2295 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2296 else
2297 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2298 }
2299
2300 __unused static inline daddr_t
2301 ib_get(struct inode *ip, int loc)
2302 {
2303 if (ip->i_ump->um_fstype == UFS1)
2304 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2305 else
2306 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2307 }
2308
2309 static inline daddr_t
2310 idb_get(struct inode *ip, void *bf, int loc)
2311 {
2312 if (ip->i_ump->um_fstype == UFS1)
2313 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2314 else
2315 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2316 }
2317
2318 static inline void
2319 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2320 {
2321 if (ip->i_ump->um_fstype == UFS1)
2322 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2323 else
2324 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2325 }
2326