ffs_snapshot.c revision 1.144 1 /* $NetBSD: ffs_snapshot.c,v 1.144 2017/02/17 08:29:11 hannken Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.144 2017/02/17 08:29:11 hannken Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_quota.h"
46 #endif
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77
78 #include <uvm/uvm.h>
79
80 TAILQ_HEAD(inodelst, inode); /* List of active snapshots */
81
82 struct snap_info {
83 kmutex_t si_lock; /* Lock this snapinfo */
84 kmutex_t si_snaplock; /* Snapshot vnode common lock */
85 lwp_t *si_owner; /* Snaplock owner */
86 struct inodelst si_snapshots; /* List of active snapshots */
87 daddr_t *si_snapblklist; /* Snapshot block hints list */
88 uint32_t si_gen; /* Incremented on change */
89 };
90
91 #if !defined(FFS_NO_SNAPSHOT)
92 typedef int (*acctfunc_t)
93 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
94
95 static int snapshot_setup(struct mount *, struct vnode *);
96 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
97 static int snapshot_expunge(struct mount *, struct vnode *,
98 struct fs *, daddr_t *, daddr_t **);
99 static int snapshot_expunge_snap(struct mount *, struct vnode *,
100 struct fs *, daddr_t);
101 static int snapshot_writefs(struct mount *, struct vnode *, void *);
102 static int cgaccount(struct vnode *, int, int *);
103 static int cgaccount1(int, struct vnode *, void *, int);
104 static int expunge(struct vnode *, struct inode *, struct fs *,
105 acctfunc_t, int);
106 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
107 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
108 static int fullacct(struct vnode *, void *, int, int, struct fs *,
109 daddr_t, int);
110 static int snapacct(struct vnode *, void *, int, int, struct fs *,
111 daddr_t, int);
112 static int mapacct(struct vnode *, void *, int, int, struct fs *,
113 daddr_t, int);
114 #endif /* !defined(FFS_NO_SNAPSHOT) */
115
116 static int ffs_copyonwrite(void *, struct buf *, bool);
117 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
118 static int rwfsblk(struct vnode *, int, void *, daddr_t);
119 static int syncsnap(struct vnode *);
120 static int wrsnapblk(struct vnode *, void *, daddr_t);
121 #if !defined(FFS_NO_SNAPSHOT)
122 static int blocks_in_journal(struct fs *);
123 #endif
124
125 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
126 static inline daddr_t db_get(struct inode *, int);
127 static inline void db_assign(struct inode *, int, daddr_t);
128 static inline daddr_t ib_get(struct inode *, int);
129 static inline daddr_t idb_get(struct inode *, void *, int);
130 static inline void idb_assign(struct inode *, void *, int, daddr_t);
131
132 #ifdef DEBUG
133 static int snapdebug = 0;
134 #endif
135
136 int
137 ffs_snapshot_init(struct ufsmount *ump)
138 {
139 struct snap_info *si;
140
141 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
142 if (si == NULL)
143 return ENOMEM;
144
145 TAILQ_INIT(&si->si_snapshots);
146 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
147 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
148 si->si_owner = NULL;
149 si->si_gen = 0;
150 si->si_snapblklist = NULL;
151
152 return 0;
153 }
154
155 void
156 ffs_snapshot_fini(struct ufsmount *ump)
157 {
158 struct snap_info *si;
159
160 si = ump->um_snapinfo;
161 ump->um_snapinfo = NULL;
162
163 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
164 mutex_destroy(&si->si_lock);
165 mutex_destroy(&si->si_snaplock);
166 KASSERT(si->si_snapblklist == NULL);
167 kmem_free(si, sizeof(*si));
168 }
169
170 /*
171 * Create a snapshot file and initialize it for the filesystem.
172 * Vnode is locked on entry and return.
173 */
174 int
175 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
176 {
177 #if defined(FFS_NO_SNAPSHOT)
178 return EOPNOTSUPP;
179 }
180 #else /* defined(FFS_NO_SNAPSHOT) */
181 bool suspended = false;
182 int error, redo = 0, snaploc;
183 void *sbbuf = NULL;
184 daddr_t *snaplist = NULL, snaplistsize = 0;
185 struct buf *bp, *nbp;
186 struct fs *copy_fs = NULL;
187 struct fs *fs = VFSTOUFS(mp)->um_fs;
188 struct inode *ip = VTOI(vp);
189 struct lwp *l = curlwp;
190 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
191 struct timespec ts;
192 struct timeval starttime;
193 #ifdef DEBUG
194 struct timeval endtime;
195 #endif
196 struct vnode *devvp = ip->i_devvp;
197
198 /*
199 * If the vnode already is a snapshot, return.
200 */
201 if ((ip->i_flags & SF_SNAPSHOT)) {
202 if ((ip->i_flags & SF_SNAPINVAL))
203 return EINVAL;
204 if (ctime) {
205 ctime->tv_sec = DIP(ip, mtime);
206 ctime->tv_nsec = DIP(ip, mtimensec);
207 }
208 return 0;
209 }
210 /*
211 * Check for free snapshot slot in the superblock.
212 */
213 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
214 if (fs->fs_snapinum[snaploc] == 0)
215 break;
216 if (snaploc == FSMAXSNAP)
217 return (ENOSPC);
218 /*
219 * Prepare the vnode to become a snapshot.
220 */
221 error = snapshot_setup(mp, vp);
222 if (error)
223 goto out;
224
225 /*
226 * Copy all the cylinder group maps. Although the
227 * filesystem is still active, we hope that only a few
228 * cylinder groups will change between now and when we
229 * suspend operations. Thus, we will be able to quickly
230 * touch up the few cylinder groups that changed during
231 * the suspension period.
232 */
233 error = cgaccount(vp, 1, NULL);
234 if (error)
235 goto out;
236
237 /*
238 * snapshot is now valid
239 */
240 ip->i_flags &= ~SF_SNAPINVAL;
241 DIP_ASSIGN(ip, flags, ip->i_flags);
242 ip->i_flag |= IN_CHANGE | IN_UPDATE;
243
244 /*
245 * Ensure that the snapshot is completely on disk.
246 * Since we have marked it as a snapshot it is safe to
247 * unlock it as no process will be allowed to write to it.
248 */
249 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
250 if (error)
251 goto out;
252 VOP_UNLOCK(vp);
253 /*
254 * All allocations are done, so we can now suspend the filesystem.
255 */
256 error = vfs_suspend(vp->v_mount, 0);
257 if (error == 0) {
258 suspended = true;
259 error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred);
260 }
261 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
262 if (error)
263 goto out;
264 getmicrotime(&starttime);
265 /*
266 * First, copy all the cylinder group maps that have changed.
267 */
268 error = cgaccount(vp, 2, &redo);
269 if (error)
270 goto out;
271 /*
272 * Create a copy of the superblock and its summary information.
273 */
274 error = snapshot_copyfs(mp, vp, &sbbuf);
275 if (error)
276 goto out;
277 copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
278 /*
279 * Expunge unlinked files from our view.
280 */
281 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
282 if (error)
283 goto out;
284 /*
285 * Record snapshot inode. Since this is the newest snapshot,
286 * it must be placed at the end of the list.
287 */
288 if (ip->i_nlink > 0)
289 fs->fs_snapinum[snaploc] = ip->i_number;
290
291 mutex_enter(&si->si_lock);
292 if (is_active_snapshot(si, ip))
293 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
294 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
295 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
296 /*
297 * If this is the first snapshot on this filesystem, put the
298 * preliminary list in place and establish the cow handler.
299 */
300 si->si_snapblklist = snaplist;
301 fscow_establish(mp, ffs_copyonwrite, devvp);
302 }
303 si->si_gen++;
304 mutex_exit(&si->si_lock);
305
306 vp->v_vflag |= VV_SYSTEM;
307 /*
308 * Set the mtime to the time the snapshot has been taken.
309 */
310 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
311 if (ctime)
312 *ctime = ts;
313 DIP_ASSIGN(ip, mtime, ts.tv_sec);
314 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
315 ip->i_flag |= IN_CHANGE | IN_UPDATE;
316 /*
317 * Copy allocation information from all snapshots and then
318 * expunge them from our view.
319 */
320 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
321 if (error)
322 goto out;
323 /*
324 * Write the superblock and its summary information to the snapshot.
325 */
326 error = snapshot_writefs(mp, vp, sbbuf);
327 if (error)
328 goto out;
329 /*
330 * We're nearly done, ensure that the snapshot is completely on disk.
331 */
332 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
333 if (error)
334 goto out;
335 /*
336 * Invalidate and free all pages on the snapshot vnode.
337 * We will read and write through the buffercache.
338 */
339 mutex_enter(vp->v_interlock);
340 error = VOP_PUTPAGES(vp, 0, 0,
341 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
342 if (error)
343 goto out;
344 /*
345 * Invalidate short ( < fs_bsize ) buffers. We will always read
346 * full size buffers later.
347 */
348 mutex_enter(&bufcache_lock);
349 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
350 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
351 nbp = LIST_NEXT(bp, b_vnbufs);
352 if (bp->b_bcount == fs->fs_bsize)
353 continue;
354 error = bbusy(bp, false, 0, NULL);
355 if (error != 0) {
356 if (error == EPASSTHROUGH) {
357 nbp = LIST_FIRST(&vp->v_cleanblkhd);
358 continue;
359 }
360 break;
361 }
362 brelsel(bp, BC_INVAL | BC_VFLUSH);
363 }
364 mutex_exit(&bufcache_lock);
365
366 out:
367 if (sbbuf != NULL) {
368 free(copy_fs->fs_csp, M_UFSMNT);
369 free(sbbuf, M_UFSMNT);
370 }
371 if (fs->fs_active != NULL) {
372 free(fs->fs_active, M_DEVBUF);
373 fs->fs_active = NULL;
374 }
375
376 mutex_enter(&si->si_lock);
377 if (snaplist != NULL) {
378 if (si->si_snapblklist == snaplist)
379 si->si_snapblklist = NULL;
380 free(snaplist, M_UFSMNT);
381 }
382 if (error) {
383 fs->fs_snapinum[snaploc] = 0;
384 } else {
385 /*
386 * As this is the newest list, it is the most inclusive, so
387 * should replace the previous list.
388 */
389 si->si_snapblklist = ip->i_snapblklist;
390 }
391 si->si_gen++;
392 mutex_exit(&si->si_lock);
393
394 if (suspended) {
395 VOP_UNLOCK(vp);
396 vfs_resume(vp->v_mount);
397 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
398 #ifdef DEBUG
399 getmicrotime(&endtime);
400 timersub(&endtime, &starttime, &endtime);
401 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
402 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
403 endtime.tv_usec / 1000, redo, fs->fs_ncg);
404 #endif
405 }
406 if (error) {
407 if (UFS_WAPBL_BEGIN(mp) == 0) {
408 /*
409 * We depend on ffs_truncate() to call ffs_snapremove()
410 * before it may return an error. On failed
411 * ffs_truncate() we have normal file with leaked
412 * (meta-) data, but no snapshot to use.
413 */
414 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
415 UFS_WAPBL_END(mp);
416 }
417 } else if (ip->i_nlink > 0)
418 vref(vp);
419 return (error);
420 }
421
422 /*
423 * Prepare vnode to become a snapshot.
424 */
425 static int
426 snapshot_setup(struct mount *mp, struct vnode *vp)
427 {
428 int error, n, len, loc, cg;
429 daddr_t blkno, numblks;
430 struct buf *ibp, *nbp;
431 struct fs *fs = VFSTOUFS(mp)->um_fs;
432 struct lwp *l = curlwp;
433 const int wbreak = blocks_in_journal(fs)/8;
434 struct inode *ip = VTOI(vp);
435
436 /*
437 * Check mount, readonly reference and owner.
438 */
439 if (vp->v_mount != mp)
440 return EXDEV;
441 if (vp->v_writecount != 0)
442 return EBUSY;
443 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
444 0, mp, vp, NULL);
445 if (error)
446 return EACCES;
447
448 if (vp->v_size != 0) {
449 /*
450 * Must completely truncate the file here. Allocated
451 * blocks on a snapshot mean that block has been copied
452 * on write, see ffs_copyonwrite() testing "blkno != 0"
453 */
454 error = ufs_truncate_retry(vp, 0, NOCRED);
455 if (error)
456 return error;
457 }
458
459 /* Change inode to snapshot type file. */
460 error = UFS_WAPBL_BEGIN(mp);
461 if (error)
462 return error;
463 #if defined(QUOTA) || defined(QUOTA2)
464 /* shapshot inodes are not accounted in quotas */
465 chkiq(ip, -1, l->l_cred, 0);
466 #endif
467 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
468 DIP_ASSIGN(ip, flags, ip->i_flags);
469 ip->i_flag |= IN_CHANGE | IN_UPDATE;
470 ffs_update(vp, NULL, NULL, UPDATE_WAIT);
471 UFS_WAPBL_END(mp);
472
473 KASSERT(ip->i_flags & SF_SNAPSHOT);
474 /*
475 * Write an empty list of preallocated blocks to the end of
476 * the snapshot to set size to at least that of the filesystem.
477 */
478 numblks = howmany(fs->fs_size, fs->fs_frag);
479 blkno = 1;
480 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
481 error = vn_rdwr(UIO_WRITE, vp,
482 (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
483 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
484 if (error)
485 return error;
486 /*
487 * Preallocate critical data structures so that we can copy
488 * them in without further allocation after we suspend all
489 * operations on the filesystem. We would like to just release
490 * the allocated buffers without writing them since they will
491 * be filled in below once we are ready to go, but this upsets
492 * the soft update code, so we go ahead and write the new buffers.
493 *
494 * Allocate all indirect blocks and mark all of them as not
495 * needing to be copied.
496 */
497 error = UFS_WAPBL_BEGIN(mp);
498 if (error)
499 return error;
500 for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
501 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
502 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
503 if (error)
504 goto out;
505 brelse(ibp, 0);
506 if (wbreak > 0 && (++n % wbreak) == 0) {
507 UFS_WAPBL_END(mp);
508 error = UFS_WAPBL_BEGIN(mp);
509 if (error)
510 return error;
511 }
512 }
513 /*
514 * Allocate copies for the superblock and its summary information.
515 */
516 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
517 0, &nbp);
518 if (error)
519 goto out;
520 bawrite(nbp);
521 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
522 len = howmany(fs->fs_cssize, fs->fs_bsize);
523 for (loc = 0; loc < len; loc++) {
524 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
525 fs->fs_bsize, l->l_cred, 0, &nbp);
526 if (error)
527 goto out;
528 bawrite(nbp);
529 if (wbreak > 0 && (++n % wbreak) == 0) {
530 UFS_WAPBL_END(mp);
531 error = UFS_WAPBL_BEGIN(mp);
532 if (error)
533 return error;
534 }
535 }
536 /*
537 * Allocate all cylinder group blocks.
538 */
539 for (cg = 0; cg < fs->fs_ncg; cg++) {
540 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
541 fs->fs_bsize, l->l_cred, 0, &nbp);
542 if (error)
543 goto out;
544 bawrite(nbp);
545 if (wbreak > 0 && (++n % wbreak) == 0) {
546 UFS_WAPBL_END(mp);
547 error = UFS_WAPBL_BEGIN(mp);
548 if (error)
549 return error;
550 }
551 }
552
553 out:
554 UFS_WAPBL_END(mp);
555 return error;
556 }
557
558 /*
559 * Create a copy of the superblock and its summary information.
560 * It is up to the caller to free copyfs and copy_fs->fs_csp.
561 */
562 static int
563 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
564 {
565 int error, i, len, loc, size;
566 void *space;
567 int32_t *lp;
568 struct buf *bp;
569 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
570 struct vnode *devvp = VTOI(vp)->i_devvp;
571
572 /*
573 * Grab a copy of the superblock and its summary information.
574 * We delay writing it until the suspension is released below.
575 */
576 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
577 loc = ffs_blkoff(fs, fs->fs_sblockloc);
578 if (loc > 0)
579 memset(*sbbuf, 0, loc);
580 copyfs = (struct fs *)((char *)(*sbbuf) + loc);
581 memcpy(copyfs, fs, fs->fs_sbsize);
582 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
583 if (fs->fs_sbsize < size)
584 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
585 size - fs->fs_sbsize);
586 size = ffs_blkroundup(fs, fs->fs_cssize);
587 if (fs->fs_contigsumsize > 0)
588 size += fs->fs_ncg * sizeof(int32_t);
589 space = malloc(size, M_UFSMNT, M_WAITOK);
590 copyfs->fs_csp = space;
591 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
592 space = (char *)space + fs->fs_cssize;
593 loc = howmany(fs->fs_cssize, fs->fs_fsize);
594 i = fs->fs_frag - loc % fs->fs_frag;
595 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
596 if (len > 0) {
597 if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
598 len, 0, &bp)) != 0) {
599 free(copyfs->fs_csp, M_UFSMNT);
600 free(*sbbuf, M_UFSMNT);
601 *sbbuf = NULL;
602 return error;
603 }
604 memcpy(space, bp->b_data, (u_int)len);
605 space = (char *)space + len;
606 brelse(bp, BC_INVAL | BC_NOCACHE);
607 }
608 if (fs->fs_contigsumsize > 0) {
609 copyfs->fs_maxcluster = lp = space;
610 for (i = 0; i < fs->fs_ncg; i++)
611 *lp++ = fs->fs_contigsumsize;
612 }
613 if (mp->mnt_wapbl)
614 copyfs->fs_flags &= ~FS_DOWAPBL;
615 return 0;
616 }
617
618 struct snapshot_expunge_ctx {
619 struct vnode *logvp;
620 struct lwp *l;
621 struct vnode *vp;
622 struct fs *copy_fs;
623 };
624
625 static bool
626 snapshot_expunge_selector(void *cl, struct vnode *xvp)
627 {
628 struct vattr vat;
629 struct snapshot_expunge_ctx *c = cl;
630 struct inode *xp;
631
632 xp = VTOI(xvp);
633 if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
634 (xp->i_flags & SF_SNAPSHOT))
635 return false;
636 #ifdef DEBUG
637 if (snapdebug)
638 vprint("ffs_snapshot: busy vnode", xvp);
639 #endif
640
641 if (xvp == c->logvp)
642 return true;
643
644 if (VOP_GETATTR(xvp, &vat, c->l->l_cred) == 0 &&
645 vat.va_nlink > 0)
646 return false;
647
648 if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
649 return false;
650
651 return true;
652 }
653
654 /*
655 * We must check for active files that have been unlinked (e.g., with a zero
656 * link count). We have to expunge all trace of these files from the snapshot
657 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
658 * Note that we skip unlinked snapshot files as they will be handled separately.
659 * Calculate the snapshot list size and create a preliminary list.
660 */
661 static int
662 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
663 daddr_t *snaplistsize, daddr_t **snaplist)
664 {
665 int cg, error = 0, len, loc;
666 daddr_t blkno, *blkp;
667 struct fs *fs = VFSTOUFS(mp)->um_fs;
668 struct inode *xp;
669 struct lwp *l = curlwp;
670 struct vnode *logvp = NULL, *xvp;
671 struct vnode_iterator *marker;
672 struct snapshot_expunge_ctx ctx;
673
674 *snaplist = NULL;
675 /*
676 * Get the log inode if any.
677 */
678 if ((fs->fs_flags & FS_DOWAPBL) &&
679 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
680 error = VFS_VGET(mp,
681 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
682 if (error)
683 goto out;
684 }
685 /*
686 * We also calculate the needed size for the snapshot list.
687 */
688 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
689 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
690
691 vfs_vnode_iterator_init(mp, &marker);
692 ctx.logvp = logvp;
693 ctx.l = l;
694 ctx.vp = vp;
695 ctx.copy_fs = copy_fs;
696 while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
697 &ctx)))
698 {
699 /*
700 * If there is a fragment, clear it here.
701 */
702 xp = VTOI(xvp);
703 blkno = 0;
704 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
705 if (loc < UFS_NDADDR) {
706 len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
707 if (len > 0 && len < fs->fs_bsize) {
708 error = UFS_WAPBL_BEGIN(mp);
709 if (error) {
710 vrele(xvp);
711 vfs_vnode_iterator_destroy(marker);
712 goto out;
713 }
714 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
715 len, xp->i_number);
716 blkno = db_get(xp, loc);
717 db_assign(xp, loc, 0);
718 UFS_WAPBL_END(mp);
719 }
720 }
721 *snaplistsize += 1;
722 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
723 if (blkno)
724 db_assign(xp, loc, blkno);
725 if (!error) {
726 error = UFS_WAPBL_BEGIN(mp);
727 if (!error) {
728 error = ffs_freefile_snap(copy_fs, vp,
729 xp->i_number, xp->i_mode);
730 UFS_WAPBL_END(mp);
731 }
732 }
733 vrele(xvp);
734 if (error) {
735 vfs_vnode_iterator_destroy(marker);
736 goto out;
737 }
738 }
739 vfs_vnode_iterator_destroy(marker);
740
741 /*
742 * Create a preliminary list of preallocated snapshot blocks.
743 */
744 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
745 blkp = &(*snaplist)[1];
746 *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
747 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
748 for (cg = 0; cg < fs->fs_ncg; cg++) {
749 if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
750 break;
751 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
752 }
753 len = howmany(fs->fs_cssize, fs->fs_bsize);
754 for (loc = 0; loc < len; loc++)
755 *blkp++ = blkno + loc;
756 for (; cg < fs->fs_ncg; cg++)
757 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
758 (*snaplist)[0] = blkp - &(*snaplist)[0];
759
760 out:
761 if (logvp != NULL)
762 vput(logvp);
763 if (error && *snaplist != NULL) {
764 free(*snaplist, M_UFSMNT);
765 *snaplist = NULL;
766 }
767
768 return error;
769 }
770
771 /*
772 * Copy allocation information from all the snapshots in this snapshot and
773 * then expunge them from its view. Also, collect the list of allocated
774 * blocks in i_snapblklist.
775 */
776 static int
777 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
778 struct fs *copy_fs, daddr_t snaplistsize)
779 {
780 int error = 0, i;
781 daddr_t numblks, *snaplist = NULL;
782 struct fs *fs = VFSTOUFS(mp)->um_fs;
783 struct inode *ip = VTOI(vp), *xp;
784 struct lwp *l = curlwp;
785 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
786
787 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
788 if (xp != ip) {
789 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
790 if (error)
791 break;
792 }
793 if (xp->i_nlink != 0)
794 continue;
795 error = UFS_WAPBL_BEGIN(mp);
796 if (error)
797 break;
798 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
799 UFS_WAPBL_END(mp);
800 if (error)
801 break;
802 }
803 if (error)
804 goto out;
805 /*
806 * Allocate space for the full list of preallocated snapshot blocks.
807 */
808 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
809 ip->i_snapblklist = &snaplist[1];
810 /*
811 * Expunge the blocks used by the snapshots from the set of
812 * blocks marked as used in the snapshot bitmaps. Also, collect
813 * the list of allocated blocks in i_snapblklist.
814 */
815 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
816 if (error)
817 goto out;
818 if (snaplistsize < ip->i_snapblklist - snaplist)
819 panic("ffs_snapshot: list too small");
820 snaplistsize = ip->i_snapblklist - snaplist;
821 snaplist[0] = snaplistsize;
822 ip->i_snapblklist = &snaplist[0];
823 /*
824 * Write out the list of allocated blocks to the end of the snapshot.
825 */
826 numblks = howmany(fs->fs_size, fs->fs_frag);
827 for (i = 0; i < snaplistsize; i++)
828 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
829 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
830 snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
831 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
832 for (i = 0; i < snaplistsize; i++)
833 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
834 out:
835 if (error && snaplist != NULL) {
836 free(snaplist, M_UFSMNT);
837 ip->i_snapblklist = NULL;
838 }
839 return error;
840 }
841
842 /*
843 * Write the superblock and its summary information to the snapshot.
844 * Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
845 */
846 static int
847 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
848 {
849 int error, len, loc;
850 void *space;
851 daddr_t blkno;
852 struct buf *bp;
853 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
854 struct inode *ip = VTOI(vp);
855 struct lwp *l = curlwp;
856
857 copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
858
859 /*
860 * Write the superblock and its summary information
861 * to the snapshot.
862 */
863 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
864 len = howmany(fs->fs_cssize, fs->fs_bsize);
865 space = copyfs->fs_csp;
866 #ifdef FFS_EI
867 if (UFS_FSNEEDSWAP(fs)) {
868 ffs_sb_swap(copyfs, copyfs);
869 ffs_csum_swap(space, space, fs->fs_cssize);
870 }
871 #endif
872 error = UFS_WAPBL_BEGIN(mp);
873 if (error)
874 return error;
875 for (loc = 0; loc < len; loc++) {
876 error = bread(vp, blkno + loc, fs->fs_bsize,
877 B_MODIFY, &bp);
878 if (error) {
879 break;
880 }
881 memcpy(bp->b_data, space, fs->fs_bsize);
882 space = (char *)space + fs->fs_bsize;
883 bawrite(bp);
884 }
885 if (error)
886 goto out;
887 error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
888 fs->fs_bsize, B_MODIFY, &bp);
889 if (error) {
890 goto out;
891 } else {
892 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
893 bawrite(bp);
894 }
895 /*
896 * Copy the first UFS_NDADDR blocks to the snapshot so
897 * ffs_copyonwrite() and ffs_snapblkfree() will always work on
898 * indirect blocks.
899 */
900 for (loc = 0; loc < UFS_NDADDR; loc++) {
901 if (db_get(ip, loc) != 0)
902 continue;
903 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
904 fs->fs_bsize, l->l_cred, 0, &bp);
905 if (error)
906 break;
907 error = rwfsblk(vp, B_READ, bp->b_data, loc);
908 if (error) {
909 brelse(bp, 0);
910 break;
911 }
912 bawrite(bp);
913 }
914
915 out:
916 UFS_WAPBL_END(mp);
917 return error;
918 }
919
920 /*
921 * Copy all cylinder group maps.
922 */
923 static int
924 cgaccount(struct vnode *vp, int passno, int *redo)
925 {
926 int cg, error = 0;
927 struct buf *nbp;
928 struct fs *fs = VTOI(vp)->i_fs;
929
930 if (redo != NULL)
931 *redo = 0;
932 if (passno == 1)
933 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
934 M_DEVBUF, M_WAITOK | M_ZERO);
935 for (cg = 0; cg < fs->fs_ncg; cg++) {
936 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
937 continue;
938
939 if (redo != NULL)
940 *redo += 1;
941 error = UFS_WAPBL_BEGIN(vp->v_mount);
942 if (error)
943 return error;
944 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
945 fs->fs_bsize, curlwp->l_cred, 0, &nbp);
946 if (error) {
947 UFS_WAPBL_END(vp->v_mount);
948 break;
949 }
950 error = cgaccount1(cg, vp, nbp->b_data, passno);
951 bawrite(nbp);
952 UFS_WAPBL_END(vp->v_mount);
953 if (error)
954 break;
955 }
956 return error;
957 }
958
959 /*
960 * Copy a cylinder group map. All the unallocated blocks are marked
961 * BLK_NOCOPY so that the snapshot knows that it need not copy them
962 * if they are later written. If passno is one, then this is a first
963 * pass, so only setting needs to be done. If passno is 2, then this
964 * is a revision to a previous pass which must be undone as the
965 * replacement pass is done.
966 */
967 static int
968 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
969 {
970 struct buf *bp, *ibp;
971 struct inode *ip;
972 struct cg *cgp;
973 struct fs *fs;
974 struct lwp *l = curlwp;
975 daddr_t base, numblks;
976 int error, len, loc, ns __unused, indiroff;
977
978 ip = VTOI(vp);
979 fs = ip->i_fs;
980 ns = UFS_FSNEEDSWAP(fs);
981 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
982 (int)fs->fs_cgsize, 0, &bp);
983 if (error) {
984 return (error);
985 }
986 cgp = (struct cg *)bp->b_data;
987 if (!cg_chkmagic(cgp, ns)) {
988 brelse(bp, 0);
989 return (EIO);
990 }
991 ACTIVECG_SET(fs, cg);
992
993 memcpy(data, bp->b_data, fs->fs_cgsize);
994 brelse(bp, 0);
995 if (fs->fs_cgsize < fs->fs_bsize)
996 memset((char *)data + fs->fs_cgsize, 0,
997 fs->fs_bsize - fs->fs_cgsize);
998 numblks = howmany(fs->fs_size, fs->fs_frag);
999 len = howmany(fs->fs_fpg, fs->fs_frag);
1000 base = cg * fs->fs_fpg / fs->fs_frag;
1001 if (base + len >= numblks)
1002 len = numblks - base - 1;
1003 loc = 0;
1004 if (base < UFS_NDADDR) {
1005 for ( ; loc < UFS_NDADDR; loc++) {
1006 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1007 db_assign(ip, loc, BLK_NOCOPY);
1008 else if (db_get(ip, loc) == BLK_NOCOPY) {
1009 if (passno == 2)
1010 db_assign(ip, loc, 0);
1011 else if (passno == 1)
1012 panic("ffs_snapshot: lost direct block");
1013 }
1014 }
1015 }
1016 if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
1017 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1018 return (error);
1019 indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
1020 for ( ; loc < len; loc++, indiroff++) {
1021 if (indiroff >= FFS_NINDIR(fs)) {
1022 bawrite(ibp);
1023 if ((error = ffs_balloc(vp,
1024 ffs_lblktosize(fs, (off_t)(base + loc)),
1025 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1026 return (error);
1027 indiroff = 0;
1028 }
1029 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1030 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
1031 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
1032 if (passno == 2)
1033 idb_assign(ip, ibp->b_data, indiroff, 0);
1034 else if (passno == 1)
1035 panic("ffs_snapshot: lost indirect block");
1036 }
1037 }
1038 bdwrite(ibp);
1039 return (0);
1040 }
1041
1042 /*
1043 * Before expunging a snapshot inode, note all the
1044 * blocks that it claims with BLK_SNAP so that fsck will
1045 * be able to account for those blocks properly and so
1046 * that this snapshot knows that it need not copy them
1047 * if the other snapshot holding them is freed.
1048 */
1049 static int
1050 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1051 acctfunc_t acctfunc, int expungetype)
1052 {
1053 int i, error, ns __unused;
1054 daddr_t lbn, rlbn;
1055 daddr_t len, blkno, numblks, blksperindir;
1056 struct ufs1_dinode *dip1;
1057 struct ufs2_dinode *dip2;
1058 struct lwp *l = curlwp;
1059 void *bap;
1060 struct buf *bp;
1061 struct mount *mp;
1062
1063 ns = UFS_FSNEEDSWAP(fs);
1064 mp = snapvp->v_mount;
1065
1066 error = UFS_WAPBL_BEGIN(mp);
1067 if (error)
1068 return error;
1069 /*
1070 * Prepare to expunge the inode. If its inode block has not
1071 * yet been copied, then allocate and fill the copy.
1072 */
1073 lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1074 error = snapblkaddr(snapvp, lbn, &blkno);
1075 if (error)
1076 return error;
1077 if (blkno != 0) {
1078 error = bread(snapvp, lbn, fs->fs_bsize,
1079 B_MODIFY, &bp);
1080 } else {
1081 error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
1082 fs->fs_bsize, l->l_cred, 0, &bp);
1083 if (! error)
1084 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1085 }
1086 if (error) {
1087 UFS_WAPBL_END(mp);
1088 return error;
1089 }
1090 /*
1091 * Set a snapshot inode to be a zero length file, regular files
1092 * or unlinked snapshots to be completely unallocated.
1093 */
1094 if (fs->fs_magic == FS_UFS1_MAGIC) {
1095 dip1 = (struct ufs1_dinode *)bp->b_data +
1096 ino_to_fsbo(fs, cancelip->i_number);
1097 if (cancelip->i_flags & SF_SNAPSHOT) {
1098 dip1->di_flags =
1099 ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
1100 SF_SNAPINVAL, ns);
1101 }
1102 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1103 dip1->di_mode = 0;
1104 dip1->di_size = 0;
1105 dip1->di_blocks = 0;
1106 memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
1107 } else {
1108 dip2 = (struct ufs2_dinode *)bp->b_data +
1109 ino_to_fsbo(fs, cancelip->i_number);
1110 if (cancelip->i_flags & SF_SNAPSHOT) {
1111 dip2->di_flags =
1112 ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
1113 SF_SNAPINVAL, ns);
1114 }
1115 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1116 dip2->di_mode = 0;
1117 dip2->di_size = 0;
1118 dip2->di_blocks = 0;
1119 memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
1120 }
1121 bdwrite(bp);
1122 UFS_WAPBL_END(mp);
1123 /*
1124 * Now go through and expunge all the blocks in the file
1125 * using the function requested.
1126 */
1127 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1128 if (fs->fs_magic == FS_UFS1_MAGIC)
1129 bap = &cancelip->i_ffs1_db[0];
1130 else
1131 bap = &cancelip->i_ffs2_db[0];
1132 error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
1133 if (error)
1134 return (error);
1135 if (fs->fs_magic == FS_UFS1_MAGIC)
1136 bap = &cancelip->i_ffs1_ib[0];
1137 else
1138 bap = &cancelip->i_ffs2_ib[0];
1139 error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
1140 if (error)
1141 return (error);
1142 blksperindir = 1;
1143 lbn = -UFS_NDADDR;
1144 len = numblks - UFS_NDADDR;
1145 rlbn = UFS_NDADDR;
1146 for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1147 error = indiracct(snapvp, ITOV(cancelip), i,
1148 ib_get(cancelip, i), lbn, rlbn, len,
1149 blksperindir, fs, acctfunc, expungetype);
1150 if (error)
1151 return (error);
1152 blksperindir *= FFS_NINDIR(fs);
1153 lbn -= blksperindir + 1;
1154 len -= blksperindir;
1155 rlbn += blksperindir;
1156 }
1157 return (0);
1158 }
1159
1160 /*
1161 * Descend an indirect block chain for vnode cancelvp accounting for all
1162 * its indirect blocks in snapvp.
1163 */
1164 static int
1165 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1166 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1167 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1168 {
1169 int error, num, i;
1170 daddr_t subblksperindir;
1171 struct indir indirs[UFS_NIADDR + 2];
1172 daddr_t last;
1173 void *bap;
1174 struct buf *bp;
1175
1176 if (blkno == 0) {
1177 if (expungetype == BLK_NOCOPY)
1178 return (0);
1179 panic("indiracct: missing indir");
1180 }
1181 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1182 return (error);
1183 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1184 panic("indiracct: botched params");
1185 /*
1186 * We have to expand bread here since it will deadlock looking
1187 * up the block number for any blocks that are not in the cache.
1188 */
1189 error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
1190 false, &bp);
1191 if (error)
1192 return error;
1193 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1194 rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
1195 brelse(bp, 0);
1196 return (error);
1197 }
1198 /*
1199 * Account for the block pointers in this indirect block.
1200 */
1201 last = howmany(remblks, blksperindir);
1202 if (last > FFS_NINDIR(fs))
1203 last = FFS_NINDIR(fs);
1204 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1205 memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1206 brelse(bp, 0);
1207 error = (*acctfunc)(snapvp, bap, 0, last,
1208 fs, level == 0 ? rlbn : -1, expungetype);
1209 if (error || level == 0)
1210 goto out;
1211 /*
1212 * Account for the block pointers in each of the indirect blocks
1213 * in the levels below us.
1214 */
1215 subblksperindir = blksperindir / FFS_NINDIR(fs);
1216 for (lbn++, level--, i = 0; i < last; i++) {
1217 error = indiracct(snapvp, cancelvp, level,
1218 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1219 subblksperindir, fs, acctfunc, expungetype);
1220 if (error)
1221 goto out;
1222 rlbn += blksperindir;
1223 lbn -= blksperindir;
1224 remblks -= blksperindir;
1225 }
1226 out:
1227 free(bap, M_DEVBUF);
1228 return (error);
1229 }
1230
1231 /*
1232 * Do both snap accounting and map accounting.
1233 */
1234 static int
1235 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1236 struct fs *fs, daddr_t lblkno,
1237 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1238 {
1239 int error;
1240
1241 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1242 return (error);
1243 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1244 }
1245
1246 /*
1247 * Identify a set of blocks allocated in a snapshot inode.
1248 */
1249 static int
1250 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1251 struct fs *fs, daddr_t lblkno,
1252 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1253 {
1254 struct inode *ip = VTOI(vp);
1255 struct lwp *l = curlwp;
1256 struct mount *mp = vp->v_mount;
1257 daddr_t blkno;
1258 daddr_t lbn;
1259 struct buf *ibp;
1260 int error, n;
1261 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1262
1263 error = UFS_WAPBL_BEGIN(mp);
1264 if (error)
1265 return error;
1266 for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1267 blkno = idb_get(ip, bap, oldblkp);
1268 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1269 continue;
1270 lbn = ffs_fragstoblks(fs, blkno);
1271 if (lbn < UFS_NDADDR) {
1272 blkno = db_get(ip, lbn);
1273 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1274 } else {
1275 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1276 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1277 if (error)
1278 break;
1279 blkno = idb_get(ip, ibp->b_data,
1280 (lbn - UFS_NDADDR) % FFS_NINDIR(fs));
1281 }
1282 /*
1283 * If we are expunging a snapshot vnode and we
1284 * find a block marked BLK_NOCOPY, then it is
1285 * one that has been allocated to this snapshot after
1286 * we took our current snapshot and can be ignored.
1287 */
1288 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1289 if (lbn >= UFS_NDADDR)
1290 brelse(ibp, 0);
1291 } else {
1292 if (blkno != 0)
1293 panic("snapacct: bad block");
1294 if (lbn < UFS_NDADDR)
1295 db_assign(ip, lbn, expungetype);
1296 else {
1297 idb_assign(ip, ibp->b_data,
1298 (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
1299 bdwrite(ibp);
1300 }
1301 }
1302 if (wbreak > 0 && (++n % wbreak) == 0) {
1303 UFS_WAPBL_END(mp);
1304 error = UFS_WAPBL_BEGIN(mp);
1305 if (error)
1306 return error;
1307 }
1308 }
1309 UFS_WAPBL_END(mp);
1310 return error;
1311 }
1312
1313 /*
1314 * Account for a set of blocks allocated in a snapshot inode.
1315 */
1316 static int
1317 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1318 struct fs *fs, daddr_t lblkno, int expungetype)
1319 {
1320 daddr_t blkno;
1321 struct inode *ip;
1322 struct mount *mp = vp->v_mount;
1323 ino_t inum;
1324 int acctit, error, n;
1325 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1326
1327 error = UFS_WAPBL_BEGIN(mp);
1328 if (error)
1329 return error;
1330 ip = VTOI(vp);
1331 inum = ip->i_number;
1332 if (lblkno == -1)
1333 acctit = 0;
1334 else
1335 acctit = 1;
1336 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1337 blkno = idb_get(ip, bap, oldblkp);
1338 if (blkno == 0 || blkno == BLK_NOCOPY)
1339 continue;
1340 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1341 *ip->i_snapblklist++ = lblkno;
1342 if (blkno == BLK_SNAP)
1343 blkno = ffs_blkstofrags(fs, lblkno);
1344 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1345 if (wbreak > 0 && (++n % wbreak) == 0) {
1346 UFS_WAPBL_END(mp);
1347 error = UFS_WAPBL_BEGIN(mp);
1348 if (error)
1349 return error;
1350 }
1351 }
1352 UFS_WAPBL_END(mp);
1353 return (0);
1354 }
1355
1356 /*
1357 * Number of blocks that fit into the journal or zero if not logging.
1358 */
1359 static int
1360 blocks_in_journal(struct fs *fs)
1361 {
1362 off_t bpj;
1363
1364 if ((fs->fs_flags & FS_DOWAPBL) == 0)
1365 return 0;
1366 bpj = 1;
1367 if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1368 switch (fs->fs_journal_location) {
1369 case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1370 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1371 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1372 break;
1373 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1374 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1375 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1376 break;
1377 }
1378 }
1379 bpj /= fs->fs_bsize;
1380 return (bpj > 0 ? bpj : 1);
1381 }
1382 #endif /* defined(FFS_NO_SNAPSHOT) */
1383
1384 /*
1385 * Decrement extra reference on snapshot when last name is removed.
1386 * It will not be freed until the last open reference goes away.
1387 */
1388 void
1389 ffs_snapgone(struct vnode *vp)
1390 {
1391 struct inode *xp, *ip = VTOI(vp);
1392 struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
1393 struct fs *fs;
1394 struct snap_info *si;
1395 int snaploc;
1396
1397 si = VFSTOUFS(mp)->um_snapinfo;
1398
1399 /*
1400 * Find snapshot in incore list.
1401 */
1402 mutex_enter(&si->si_lock);
1403 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1404 if (xp == ip)
1405 break;
1406 mutex_exit(&si->si_lock);
1407 if (xp != NULL)
1408 vrele(ITOV(ip));
1409 #ifdef DEBUG
1410 else if (snapdebug)
1411 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1412 (unsigned long long)ip->i_number);
1413 #endif
1414 /*
1415 * Delete snapshot inode from superblock. Keep list dense.
1416 */
1417 mutex_enter(&si->si_lock);
1418 fs = ip->i_fs;
1419 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1420 if (fs->fs_snapinum[snaploc] == ip->i_number)
1421 break;
1422 if (snaploc < FSMAXSNAP) {
1423 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1424 if (fs->fs_snapinum[snaploc] == 0)
1425 break;
1426 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1427 }
1428 fs->fs_snapinum[snaploc - 1] = 0;
1429 }
1430 si->si_gen++;
1431 mutex_exit(&si->si_lock);
1432 }
1433
1434 /*
1435 * Prepare a snapshot file for being removed.
1436 */
1437 void
1438 ffs_snapremove(struct vnode *vp)
1439 {
1440 struct inode *ip = VTOI(vp), *xp;
1441 struct vnode *devvp = ip->i_devvp;
1442 struct fs *fs = ip->i_fs;
1443 struct mount *mp = spec_node_getmountedfs(devvp);
1444 struct buf *ibp;
1445 struct snap_info *si;
1446 struct lwp *l = curlwp;
1447 daddr_t numblks, blkno, dblk;
1448 int error, loc, last;
1449
1450 si = VFSTOUFS(mp)->um_snapinfo;
1451 /*
1452 * If active, delete from incore list (this snapshot may
1453 * already have been in the process of being deleted, so
1454 * would not have been active).
1455 *
1456 * Clear copy-on-write flag if last snapshot.
1457 */
1458 mutex_enter(&si->si_snaplock);
1459 mutex_enter(&si->si_lock);
1460 if (is_active_snapshot(si, ip)) {
1461 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1462 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1463 /* Roll back the list of preallocated blocks. */
1464 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1465 si->si_snapblklist = xp->i_snapblklist;
1466 si->si_gen++;
1467 mutex_exit(&si->si_lock);
1468 mutex_exit(&si->si_snaplock);
1469 } else {
1470 si->si_snapblklist = 0;
1471 si->si_gen++;
1472 mutex_exit(&si->si_lock);
1473 mutex_exit(&si->si_snaplock);
1474 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1475 }
1476 if (ip->i_snapblklist != NULL) {
1477 free(ip->i_snapblklist, M_UFSMNT);
1478 ip->i_snapblklist = NULL;
1479 }
1480 } else {
1481 mutex_exit(&si->si_lock);
1482 mutex_exit(&si->si_snaplock);
1483 }
1484 /*
1485 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1486 * snapshots that want them (see ffs_snapblkfree below).
1487 */
1488 for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1489 dblk = db_get(ip, blkno);
1490 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1491 db_assign(ip, blkno, 0);
1492 else if ((dblk == ffs_blkstofrags(fs, blkno) &&
1493 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1494 ip->i_number))) {
1495 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1496 db_assign(ip, blkno, 0);
1497 }
1498 }
1499 numblks = howmany(ip->i_size, fs->fs_bsize);
1500 for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
1501 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
1502 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1503 if (error)
1504 continue;
1505 if (fs->fs_size - blkno > FFS_NINDIR(fs))
1506 last = FFS_NINDIR(fs);
1507 else
1508 last = fs->fs_size - blkno;
1509 for (loc = 0; loc < last; loc++) {
1510 dblk = idb_get(ip, ibp->b_data, loc);
1511 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1512 idb_assign(ip, ibp->b_data, loc, 0);
1513 else if (dblk == ffs_blkstofrags(fs, blkno) &&
1514 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1515 fs->fs_bsize, ip->i_number)) {
1516 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1517 idb_assign(ip, ibp->b_data, loc, 0);
1518 }
1519 }
1520 bawrite(ibp);
1521 UFS_WAPBL_END(mp);
1522 error = UFS_WAPBL_BEGIN(mp);
1523 KASSERT(error == 0);
1524 }
1525 /*
1526 * Clear snapshot flag and drop reference.
1527 */
1528 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
1529 DIP_ASSIGN(ip, flags, ip->i_flags);
1530 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1531 #if defined(QUOTA) || defined(QUOTA2)
1532 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1533 chkiq(ip, 1, l->l_cred, FORCE);
1534 #endif
1535 }
1536
1537 /*
1538 * Notification that a block is being freed. Return zero if the free
1539 * should be allowed to proceed. Return non-zero if the snapshot file
1540 * wants to claim the block. The block will be claimed if it is an
1541 * uncopied part of one of the snapshots. It will be freed if it is
1542 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1543 * If a fragment is being freed, then all snapshots that care about
1544 * it must make a copy since a snapshot file can only claim full sized
1545 * blocks. Note that if more than one snapshot file maps the block,
1546 * we can pick one at random to claim it. Since none of the snapshots
1547 * can change, we are assurred that they will all see the same unmodified
1548 * image. When deleting a snapshot file (see ffs_snapremove above), we
1549 * must push any of these claimed blocks to one of the other snapshots
1550 * that maps it. These claimed blocks are easily identified as they will
1551 * have a block number equal to their logical block number within the
1552 * snapshot. A copied block can never have this property because they
1553 * must always have been allocated from a BLK_NOCOPY location.
1554 */
1555 int
1556 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1557 long size, ino_t inum)
1558 {
1559 struct mount *mp = spec_node_getmountedfs(devvp);
1560 struct buf *ibp;
1561 struct inode *ip;
1562 struct vnode *vp = NULL;
1563 struct snap_info *si;
1564 void *saved_data = NULL;
1565 daddr_t lbn;
1566 daddr_t blkno;
1567 uint32_t gen;
1568 int indiroff = 0, error = 0, claimedblk = 0;
1569
1570 si = VFSTOUFS(mp)->um_snapinfo;
1571 lbn = ffs_fragstoblks(fs, bno);
1572 mutex_enter(&si->si_snaplock);
1573 mutex_enter(&si->si_lock);
1574 si->si_owner = curlwp;
1575
1576 retry:
1577 gen = si->si_gen;
1578 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1579 vp = ITOV(ip);
1580 /*
1581 * Lookup block being written.
1582 */
1583 if (lbn < UFS_NDADDR) {
1584 blkno = db_get(ip, lbn);
1585 } else {
1586 mutex_exit(&si->si_lock);
1587 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1588 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1589 if (error) {
1590 mutex_enter(&si->si_lock);
1591 break;
1592 }
1593 indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
1594 blkno = idb_get(ip, ibp->b_data, indiroff);
1595 mutex_enter(&si->si_lock);
1596 if (gen != si->si_gen) {
1597 brelse(ibp, 0);
1598 goto retry;
1599 }
1600 }
1601 /*
1602 * Check to see if block needs to be copied.
1603 */
1604 if (blkno == 0) {
1605 /*
1606 * A block that we map is being freed. If it has not
1607 * been claimed yet, we will claim or copy it (below).
1608 */
1609 claimedblk = 1;
1610 } else if (blkno == BLK_SNAP) {
1611 /*
1612 * No previous snapshot claimed the block,
1613 * so it will be freed and become a BLK_NOCOPY
1614 * (don't care) for us.
1615 */
1616 if (claimedblk)
1617 panic("snapblkfree: inconsistent block type");
1618 if (lbn < UFS_NDADDR) {
1619 db_assign(ip, lbn, BLK_NOCOPY);
1620 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1621 } else {
1622 idb_assign(ip, ibp->b_data, indiroff,
1623 BLK_NOCOPY);
1624 mutex_exit(&si->si_lock);
1625 if (ip->i_nlink > 0)
1626 bwrite(ibp);
1627 else
1628 bdwrite(ibp);
1629 mutex_enter(&si->si_lock);
1630 if (gen != si->si_gen)
1631 goto retry;
1632 }
1633 continue;
1634 } else /* BLK_NOCOPY or default */ {
1635 /*
1636 * If the snapshot has already copied the block
1637 * (default), or does not care about the block,
1638 * it is not needed.
1639 */
1640 if (lbn >= UFS_NDADDR)
1641 brelse(ibp, 0);
1642 continue;
1643 }
1644 /*
1645 * If this is a full size block, we will just grab it
1646 * and assign it to the snapshot inode. Otherwise we
1647 * will proceed to copy it. See explanation for this
1648 * routine as to why only a single snapshot needs to
1649 * claim this block.
1650 */
1651 if (size == fs->fs_bsize) {
1652 #ifdef DEBUG
1653 if (snapdebug)
1654 printf("%s %llu lbn %" PRId64
1655 "from inum %llu\n",
1656 "Grabonremove: snapino",
1657 (unsigned long long)ip->i_number,
1658 lbn, (unsigned long long)inum);
1659 #endif
1660 mutex_exit(&si->si_lock);
1661 if (lbn < UFS_NDADDR) {
1662 db_assign(ip, lbn, bno);
1663 } else {
1664 idb_assign(ip, ibp->b_data, indiroff, bno);
1665 if (ip->i_nlink > 0)
1666 bwrite(ibp);
1667 else
1668 bdwrite(ibp);
1669 }
1670 DIP_ADD(ip, blocks, btodb(size));
1671 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1672 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1673 error = syncsnap(vp);
1674 else
1675 error = 0;
1676 mutex_enter(&si->si_lock);
1677 si->si_owner = NULL;
1678 mutex_exit(&si->si_lock);
1679 mutex_exit(&si->si_snaplock);
1680 return (error == 0);
1681 }
1682 if (lbn >= UFS_NDADDR)
1683 brelse(ibp, 0);
1684 #ifdef DEBUG
1685 if (snapdebug)
1686 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1687 "Copyonremove: snapino ",
1688 (unsigned long long)ip->i_number,
1689 lbn, "for inum", (unsigned long long)inum, size);
1690 #endif
1691 /*
1692 * If we have already read the old block contents, then
1693 * simply copy them to the new block. Note that we need
1694 * to synchronously write snapshots that have not been
1695 * unlinked, and hence will be visible after a crash,
1696 * to ensure their integrity.
1697 */
1698 mutex_exit(&si->si_lock);
1699 if (saved_data == NULL) {
1700 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1701 error = rwfsblk(vp, B_READ, saved_data, lbn);
1702 if (error) {
1703 free(saved_data, M_UFSMNT);
1704 saved_data = NULL;
1705 mutex_enter(&si->si_lock);
1706 break;
1707 }
1708 }
1709 error = wrsnapblk(vp, saved_data, lbn);
1710 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1711 error = syncsnap(vp);
1712 mutex_enter(&si->si_lock);
1713 if (error)
1714 break;
1715 if (gen != si->si_gen)
1716 goto retry;
1717 }
1718 si->si_owner = NULL;
1719 mutex_exit(&si->si_lock);
1720 mutex_exit(&si->si_snaplock);
1721 if (saved_data)
1722 free(saved_data, M_UFSMNT);
1723 /*
1724 * If we have been unable to allocate a block in which to do
1725 * the copy, then return non-zero so that the fragment will
1726 * not be freed. Although space will be lost, the snapshot
1727 * will stay consistent.
1728 */
1729 return (error);
1730 }
1731
1732 /*
1733 * Associate snapshot files when mounting.
1734 */
1735 void
1736 ffs_snapshot_mount(struct mount *mp)
1737 {
1738 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1739 struct fs *fs = VFSTOUFS(mp)->um_fs;
1740 struct lwp *l = curlwp;
1741 struct vnode *vp;
1742 struct inode *ip, *xp;
1743 struct snap_info *si;
1744 daddr_t snaplistsize, *snapblklist;
1745 int i, error, ns __unused, snaploc, loc;
1746
1747 /*
1748 * No persistent snapshots on apple ufs file systems.
1749 */
1750 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1751 return;
1752
1753 si = VFSTOUFS(mp)->um_snapinfo;
1754 ns = UFS_FSNEEDSWAP(fs);
1755 /*
1756 * XXX The following needs to be set before ffs_truncate or
1757 * VOP_READ can be called.
1758 */
1759 mp->mnt_stat.f_iosize = fs->fs_bsize;
1760 /*
1761 * Process each snapshot listed in the superblock.
1762 */
1763 vp = NULL;
1764 mutex_enter(&si->si_lock);
1765 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1766 if (fs->fs_snapinum[snaploc] == 0)
1767 break;
1768 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1769 &vp)) != 0) {
1770 printf("ffs_snapshot_mount: vget failed %d\n", error);
1771 continue;
1772 }
1773 ip = VTOI(vp);
1774 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
1775 SF_SNAPSHOT) {
1776 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1777 fs->fs_snapinum[snaploc]);
1778 vput(vp);
1779 vp = NULL;
1780 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1781 if (fs->fs_snapinum[loc] == 0)
1782 break;
1783 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1784 }
1785 fs->fs_snapinum[loc - 1] = 0;
1786 snaploc--;
1787 continue;
1788 }
1789
1790 /*
1791 * Read the block hints list. Use an empty list on
1792 * read errors.
1793 */
1794 error = vn_rdwr(UIO_READ, vp,
1795 (void *)&snaplistsize, sizeof(snaplistsize),
1796 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1797 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1798 l->l_cred, NULL, NULL);
1799 if (error) {
1800 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1801 snaplistsize = 1;
1802 } else
1803 snaplistsize = ufs_rw64(snaplistsize, ns);
1804 snapblklist = malloc(
1805 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1806 if (error)
1807 snapblklist[0] = 1;
1808 else {
1809 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1810 snaplistsize * sizeof(daddr_t),
1811 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1812 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1813 l->l_cred, NULL, NULL);
1814 for (i = 0; i < snaplistsize; i++)
1815 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1816 if (error) {
1817 printf("ffs_snapshot_mount: read_2 failed %d\n",
1818 error);
1819 snapblklist[0] = 1;
1820 }
1821 }
1822 ip->i_snapblklist = &snapblklist[0];
1823
1824 /*
1825 * Link it onto the active snapshot list.
1826 */
1827 if (is_active_snapshot(si, ip))
1828 panic("ffs_snapshot_mount: %"PRIu64" already on list",
1829 ip->i_number);
1830 else
1831 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1832 vp->v_vflag |= VV_SYSTEM;
1833 VOP_UNLOCK(vp);
1834 }
1835 /*
1836 * No usable snapshots found.
1837 */
1838 if (vp == NULL) {
1839 mutex_exit(&si->si_lock);
1840 return;
1841 }
1842 /*
1843 * Attach the block hints list. We always want to
1844 * use the list from the newest snapshot.
1845 */
1846 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1847 si->si_snapblklist = xp->i_snapblklist;
1848 fscow_establish(mp, ffs_copyonwrite, devvp);
1849 si->si_gen++;
1850 mutex_exit(&si->si_lock);
1851 }
1852
1853 /*
1854 * Disassociate snapshot files when unmounting.
1855 */
1856 void
1857 ffs_snapshot_unmount(struct mount *mp)
1858 {
1859 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1860 struct inode *xp;
1861 struct vnode *vp = NULL;
1862 struct snap_info *si;
1863
1864 si = VFSTOUFS(mp)->um_snapinfo;
1865 mutex_enter(&si->si_lock);
1866 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1867 vp = ITOV(xp);
1868 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1869 if (xp->i_snapblklist == si->si_snapblklist)
1870 si->si_snapblklist = NULL;
1871 free(xp->i_snapblklist, M_UFSMNT);
1872 if (xp->i_nlink > 0) {
1873 si->si_gen++;
1874 mutex_exit(&si->si_lock);
1875 vrele(vp);
1876 mutex_enter(&si->si_lock);
1877 }
1878 }
1879 si->si_gen++;
1880 mutex_exit(&si->si_lock);
1881 if (vp)
1882 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1883 }
1884
1885 /*
1886 * Check for need to copy block that is about to be written,
1887 * copying the block if necessary.
1888 */
1889 static int
1890 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1891 {
1892 struct fs *fs;
1893 struct inode *ip;
1894 struct vnode *devvp = v, *vp = NULL;
1895 struct mount *mp = spec_node_getmountedfs(devvp);
1896 struct snap_info *si;
1897 void *saved_data = NULL;
1898 daddr_t lbn, blkno, *snapblklist;
1899 uint32_t gen;
1900 int lower, upper, mid, snapshot_locked = 0, error = 0;
1901
1902 /*
1903 * Check for valid snapshots.
1904 */
1905 si = VFSTOUFS(mp)->um_snapinfo;
1906 mutex_enter(&si->si_lock);
1907 ip = TAILQ_FIRST(&si->si_snapshots);
1908 if (ip == NULL) {
1909 mutex_exit(&si->si_lock);
1910 return 0;
1911 }
1912 /*
1913 * First check to see if it is after the file system,
1914 * in the journal or in the preallocated list.
1915 * By doing these checks we avoid several potential deadlocks.
1916 */
1917 fs = ip->i_fs;
1918 lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
1919 if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
1920 mutex_exit(&si->si_lock);
1921 return 0;
1922 }
1923 if ((fs->fs_flags & FS_DOWAPBL) &&
1924 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1925 off_t blk_off, log_start, log_end;
1926
1927 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1928 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1929 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1930 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1931 blk_off = dbtob(bp->b_blkno);
1932 if (blk_off >= log_start && blk_off < log_end) {
1933 mutex_exit(&si->si_lock);
1934 return 0;
1935 }
1936 }
1937 snapblklist = si->si_snapblklist;
1938 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1939 lower = 1;
1940 while (lower <= upper) {
1941 mid = (lower + upper) / 2;
1942 if (snapblklist[mid] == lbn)
1943 break;
1944 if (snapblklist[mid] < lbn)
1945 lower = mid + 1;
1946 else
1947 upper = mid - 1;
1948 }
1949 if (lower <= upper) {
1950 mutex_exit(&si->si_lock);
1951 return 0;
1952 }
1953 /*
1954 * Not in the precomputed list, so check the snapshots.
1955 */
1956 if (si->si_owner != curlwp) {
1957 if (!mutex_tryenter(&si->si_snaplock)) {
1958 mutex_exit(&si->si_lock);
1959 mutex_enter(&si->si_snaplock);
1960 mutex_enter(&si->si_lock);
1961 }
1962 si->si_owner = curlwp;
1963 snapshot_locked = 1;
1964 }
1965 if (data_valid && bp->b_bcount == fs->fs_bsize)
1966 saved_data = bp->b_data;
1967 retry:
1968 gen = si->si_gen;
1969 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1970 vp = ITOV(ip);
1971 /*
1972 * We ensure that everything of our own that needs to be
1973 * copied will be done at the time that ffs_snapshot is
1974 * called. Thus we can skip the check here which can
1975 * deadlock in doing the lookup in ffs_balloc.
1976 */
1977 if (bp->b_vp == vp)
1978 continue;
1979 /*
1980 * Check to see if block needs to be copied.
1981 */
1982 if (lbn < UFS_NDADDR) {
1983 blkno = db_get(ip, lbn);
1984 } else {
1985 mutex_exit(&si->si_lock);
1986 blkno = 0; /* XXX: GCC */
1987 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1988 mutex_enter(&si->si_lock);
1989 break;
1990 }
1991 mutex_enter(&si->si_lock);
1992 if (gen != si->si_gen)
1993 goto retry;
1994 }
1995 #ifdef DIAGNOSTIC
1996 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1997 panic("ffs_copyonwrite: bad copy block");
1998 #endif
1999 if (blkno != 0)
2000 continue;
2001
2002 if (curlwp == uvm.pagedaemon_lwp) {
2003 error = ENOMEM;
2004 break;
2005 }
2006 /* Only one level of recursion allowed. */
2007 KASSERT(snapshot_locked);
2008 /*
2009 * Allocate the block into which to do the copy. Since
2010 * multiple processes may all try to copy the same block,
2011 * we have to recheck our need to do a copy if we sleep
2012 * waiting for the lock.
2013 *
2014 * Because all snapshots on a filesystem share a single
2015 * lock, we ensure that we will never be in competition
2016 * with another process to allocate a block.
2017 */
2018 #ifdef DEBUG
2019 if (snapdebug) {
2020 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2021 (unsigned long long)ip->i_number, lbn);
2022 if (bp->b_vp == devvp)
2023 printf("fs metadata");
2024 else
2025 printf("inum %llu", (unsigned long long)
2026 VTOI(bp->b_vp)->i_number);
2027 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2028 }
2029 #endif
2030 /*
2031 * If we have already read the old block contents, then
2032 * simply copy them to the new block. Note that we need
2033 * to synchronously write snapshots that have not been
2034 * unlinked, and hence will be visible after a crash,
2035 * to ensure their integrity.
2036 */
2037 mutex_exit(&si->si_lock);
2038 if (saved_data == NULL) {
2039 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2040 error = rwfsblk(vp, B_READ, saved_data, lbn);
2041 if (error) {
2042 free(saved_data, M_UFSMNT);
2043 saved_data = NULL;
2044 mutex_enter(&si->si_lock);
2045 break;
2046 }
2047 }
2048 error = wrsnapblk(vp, saved_data, lbn);
2049 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
2050 error = syncsnap(vp);
2051 mutex_enter(&si->si_lock);
2052 if (error)
2053 break;
2054 if (gen != si->si_gen)
2055 goto retry;
2056 }
2057 /*
2058 * Note that we need to synchronously write snapshots that
2059 * have not been unlinked, and hence will be visible after
2060 * a crash, to ensure their integrity.
2061 */
2062 if (snapshot_locked) {
2063 si->si_owner = NULL;
2064 mutex_exit(&si->si_lock);
2065 mutex_exit(&si->si_snaplock);
2066 } else
2067 mutex_exit(&si->si_lock);
2068 if (saved_data && saved_data != bp->b_data)
2069 free(saved_data, M_UFSMNT);
2070 return error;
2071 }
2072
2073 /*
2074 * Read from a snapshot.
2075 */
2076 int
2077 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2078 {
2079 struct inode *ip = VTOI(vp);
2080 struct fs *fs = ip->i_fs;
2081 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2082 struct buf *bp;
2083 daddr_t lbn, nextlbn;
2084 off_t fsbytes, bytesinfile;
2085 long size, xfersize, blkoffset;
2086 int error;
2087
2088 fstrans_start(vp->v_mount, FSTRANS_SHARED);
2089 mutex_enter(&si->si_snaplock);
2090
2091 if (ioflag & IO_ALTSEMANTICS)
2092 fsbytes = ip->i_size;
2093 else
2094 fsbytes = ffs_lfragtosize(fs, fs->fs_size);
2095 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2096 bytesinfile = fsbytes - uio->uio_offset;
2097 if (bytesinfile <= 0)
2098 break;
2099 lbn = ffs_lblkno(fs, uio->uio_offset);
2100 nextlbn = lbn + 1;
2101 size = fs->fs_bsize;
2102 blkoffset = ffs_blkoff(fs, uio->uio_offset);
2103 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2104 bytesinfile);
2105
2106 if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
2107 if (ffs_lblktosize(fs, lbn) + size > fsbytes)
2108 size = ffs_fragroundup(fs,
2109 fsbytes - ffs_lblktosize(fs, lbn));
2110 error = bread(vp, lbn, size, 0, &bp);
2111 } else {
2112 int nextsize = fs->fs_bsize;
2113 error = breadn(vp, lbn,
2114 size, &nextlbn, &nextsize, 1, 0, &bp);
2115 }
2116 if (error)
2117 break;
2118
2119 /*
2120 * We should only get non-zero b_resid when an I/O error
2121 * has occurred, which should cause us to break above.
2122 * However, if the short read did not cause an error,
2123 * then we want to ensure that we do not uiomove bad
2124 * or uninitialized data.
2125 */
2126 size -= bp->b_resid;
2127 if (size < blkoffset + xfersize) {
2128 xfersize = size - blkoffset;
2129 if (xfersize <= 0)
2130 break;
2131 }
2132 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2133 if (error)
2134 break;
2135 brelse(bp, BC_AGE);
2136 }
2137 if (bp != NULL)
2138 brelse(bp, BC_AGE);
2139
2140 mutex_exit(&si->si_snaplock);
2141 fstrans_done(vp->v_mount);
2142 return error;
2143 }
2144
2145 /*
2146 * Lookup a snapshots data block address.
2147 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2148 * and safe even for the pagedaemon where we cannot bread().
2149 */
2150 static int
2151 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2152 {
2153 struct indir indirs[UFS_NIADDR + 2];
2154 struct inode *ip = VTOI(vp);
2155 struct fs *fs = ip->i_fs;
2156 struct buf *bp;
2157 int error, num;
2158
2159 KASSERT(lbn >= 0);
2160
2161 if (lbn < UFS_NDADDR) {
2162 *res = db_get(ip, lbn);
2163 return 0;
2164 }
2165 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2166 return error;
2167 if (curlwp == uvm.pagedaemon_lwp) {
2168 mutex_enter(&bufcache_lock);
2169 bp = incore(vp, indirs[num-1].in_lbn);
2170 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2171 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2172 error = 0;
2173 } else
2174 error = ENOMEM;
2175 mutex_exit(&bufcache_lock);
2176 return error;
2177 }
2178 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp);
2179 if (error == 0) {
2180 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2181 brelse(bp, 0);
2182 }
2183
2184 return error;
2185 }
2186
2187 /*
2188 * Read or write the specified block of the filesystem vp resides on
2189 * from or to the disk bypassing the buffer cache.
2190 */
2191 static int
2192 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2193 {
2194 int error;
2195 struct inode *ip = VTOI(vp);
2196 struct fs *fs = ip->i_fs;
2197 struct buf *nbp;
2198
2199 nbp = getiobuf(NULL, true);
2200 nbp->b_flags = flags;
2201 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2202 nbp->b_error = 0;
2203 nbp->b_data = data;
2204 nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
2205 nbp->b_proc = NULL;
2206 nbp->b_dev = ip->i_devvp->v_rdev;
2207 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2208
2209 bdev_strategy(nbp);
2210
2211 error = biowait(nbp);
2212
2213 putiobuf(nbp);
2214
2215 return error;
2216 }
2217
2218 /*
2219 * Write all dirty buffers to disk and invalidate them.
2220 */
2221 static int
2222 syncsnap(struct vnode *vp)
2223 {
2224 int error;
2225 buf_t *bp;
2226 struct fs *fs = VTOI(vp)->i_fs;
2227
2228 mutex_enter(&bufcache_lock);
2229 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2230 error = bbusy(bp, false, 0, NULL);
2231 if (error == EPASSTHROUGH)
2232 continue;
2233 else if (error != 0) {
2234 mutex_exit(&bufcache_lock);
2235 return error;
2236 }
2237 KASSERT(bp->b_bcount == fs->fs_bsize);
2238 mutex_exit(&bufcache_lock);
2239 error = rwfsblk(vp, B_WRITE, bp->b_data,
2240 ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
2241 brelse(bp, BC_INVAL | BC_VFLUSH);
2242 if (error)
2243 return error;
2244 mutex_enter(&bufcache_lock);
2245 }
2246 mutex_exit(&bufcache_lock);
2247
2248 return 0;
2249 }
2250
2251 /*
2252 * Write the specified block to a snapshot.
2253 */
2254 static int
2255 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2256 {
2257 struct inode *ip = VTOI(vp);
2258 struct fs *fs = ip->i_fs;
2259 struct buf *bp;
2260 int error;
2261
2262 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2263 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2264 if (error)
2265 return error;
2266 memcpy(bp->b_data, data, fs->fs_bsize);
2267 if (ip->i_nlink > 0)
2268 error = bwrite(bp);
2269 else
2270 bawrite(bp);
2271
2272 return error;
2273 }
2274
2275 /*
2276 * Check if this inode is present on the active snapshot list.
2277 * Must be called with snapinfo locked.
2278 */
2279 static inline bool
2280 is_active_snapshot(struct snap_info *si, struct inode *ip)
2281 {
2282 struct inode *xp;
2283
2284 KASSERT(mutex_owned(&si->si_lock));
2285
2286 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2287 if (xp == ip)
2288 return true;
2289 return false;
2290 }
2291
2292 /*
2293 * Get/Put direct block from inode or buffer containing disk addresses. Take
2294 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2295 * into a global include.
2296 */
2297 static inline daddr_t
2298 db_get(struct inode *ip, int loc)
2299 {
2300 if (ip->i_ump->um_fstype == UFS1)
2301 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2302 else
2303 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2304 }
2305
2306 static inline void
2307 db_assign(struct inode *ip, int loc, daddr_t val)
2308 {
2309 if (ip->i_ump->um_fstype == UFS1)
2310 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2311 else
2312 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2313 }
2314
2315 __unused static inline daddr_t
2316 ib_get(struct inode *ip, int loc)
2317 {
2318 if (ip->i_ump->um_fstype == UFS1)
2319 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2320 else
2321 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2322 }
2323
2324 static inline daddr_t
2325 idb_get(struct inode *ip, void *bf, int loc)
2326 {
2327 if (ip->i_ump->um_fstype == UFS1)
2328 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2329 else
2330 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2331 }
2332
2333 static inline void
2334 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2335 {
2336 if (ip->i_ump->um_fstype == UFS1)
2337 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2338 else
2339 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2340 }
2341