ffs_snapshot.c revision 1.120 1 /* $NetBSD: ffs_snapshot.c,v 1.120 2012/12/20 08:03:44 hannken Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.120 2012/12/20 08:03:44 hannken Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_quota.h"
46 #endif
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77
78 #include <uvm/uvm.h>
79
80 struct snap_info {
81 kmutex_t si_lock; /* Lock this snapinfo */
82 kmutex_t si_snaplock; /* Snapshot vnode common lock */
83 lwp_t *si_owner; /* Sanplock owner */
84 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
85 daddr_t *si_snapblklist; /* Snapshot block hints list */
86 uint32_t si_gen; /* Incremented on change */
87 };
88
89 #if !defined(FFS_NO_SNAPSHOT)
90 typedef int (*acctfunc_t)
91 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
92
93 static int snapshot_setup(struct mount *, struct vnode *);
94 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
95 static int snapshot_expunge(struct mount *, struct vnode *,
96 struct fs *, daddr_t *, daddr_t **);
97 static int snapshot_expunge_snap(struct mount *, struct vnode *,
98 struct fs *, daddr_t);
99 static int snapshot_writefs(struct mount *, struct vnode *, void *);
100 static int cgaccount(struct vnode *, int, int *);
101 static int cgaccount1(int, struct vnode *, void *, int);
102 static int expunge(struct vnode *, struct inode *, struct fs *,
103 acctfunc_t, int);
104 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
105 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
106 static int fullacct(struct vnode *, void *, int, int, struct fs *,
107 daddr_t, int);
108 static int snapacct(struct vnode *, void *, int, int, struct fs *,
109 daddr_t, int);
110 static int mapacct(struct vnode *, void *, int, int, struct fs *,
111 daddr_t, int);
112 #endif /* !defined(FFS_NO_SNAPSHOT) */
113
114 static int ffs_copyonwrite(void *, struct buf *, bool);
115 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
116 static int rwfsblk(struct vnode *, int, void *, daddr_t);
117 static int syncsnap(struct vnode *);
118 static int wrsnapblk(struct vnode *, void *, daddr_t);
119 #if !defined(FFS_NO_SNAPSHOT)
120 static int blocks_in_journal(struct fs *);
121 #endif
122
123 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
124 static inline daddr_t db_get(struct inode *, int);
125 static inline void db_assign(struct inode *, int, daddr_t);
126 static inline daddr_t ib_get(struct inode *, int);
127 static inline void ib_assign(struct inode *, int, daddr_t);
128 static inline daddr_t idb_get(struct inode *, void *, int);
129 static inline void idb_assign(struct inode *, void *, int, daddr_t);
130
131 #ifdef DEBUG
132 static int snapdebug = 0;
133 #endif
134
135 int
136 ffs_snapshot_init(struct ufsmount *ump)
137 {
138 struct snap_info *si;
139
140 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
141 if (si == NULL)
142 return ENOMEM;
143
144 TAILQ_INIT(&si->si_snapshots);
145 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
146 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
147 si->si_owner = NULL;
148 si->si_gen = 0;
149 si->si_snapblklist = NULL;
150
151 return 0;
152 }
153
154 void
155 ffs_snapshot_fini(struct ufsmount *ump)
156 {
157 struct snap_info *si;
158
159 si = ump->um_snapinfo;
160 ump->um_snapinfo = NULL;
161
162 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
163 mutex_destroy(&si->si_lock);
164 mutex_destroy(&si->si_snaplock);
165 KASSERT(si->si_snapblklist == NULL);
166 kmem_free(si, sizeof(*si));
167 }
168
169 /*
170 * Create a snapshot file and initialize it for the filesystem.
171 * Vnode is locked on entry and return.
172 */
173 int
174 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
175 {
176 #if defined(FFS_NO_SNAPSHOT)
177 return EOPNOTSUPP;
178 }
179 #else /* defined(FFS_NO_SNAPSHOT) */
180 bool suspended = false;
181 int error, redo = 0, snaploc;
182 void *sbbuf = NULL;
183 daddr_t *snaplist = NULL, snaplistsize = 0;
184 struct buf *bp, *nbp;
185 struct fs *copy_fs = NULL;
186 struct fs *fs = VFSTOUFS(mp)->um_fs;
187 struct inode *ip = VTOI(vp);
188 struct lwp *l = curlwp;
189 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
190 struct timespec ts;
191 struct timeval starttime;
192 #ifdef DEBUG
193 struct timeval endtime;
194 #endif
195 struct vnode *devvp = ip->i_devvp;
196
197 /*
198 * If the vnode already is a snapshot, return.
199 */
200 if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) {
201 if ((VTOI(vp)->i_flags & SF_SNAPINVAL))
202 return EINVAL;
203 if (ctime) {
204 ctime->tv_sec = DIP(VTOI(vp), mtime);
205 ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
206 }
207 return 0;
208 }
209 /*
210 * Check for free snapshot slot in the superblock.
211 */
212 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
213 if (fs->fs_snapinum[snaploc] == 0)
214 break;
215 if (snaploc == FSMAXSNAP)
216 return (ENOSPC);
217 /*
218 * Prepare the vnode to become a snapshot.
219 */
220 error = snapshot_setup(mp, vp);
221 if (error)
222 goto out;
223
224 /*
225 * Copy all the cylinder group maps. Although the
226 * filesystem is still active, we hope that only a few
227 * cylinder groups will change between now and when we
228 * suspend operations. Thus, we will be able to quickly
229 * touch up the few cylinder groups that changed during
230 * the suspension period.
231 */
232 error = cgaccount(vp, 1, NULL);
233 if (error)
234 goto out;
235
236 /*
237 * snapshot is now valid
238 */
239 ip->i_flags &= ~SF_SNAPINVAL;
240 DIP_ASSIGN(ip, flags, ip->i_flags);
241 ip->i_flag |= IN_CHANGE | IN_UPDATE;
242
243 /*
244 * Ensure that the snapshot is completely on disk.
245 * Since we have marked it as a snapshot it is safe to
246 * unlock it as no process will be allowed to write to it.
247 */
248 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
249 if (error)
250 goto out;
251 VOP_UNLOCK(vp);
252 /*
253 * All allocations are done, so we can now suspend the filesystem.
254 */
255 error = vfs_suspend(vp->v_mount, 0);
256 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
257 if (error)
258 goto out;
259 suspended = true;
260 getmicrotime(&starttime);
261 /*
262 * First, copy all the cylinder group maps that have changed.
263 */
264 error = cgaccount(vp, 2, &redo);
265 if (error)
266 goto out;
267 /*
268 * Create a copy of the superblock and its summary information.
269 */
270 error = snapshot_copyfs(mp, vp, &sbbuf);
271 copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
272 if (error)
273 goto out;
274 /*
275 * Expunge unlinked files from our view.
276 */
277 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
278 if (error)
279 goto out;
280 /*
281 * Record snapshot inode. Since this is the newest snapshot,
282 * it must be placed at the end of the list.
283 */
284 if (ip->i_nlink > 0)
285 fs->fs_snapinum[snaploc] = ip->i_number;
286
287 mutex_enter(&si->si_lock);
288 if (is_active_snapshot(si, ip))
289 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
290 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
291 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
292 /*
293 * If this is the first snapshot on this filesystem, put the
294 * preliminary list in place and establish the cow handler.
295 */
296 si->si_snapblklist = snaplist;
297 fscow_establish(mp, ffs_copyonwrite, devvp);
298 }
299 si->si_gen++;
300 mutex_exit(&si->si_lock);
301
302 vp->v_vflag |= VV_SYSTEM;
303 /*
304 * Set the mtime to the time the snapshot has been taken.
305 */
306 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
307 if (ctime)
308 *ctime = ts;
309 DIP_ASSIGN(ip, mtime, ts.tv_sec);
310 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
311 ip->i_flag |= IN_CHANGE | IN_UPDATE;
312 /*
313 * Copy allocation information from all snapshots and then
314 * expunge them from our view.
315 */
316 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
317 if (error)
318 goto out;
319 /*
320 * Write the superblock and its summary information to the snapshot.
321 */
322 error = snapshot_writefs(mp, vp, sbbuf);
323 if (error)
324 goto out;
325 /*
326 * We're nearly done, ensure that the snapshot is completely on disk.
327 */
328 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
329 if (error)
330 goto out;
331 /*
332 * Invalidate and free all pages on the snapshot vnode.
333 * We will read and write through the buffercache.
334 */
335 mutex_enter(vp->v_interlock);
336 error = VOP_PUTPAGES(vp, 0, 0,
337 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
338 if (error)
339 goto out;
340 /*
341 * Invalidate short ( < fs_bsize ) buffers. We will always read
342 * full size buffers later.
343 */
344 mutex_enter(&bufcache_lock);
345 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
346 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
347 nbp = LIST_NEXT(bp, b_vnbufs);
348 KASSERT((bp->b_cflags & BC_BUSY) == 0);
349 if (bp->b_bcount < fs->fs_bsize) {
350 bp->b_cflags |= BC_BUSY;
351 brelsel(bp, BC_INVAL | BC_VFLUSH);
352 }
353 }
354 mutex_exit(&bufcache_lock);
355
356 out:
357 if (sbbuf != NULL) {
358 free(copy_fs->fs_csp, M_UFSMNT);
359 free(sbbuf, M_UFSMNT);
360 }
361 if (fs->fs_active != NULL) {
362 free(fs->fs_active, M_DEVBUF);
363 fs->fs_active = NULL;
364 }
365
366 mutex_enter(&si->si_lock);
367 if (snaplist != NULL) {
368 if (si->si_snapblklist == snaplist)
369 si->si_snapblklist = NULL;
370 free(snaplist, M_UFSMNT);
371 }
372 if (error) {
373 fs->fs_snapinum[snaploc] = 0;
374 } else {
375 /*
376 * As this is the newest list, it is the most inclusive, so
377 * should replace the previous list.
378 */
379 si->si_snapblklist = ip->i_snapblklist;
380 }
381 si->si_gen++;
382 mutex_exit(&si->si_lock);
383
384 if (suspended) {
385 VOP_UNLOCK(vp);
386 vfs_resume(vp->v_mount);
387 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
388 #ifdef DEBUG
389 getmicrotime(&endtime);
390 timersub(&endtime, &starttime, &endtime);
391 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
392 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
393 endtime.tv_usec / 1000, redo, fs->fs_ncg);
394 #endif
395 }
396 if (error) {
397 if (!UFS_WAPBL_BEGIN(mp)) {
398 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
399 UFS_WAPBL_END(mp);
400 }
401 } else if (ip->i_nlink > 0)
402 vref(vp);
403 return (error);
404 }
405
406 /*
407 * Prepare vnode to become a snapshot.
408 */
409 static int
410 snapshot_setup(struct mount *mp, struct vnode *vp)
411 {
412 int error, n, len, loc, cg;
413 daddr_t blkno, numblks;
414 struct buf *ibp, *nbp;
415 struct fs *fs = VFSTOUFS(mp)->um_fs;
416 struct lwp *l = curlwp;
417 const int wbreak = blocks_in_journal(fs)/8;
418 struct inode *ip = VTOI(vp);
419
420 /*
421 * Check mount, exclusive reference and owner.
422 */
423 if (vp->v_mount != mp)
424 return EXDEV;
425 if (vp->v_usecount != 1 || vp->v_writecount != 0)
426 return EBUSY;
427 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
428 0, mp, vp, NULL);
429 if (error)
430 return EACCES;
431
432 if (vp->v_size != 0) {
433 error = ffs_truncate(vp, 0, 0, NOCRED);
434 if (error)
435 return error;
436 }
437
438 /* Change inode to snapshot type file. */
439 error = UFS_WAPBL_BEGIN(mp);
440 if (error)
441 return error;
442 #if defined(QUOTA) || defined(QUOTA2)
443 /* shapshot inodes are not accounted in quotas */
444 chkiq(ip, -1, l->l_cred, 0);
445 #endif
446 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
447 DIP_ASSIGN(ip, flags, ip->i_flags);
448 ip->i_flag |= IN_CHANGE | IN_UPDATE;
449 ffs_update(vp, NULL, NULL, UPDATE_WAIT);
450 UFS_WAPBL_END(mp);
451
452 KASSERT(ip->i_flags & SF_SNAPSHOT);
453 /*
454 * Write an empty list of preallocated blocks to the end of
455 * the snapshot to set size to at least that of the filesystem.
456 */
457 numblks = howmany(fs->fs_size, fs->fs_frag);
458 blkno = 1;
459 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
460 error = vn_rdwr(UIO_WRITE, vp,
461 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
462 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
463 if (error)
464 return error;
465 /*
466 * Preallocate critical data structures so that we can copy
467 * them in without further allocation after we suspend all
468 * operations on the filesystem. We would like to just release
469 * the allocated buffers without writing them since they will
470 * be filled in below once we are ready to go, but this upsets
471 * the soft update code, so we go ahead and write the new buffers.
472 *
473 * Allocate all indirect blocks and mark all of them as not
474 * needing to be copied.
475 */
476 error = UFS_WAPBL_BEGIN(mp);
477 if (error)
478 return error;
479 for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) {
480 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
481 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
482 if (error)
483 goto out;
484 brelse(ibp, 0);
485 if (wbreak > 0 && (++n % wbreak) == 0) {
486 UFS_WAPBL_END(mp);
487 error = UFS_WAPBL_BEGIN(mp);
488 if (error)
489 return error;
490 }
491 }
492 /*
493 * Allocate copies for the superblock and its summary information.
494 */
495 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
496 0, &nbp);
497 if (error)
498 goto out;
499 bawrite(nbp);
500 blkno = fragstoblks(fs, fs->fs_csaddr);
501 len = howmany(fs->fs_cssize, fs->fs_bsize);
502 for (loc = 0; loc < len; loc++) {
503 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
504 fs->fs_bsize, l->l_cred, 0, &nbp);
505 if (error)
506 goto out;
507 bawrite(nbp);
508 if (wbreak > 0 && (++n % wbreak) == 0) {
509 UFS_WAPBL_END(mp);
510 error = UFS_WAPBL_BEGIN(mp);
511 if (error)
512 return error;
513 }
514 }
515 /*
516 * Allocate all cylinder group blocks.
517 */
518 for (cg = 0; cg < fs->fs_ncg; cg++) {
519 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
520 fs->fs_bsize, l->l_cred, 0, &nbp);
521 if (error)
522 goto out;
523 bawrite(nbp);
524 if (wbreak > 0 && (++n % wbreak) == 0) {
525 UFS_WAPBL_END(mp);
526 error = UFS_WAPBL_BEGIN(mp);
527 if (error)
528 return error;
529 }
530 }
531
532 out:
533 UFS_WAPBL_END(mp);
534 return error;
535 }
536
537 /*
538 * Create a copy of the superblock and its summary information.
539 * It is up to the caller to free copyfs and copy_fs->fs_csp.
540 */
541 static int
542 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
543 {
544 int error, i, len, loc, size;
545 void *space;
546 int32_t *lp;
547 struct buf *bp;
548 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
549 struct lwp *l = curlwp;
550 struct vnode *devvp = VTOI(vp)->i_devvp;
551
552 /*
553 * Grab a copy of the superblock and its summary information.
554 * We delay writing it until the suspension is released below.
555 */
556 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
557 loc = blkoff(fs, fs->fs_sblockloc);
558 if (loc > 0)
559 memset(*sbbuf, 0, loc);
560 copyfs = (struct fs *)((char *)(*sbbuf) + loc);
561 memcpy(copyfs, fs, fs->fs_sbsize);
562 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
563 if (fs->fs_sbsize < size)
564 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
565 size - fs->fs_sbsize);
566 size = blkroundup(fs, fs->fs_cssize);
567 if (fs->fs_contigsumsize > 0)
568 size += fs->fs_ncg * sizeof(int32_t);
569 space = malloc(size, M_UFSMNT, M_WAITOK);
570 copyfs->fs_csp = space;
571 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
572 space = (char *)space + fs->fs_cssize;
573 loc = howmany(fs->fs_cssize, fs->fs_fsize);
574 i = fs->fs_frag - loc % fs->fs_frag;
575 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
576 if (len > 0) {
577 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
578 len, l->l_cred, 0, &bp)) != 0) {
579 free(copyfs->fs_csp, M_UFSMNT);
580 free(*sbbuf, M_UFSMNT);
581 *sbbuf = NULL;
582 return error;
583 }
584 memcpy(space, bp->b_data, (u_int)len);
585 space = (char *)space + len;
586 brelse(bp, BC_INVAL | BC_NOCACHE);
587 }
588 if (fs->fs_contigsumsize > 0) {
589 copyfs->fs_maxcluster = lp = space;
590 for (i = 0; i < fs->fs_ncg; i++)
591 *lp++ = fs->fs_contigsumsize;
592 }
593 if (mp->mnt_wapbl)
594 copyfs->fs_flags &= ~FS_DOWAPBL;
595 return 0;
596 }
597
598 /*
599 * We must check for active files that have been unlinked (e.g., with a zero
600 * link count). We have to expunge all trace of these files from the snapshot
601 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
602 * Note that we skip unlinked snapshot files as they will be handled separately.
603 * Calculate the snapshot list size and create a preliminary list.
604 */
605 static int
606 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
607 daddr_t *snaplistsize, daddr_t **snaplist)
608 {
609 int cg, error = 0, len, loc;
610 daddr_t blkno, *blkp;
611 struct fs *fs = VFSTOUFS(mp)->um_fs;
612 struct inode *xp;
613 struct lwp *l = curlwp;
614 struct vattr vat;
615 struct vnode *logvp = NULL, *mvp = NULL, *xvp;
616
617 *snaplist = NULL;
618 /*
619 * Get the log inode if any.
620 */
621 if ((fs->fs_flags & FS_DOWAPBL) &&
622 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
623 error = VFS_VGET(mp,
624 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
625 if (error)
626 goto out;
627 }
628 /*
629 * Allocate a marker vnode.
630 */
631 mvp = vnalloc(mp);
632 /*
633 * We also calculate the needed size for the snapshot list.
634 */
635 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
636 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
637 mutex_enter(&mntvnode_lock);
638 /*
639 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
640 * and vclean() can be called indirectly
641 */
642 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
643 vmark(mvp, xvp);
644 /*
645 * Make sure this vnode wasn't reclaimed in getnewvnode().
646 * Start over if it has (it won't be on the list anymore).
647 */
648 if (xvp->v_mount != mp || vismarker(xvp))
649 continue;
650 mutex_enter(xvp->v_interlock);
651 if ((xvp->v_iflag & VI_XLOCK) ||
652 xvp->v_usecount == 0 || xvp->v_type == VNON ||
653 VTOI(xvp) == NULL ||
654 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
655 mutex_exit(xvp->v_interlock);
656 continue;
657 }
658 mutex_exit(&mntvnode_lock);
659 /*
660 * XXXAD should increase vnode ref count to prevent it
661 * disappearing or being recycled.
662 */
663 mutex_exit(xvp->v_interlock);
664 #ifdef DEBUG
665 if (snapdebug)
666 vprint("ffs_snapshot: busy vnode", xvp);
667 #endif
668 xp = VTOI(xvp);
669 if (xvp != logvp) {
670 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
671 vat.va_nlink > 0) {
672 mutex_enter(&mntvnode_lock);
673 continue;
674 }
675 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
676 mutex_enter(&mntvnode_lock);
677 continue;
678 }
679 }
680 /*
681 * If there is a fragment, clear it here.
682 */
683 blkno = 0;
684 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
685 if (loc < NDADDR) {
686 len = fragroundup(fs, blkoff(fs, xp->i_size));
687 if (len > 0 && len < fs->fs_bsize) {
688 error = UFS_WAPBL_BEGIN(mp);
689 if (error) {
690 (void)vunmark(mvp);
691 goto out;
692 }
693 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
694 len, xp->i_number);
695 blkno = db_get(xp, loc);
696 db_assign(xp, loc, 0);
697 UFS_WAPBL_END(mp);
698 }
699 }
700 *snaplistsize += 1;
701 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
702 if (blkno)
703 db_assign(xp, loc, blkno);
704 if (!error) {
705 error = UFS_WAPBL_BEGIN(mp);
706 if (!error) {
707 error = ffs_freefile_snap(copy_fs, vp,
708 xp->i_number, xp->i_mode);
709 UFS_WAPBL_END(mp);
710 }
711 }
712 if (error) {
713 (void)vunmark(mvp);
714 goto out;
715 }
716 mutex_enter(&mntvnode_lock);
717 }
718 mutex_exit(&mntvnode_lock);
719 /*
720 * Create a preliminary list of preallocated snapshot blocks.
721 */
722 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
723 blkp = &(*snaplist)[1];
724 *blkp++ = lblkno(fs, fs->fs_sblockloc);
725 blkno = fragstoblks(fs, fs->fs_csaddr);
726 for (cg = 0; cg < fs->fs_ncg; cg++) {
727 if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
728 break;
729 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
730 }
731 len = howmany(fs->fs_cssize, fs->fs_bsize);
732 for (loc = 0; loc < len; loc++)
733 *blkp++ = blkno + loc;
734 for (; cg < fs->fs_ncg; cg++)
735 *blkp++ = fragstoblks(fs, cgtod(fs, cg));
736 (*snaplist)[0] = blkp - &(*snaplist)[0];
737
738 out:
739 if (mvp != NULL)
740 vnfree(mvp);
741 if (logvp != NULL)
742 vput(logvp);
743 if (error && *snaplist != NULL) {
744 free(*snaplist, M_UFSMNT);
745 *snaplist = NULL;
746 }
747
748 return error;
749 }
750
751 /*
752 * Copy allocation information from all the snapshots in this snapshot and
753 * then expunge them from its view. Also, collect the list of allocated
754 * blocks in i_snapblklist.
755 */
756 static int
757 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
758 struct fs *copy_fs, daddr_t snaplistsize)
759 {
760 int error = 0, i;
761 daddr_t numblks, *snaplist = NULL;
762 struct fs *fs = VFSTOUFS(mp)->um_fs;
763 struct inode *ip = VTOI(vp), *xp;
764 struct lwp *l = curlwp;
765 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
766
767 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
768 if (xp != ip) {
769 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
770 if (error)
771 break;
772 }
773 if (xp->i_nlink != 0)
774 continue;
775 error = UFS_WAPBL_BEGIN(mp);
776 if (error)
777 break;
778 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
779 UFS_WAPBL_END(mp);
780 if (error)
781 break;
782 }
783 if (error)
784 goto out;
785 /*
786 * Allocate space for the full list of preallocated snapshot blocks.
787 */
788 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
789 ip->i_snapblklist = &snaplist[1];
790 /*
791 * Expunge the blocks used by the snapshots from the set of
792 * blocks marked as used in the snapshot bitmaps. Also, collect
793 * the list of allocated blocks in i_snapblklist.
794 */
795 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
796 if (error)
797 goto out;
798 if (snaplistsize < ip->i_snapblklist - snaplist)
799 panic("ffs_snapshot: list too small");
800 snaplistsize = ip->i_snapblklist - snaplist;
801 snaplist[0] = snaplistsize;
802 ip->i_snapblklist = &snaplist[0];
803 /*
804 * Write out the list of allocated blocks to the end of the snapshot.
805 */
806 numblks = howmany(fs->fs_size, fs->fs_frag);
807 for (i = 0; i < snaplistsize; i++)
808 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
809 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
810 snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks),
811 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
812 for (i = 0; i < snaplistsize; i++)
813 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
814 out:
815 if (error && snaplist != NULL) {
816 free(snaplist, M_UFSMNT);
817 ip->i_snapblklist = NULL;
818 }
819 return error;
820 }
821
822 /*
823 * Write the superblock and its summary information to the snapshot.
824 * Make sure, the first NDADDR blocks get copied to the snapshot.
825 */
826 static int
827 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
828 {
829 int error, len, loc;
830 void *space;
831 daddr_t blkno;
832 struct buf *bp;
833 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
834 struct inode *ip = VTOI(vp);
835 struct lwp *l = curlwp;
836
837 copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
838
839 /*
840 * Write the superblock and its summary information
841 * to the snapshot.
842 */
843 blkno = fragstoblks(fs, fs->fs_csaddr);
844 len = howmany(fs->fs_cssize, fs->fs_bsize);
845 space = copyfs->fs_csp;
846 #ifdef FFS_EI
847 if (UFS_FSNEEDSWAP(fs)) {
848 ffs_sb_swap(copyfs, copyfs);
849 ffs_csum_swap(space, space, fs->fs_cssize);
850 }
851 #endif
852 error = UFS_WAPBL_BEGIN(mp);
853 if (error)
854 return error;
855 for (loc = 0; loc < len; loc++) {
856 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
857 B_MODIFY, &bp);
858 if (error) {
859 break;
860 }
861 memcpy(bp->b_data, space, fs->fs_bsize);
862 space = (char *)space + fs->fs_bsize;
863 bawrite(bp);
864 }
865 if (error)
866 goto out;
867 error = bread(vp, lblkno(fs, fs->fs_sblockloc),
868 fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
869 if (error) {
870 goto out;
871 } else {
872 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
873 bawrite(bp);
874 }
875 /*
876 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
877 * and ffs_snapblkfree() will always work on indirect blocks.
878 */
879 for (loc = 0; loc < NDADDR; loc++) {
880 if (db_get(ip, loc) != 0)
881 continue;
882 error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
883 fs->fs_bsize, l->l_cred, 0, &bp);
884 if (error)
885 break;
886 error = rwfsblk(vp, B_READ, bp->b_data, loc);
887 if (error) {
888 brelse(bp, 0);
889 break;
890 }
891 bawrite(bp);
892 }
893
894 out:
895 UFS_WAPBL_END(mp);
896 return error;
897 }
898
899 /*
900 * Copy all cylinder group maps.
901 */
902 static int
903 cgaccount(struct vnode *vp, int passno, int *redo)
904 {
905 int cg, error = 0;
906 struct buf *nbp;
907 struct fs *fs = VTOI(vp)->i_fs;
908
909 if (redo != NULL)
910 *redo = 0;
911 if (passno == 1)
912 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
913 M_DEVBUF, M_WAITOK | M_ZERO);
914 for (cg = 0; cg < fs->fs_ncg; cg++) {
915 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
916 continue;
917
918 if (redo != NULL)
919 *redo += 1;
920 error = UFS_WAPBL_BEGIN(vp->v_mount);
921 if (error)
922 return error;
923 error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
924 fs->fs_bsize, curlwp->l_cred, 0, &nbp);
925 if (error) {
926 UFS_WAPBL_END(vp->v_mount);
927 break;
928 }
929 error = cgaccount1(cg, vp, nbp->b_data, passno);
930 bawrite(nbp);
931 UFS_WAPBL_END(vp->v_mount);
932 if (error)
933 break;
934 }
935 return error;
936 }
937
938 /*
939 * Copy a cylinder group map. All the unallocated blocks are marked
940 * BLK_NOCOPY so that the snapshot knows that it need not copy them
941 * if they are later written. If passno is one, then this is a first
942 * pass, so only setting needs to be done. If passno is 2, then this
943 * is a revision to a previous pass which must be undone as the
944 * replacement pass is done.
945 */
946 static int
947 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
948 {
949 struct buf *bp, *ibp;
950 struct inode *ip;
951 struct cg *cgp;
952 struct fs *fs;
953 struct lwp *l = curlwp;
954 daddr_t base, numblks;
955 int error, len, loc, ns, indiroff;
956
957 ip = VTOI(vp);
958 fs = ip->i_fs;
959 ns = UFS_FSNEEDSWAP(fs);
960 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
961 (int)fs->fs_cgsize, l->l_cred, 0, &bp);
962 if (error) {
963 return (error);
964 }
965 cgp = (struct cg *)bp->b_data;
966 if (!cg_chkmagic(cgp, ns)) {
967 brelse(bp, 0);
968 return (EIO);
969 }
970 ACTIVECG_SET(fs, cg);
971
972 memcpy(data, bp->b_data, fs->fs_cgsize);
973 brelse(bp, 0);
974 if (fs->fs_cgsize < fs->fs_bsize)
975 memset((char *)data + fs->fs_cgsize, 0,
976 fs->fs_bsize - fs->fs_cgsize);
977 numblks = howmany(fs->fs_size, fs->fs_frag);
978 len = howmany(fs->fs_fpg, fs->fs_frag);
979 base = cg * fs->fs_fpg / fs->fs_frag;
980 if (base + len >= numblks)
981 len = numblks - base - 1;
982 loc = 0;
983 if (base < NDADDR) {
984 for ( ; loc < NDADDR; loc++) {
985 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
986 db_assign(ip, loc, BLK_NOCOPY);
987 else if (db_get(ip, loc) == BLK_NOCOPY) {
988 if (passno == 2)
989 db_assign(ip, loc, 0);
990 else if (passno == 1)
991 panic("ffs_snapshot: lost direct block");
992 }
993 }
994 }
995 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
996 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
997 return (error);
998 indiroff = (base + loc - NDADDR) % NINDIR(fs);
999 for ( ; loc < len; loc++, indiroff++) {
1000 if (indiroff >= NINDIR(fs)) {
1001 bawrite(ibp);
1002 if ((error = ffs_balloc(vp,
1003 lblktosize(fs, (off_t)(base + loc)),
1004 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
1005 return (error);
1006 indiroff = 0;
1007 }
1008 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
1009 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
1010 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
1011 if (passno == 2)
1012 idb_assign(ip, ibp->b_data, indiroff, 0);
1013 else if (passno == 1)
1014 panic("ffs_snapshot: lost indirect block");
1015 }
1016 }
1017 bdwrite(ibp);
1018 return (0);
1019 }
1020
1021 /*
1022 * Before expunging a snapshot inode, note all the
1023 * blocks that it claims with BLK_SNAP so that fsck will
1024 * be able to account for those blocks properly and so
1025 * that this snapshot knows that it need not copy them
1026 * if the other snapshot holding them is freed.
1027 */
1028 static int
1029 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1030 acctfunc_t acctfunc, int expungetype)
1031 {
1032 int i, error, ns;
1033 daddr_t lbn, rlbn;
1034 daddr_t len, blkno, numblks, blksperindir;
1035 struct ufs1_dinode *dip1;
1036 struct ufs2_dinode *dip2;
1037 struct lwp *l = curlwp;
1038 void *bap;
1039 struct buf *bp;
1040 struct mount *mp;
1041
1042 ns = UFS_FSNEEDSWAP(fs);
1043 mp = snapvp->v_mount;
1044
1045 error = UFS_WAPBL_BEGIN(mp);
1046 if (error)
1047 return error;
1048 /*
1049 * Prepare to expunge the inode. If its inode block has not
1050 * yet been copied, then allocate and fill the copy.
1051 */
1052 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1053 error = snapblkaddr(snapvp, lbn, &blkno);
1054 if (error)
1055 return error;
1056 if (blkno != 0) {
1057 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
1058 B_MODIFY, &bp);
1059 } else {
1060 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
1061 fs->fs_bsize, l->l_cred, 0, &bp);
1062 if (! error)
1063 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1064 }
1065 if (error) {
1066 UFS_WAPBL_END(mp);
1067 return error;
1068 }
1069 /*
1070 * Set a snapshot inode to be a zero length file, regular files
1071 * or unlinked snapshots to be completely unallocated.
1072 */
1073 if (fs->fs_magic == FS_UFS1_MAGIC) {
1074 dip1 = (struct ufs1_dinode *)bp->b_data +
1075 ino_to_fsbo(fs, cancelip->i_number);
1076 if (cancelip->i_flags & SF_SNAPSHOT) {
1077 dip1->di_flags =
1078 ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
1079 SF_SNAPINVAL, ns);
1080 }
1081 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1082 dip1->di_mode = 0;
1083 dip1->di_size = 0;
1084 dip1->di_blocks = 0;
1085 memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t));
1086 } else {
1087 dip2 = (struct ufs2_dinode *)bp->b_data +
1088 ino_to_fsbo(fs, cancelip->i_number);
1089 if (cancelip->i_flags & SF_SNAPSHOT) {
1090 dip2->di_flags =
1091 ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
1092 SF_SNAPINVAL, ns);
1093 }
1094 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1095 dip2->di_mode = 0;
1096 dip2->di_size = 0;
1097 dip2->di_blocks = 0;
1098 memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t));
1099 }
1100 bdwrite(bp);
1101 UFS_WAPBL_END(mp);
1102 /*
1103 * Now go through and expunge all the blocks in the file
1104 * using the function requested.
1105 */
1106 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1107 if (fs->fs_magic == FS_UFS1_MAGIC)
1108 bap = &cancelip->i_ffs1_db[0];
1109 else
1110 bap = &cancelip->i_ffs2_db[0];
1111 error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype);
1112 if (error)
1113 return (error);
1114 if (fs->fs_magic == FS_UFS1_MAGIC)
1115 bap = &cancelip->i_ffs1_ib[0];
1116 else
1117 bap = &cancelip->i_ffs2_ib[0];
1118 error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype);
1119 if (error)
1120 return (error);
1121 blksperindir = 1;
1122 lbn = -NDADDR;
1123 len = numblks - NDADDR;
1124 rlbn = NDADDR;
1125 for (i = 0; len > 0 && i < NIADDR; i++) {
1126 error = indiracct(snapvp, ITOV(cancelip), i,
1127 ib_get(cancelip, i), lbn, rlbn, len,
1128 blksperindir, fs, acctfunc, expungetype);
1129 if (error)
1130 return (error);
1131 blksperindir *= NINDIR(fs);
1132 lbn -= blksperindir + 1;
1133 len -= blksperindir;
1134 rlbn += blksperindir;
1135 }
1136 return (0);
1137 }
1138
1139 /*
1140 * Descend an indirect block chain for vnode cancelvp accounting for all
1141 * its indirect blocks in snapvp.
1142 */
1143 static int
1144 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1145 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1146 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1147 {
1148 int error, num, i;
1149 daddr_t subblksperindir;
1150 struct indir indirs[NIADDR + 2];
1151 daddr_t last;
1152 void *bap;
1153 struct buf *bp;
1154
1155 if (blkno == 0) {
1156 if (expungetype == BLK_NOCOPY)
1157 return (0);
1158 panic("indiracct: missing indir");
1159 }
1160 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1161 return (error);
1162 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1163 panic("indiracct: botched params");
1164 /*
1165 * We have to expand bread here since it will deadlock looking
1166 * up the block number for any blocks that are not in the cache.
1167 */
1168 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
1169 false, &bp);
1170 if (error)
1171 return error;
1172 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1173 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
1174 brelse(bp, 0);
1175 return (error);
1176 }
1177 /*
1178 * Account for the block pointers in this indirect block.
1179 */
1180 last = howmany(remblks, blksperindir);
1181 if (last > NINDIR(fs))
1182 last = NINDIR(fs);
1183 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1184 memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1185 brelse(bp, 0);
1186 error = (*acctfunc)(snapvp, bap, 0, last,
1187 fs, level == 0 ? rlbn : -1, expungetype);
1188 if (error || level == 0)
1189 goto out;
1190 /*
1191 * Account for the block pointers in each of the indirect blocks
1192 * in the levels below us.
1193 */
1194 subblksperindir = blksperindir / NINDIR(fs);
1195 for (lbn++, level--, i = 0; i < last; i++) {
1196 error = indiracct(snapvp, cancelvp, level,
1197 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1198 subblksperindir, fs, acctfunc, expungetype);
1199 if (error)
1200 goto out;
1201 rlbn += blksperindir;
1202 lbn -= blksperindir;
1203 remblks -= blksperindir;
1204 }
1205 out:
1206 free(bap, M_DEVBUF);
1207 return (error);
1208 }
1209
1210 /*
1211 * Do both snap accounting and map accounting.
1212 */
1213 static int
1214 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1215 struct fs *fs, daddr_t lblkno,
1216 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1217 {
1218 int error;
1219
1220 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1221 return (error);
1222 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1223 }
1224
1225 /*
1226 * Identify a set of blocks allocated in a snapshot inode.
1227 */
1228 static int
1229 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1230 struct fs *fs, daddr_t lblkno,
1231 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1232 {
1233 struct inode *ip = VTOI(vp);
1234 struct lwp *l = curlwp;
1235 struct mount *mp = vp->v_mount;
1236 daddr_t blkno;
1237 daddr_t lbn;
1238 struct buf *ibp;
1239 int error, n;
1240 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1241
1242 error = UFS_WAPBL_BEGIN(mp);
1243 if (error)
1244 return error;
1245 for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1246 blkno = idb_get(ip, bap, oldblkp);
1247 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1248 continue;
1249 lbn = fragstoblks(fs, blkno);
1250 if (lbn < NDADDR) {
1251 blkno = db_get(ip, lbn);
1252 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1253 } else {
1254 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1255 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1256 if (error)
1257 break;
1258 blkno = idb_get(ip, ibp->b_data,
1259 (lbn - NDADDR) % NINDIR(fs));
1260 }
1261 /*
1262 * If we are expunging a snapshot vnode and we
1263 * find a block marked BLK_NOCOPY, then it is
1264 * one that has been allocated to this snapshot after
1265 * we took our current snapshot and can be ignored.
1266 */
1267 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1268 if (lbn >= NDADDR)
1269 brelse(ibp, 0);
1270 } else {
1271 if (blkno != 0)
1272 panic("snapacct: bad block");
1273 if (lbn < NDADDR)
1274 db_assign(ip, lbn, expungetype);
1275 else {
1276 idb_assign(ip, ibp->b_data,
1277 (lbn - NDADDR) % NINDIR(fs), expungetype);
1278 bdwrite(ibp);
1279 }
1280 }
1281 if (wbreak > 0 && (++n % wbreak) == 0) {
1282 UFS_WAPBL_END(mp);
1283 error = UFS_WAPBL_BEGIN(mp);
1284 if (error)
1285 return error;
1286 }
1287 }
1288 UFS_WAPBL_END(mp);
1289 return error;
1290 }
1291
1292 /*
1293 * Account for a set of blocks allocated in a snapshot inode.
1294 */
1295 static int
1296 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1297 struct fs *fs, daddr_t lblkno, int expungetype)
1298 {
1299 daddr_t blkno;
1300 struct inode *ip;
1301 struct mount *mp = vp->v_mount;
1302 ino_t inum;
1303 int acctit, error, n;
1304 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1305
1306 error = UFS_WAPBL_BEGIN(mp);
1307 if (error)
1308 return error;
1309 ip = VTOI(vp);
1310 inum = ip->i_number;
1311 if (lblkno == -1)
1312 acctit = 0;
1313 else
1314 acctit = 1;
1315 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1316 blkno = idb_get(ip, bap, oldblkp);
1317 if (blkno == 0 || blkno == BLK_NOCOPY)
1318 continue;
1319 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1320 *ip->i_snapblklist++ = lblkno;
1321 if (blkno == BLK_SNAP)
1322 blkno = blkstofrags(fs, lblkno);
1323 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1324 if (wbreak > 0 && (++n % wbreak) == 0) {
1325 UFS_WAPBL_END(mp);
1326 error = UFS_WAPBL_BEGIN(mp);
1327 if (error)
1328 return error;
1329 }
1330 }
1331 UFS_WAPBL_END(mp);
1332 return (0);
1333 }
1334
1335 /*
1336 * Number of blocks that fit into the journal or zero if not logging.
1337 */
1338 static int
1339 blocks_in_journal(struct fs *fs)
1340 {
1341 off_t bpj;
1342
1343 if ((fs->fs_flags & FS_DOWAPBL) == 0)
1344 return 0;
1345 bpj = 1;
1346 if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1347 switch (fs->fs_journal_location) {
1348 case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1349 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1350 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1351 break;
1352 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1353 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1354 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1355 break;
1356 }
1357 }
1358 bpj /= fs->fs_bsize;
1359 return (bpj > 0 ? bpj : 1);
1360 }
1361 #endif /* defined(FFS_NO_SNAPSHOT) */
1362
1363 /*
1364 * Decrement extra reference on snapshot when last name is removed.
1365 * It will not be freed until the last open reference goes away.
1366 */
1367 void
1368 ffs_snapgone(struct inode *ip)
1369 {
1370 struct mount *mp = ip->i_devvp->v_specmountpoint;
1371 struct inode *xp;
1372 struct fs *fs;
1373 struct snap_info *si;
1374 int snaploc;
1375
1376 si = VFSTOUFS(mp)->um_snapinfo;
1377
1378 /*
1379 * Find snapshot in incore list.
1380 */
1381 mutex_enter(&si->si_lock);
1382 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1383 if (xp == ip)
1384 break;
1385 mutex_exit(&si->si_lock);
1386 if (xp != NULL)
1387 vrele(ITOV(ip));
1388 #ifdef DEBUG
1389 else if (snapdebug)
1390 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1391 (unsigned long long)ip->i_number);
1392 #endif
1393 /*
1394 * Delete snapshot inode from superblock. Keep list dense.
1395 */
1396 mutex_enter(&si->si_lock);
1397 fs = ip->i_fs;
1398 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1399 if (fs->fs_snapinum[snaploc] == ip->i_number)
1400 break;
1401 if (snaploc < FSMAXSNAP) {
1402 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1403 if (fs->fs_snapinum[snaploc] == 0)
1404 break;
1405 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1406 }
1407 fs->fs_snapinum[snaploc - 1] = 0;
1408 }
1409 si->si_gen++;
1410 mutex_exit(&si->si_lock);
1411 }
1412
1413 /*
1414 * Prepare a snapshot file for being removed.
1415 */
1416 void
1417 ffs_snapremove(struct vnode *vp)
1418 {
1419 struct inode *ip = VTOI(vp), *xp;
1420 struct vnode *devvp = ip->i_devvp;
1421 struct fs *fs = ip->i_fs;
1422 struct mount *mp = devvp->v_specmountpoint;
1423 struct buf *ibp;
1424 struct snap_info *si;
1425 struct lwp *l = curlwp;
1426 daddr_t numblks, blkno, dblk;
1427 int error, loc, last;
1428
1429 si = VFSTOUFS(mp)->um_snapinfo;
1430 /*
1431 * If active, delete from incore list (this snapshot may
1432 * already have been in the process of being deleted, so
1433 * would not have been active).
1434 *
1435 * Clear copy-on-write flag if last snapshot.
1436 */
1437 mutex_enter(&si->si_snaplock);
1438 mutex_enter(&si->si_lock);
1439 if (is_active_snapshot(si, ip)) {
1440 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1441 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1442 /* Roll back the list of preallocated blocks. */
1443 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1444 si->si_snapblklist = xp->i_snapblklist;
1445 si->si_gen++;
1446 mutex_exit(&si->si_lock);
1447 mutex_exit(&si->si_snaplock);
1448 } else {
1449 si->si_snapblklist = 0;
1450 si->si_gen++;
1451 mutex_exit(&si->si_lock);
1452 mutex_exit(&si->si_snaplock);
1453 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1454 }
1455 if (ip->i_snapblklist != NULL) {
1456 free(ip->i_snapblklist, M_UFSMNT);
1457 ip->i_snapblklist = NULL;
1458 }
1459 } else {
1460 mutex_exit(&si->si_lock);
1461 mutex_exit(&si->si_snaplock);
1462 }
1463 /*
1464 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1465 * snapshots that want them (see ffs_snapblkfree below).
1466 */
1467 for (blkno = 1; blkno < NDADDR; blkno++) {
1468 dblk = db_get(ip, blkno);
1469 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1470 db_assign(ip, blkno, 0);
1471 else if ((dblk == blkstofrags(fs, blkno) &&
1472 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1473 ip->i_number))) {
1474 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1475 db_assign(ip, blkno, 0);
1476 }
1477 }
1478 numblks = howmany(ip->i_size, fs->fs_bsize);
1479 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1480 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1481 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1482 if (error)
1483 continue;
1484 if (fs->fs_size - blkno > NINDIR(fs))
1485 last = NINDIR(fs);
1486 else
1487 last = fs->fs_size - blkno;
1488 for (loc = 0; loc < last; loc++) {
1489 dblk = idb_get(ip, ibp->b_data, loc);
1490 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1491 idb_assign(ip, ibp->b_data, loc, 0);
1492 else if (dblk == blkstofrags(fs, blkno) &&
1493 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1494 fs->fs_bsize, ip->i_number)) {
1495 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1496 idb_assign(ip, ibp->b_data, loc, 0);
1497 }
1498 }
1499 bawrite(ibp);
1500 UFS_WAPBL_END(mp);
1501 error = UFS_WAPBL_BEGIN(mp);
1502 KASSERT(error == 0);
1503 }
1504 /*
1505 * Clear snapshot flag and drop reference.
1506 */
1507 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
1508 DIP_ASSIGN(ip, flags, ip->i_flags);
1509 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1510 #if defined(QUOTA) || defined(QUOTA2)
1511 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1512 chkiq(ip, 1, l->l_cred, FORCE);
1513 #endif
1514 }
1515
1516 /*
1517 * Notification that a block is being freed. Return zero if the free
1518 * should be allowed to proceed. Return non-zero if the snapshot file
1519 * wants to claim the block. The block will be claimed if it is an
1520 * uncopied part of one of the snapshots. It will be freed if it is
1521 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1522 * If a fragment is being freed, then all snapshots that care about
1523 * it must make a copy since a snapshot file can only claim full sized
1524 * blocks. Note that if more than one snapshot file maps the block,
1525 * we can pick one at random to claim it. Since none of the snapshots
1526 * can change, we are assurred that they will all see the same unmodified
1527 * image. When deleting a snapshot file (see ffs_snapremove above), we
1528 * must push any of these claimed blocks to one of the other snapshots
1529 * that maps it. These claimed blocks are easily identified as they will
1530 * have a block number equal to their logical block number within the
1531 * snapshot. A copied block can never have this property because they
1532 * must always have been allocated from a BLK_NOCOPY location.
1533 */
1534 int
1535 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1536 long size, ino_t inum)
1537 {
1538 struct mount *mp = devvp->v_specmountpoint;
1539 struct buf *ibp;
1540 struct inode *ip;
1541 struct vnode *vp = NULL;
1542 struct snap_info *si;
1543 void *saved_data = NULL;
1544 daddr_t lbn;
1545 daddr_t blkno;
1546 uint32_t gen;
1547 int indiroff = 0, error = 0, claimedblk = 0;
1548
1549 si = VFSTOUFS(mp)->um_snapinfo;
1550 lbn = fragstoblks(fs, bno);
1551 mutex_enter(&si->si_snaplock);
1552 mutex_enter(&si->si_lock);
1553 si->si_owner = curlwp;
1554
1555 retry:
1556 gen = si->si_gen;
1557 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1558 vp = ITOV(ip);
1559 /*
1560 * Lookup block being written.
1561 */
1562 if (lbn < NDADDR) {
1563 blkno = db_get(ip, lbn);
1564 } else {
1565 mutex_exit(&si->si_lock);
1566 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1567 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1568 if (error) {
1569 mutex_enter(&si->si_lock);
1570 break;
1571 }
1572 indiroff = (lbn - NDADDR) % NINDIR(fs);
1573 blkno = idb_get(ip, ibp->b_data, indiroff);
1574 mutex_enter(&si->si_lock);
1575 if (gen != si->si_gen) {
1576 brelse(ibp, 0);
1577 goto retry;
1578 }
1579 }
1580 /*
1581 * Check to see if block needs to be copied.
1582 */
1583 if (blkno == 0) {
1584 /*
1585 * A block that we map is being freed. If it has not
1586 * been claimed yet, we will claim or copy it (below).
1587 */
1588 claimedblk = 1;
1589 } else if (blkno == BLK_SNAP) {
1590 /*
1591 * No previous snapshot claimed the block,
1592 * so it will be freed and become a BLK_NOCOPY
1593 * (don't care) for us.
1594 */
1595 if (claimedblk)
1596 panic("snapblkfree: inconsistent block type");
1597 if (lbn < NDADDR) {
1598 db_assign(ip, lbn, BLK_NOCOPY);
1599 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1600 } else {
1601 idb_assign(ip, ibp->b_data, indiroff,
1602 BLK_NOCOPY);
1603 mutex_exit(&si->si_lock);
1604 if (ip->i_nlink > 0)
1605 bwrite(ibp);
1606 else
1607 bdwrite(ibp);
1608 mutex_enter(&si->si_lock);
1609 if (gen != si->si_gen)
1610 goto retry;
1611 }
1612 continue;
1613 } else /* BLK_NOCOPY or default */ {
1614 /*
1615 * If the snapshot has already copied the block
1616 * (default), or does not care about the block,
1617 * it is not needed.
1618 */
1619 if (lbn >= NDADDR)
1620 brelse(ibp, 0);
1621 continue;
1622 }
1623 /*
1624 * If this is a full size block, we will just grab it
1625 * and assign it to the snapshot inode. Otherwise we
1626 * will proceed to copy it. See explanation for this
1627 * routine as to why only a single snapshot needs to
1628 * claim this block.
1629 */
1630 if (size == fs->fs_bsize) {
1631 #ifdef DEBUG
1632 if (snapdebug)
1633 printf("%s %llu lbn %" PRId64
1634 "from inum %llu\n",
1635 "Grabonremove: snapino",
1636 (unsigned long long)ip->i_number,
1637 lbn, (unsigned long long)inum);
1638 #endif
1639 mutex_exit(&si->si_lock);
1640 if (lbn < NDADDR) {
1641 db_assign(ip, lbn, bno);
1642 } else {
1643 idb_assign(ip, ibp->b_data, indiroff, bno);
1644 if (ip->i_nlink > 0)
1645 bwrite(ibp);
1646 else
1647 bdwrite(ibp);
1648 }
1649 DIP_ADD(ip, blocks, btodb(size));
1650 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1651 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1652 error = syncsnap(vp);
1653 else
1654 error = 0;
1655 mutex_enter(&si->si_lock);
1656 si->si_owner = NULL;
1657 mutex_exit(&si->si_lock);
1658 mutex_exit(&si->si_snaplock);
1659 return (error == 0);
1660 }
1661 if (lbn >= NDADDR)
1662 brelse(ibp, 0);
1663 #ifdef DEBUG
1664 if (snapdebug)
1665 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1666 "Copyonremove: snapino ",
1667 (unsigned long long)ip->i_number,
1668 lbn, "for inum", (unsigned long long)inum, size);
1669 #endif
1670 /*
1671 * If we have already read the old block contents, then
1672 * simply copy them to the new block. Note that we need
1673 * to synchronously write snapshots that have not been
1674 * unlinked, and hence will be visible after a crash,
1675 * to ensure their integrity.
1676 */
1677 mutex_exit(&si->si_lock);
1678 if (saved_data == NULL) {
1679 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1680 error = rwfsblk(vp, B_READ, saved_data, lbn);
1681 if (error) {
1682 free(saved_data, M_UFSMNT);
1683 saved_data = NULL;
1684 mutex_enter(&si->si_lock);
1685 break;
1686 }
1687 }
1688 error = wrsnapblk(vp, saved_data, lbn);
1689 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1690 error = syncsnap(vp);
1691 mutex_enter(&si->si_lock);
1692 if (error)
1693 break;
1694 if (gen != si->si_gen)
1695 goto retry;
1696 }
1697 si->si_owner = NULL;
1698 mutex_exit(&si->si_lock);
1699 mutex_exit(&si->si_snaplock);
1700 if (saved_data)
1701 free(saved_data, M_UFSMNT);
1702 /*
1703 * If we have been unable to allocate a block in which to do
1704 * the copy, then return non-zero so that the fragment will
1705 * not be freed. Although space will be lost, the snapshot
1706 * will stay consistent.
1707 */
1708 return (error);
1709 }
1710
1711 /*
1712 * Associate snapshot files when mounting.
1713 */
1714 void
1715 ffs_snapshot_mount(struct mount *mp)
1716 {
1717 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1718 struct fs *fs = VFSTOUFS(mp)->um_fs;
1719 struct lwp *l = curlwp;
1720 struct vnode *vp;
1721 struct inode *ip, *xp;
1722 struct snap_info *si;
1723 daddr_t snaplistsize, *snapblklist;
1724 int i, error, ns, snaploc, loc;
1725
1726 /*
1727 * No persistent snapshots on apple ufs file systems.
1728 */
1729 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1730 return;
1731
1732 si = VFSTOUFS(mp)->um_snapinfo;
1733 ns = UFS_FSNEEDSWAP(fs);
1734 /*
1735 * XXX The following needs to be set before ffs_truncate or
1736 * VOP_READ can be called.
1737 */
1738 mp->mnt_stat.f_iosize = fs->fs_bsize;
1739 /*
1740 * Process each snapshot listed in the superblock.
1741 */
1742 vp = NULL;
1743 mutex_enter(&si->si_lock);
1744 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1745 if (fs->fs_snapinum[snaploc] == 0)
1746 break;
1747 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1748 &vp)) != 0) {
1749 printf("ffs_snapshot_mount: vget failed %d\n", error);
1750 continue;
1751 }
1752 ip = VTOI(vp);
1753 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
1754 SF_SNAPSHOT) {
1755 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1756 fs->fs_snapinum[snaploc]);
1757 vput(vp);
1758 vp = NULL;
1759 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1760 if (fs->fs_snapinum[loc] == 0)
1761 break;
1762 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1763 }
1764 fs->fs_snapinum[loc - 1] = 0;
1765 snaploc--;
1766 continue;
1767 }
1768
1769 /*
1770 * Read the block hints list. Use an empty list on
1771 * read errors.
1772 */
1773 error = vn_rdwr(UIO_READ, vp,
1774 (void *)&snaplistsize, sizeof(snaplistsize),
1775 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1776 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1777 l->l_cred, NULL, NULL);
1778 if (error) {
1779 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1780 snaplistsize = 1;
1781 } else
1782 snaplistsize = ufs_rw64(snaplistsize, ns);
1783 snapblklist = malloc(
1784 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1785 if (error)
1786 snapblklist[0] = 1;
1787 else {
1788 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1789 snaplistsize * sizeof(daddr_t),
1790 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1791 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1792 l->l_cred, NULL, NULL);
1793 for (i = 0; i < snaplistsize; i++)
1794 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1795 if (error) {
1796 printf("ffs_snapshot_mount: read_2 failed %d\n",
1797 error);
1798 snapblklist[0] = 1;
1799 }
1800 }
1801 ip->i_snapblklist = &snapblklist[0];
1802
1803 /*
1804 * Link it onto the active snapshot list.
1805 */
1806 if (is_active_snapshot(si, ip))
1807 panic("ffs_snapshot_mount: %"PRIu64" already on list",
1808 ip->i_number);
1809 else
1810 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1811 vp->v_vflag |= VV_SYSTEM;
1812 VOP_UNLOCK(vp);
1813 }
1814 /*
1815 * No usable snapshots found.
1816 */
1817 if (vp == NULL) {
1818 mutex_exit(&si->si_lock);
1819 return;
1820 }
1821 /*
1822 * Attach the block hints list. We always want to
1823 * use the list from the newest snapshot.
1824 */
1825 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1826 si->si_snapblklist = xp->i_snapblklist;
1827 fscow_establish(mp, ffs_copyonwrite, devvp);
1828 si->si_gen++;
1829 mutex_exit(&si->si_lock);
1830 }
1831
1832 /*
1833 * Disassociate snapshot files when unmounting.
1834 */
1835 void
1836 ffs_snapshot_unmount(struct mount *mp)
1837 {
1838 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1839 struct inode *xp;
1840 struct vnode *vp = NULL;
1841 struct snap_info *si;
1842
1843 si = VFSTOUFS(mp)->um_snapinfo;
1844 mutex_enter(&si->si_lock);
1845 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1846 vp = ITOV(xp);
1847 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1848 if (xp->i_snapblklist == si->si_snapblklist)
1849 si->si_snapblklist = NULL;
1850 free(xp->i_snapblklist, M_UFSMNT);
1851 if (xp->i_nlink > 0) {
1852 si->si_gen++;
1853 mutex_exit(&si->si_lock);
1854 vrele(vp);
1855 mutex_enter(&si->si_lock);
1856 }
1857 }
1858 si->si_gen++;
1859 mutex_exit(&si->si_lock);
1860 if (vp)
1861 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1862 }
1863
1864 /*
1865 * Check for need to copy block that is about to be written,
1866 * copying the block if necessary.
1867 */
1868 static int
1869 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1870 {
1871 struct fs *fs;
1872 struct inode *ip;
1873 struct vnode *devvp = v, *vp = NULL;
1874 struct mount *mp = devvp->v_specmountpoint;
1875 struct snap_info *si;
1876 void *saved_data = NULL;
1877 daddr_t lbn, blkno, *snapblklist;
1878 uint32_t gen;
1879 int lower, upper, mid, snapshot_locked = 0, error = 0;
1880
1881 /*
1882 * Check for valid snapshots.
1883 */
1884 si = VFSTOUFS(mp)->um_snapinfo;
1885 mutex_enter(&si->si_lock);
1886 ip = TAILQ_FIRST(&si->si_snapshots);
1887 if (ip == NULL) {
1888 mutex_exit(&si->si_lock);
1889 return 0;
1890 }
1891 /*
1892 * First check to see if it is after the file system,
1893 * in the journal or in the preallocated list.
1894 * By doing these checks we avoid several potential deadlocks.
1895 */
1896 fs = ip->i_fs;
1897 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1898 if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
1899 mutex_exit(&si->si_lock);
1900 return 0;
1901 }
1902 if ((fs->fs_flags & FS_DOWAPBL) &&
1903 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1904 off_t blk_off, log_start, log_end;
1905
1906 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1907 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1908 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1909 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1910 blk_off = dbtob(bp->b_blkno);
1911 if (blk_off >= log_start && blk_off < log_end) {
1912 mutex_exit(&si->si_lock);
1913 return 0;
1914 }
1915 }
1916 snapblklist = si->si_snapblklist;
1917 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1918 lower = 1;
1919 while (lower <= upper) {
1920 mid = (lower + upper) / 2;
1921 if (snapblklist[mid] == lbn)
1922 break;
1923 if (snapblklist[mid] < lbn)
1924 lower = mid + 1;
1925 else
1926 upper = mid - 1;
1927 }
1928 if (lower <= upper) {
1929 mutex_exit(&si->si_lock);
1930 return 0;
1931 }
1932 /*
1933 * Not in the precomputed list, so check the snapshots.
1934 */
1935 if (si->si_owner != curlwp) {
1936 if (!mutex_tryenter(&si->si_snaplock)) {
1937 mutex_exit(&si->si_lock);
1938 mutex_enter(&si->si_snaplock);
1939 mutex_enter(&si->si_lock);
1940 }
1941 si->si_owner = curlwp;
1942 snapshot_locked = 1;
1943 }
1944 if (data_valid && bp->b_bcount == fs->fs_bsize)
1945 saved_data = bp->b_data;
1946 retry:
1947 gen = si->si_gen;
1948 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1949 vp = ITOV(ip);
1950 /*
1951 * We ensure that everything of our own that needs to be
1952 * copied will be done at the time that ffs_snapshot is
1953 * called. Thus we can skip the check here which can
1954 * deadlock in doing the lookup in ffs_balloc.
1955 */
1956 if (bp->b_vp == vp)
1957 continue;
1958 /*
1959 * Check to see if block needs to be copied.
1960 */
1961 if (lbn < NDADDR) {
1962 blkno = db_get(ip, lbn);
1963 } else {
1964 mutex_exit(&si->si_lock);
1965 blkno = 0; /* XXX: GCC */
1966 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1967 mutex_enter(&si->si_lock);
1968 break;
1969 }
1970 mutex_enter(&si->si_lock);
1971 if (gen != si->si_gen)
1972 goto retry;
1973 }
1974 #ifdef DIAGNOSTIC
1975 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1976 panic("ffs_copyonwrite: bad copy block");
1977 #endif
1978 if (blkno != 0)
1979 continue;
1980
1981 if (curlwp == uvm.pagedaemon_lwp) {
1982 error = ENOMEM;
1983 break;
1984 }
1985 /* Only one level of recursion allowed. */
1986 KASSERT(snapshot_locked);
1987 /*
1988 * Allocate the block into which to do the copy. Since
1989 * multiple processes may all try to copy the same block,
1990 * we have to recheck our need to do a copy if we sleep
1991 * waiting for the lock.
1992 *
1993 * Because all snapshots on a filesystem share a single
1994 * lock, we ensure that we will never be in competition
1995 * with another process to allocate a block.
1996 */
1997 #ifdef DEBUG
1998 if (snapdebug) {
1999 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
2000 (unsigned long long)ip->i_number, lbn);
2001 if (bp->b_vp == devvp)
2002 printf("fs metadata");
2003 else
2004 printf("inum %llu", (unsigned long long)
2005 VTOI(bp->b_vp)->i_number);
2006 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
2007 }
2008 #endif
2009 /*
2010 * If we have already read the old block contents, then
2011 * simply copy them to the new block. Note that we need
2012 * to synchronously write snapshots that have not been
2013 * unlinked, and hence will be visible after a crash,
2014 * to ensure their integrity.
2015 */
2016 mutex_exit(&si->si_lock);
2017 if (saved_data == NULL) {
2018 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2019 error = rwfsblk(vp, B_READ, saved_data, lbn);
2020 if (error) {
2021 free(saved_data, M_UFSMNT);
2022 saved_data = NULL;
2023 mutex_enter(&si->si_lock);
2024 break;
2025 }
2026 }
2027 error = wrsnapblk(vp, saved_data, lbn);
2028 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
2029 error = syncsnap(vp);
2030 mutex_enter(&si->si_lock);
2031 if (error)
2032 break;
2033 if (gen != si->si_gen)
2034 goto retry;
2035 }
2036 /*
2037 * Note that we need to synchronously write snapshots that
2038 * have not been unlinked, and hence will be visible after
2039 * a crash, to ensure their integrity.
2040 */
2041 if (snapshot_locked) {
2042 si->si_owner = NULL;
2043 mutex_exit(&si->si_lock);
2044 mutex_exit(&si->si_snaplock);
2045 } else
2046 mutex_exit(&si->si_lock);
2047 if (saved_data && saved_data != bp->b_data)
2048 free(saved_data, M_UFSMNT);
2049 return error;
2050 }
2051
2052 /*
2053 * Read from a snapshot.
2054 */
2055 int
2056 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2057 {
2058 struct inode *ip = VTOI(vp);
2059 struct fs *fs = ip->i_fs;
2060 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2061 struct buf *bp;
2062 daddr_t lbn, nextlbn;
2063 off_t fsbytes, bytesinfile;
2064 long size, xfersize, blkoffset;
2065 int error;
2066
2067 fstrans_start(vp->v_mount, FSTRANS_SHARED);
2068 mutex_enter(&si->si_snaplock);
2069
2070 if (ioflag & IO_ALTSEMANTICS)
2071 fsbytes = ip->i_size;
2072 else
2073 fsbytes = lfragtosize(fs, fs->fs_size);
2074 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2075 bytesinfile = fsbytes - uio->uio_offset;
2076 if (bytesinfile <= 0)
2077 break;
2078 lbn = lblkno(fs, uio->uio_offset);
2079 nextlbn = lbn + 1;
2080 size = fs->fs_bsize;
2081 blkoffset = blkoff(fs, uio->uio_offset);
2082 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2083 bytesinfile);
2084
2085 if (lblktosize(fs, nextlbn + 1) >= fsbytes) {
2086 if (lblktosize(fs, lbn) + size > fsbytes)
2087 size = fragroundup(fs,
2088 fsbytes - lblktosize(fs, lbn));
2089 error = bread(vp, lbn, size, NOCRED, 0, &bp);
2090 } else {
2091 int nextsize = fs->fs_bsize;
2092 error = breadn(vp, lbn,
2093 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
2094 }
2095 if (error)
2096 break;
2097
2098 /*
2099 * We should only get non-zero b_resid when an I/O error
2100 * has occurred, which should cause us to break above.
2101 * However, if the short read did not cause an error,
2102 * then we want to ensure that we do not uiomove bad
2103 * or uninitialized data.
2104 */
2105 size -= bp->b_resid;
2106 if (size < blkoffset + xfersize) {
2107 xfersize = size - blkoffset;
2108 if (xfersize <= 0)
2109 break;
2110 }
2111 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2112 if (error)
2113 break;
2114 brelse(bp, BC_AGE);
2115 }
2116 if (bp != NULL)
2117 brelse(bp, BC_AGE);
2118
2119 mutex_exit(&si->si_snaplock);
2120 fstrans_done(vp->v_mount);
2121 return error;
2122 }
2123
2124 /*
2125 * Lookup a snapshots data block address.
2126 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2127 * and safe even for the pagedaemon where we cannot bread().
2128 */
2129 static int
2130 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2131 {
2132 struct indir indirs[NIADDR + 2];
2133 struct inode *ip = VTOI(vp);
2134 struct fs *fs = ip->i_fs;
2135 struct buf *bp;
2136 int error, num;
2137
2138 KASSERT(lbn >= 0);
2139
2140 if (lbn < NDADDR) {
2141 *res = db_get(ip, lbn);
2142 return 0;
2143 }
2144 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2145 return error;
2146 if (curlwp == uvm.pagedaemon_lwp) {
2147 mutex_enter(&bufcache_lock);
2148 bp = incore(vp, indirs[num-1].in_lbn);
2149 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2150 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2151 error = 0;
2152 } else
2153 error = ENOMEM;
2154 mutex_exit(&bufcache_lock);
2155 return error;
2156 }
2157 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
2158 if (error == 0) {
2159 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2160 brelse(bp, 0);
2161 }
2162
2163 return error;
2164 }
2165
2166 /*
2167 * Read or write the specified block of the filesystem vp resides on
2168 * from or to the disk bypassing the buffer cache.
2169 */
2170 static int
2171 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2172 {
2173 int error;
2174 struct inode *ip = VTOI(vp);
2175 struct fs *fs = ip->i_fs;
2176 struct buf *nbp;
2177
2178 nbp = getiobuf(NULL, true);
2179 nbp->b_flags = flags;
2180 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2181 nbp->b_error = 0;
2182 nbp->b_data = data;
2183 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
2184 nbp->b_proc = NULL;
2185 nbp->b_dev = ip->i_devvp->v_rdev;
2186 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2187
2188 bdev_strategy(nbp);
2189
2190 error = biowait(nbp);
2191
2192 putiobuf(nbp);
2193
2194 return error;
2195 }
2196
2197 /*
2198 * Write all dirty buffers to disk and invalidate them.
2199 */
2200 static int
2201 syncsnap(struct vnode *vp)
2202 {
2203 int error;
2204 buf_t *bp;
2205 struct fs *fs = VTOI(vp)->i_fs;
2206
2207 mutex_enter(&bufcache_lock);
2208 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2209 error = bbusy(bp, false, 0, NULL);
2210 if (error == EPASSTHROUGH)
2211 continue;
2212 else if (error != 0) {
2213 mutex_exit(&bufcache_lock);
2214 return error;
2215 }
2216 KASSERT(bp->b_bcount == fs->fs_bsize);
2217 mutex_exit(&bufcache_lock);
2218 error = rwfsblk(vp, B_WRITE, bp->b_data,
2219 fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
2220 brelse(bp, BC_INVAL | BC_VFLUSH);
2221 if (error)
2222 return error;
2223 mutex_enter(&bufcache_lock);
2224 }
2225 mutex_exit(&bufcache_lock);
2226
2227 return 0;
2228 }
2229
2230 /*
2231 * Write the specified block to a snapshot.
2232 */
2233 static int
2234 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2235 {
2236 struct inode *ip = VTOI(vp);
2237 struct fs *fs = ip->i_fs;
2238 struct buf *bp;
2239 int error;
2240
2241 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2242 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2243 if (error)
2244 return error;
2245 memcpy(bp->b_data, data, fs->fs_bsize);
2246 if (ip->i_nlink > 0)
2247 error = bwrite(bp);
2248 else
2249 bawrite(bp);
2250
2251 return error;
2252 }
2253
2254 /*
2255 * Check if this inode is present on the active snapshot list.
2256 * Must be called with snapinfo locked.
2257 */
2258 static inline bool
2259 is_active_snapshot(struct snap_info *si, struct inode *ip)
2260 {
2261 struct inode *xp;
2262
2263 KASSERT(mutex_owned(&si->si_lock));
2264
2265 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2266 if (xp == ip)
2267 return true;
2268 return false;
2269 }
2270
2271 /*
2272 * Get/Put direct block from inode or buffer containing disk addresses. Take
2273 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2274 * into a global include.
2275 */
2276 static inline daddr_t
2277 db_get(struct inode *ip, int loc)
2278 {
2279 if (ip->i_ump->um_fstype == UFS1)
2280 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2281 else
2282 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2283 }
2284
2285 static inline void
2286 db_assign(struct inode *ip, int loc, daddr_t val)
2287 {
2288 if (ip->i_ump->um_fstype == UFS1)
2289 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2290 else
2291 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2292 }
2293
2294 static inline daddr_t
2295 ib_get(struct inode *ip, int loc)
2296 {
2297 if (ip->i_ump->um_fstype == UFS1)
2298 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2299 else
2300 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2301 }
2302
2303 static inline void
2304 ib_assign(struct inode *ip, int loc, daddr_t val)
2305 {
2306 if (ip->i_ump->um_fstype == UFS1)
2307 ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2308 else
2309 ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2310 }
2311
2312 static inline daddr_t
2313 idb_get(struct inode *ip, void *bf, int loc)
2314 {
2315 if (ip->i_ump->um_fstype == UFS1)
2316 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2317 else
2318 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2319 }
2320
2321 static inline void
2322 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2323 {
2324 if (ip->i_ump->um_fstype == UFS1)
2325 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2326 else
2327 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2328 }
2329