ffs_snapshot.c revision 1.133 1 /* $NetBSD: ffs_snapshot.c,v 1.133 2014/03/17 09:29:20 hannken Exp $ */
2
3 /*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
9 * 1614 Oxford Street mckusick (at) mckusick.com
10 * Berkeley, CA 94709-1608 +1-510-843-9542
11 * USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
36 *
37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40 #include <sys/cdefs.h>
41 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.133 2014/03/17 09:29:20 hannken Exp $");
42
43 #if defined(_KERNEL_OPT)
44 #include "opt_ffs.h"
45 #include "opt_quota.h"
46 #endif
47
48 #include <sys/param.h>
49 #include <sys/kernel.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/buf.h>
53 #include <sys/proc.h>
54 #include <sys/namei.h>
55 #include <sys/sched.h>
56 #include <sys/stat.h>
57 #include <sys/malloc.h>
58 #include <sys/mount.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/kauth.h>
63 #include <sys/fstrans.h>
64 #include <sys/wapbl.h>
65
66 #include <miscfs/specfs/specdev.h>
67
68 #include <ufs/ufs/quota.h>
69 #include <ufs/ufs/ufsmount.h>
70 #include <ufs/ufs/inode.h>
71 #include <ufs/ufs/ufs_extern.h>
72 #include <ufs/ufs/ufs_bswap.h>
73 #include <ufs/ufs/ufs_wapbl.h>
74
75 #include <ufs/ffs/fs.h>
76 #include <ufs/ffs/ffs_extern.h>
77
78 #include <uvm/uvm.h>
79
80 struct snap_info {
81 kmutex_t si_lock; /* Lock this snapinfo */
82 kmutex_t si_snaplock; /* Snapshot vnode common lock */
83 lwp_t *si_owner; /* Sanplock owner */
84 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
85 daddr_t *si_snapblklist; /* Snapshot block hints list */
86 uint32_t si_gen; /* Incremented on change */
87 };
88
89 #if !defined(FFS_NO_SNAPSHOT)
90 typedef int (*acctfunc_t)
91 (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
92
93 static int snapshot_setup(struct mount *, struct vnode *);
94 static int snapshot_copyfs(struct mount *, struct vnode *, void **);
95 static int snapshot_expunge(struct mount *, struct vnode *,
96 struct fs *, daddr_t *, daddr_t **);
97 static int snapshot_expunge_snap(struct mount *, struct vnode *,
98 struct fs *, daddr_t);
99 static int snapshot_writefs(struct mount *, struct vnode *, void *);
100 static int cgaccount(struct vnode *, int, int *);
101 static int cgaccount1(int, struct vnode *, void *, int);
102 static int expunge(struct vnode *, struct inode *, struct fs *,
103 acctfunc_t, int);
104 static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
105 daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
106 static int fullacct(struct vnode *, void *, int, int, struct fs *,
107 daddr_t, int);
108 static int snapacct(struct vnode *, void *, int, int, struct fs *,
109 daddr_t, int);
110 static int mapacct(struct vnode *, void *, int, int, struct fs *,
111 daddr_t, int);
112 #endif /* !defined(FFS_NO_SNAPSHOT) */
113
114 static int ffs_copyonwrite(void *, struct buf *, bool);
115 static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
116 static int rwfsblk(struct vnode *, int, void *, daddr_t);
117 static int syncsnap(struct vnode *);
118 static int wrsnapblk(struct vnode *, void *, daddr_t);
119 #if !defined(FFS_NO_SNAPSHOT)
120 static int blocks_in_journal(struct fs *);
121 #endif
122
123 static inline bool is_active_snapshot(struct snap_info *, struct inode *);
124 static inline daddr_t db_get(struct inode *, int);
125 static inline void db_assign(struct inode *, int, daddr_t);
126 static inline daddr_t ib_get(struct inode *, int);
127 static inline daddr_t idb_get(struct inode *, void *, int);
128 static inline void idb_assign(struct inode *, void *, int, daddr_t);
129
130 #ifdef DEBUG
131 static int snapdebug = 0;
132 #endif
133
134 int
135 ffs_snapshot_init(struct ufsmount *ump)
136 {
137 struct snap_info *si;
138
139 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
140 if (si == NULL)
141 return ENOMEM;
142
143 TAILQ_INIT(&si->si_snapshots);
144 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
145 mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
146 si->si_owner = NULL;
147 si->si_gen = 0;
148 si->si_snapblklist = NULL;
149
150 return 0;
151 }
152
153 void
154 ffs_snapshot_fini(struct ufsmount *ump)
155 {
156 struct snap_info *si;
157
158 si = ump->um_snapinfo;
159 ump->um_snapinfo = NULL;
160
161 KASSERT(TAILQ_EMPTY(&si->si_snapshots));
162 mutex_destroy(&si->si_lock);
163 mutex_destroy(&si->si_snaplock);
164 KASSERT(si->si_snapblklist == NULL);
165 kmem_free(si, sizeof(*si));
166 }
167
168 /*
169 * Create a snapshot file and initialize it for the filesystem.
170 * Vnode is locked on entry and return.
171 */
172 int
173 ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
174 {
175 #if defined(FFS_NO_SNAPSHOT)
176 return EOPNOTSUPP;
177 }
178 #else /* defined(FFS_NO_SNAPSHOT) */
179 bool suspended = false;
180 int error, redo = 0, snaploc;
181 void *sbbuf = NULL;
182 daddr_t *snaplist = NULL, snaplistsize = 0;
183 struct buf *bp, *nbp;
184 struct fs *copy_fs = NULL;
185 struct fs *fs = VFSTOUFS(mp)->um_fs;
186 struct inode *ip = VTOI(vp);
187 struct lwp *l = curlwp;
188 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
189 struct timespec ts;
190 struct timeval starttime;
191 #ifdef DEBUG
192 struct timeval endtime;
193 #endif
194 struct vnode *devvp = ip->i_devvp;
195
196 /*
197 * If the vnode already is a snapshot, return.
198 */
199 if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) {
200 if ((VTOI(vp)->i_flags & SF_SNAPINVAL))
201 return EINVAL;
202 if (ctime) {
203 ctime->tv_sec = DIP(VTOI(vp), mtime);
204 ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
205 }
206 return 0;
207 }
208 /*
209 * Check for free snapshot slot in the superblock.
210 */
211 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
212 if (fs->fs_snapinum[snaploc] == 0)
213 break;
214 if (snaploc == FSMAXSNAP)
215 return (ENOSPC);
216 /*
217 * Prepare the vnode to become a snapshot.
218 */
219 error = snapshot_setup(mp, vp);
220 if (error)
221 goto out;
222
223 /*
224 * Copy all the cylinder group maps. Although the
225 * filesystem is still active, we hope that only a few
226 * cylinder groups will change between now and when we
227 * suspend operations. Thus, we will be able to quickly
228 * touch up the few cylinder groups that changed during
229 * the suspension period.
230 */
231 error = cgaccount(vp, 1, NULL);
232 if (error)
233 goto out;
234
235 /*
236 * snapshot is now valid
237 */
238 ip->i_flags &= ~SF_SNAPINVAL;
239 DIP_ASSIGN(ip, flags, ip->i_flags);
240 ip->i_flag |= IN_CHANGE | IN_UPDATE;
241
242 /*
243 * Ensure that the snapshot is completely on disk.
244 * Since we have marked it as a snapshot it is safe to
245 * unlock it as no process will be allowed to write to it.
246 */
247 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
248 if (error)
249 goto out;
250 VOP_UNLOCK(vp);
251 /*
252 * All allocations are done, so we can now suspend the filesystem.
253 */
254 error = vfs_suspend(vp->v_mount, 0);
255 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
256 if (error)
257 goto out;
258 suspended = true;
259 getmicrotime(&starttime);
260 /*
261 * First, copy all the cylinder group maps that have changed.
262 */
263 error = cgaccount(vp, 2, &redo);
264 if (error)
265 goto out;
266 /*
267 * Create a copy of the superblock and its summary information.
268 */
269 error = snapshot_copyfs(mp, vp, &sbbuf);
270 copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
271 if (error)
272 goto out;
273 /*
274 * Expunge unlinked files from our view.
275 */
276 error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
277 if (error)
278 goto out;
279 /*
280 * Record snapshot inode. Since this is the newest snapshot,
281 * it must be placed at the end of the list.
282 */
283 if (ip->i_nlink > 0)
284 fs->fs_snapinum[snaploc] = ip->i_number;
285
286 mutex_enter(&si->si_lock);
287 if (is_active_snapshot(si, ip))
288 panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
289 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
290 if (TAILQ_FIRST(&si->si_snapshots) == ip) {
291 /*
292 * If this is the first snapshot on this filesystem, put the
293 * preliminary list in place and establish the cow handler.
294 */
295 si->si_snapblklist = snaplist;
296 fscow_establish(mp, ffs_copyonwrite, devvp);
297 }
298 si->si_gen++;
299 mutex_exit(&si->si_lock);
300
301 vp->v_vflag |= VV_SYSTEM;
302 /*
303 * Set the mtime to the time the snapshot has been taken.
304 */
305 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
306 if (ctime)
307 *ctime = ts;
308 DIP_ASSIGN(ip, mtime, ts.tv_sec);
309 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
310 ip->i_flag |= IN_CHANGE | IN_UPDATE;
311 /*
312 * Copy allocation information from all snapshots and then
313 * expunge them from our view.
314 */
315 error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
316 if (error)
317 goto out;
318 /*
319 * Write the superblock and its summary information to the snapshot.
320 */
321 error = snapshot_writefs(mp, vp, sbbuf);
322 if (error)
323 goto out;
324 /*
325 * We're nearly done, ensure that the snapshot is completely on disk.
326 */
327 error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
328 if (error)
329 goto out;
330 /*
331 * Invalidate and free all pages on the snapshot vnode.
332 * We will read and write through the buffercache.
333 */
334 mutex_enter(vp->v_interlock);
335 error = VOP_PUTPAGES(vp, 0, 0,
336 PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
337 if (error)
338 goto out;
339 /*
340 * Invalidate short ( < fs_bsize ) buffers. We will always read
341 * full size buffers later.
342 */
343 mutex_enter(&bufcache_lock);
344 KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
345 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
346 nbp = LIST_NEXT(bp, b_vnbufs);
347 if (bp->b_bcount == fs->fs_bsize)
348 continue;
349 error = bbusy(bp, false, 0, NULL);
350 if (error != 0) {
351 if (error == EPASSTHROUGH) {
352 nbp = LIST_FIRST(&vp->v_cleanblkhd);
353 continue;
354 }
355 break;
356 }
357 brelsel(bp, BC_INVAL | BC_VFLUSH);
358 }
359 mutex_exit(&bufcache_lock);
360
361 out:
362 if (sbbuf != NULL) {
363 free(copy_fs->fs_csp, M_UFSMNT);
364 free(sbbuf, M_UFSMNT);
365 }
366 if (fs->fs_active != NULL) {
367 free(fs->fs_active, M_DEVBUF);
368 fs->fs_active = NULL;
369 }
370
371 mutex_enter(&si->si_lock);
372 if (snaplist != NULL) {
373 if (si->si_snapblklist == snaplist)
374 si->si_snapblklist = NULL;
375 free(snaplist, M_UFSMNT);
376 }
377 if (error) {
378 fs->fs_snapinum[snaploc] = 0;
379 } else {
380 /*
381 * As this is the newest list, it is the most inclusive, so
382 * should replace the previous list.
383 */
384 si->si_snapblklist = ip->i_snapblklist;
385 }
386 si->si_gen++;
387 mutex_exit(&si->si_lock);
388
389 if (suspended) {
390 VOP_UNLOCK(vp);
391 vfs_resume(vp->v_mount);
392 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
393 #ifdef DEBUG
394 getmicrotime(&endtime);
395 timersub(&endtime, &starttime, &endtime);
396 printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
397 mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
398 endtime.tv_usec / 1000, redo, fs->fs_ncg);
399 #endif
400 }
401 if (error) {
402 if (!UFS_WAPBL_BEGIN(mp)) {
403 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
404 UFS_WAPBL_END(mp);
405 }
406 } else if (ip->i_nlink > 0)
407 vref(vp);
408 return (error);
409 }
410
411 /*
412 * Prepare vnode to become a snapshot.
413 */
414 static int
415 snapshot_setup(struct mount *mp, struct vnode *vp)
416 {
417 int error, n, len, loc, cg;
418 daddr_t blkno, numblks;
419 struct buf *ibp, *nbp;
420 struct fs *fs = VFSTOUFS(mp)->um_fs;
421 struct lwp *l = curlwp;
422 const int wbreak = blocks_in_journal(fs)/8;
423 struct inode *ip = VTOI(vp);
424
425 /*
426 * Check mount, exclusive reference and owner.
427 */
428 if (vp->v_mount != mp)
429 return EXDEV;
430 if (vp->v_usecount != 1 || vp->v_writecount != 0)
431 return EBUSY;
432 error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
433 0, mp, vp, NULL);
434 if (error)
435 return EACCES;
436
437 if (vp->v_size != 0) {
438 error = ffs_truncate(vp, 0, 0, NOCRED);
439 if (error)
440 return error;
441 }
442
443 /* Change inode to snapshot type file. */
444 error = UFS_WAPBL_BEGIN(mp);
445 if (error)
446 return error;
447 #if defined(QUOTA) || defined(QUOTA2)
448 /* shapshot inodes are not accounted in quotas */
449 chkiq(ip, -1, l->l_cred, 0);
450 #endif
451 ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
452 DIP_ASSIGN(ip, flags, ip->i_flags);
453 ip->i_flag |= IN_CHANGE | IN_UPDATE;
454 ffs_update(vp, NULL, NULL, UPDATE_WAIT);
455 UFS_WAPBL_END(mp);
456
457 KASSERT(ip->i_flags & SF_SNAPSHOT);
458 /*
459 * Write an empty list of preallocated blocks to the end of
460 * the snapshot to set size to at least that of the filesystem.
461 */
462 numblks = howmany(fs->fs_size, fs->fs_frag);
463 blkno = 1;
464 blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
465 error = vn_rdwr(UIO_WRITE, vp,
466 (void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
467 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
468 if (error)
469 return error;
470 /*
471 * Preallocate critical data structures so that we can copy
472 * them in without further allocation after we suspend all
473 * operations on the filesystem. We would like to just release
474 * the allocated buffers without writing them since they will
475 * be filled in below once we are ready to go, but this upsets
476 * the soft update code, so we go ahead and write the new buffers.
477 *
478 * Allocate all indirect blocks and mark all of them as not
479 * needing to be copied.
480 */
481 error = UFS_WAPBL_BEGIN(mp);
482 if (error)
483 return error;
484 for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
485 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
486 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
487 if (error)
488 goto out;
489 brelse(ibp, 0);
490 if (wbreak > 0 && (++n % wbreak) == 0) {
491 UFS_WAPBL_END(mp);
492 error = UFS_WAPBL_BEGIN(mp);
493 if (error)
494 return error;
495 }
496 }
497 /*
498 * Allocate copies for the superblock and its summary information.
499 */
500 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
501 0, &nbp);
502 if (error)
503 goto out;
504 bawrite(nbp);
505 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
506 len = howmany(fs->fs_cssize, fs->fs_bsize);
507 for (loc = 0; loc < len; loc++) {
508 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
509 fs->fs_bsize, l->l_cred, 0, &nbp);
510 if (error)
511 goto out;
512 bawrite(nbp);
513 if (wbreak > 0 && (++n % wbreak) == 0) {
514 UFS_WAPBL_END(mp);
515 error = UFS_WAPBL_BEGIN(mp);
516 if (error)
517 return error;
518 }
519 }
520 /*
521 * Allocate all cylinder group blocks.
522 */
523 for (cg = 0; cg < fs->fs_ncg; cg++) {
524 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
525 fs->fs_bsize, l->l_cred, 0, &nbp);
526 if (error)
527 goto out;
528 bawrite(nbp);
529 if (wbreak > 0 && (++n % wbreak) == 0) {
530 UFS_WAPBL_END(mp);
531 error = UFS_WAPBL_BEGIN(mp);
532 if (error)
533 return error;
534 }
535 }
536
537 out:
538 UFS_WAPBL_END(mp);
539 return error;
540 }
541
542 /*
543 * Create a copy of the superblock and its summary information.
544 * It is up to the caller to free copyfs and copy_fs->fs_csp.
545 */
546 static int
547 snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
548 {
549 int error, i, len, loc, size;
550 void *space;
551 int32_t *lp;
552 struct buf *bp;
553 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
554 struct lwp *l = curlwp;
555 struct vnode *devvp = VTOI(vp)->i_devvp;
556
557 /*
558 * Grab a copy of the superblock and its summary information.
559 * We delay writing it until the suspension is released below.
560 */
561 *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
562 loc = ffs_blkoff(fs, fs->fs_sblockloc);
563 if (loc > 0)
564 memset(*sbbuf, 0, loc);
565 copyfs = (struct fs *)((char *)(*sbbuf) + loc);
566 memcpy(copyfs, fs, fs->fs_sbsize);
567 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
568 if (fs->fs_sbsize < size)
569 memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
570 size - fs->fs_sbsize);
571 size = ffs_blkroundup(fs, fs->fs_cssize);
572 if (fs->fs_contigsumsize > 0)
573 size += fs->fs_ncg * sizeof(int32_t);
574 space = malloc(size, M_UFSMNT, M_WAITOK);
575 copyfs->fs_csp = space;
576 memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
577 space = (char *)space + fs->fs_cssize;
578 loc = howmany(fs->fs_cssize, fs->fs_fsize);
579 i = fs->fs_frag - loc % fs->fs_frag;
580 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
581 if (len > 0) {
582 if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
583 len, l->l_cred, 0, &bp)) != 0) {
584 free(copyfs->fs_csp, M_UFSMNT);
585 free(*sbbuf, M_UFSMNT);
586 *sbbuf = NULL;
587 return error;
588 }
589 memcpy(space, bp->b_data, (u_int)len);
590 space = (char *)space + len;
591 brelse(bp, BC_INVAL | BC_NOCACHE);
592 }
593 if (fs->fs_contigsumsize > 0) {
594 copyfs->fs_maxcluster = lp = space;
595 for (i = 0; i < fs->fs_ncg; i++)
596 *lp++ = fs->fs_contigsumsize;
597 }
598 if (mp->mnt_wapbl)
599 copyfs->fs_flags &= ~FS_DOWAPBL;
600 return 0;
601 }
602
603 /*
604 * We must check for active files that have been unlinked (e.g., with a zero
605 * link count). We have to expunge all trace of these files from the snapshot
606 * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
607 * Note that we skip unlinked snapshot files as they will be handled separately.
608 * Calculate the snapshot list size and create a preliminary list.
609 */
610 static int
611 snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
612 daddr_t *snaplistsize, daddr_t **snaplist)
613 {
614 int cg, error = 0, len, loc;
615 daddr_t blkno, *blkp;
616 struct fs *fs = VFSTOUFS(mp)->um_fs;
617 struct inode *xp;
618 struct lwp *l = curlwp;
619 struct vattr vat;
620 struct vnode *logvp = NULL, *xvp;
621 struct vnode_iterator *marker;
622
623 *snaplist = NULL;
624 /*
625 * Get the log inode if any.
626 */
627 if ((fs->fs_flags & FS_DOWAPBL) &&
628 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
629 error = VFS_VGET(mp,
630 fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
631 if (error)
632 goto out;
633 }
634 /*
635 * We also calculate the needed size for the snapshot list.
636 */
637 *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
638 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
639
640 vfs_vnode_iterator_init(mp, &marker);
641 while (vfs_vnode_iterator_next(marker, &xvp)) {
642 if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
643 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
644 vrele(xvp);
645 continue;
646 }
647 #ifdef DEBUG
648 if (snapdebug)
649 vprint("ffs_snapshot: busy vnode", xvp);
650 #endif
651 xp = VTOI(xvp);
652 if (xvp != logvp) {
653 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
654 vat.va_nlink > 0) {
655 vrele(xvp);
656 continue;
657 }
658 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
659 vrele(xvp);
660 continue;
661 }
662 }
663 /*
664 * If there is a fragment, clear it here.
665 */
666 blkno = 0;
667 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
668 if (loc < UFS_NDADDR) {
669 len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
670 if (len > 0 && len < fs->fs_bsize) {
671 error = UFS_WAPBL_BEGIN(mp);
672 if (error) {
673 vrele(xvp);
674 vfs_vnode_iterator_destroy(marker);
675 goto out;
676 }
677 ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
678 len, xp->i_number);
679 blkno = db_get(xp, loc);
680 db_assign(xp, loc, 0);
681 UFS_WAPBL_END(mp);
682 }
683 }
684 *snaplistsize += 1;
685 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
686 if (blkno)
687 db_assign(xp, loc, blkno);
688 if (!error) {
689 error = UFS_WAPBL_BEGIN(mp);
690 if (!error) {
691 error = ffs_freefile_snap(copy_fs, vp,
692 xp->i_number, xp->i_mode);
693 UFS_WAPBL_END(mp);
694 }
695 }
696 vrele(xvp);
697 if (error) {
698 vfs_vnode_iterator_destroy(marker);
699 goto out;
700 }
701 }
702 vfs_vnode_iterator_destroy(marker);
703
704 /*
705 * Create a preliminary list of preallocated snapshot blocks.
706 */
707 *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
708 blkp = &(*snaplist)[1];
709 *blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
710 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
711 for (cg = 0; cg < fs->fs_ncg; cg++) {
712 if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
713 break;
714 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
715 }
716 len = howmany(fs->fs_cssize, fs->fs_bsize);
717 for (loc = 0; loc < len; loc++)
718 *blkp++ = blkno + loc;
719 for (; cg < fs->fs_ncg; cg++)
720 *blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
721 (*snaplist)[0] = blkp - &(*snaplist)[0];
722
723 out:
724 if (logvp != NULL)
725 vput(logvp);
726 if (error && *snaplist != NULL) {
727 free(*snaplist, M_UFSMNT);
728 *snaplist = NULL;
729 }
730
731 return error;
732 }
733
734 /*
735 * Copy allocation information from all the snapshots in this snapshot and
736 * then expunge them from its view. Also, collect the list of allocated
737 * blocks in i_snapblklist.
738 */
739 static int
740 snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
741 struct fs *copy_fs, daddr_t snaplistsize)
742 {
743 int error = 0, i;
744 daddr_t numblks, *snaplist = NULL;
745 struct fs *fs = VFSTOUFS(mp)->um_fs;
746 struct inode *ip = VTOI(vp), *xp;
747 struct lwp *l = curlwp;
748 struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
749
750 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
751 if (xp != ip) {
752 error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
753 if (error)
754 break;
755 }
756 if (xp->i_nlink != 0)
757 continue;
758 error = UFS_WAPBL_BEGIN(mp);
759 if (error)
760 break;
761 error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
762 UFS_WAPBL_END(mp);
763 if (error)
764 break;
765 }
766 if (error)
767 goto out;
768 /*
769 * Allocate space for the full list of preallocated snapshot blocks.
770 */
771 snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
772 ip->i_snapblklist = &snaplist[1];
773 /*
774 * Expunge the blocks used by the snapshots from the set of
775 * blocks marked as used in the snapshot bitmaps. Also, collect
776 * the list of allocated blocks in i_snapblklist.
777 */
778 error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
779 if (error)
780 goto out;
781 if (snaplistsize < ip->i_snapblklist - snaplist)
782 panic("ffs_snapshot: list too small");
783 snaplistsize = ip->i_snapblklist - snaplist;
784 snaplist[0] = snaplistsize;
785 ip->i_snapblklist = &snaplist[0];
786 /*
787 * Write out the list of allocated blocks to the end of the snapshot.
788 */
789 numblks = howmany(fs->fs_size, fs->fs_frag);
790 for (i = 0; i < snaplistsize; i++)
791 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
792 error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
793 snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
794 UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
795 for (i = 0; i < snaplistsize; i++)
796 snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
797 out:
798 if (error && snaplist != NULL) {
799 free(snaplist, M_UFSMNT);
800 ip->i_snapblklist = NULL;
801 }
802 return error;
803 }
804
805 /*
806 * Write the superblock and its summary information to the snapshot.
807 * Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
808 */
809 static int
810 snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
811 {
812 int error, len, loc;
813 void *space;
814 daddr_t blkno;
815 struct buf *bp;
816 struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
817 struct inode *ip = VTOI(vp);
818 struct lwp *l = curlwp;
819
820 copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
821
822 /*
823 * Write the superblock and its summary information
824 * to the snapshot.
825 */
826 blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
827 len = howmany(fs->fs_cssize, fs->fs_bsize);
828 space = copyfs->fs_csp;
829 #ifdef FFS_EI
830 if (UFS_FSNEEDSWAP(fs)) {
831 ffs_sb_swap(copyfs, copyfs);
832 ffs_csum_swap(space, space, fs->fs_cssize);
833 }
834 #endif
835 error = UFS_WAPBL_BEGIN(mp);
836 if (error)
837 return error;
838 for (loc = 0; loc < len; loc++) {
839 error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
840 B_MODIFY, &bp);
841 if (error) {
842 break;
843 }
844 memcpy(bp->b_data, space, fs->fs_bsize);
845 space = (char *)space + fs->fs_bsize;
846 bawrite(bp);
847 }
848 if (error)
849 goto out;
850 error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
851 fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
852 if (error) {
853 goto out;
854 } else {
855 memcpy(bp->b_data, sbbuf, fs->fs_bsize);
856 bawrite(bp);
857 }
858 /*
859 * Copy the first UFS_NDADDR blocks to the snapshot so
860 * ffs_copyonwrite() and ffs_snapblkfree() will always work on
861 * indirect blocks.
862 */
863 for (loc = 0; loc < UFS_NDADDR; loc++) {
864 if (db_get(ip, loc) != 0)
865 continue;
866 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
867 fs->fs_bsize, l->l_cred, 0, &bp);
868 if (error)
869 break;
870 error = rwfsblk(vp, B_READ, bp->b_data, loc);
871 if (error) {
872 brelse(bp, 0);
873 break;
874 }
875 bawrite(bp);
876 }
877
878 out:
879 UFS_WAPBL_END(mp);
880 return error;
881 }
882
883 /*
884 * Copy all cylinder group maps.
885 */
886 static int
887 cgaccount(struct vnode *vp, int passno, int *redo)
888 {
889 int cg, error = 0;
890 struct buf *nbp;
891 struct fs *fs = VTOI(vp)->i_fs;
892
893 if (redo != NULL)
894 *redo = 0;
895 if (passno == 1)
896 fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
897 M_DEVBUF, M_WAITOK | M_ZERO);
898 for (cg = 0; cg < fs->fs_ncg; cg++) {
899 if (passno == 2 && ACTIVECG_ISSET(fs, cg))
900 continue;
901
902 if (redo != NULL)
903 *redo += 1;
904 error = UFS_WAPBL_BEGIN(vp->v_mount);
905 if (error)
906 return error;
907 error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
908 fs->fs_bsize, curlwp->l_cred, 0, &nbp);
909 if (error) {
910 UFS_WAPBL_END(vp->v_mount);
911 break;
912 }
913 error = cgaccount1(cg, vp, nbp->b_data, passno);
914 bawrite(nbp);
915 UFS_WAPBL_END(vp->v_mount);
916 if (error)
917 break;
918 }
919 return error;
920 }
921
922 /*
923 * Copy a cylinder group map. All the unallocated blocks are marked
924 * BLK_NOCOPY so that the snapshot knows that it need not copy them
925 * if they are later written. If passno is one, then this is a first
926 * pass, so only setting needs to be done. If passno is 2, then this
927 * is a revision to a previous pass which must be undone as the
928 * replacement pass is done.
929 */
930 static int
931 cgaccount1(int cg, struct vnode *vp, void *data, int passno)
932 {
933 struct buf *bp, *ibp;
934 struct inode *ip;
935 struct cg *cgp;
936 struct fs *fs;
937 struct lwp *l = curlwp;
938 daddr_t base, numblks;
939 int error, len, loc, ns __unused, indiroff;
940
941 ip = VTOI(vp);
942 fs = ip->i_fs;
943 ns = UFS_FSNEEDSWAP(fs);
944 error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
945 (int)fs->fs_cgsize, l->l_cred, 0, &bp);
946 if (error) {
947 return (error);
948 }
949 cgp = (struct cg *)bp->b_data;
950 if (!cg_chkmagic(cgp, ns)) {
951 brelse(bp, 0);
952 return (EIO);
953 }
954 ACTIVECG_SET(fs, cg);
955
956 memcpy(data, bp->b_data, fs->fs_cgsize);
957 brelse(bp, 0);
958 if (fs->fs_cgsize < fs->fs_bsize)
959 memset((char *)data + fs->fs_cgsize, 0,
960 fs->fs_bsize - fs->fs_cgsize);
961 numblks = howmany(fs->fs_size, fs->fs_frag);
962 len = howmany(fs->fs_fpg, fs->fs_frag);
963 base = cg * fs->fs_fpg / fs->fs_frag;
964 if (base + len >= numblks)
965 len = numblks - base - 1;
966 loc = 0;
967 if (base < UFS_NDADDR) {
968 for ( ; loc < UFS_NDADDR; loc++) {
969 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
970 db_assign(ip, loc, BLK_NOCOPY);
971 else if (db_get(ip, loc) == BLK_NOCOPY) {
972 if (passno == 2)
973 db_assign(ip, loc, 0);
974 else if (passno == 1)
975 panic("ffs_snapshot: lost direct block");
976 }
977 }
978 }
979 if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
980 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
981 return (error);
982 indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
983 for ( ; loc < len; loc++, indiroff++) {
984 if (indiroff >= FFS_NINDIR(fs)) {
985 bawrite(ibp);
986 if ((error = ffs_balloc(vp,
987 ffs_lblktosize(fs, (off_t)(base + loc)),
988 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
989 return (error);
990 indiroff = 0;
991 }
992 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
993 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
994 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
995 if (passno == 2)
996 idb_assign(ip, ibp->b_data, indiroff, 0);
997 else if (passno == 1)
998 panic("ffs_snapshot: lost indirect block");
999 }
1000 }
1001 bdwrite(ibp);
1002 return (0);
1003 }
1004
1005 /*
1006 * Before expunging a snapshot inode, note all the
1007 * blocks that it claims with BLK_SNAP so that fsck will
1008 * be able to account for those blocks properly and so
1009 * that this snapshot knows that it need not copy them
1010 * if the other snapshot holding them is freed.
1011 */
1012 static int
1013 expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1014 acctfunc_t acctfunc, int expungetype)
1015 {
1016 int i, error, ns __unused;
1017 daddr_t lbn, rlbn;
1018 daddr_t len, blkno, numblks, blksperindir;
1019 struct ufs1_dinode *dip1;
1020 struct ufs2_dinode *dip2;
1021 struct lwp *l = curlwp;
1022 void *bap;
1023 struct buf *bp;
1024 struct mount *mp;
1025
1026 ns = UFS_FSNEEDSWAP(fs);
1027 mp = snapvp->v_mount;
1028
1029 error = UFS_WAPBL_BEGIN(mp);
1030 if (error)
1031 return error;
1032 /*
1033 * Prepare to expunge the inode. If its inode block has not
1034 * yet been copied, then allocate and fill the copy.
1035 */
1036 lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1037 error = snapblkaddr(snapvp, lbn, &blkno);
1038 if (error)
1039 return error;
1040 if (blkno != 0) {
1041 error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
1042 B_MODIFY, &bp);
1043 } else {
1044 error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
1045 fs->fs_bsize, l->l_cred, 0, &bp);
1046 if (! error)
1047 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
1048 }
1049 if (error) {
1050 UFS_WAPBL_END(mp);
1051 return error;
1052 }
1053 /*
1054 * Set a snapshot inode to be a zero length file, regular files
1055 * or unlinked snapshots to be completely unallocated.
1056 */
1057 if (fs->fs_magic == FS_UFS1_MAGIC) {
1058 dip1 = (struct ufs1_dinode *)bp->b_data +
1059 ino_to_fsbo(fs, cancelip->i_number);
1060 if (cancelip->i_flags & SF_SNAPSHOT) {
1061 dip1->di_flags =
1062 ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
1063 SF_SNAPINVAL, ns);
1064 }
1065 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1066 dip1->di_mode = 0;
1067 dip1->di_size = 0;
1068 dip1->di_blocks = 0;
1069 memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
1070 } else {
1071 dip2 = (struct ufs2_dinode *)bp->b_data +
1072 ino_to_fsbo(fs, cancelip->i_number);
1073 if (cancelip->i_flags & SF_SNAPSHOT) {
1074 dip2->di_flags =
1075 ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
1076 SF_SNAPINVAL, ns);
1077 }
1078 if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
1079 dip2->di_mode = 0;
1080 dip2->di_size = 0;
1081 dip2->di_blocks = 0;
1082 memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
1083 }
1084 bdwrite(bp);
1085 UFS_WAPBL_END(mp);
1086 /*
1087 * Now go through and expunge all the blocks in the file
1088 * using the function requested.
1089 */
1090 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1091 if (fs->fs_magic == FS_UFS1_MAGIC)
1092 bap = &cancelip->i_ffs1_db[0];
1093 else
1094 bap = &cancelip->i_ffs2_db[0];
1095 error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
1096 if (error)
1097 return (error);
1098 if (fs->fs_magic == FS_UFS1_MAGIC)
1099 bap = &cancelip->i_ffs1_ib[0];
1100 else
1101 bap = &cancelip->i_ffs2_ib[0];
1102 error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
1103 if (error)
1104 return (error);
1105 blksperindir = 1;
1106 lbn = -UFS_NDADDR;
1107 len = numblks - UFS_NDADDR;
1108 rlbn = UFS_NDADDR;
1109 for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1110 error = indiracct(snapvp, ITOV(cancelip), i,
1111 ib_get(cancelip, i), lbn, rlbn, len,
1112 blksperindir, fs, acctfunc, expungetype);
1113 if (error)
1114 return (error);
1115 blksperindir *= FFS_NINDIR(fs);
1116 lbn -= blksperindir + 1;
1117 len -= blksperindir;
1118 rlbn += blksperindir;
1119 }
1120 return (0);
1121 }
1122
1123 /*
1124 * Descend an indirect block chain for vnode cancelvp accounting for all
1125 * its indirect blocks in snapvp.
1126 */
1127 static int
1128 indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
1129 daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
1130 daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
1131 {
1132 int error, num, i;
1133 daddr_t subblksperindir;
1134 struct indir indirs[UFS_NIADDR + 2];
1135 daddr_t last;
1136 void *bap;
1137 struct buf *bp;
1138
1139 if (blkno == 0) {
1140 if (expungetype == BLK_NOCOPY)
1141 return (0);
1142 panic("indiracct: missing indir");
1143 }
1144 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1145 return (error);
1146 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1147 panic("indiracct: botched params");
1148 /*
1149 * We have to expand bread here since it will deadlock looking
1150 * up the block number for any blocks that are not in the cache.
1151 */
1152 error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
1153 false, &bp);
1154 if (error)
1155 return error;
1156 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
1157 rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
1158 brelse(bp, 0);
1159 return (error);
1160 }
1161 /*
1162 * Account for the block pointers in this indirect block.
1163 */
1164 last = howmany(remblks, blksperindir);
1165 if (last > FFS_NINDIR(fs))
1166 last = FFS_NINDIR(fs);
1167 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
1168 memcpy((void *)bap, bp->b_data, fs->fs_bsize);
1169 brelse(bp, 0);
1170 error = (*acctfunc)(snapvp, bap, 0, last,
1171 fs, level == 0 ? rlbn : -1, expungetype);
1172 if (error || level == 0)
1173 goto out;
1174 /*
1175 * Account for the block pointers in each of the indirect blocks
1176 * in the levels below us.
1177 */
1178 subblksperindir = blksperindir / FFS_NINDIR(fs);
1179 for (lbn++, level--, i = 0; i < last; i++) {
1180 error = indiracct(snapvp, cancelvp, level,
1181 idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1182 subblksperindir, fs, acctfunc, expungetype);
1183 if (error)
1184 goto out;
1185 rlbn += blksperindir;
1186 lbn -= blksperindir;
1187 remblks -= blksperindir;
1188 }
1189 out:
1190 free(bap, M_DEVBUF);
1191 return (error);
1192 }
1193
1194 /*
1195 * Do both snap accounting and map accounting.
1196 */
1197 static int
1198 fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1199 struct fs *fs, daddr_t lblkno,
1200 int exptype /* BLK_SNAP or BLK_NOCOPY */)
1201 {
1202 int error;
1203
1204 if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1205 return (error);
1206 return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1207 }
1208
1209 /*
1210 * Identify a set of blocks allocated in a snapshot inode.
1211 */
1212 static int
1213 snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1214 struct fs *fs, daddr_t lblkno,
1215 int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1216 {
1217 struct inode *ip = VTOI(vp);
1218 struct lwp *l = curlwp;
1219 struct mount *mp = vp->v_mount;
1220 daddr_t blkno;
1221 daddr_t lbn;
1222 struct buf *ibp;
1223 int error, n;
1224 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1225
1226 error = UFS_WAPBL_BEGIN(mp);
1227 if (error)
1228 return error;
1229 for ( n = 0; oldblkp < lastblkp; oldblkp++) {
1230 blkno = idb_get(ip, bap, oldblkp);
1231 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1232 continue;
1233 lbn = ffs_fragstoblks(fs, blkno);
1234 if (lbn < UFS_NDADDR) {
1235 blkno = db_get(ip, lbn);
1236 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1237 } else {
1238 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1239 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1240 if (error)
1241 break;
1242 blkno = idb_get(ip, ibp->b_data,
1243 (lbn - UFS_NDADDR) % FFS_NINDIR(fs));
1244 }
1245 /*
1246 * If we are expunging a snapshot vnode and we
1247 * find a block marked BLK_NOCOPY, then it is
1248 * one that has been allocated to this snapshot after
1249 * we took our current snapshot and can be ignored.
1250 */
1251 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1252 if (lbn >= UFS_NDADDR)
1253 brelse(ibp, 0);
1254 } else {
1255 if (blkno != 0)
1256 panic("snapacct: bad block");
1257 if (lbn < UFS_NDADDR)
1258 db_assign(ip, lbn, expungetype);
1259 else {
1260 idb_assign(ip, ibp->b_data,
1261 (lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
1262 bdwrite(ibp);
1263 }
1264 }
1265 if (wbreak > 0 && (++n % wbreak) == 0) {
1266 UFS_WAPBL_END(mp);
1267 error = UFS_WAPBL_BEGIN(mp);
1268 if (error)
1269 return error;
1270 }
1271 }
1272 UFS_WAPBL_END(mp);
1273 return error;
1274 }
1275
1276 /*
1277 * Account for a set of blocks allocated in a snapshot inode.
1278 */
1279 static int
1280 mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1281 struct fs *fs, daddr_t lblkno, int expungetype)
1282 {
1283 daddr_t blkno;
1284 struct inode *ip;
1285 struct mount *mp = vp->v_mount;
1286 ino_t inum;
1287 int acctit, error, n;
1288 const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
1289
1290 error = UFS_WAPBL_BEGIN(mp);
1291 if (error)
1292 return error;
1293 ip = VTOI(vp);
1294 inum = ip->i_number;
1295 if (lblkno == -1)
1296 acctit = 0;
1297 else
1298 acctit = 1;
1299 for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
1300 blkno = idb_get(ip, bap, oldblkp);
1301 if (blkno == 0 || blkno == BLK_NOCOPY)
1302 continue;
1303 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1304 *ip->i_snapblklist++ = lblkno;
1305 if (blkno == BLK_SNAP)
1306 blkno = ffs_blkstofrags(fs, lblkno);
1307 ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
1308 if (wbreak > 0 && (++n % wbreak) == 0) {
1309 UFS_WAPBL_END(mp);
1310 error = UFS_WAPBL_BEGIN(mp);
1311 if (error)
1312 return error;
1313 }
1314 }
1315 UFS_WAPBL_END(mp);
1316 return (0);
1317 }
1318
1319 /*
1320 * Number of blocks that fit into the journal or zero if not logging.
1321 */
1322 static int
1323 blocks_in_journal(struct fs *fs)
1324 {
1325 off_t bpj;
1326
1327 if ((fs->fs_flags & FS_DOWAPBL) == 0)
1328 return 0;
1329 bpj = 1;
1330 if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
1331 switch (fs->fs_journal_location) {
1332 case UFS_WAPBL_JOURNALLOC_END_PARTITION:
1333 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
1334 fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
1335 break;
1336 case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
1337 bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
1338 fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
1339 break;
1340 }
1341 }
1342 bpj /= fs->fs_bsize;
1343 return (bpj > 0 ? bpj : 1);
1344 }
1345 #endif /* defined(FFS_NO_SNAPSHOT) */
1346
1347 /*
1348 * Decrement extra reference on snapshot when last name is removed.
1349 * It will not be freed until the last open reference goes away.
1350 */
1351 void
1352 ffs_snapgone(struct vnode *vp)
1353 {
1354 struct inode *xp, *ip = VTOI(vp);
1355 struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
1356 struct fs *fs;
1357 struct snap_info *si;
1358 int snaploc;
1359
1360 si = VFSTOUFS(mp)->um_snapinfo;
1361
1362 /*
1363 * Find snapshot in incore list.
1364 */
1365 mutex_enter(&si->si_lock);
1366 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1367 if (xp == ip)
1368 break;
1369 mutex_exit(&si->si_lock);
1370 if (xp != NULL)
1371 vrele(ITOV(ip));
1372 #ifdef DEBUG
1373 else if (snapdebug)
1374 printf("ffs_snapgone: lost snapshot vnode %llu\n",
1375 (unsigned long long)ip->i_number);
1376 #endif
1377 /*
1378 * Delete snapshot inode from superblock. Keep list dense.
1379 */
1380 mutex_enter(&si->si_lock);
1381 fs = ip->i_fs;
1382 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1383 if (fs->fs_snapinum[snaploc] == ip->i_number)
1384 break;
1385 if (snaploc < FSMAXSNAP) {
1386 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1387 if (fs->fs_snapinum[snaploc] == 0)
1388 break;
1389 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1390 }
1391 fs->fs_snapinum[snaploc - 1] = 0;
1392 }
1393 si->si_gen++;
1394 mutex_exit(&si->si_lock);
1395 }
1396
1397 /*
1398 * Prepare a snapshot file for being removed.
1399 */
1400 void
1401 ffs_snapremove(struct vnode *vp)
1402 {
1403 struct inode *ip = VTOI(vp), *xp;
1404 struct vnode *devvp = ip->i_devvp;
1405 struct fs *fs = ip->i_fs;
1406 struct mount *mp = spec_node_getmountedfs(devvp);
1407 struct buf *ibp;
1408 struct snap_info *si;
1409 struct lwp *l = curlwp;
1410 daddr_t numblks, blkno, dblk;
1411 int error, loc, last;
1412
1413 si = VFSTOUFS(mp)->um_snapinfo;
1414 /*
1415 * If active, delete from incore list (this snapshot may
1416 * already have been in the process of being deleted, so
1417 * would not have been active).
1418 *
1419 * Clear copy-on-write flag if last snapshot.
1420 */
1421 mutex_enter(&si->si_snaplock);
1422 mutex_enter(&si->si_lock);
1423 if (is_active_snapshot(si, ip)) {
1424 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1425 if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1426 /* Roll back the list of preallocated blocks. */
1427 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1428 si->si_snapblklist = xp->i_snapblklist;
1429 si->si_gen++;
1430 mutex_exit(&si->si_lock);
1431 mutex_exit(&si->si_snaplock);
1432 } else {
1433 si->si_snapblklist = 0;
1434 si->si_gen++;
1435 mutex_exit(&si->si_lock);
1436 mutex_exit(&si->si_snaplock);
1437 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1438 }
1439 if (ip->i_snapblklist != NULL) {
1440 free(ip->i_snapblklist, M_UFSMNT);
1441 ip->i_snapblklist = NULL;
1442 }
1443 } else {
1444 mutex_exit(&si->si_lock);
1445 mutex_exit(&si->si_snaplock);
1446 }
1447 /*
1448 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1449 * snapshots that want them (see ffs_snapblkfree below).
1450 */
1451 for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1452 dblk = db_get(ip, blkno);
1453 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1454 db_assign(ip, blkno, 0);
1455 else if ((dblk == ffs_blkstofrags(fs, blkno) &&
1456 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1457 ip->i_number))) {
1458 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1459 db_assign(ip, blkno, 0);
1460 }
1461 }
1462 numblks = howmany(ip->i_size, fs->fs_bsize);
1463 for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
1464 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
1465 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
1466 if (error)
1467 continue;
1468 if (fs->fs_size - blkno > FFS_NINDIR(fs))
1469 last = FFS_NINDIR(fs);
1470 else
1471 last = fs->fs_size - blkno;
1472 for (loc = 0; loc < last; loc++) {
1473 dblk = idb_get(ip, ibp->b_data, loc);
1474 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1475 idb_assign(ip, ibp->b_data, loc, 0);
1476 else if (dblk == ffs_blkstofrags(fs, blkno) &&
1477 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1478 fs->fs_bsize, ip->i_number)) {
1479 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1480 idb_assign(ip, ibp->b_data, loc, 0);
1481 }
1482 }
1483 bawrite(ibp);
1484 UFS_WAPBL_END(mp);
1485 error = UFS_WAPBL_BEGIN(mp);
1486 KASSERT(error == 0);
1487 }
1488 /*
1489 * Clear snapshot flag and drop reference.
1490 */
1491 ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
1492 DIP_ASSIGN(ip, flags, ip->i_flags);
1493 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1494 #if defined(QUOTA) || defined(QUOTA2)
1495 chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
1496 chkiq(ip, 1, l->l_cred, FORCE);
1497 #endif
1498 }
1499
1500 /*
1501 * Notification that a block is being freed. Return zero if the free
1502 * should be allowed to proceed. Return non-zero if the snapshot file
1503 * wants to claim the block. The block will be claimed if it is an
1504 * uncopied part of one of the snapshots. It will be freed if it is
1505 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1506 * If a fragment is being freed, then all snapshots that care about
1507 * it must make a copy since a snapshot file can only claim full sized
1508 * blocks. Note that if more than one snapshot file maps the block,
1509 * we can pick one at random to claim it. Since none of the snapshots
1510 * can change, we are assurred that they will all see the same unmodified
1511 * image. When deleting a snapshot file (see ffs_snapremove above), we
1512 * must push any of these claimed blocks to one of the other snapshots
1513 * that maps it. These claimed blocks are easily identified as they will
1514 * have a block number equal to their logical block number within the
1515 * snapshot. A copied block can never have this property because they
1516 * must always have been allocated from a BLK_NOCOPY location.
1517 */
1518 int
1519 ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
1520 long size, ino_t inum)
1521 {
1522 struct mount *mp = spec_node_getmountedfs(devvp);
1523 struct buf *ibp;
1524 struct inode *ip;
1525 struct vnode *vp = NULL;
1526 struct snap_info *si;
1527 void *saved_data = NULL;
1528 daddr_t lbn;
1529 daddr_t blkno;
1530 uint32_t gen;
1531 int indiroff = 0, error = 0, claimedblk = 0;
1532
1533 si = VFSTOUFS(mp)->um_snapinfo;
1534 lbn = ffs_fragstoblks(fs, bno);
1535 mutex_enter(&si->si_snaplock);
1536 mutex_enter(&si->si_lock);
1537 si->si_owner = curlwp;
1538
1539 retry:
1540 gen = si->si_gen;
1541 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1542 vp = ITOV(ip);
1543 /*
1544 * Lookup block being written.
1545 */
1546 if (lbn < UFS_NDADDR) {
1547 blkno = db_get(ip, lbn);
1548 } else {
1549 mutex_exit(&si->si_lock);
1550 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
1551 fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
1552 if (error) {
1553 mutex_enter(&si->si_lock);
1554 break;
1555 }
1556 indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs);
1557 blkno = idb_get(ip, ibp->b_data, indiroff);
1558 mutex_enter(&si->si_lock);
1559 if (gen != si->si_gen) {
1560 brelse(ibp, 0);
1561 goto retry;
1562 }
1563 }
1564 /*
1565 * Check to see if block needs to be copied.
1566 */
1567 if (blkno == 0) {
1568 /*
1569 * A block that we map is being freed. If it has not
1570 * been claimed yet, we will claim or copy it (below).
1571 */
1572 claimedblk = 1;
1573 } else if (blkno == BLK_SNAP) {
1574 /*
1575 * No previous snapshot claimed the block,
1576 * so it will be freed and become a BLK_NOCOPY
1577 * (don't care) for us.
1578 */
1579 if (claimedblk)
1580 panic("snapblkfree: inconsistent block type");
1581 if (lbn < UFS_NDADDR) {
1582 db_assign(ip, lbn, BLK_NOCOPY);
1583 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1584 } else {
1585 idb_assign(ip, ibp->b_data, indiroff,
1586 BLK_NOCOPY);
1587 mutex_exit(&si->si_lock);
1588 if (ip->i_nlink > 0)
1589 bwrite(ibp);
1590 else
1591 bdwrite(ibp);
1592 mutex_enter(&si->si_lock);
1593 if (gen != si->si_gen)
1594 goto retry;
1595 }
1596 continue;
1597 } else /* BLK_NOCOPY or default */ {
1598 /*
1599 * If the snapshot has already copied the block
1600 * (default), or does not care about the block,
1601 * it is not needed.
1602 */
1603 if (lbn >= UFS_NDADDR)
1604 brelse(ibp, 0);
1605 continue;
1606 }
1607 /*
1608 * If this is a full size block, we will just grab it
1609 * and assign it to the snapshot inode. Otherwise we
1610 * will proceed to copy it. See explanation for this
1611 * routine as to why only a single snapshot needs to
1612 * claim this block.
1613 */
1614 if (size == fs->fs_bsize) {
1615 #ifdef DEBUG
1616 if (snapdebug)
1617 printf("%s %llu lbn %" PRId64
1618 "from inum %llu\n",
1619 "Grabonremove: snapino",
1620 (unsigned long long)ip->i_number,
1621 lbn, (unsigned long long)inum);
1622 #endif
1623 mutex_exit(&si->si_lock);
1624 if (lbn < UFS_NDADDR) {
1625 db_assign(ip, lbn, bno);
1626 } else {
1627 idb_assign(ip, ibp->b_data, indiroff, bno);
1628 if (ip->i_nlink > 0)
1629 bwrite(ibp);
1630 else
1631 bdwrite(ibp);
1632 }
1633 DIP_ADD(ip, blocks, btodb(size));
1634 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1635 if (ip->i_nlink > 0 && mp->mnt_wapbl)
1636 error = syncsnap(vp);
1637 else
1638 error = 0;
1639 mutex_enter(&si->si_lock);
1640 si->si_owner = NULL;
1641 mutex_exit(&si->si_lock);
1642 mutex_exit(&si->si_snaplock);
1643 return (error == 0);
1644 }
1645 if (lbn >= UFS_NDADDR)
1646 brelse(ibp, 0);
1647 #ifdef DEBUG
1648 if (snapdebug)
1649 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1650 "Copyonremove: snapino ",
1651 (unsigned long long)ip->i_number,
1652 lbn, "for inum", (unsigned long long)inum, size);
1653 #endif
1654 /*
1655 * If we have already read the old block contents, then
1656 * simply copy them to the new block. Note that we need
1657 * to synchronously write snapshots that have not been
1658 * unlinked, and hence will be visible after a crash,
1659 * to ensure their integrity.
1660 */
1661 mutex_exit(&si->si_lock);
1662 if (saved_data == NULL) {
1663 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1664 error = rwfsblk(vp, B_READ, saved_data, lbn);
1665 if (error) {
1666 free(saved_data, M_UFSMNT);
1667 saved_data = NULL;
1668 mutex_enter(&si->si_lock);
1669 break;
1670 }
1671 }
1672 error = wrsnapblk(vp, saved_data, lbn);
1673 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
1674 error = syncsnap(vp);
1675 mutex_enter(&si->si_lock);
1676 if (error)
1677 break;
1678 if (gen != si->si_gen)
1679 goto retry;
1680 }
1681 si->si_owner = NULL;
1682 mutex_exit(&si->si_lock);
1683 mutex_exit(&si->si_snaplock);
1684 if (saved_data)
1685 free(saved_data, M_UFSMNT);
1686 /*
1687 * If we have been unable to allocate a block in which to do
1688 * the copy, then return non-zero so that the fragment will
1689 * not be freed. Although space will be lost, the snapshot
1690 * will stay consistent.
1691 */
1692 return (error);
1693 }
1694
1695 /*
1696 * Associate snapshot files when mounting.
1697 */
1698 void
1699 ffs_snapshot_mount(struct mount *mp)
1700 {
1701 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1702 struct fs *fs = VFSTOUFS(mp)->um_fs;
1703 struct lwp *l = curlwp;
1704 struct vnode *vp;
1705 struct inode *ip, *xp;
1706 struct snap_info *si;
1707 daddr_t snaplistsize, *snapblklist;
1708 int i, error, ns __unused, snaploc, loc;
1709
1710 /*
1711 * No persistent snapshots on apple ufs file systems.
1712 */
1713 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1714 return;
1715
1716 si = VFSTOUFS(mp)->um_snapinfo;
1717 ns = UFS_FSNEEDSWAP(fs);
1718 /*
1719 * XXX The following needs to be set before ffs_truncate or
1720 * VOP_READ can be called.
1721 */
1722 mp->mnt_stat.f_iosize = fs->fs_bsize;
1723 /*
1724 * Process each snapshot listed in the superblock.
1725 */
1726 vp = NULL;
1727 mutex_enter(&si->si_lock);
1728 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1729 if (fs->fs_snapinum[snaploc] == 0)
1730 break;
1731 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1732 &vp)) != 0) {
1733 printf("ffs_snapshot_mount: vget failed %d\n", error);
1734 continue;
1735 }
1736 ip = VTOI(vp);
1737 if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
1738 SF_SNAPSHOT) {
1739 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1740 fs->fs_snapinum[snaploc]);
1741 vput(vp);
1742 vp = NULL;
1743 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1744 if (fs->fs_snapinum[loc] == 0)
1745 break;
1746 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1747 }
1748 fs->fs_snapinum[loc - 1] = 0;
1749 snaploc--;
1750 continue;
1751 }
1752
1753 /*
1754 * Read the block hints list. Use an empty list on
1755 * read errors.
1756 */
1757 error = vn_rdwr(UIO_READ, vp,
1758 (void *)&snaplistsize, sizeof(snaplistsize),
1759 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1760 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1761 l->l_cred, NULL, NULL);
1762 if (error) {
1763 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1764 snaplistsize = 1;
1765 } else
1766 snaplistsize = ufs_rw64(snaplistsize, ns);
1767 snapblklist = malloc(
1768 snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
1769 if (error)
1770 snapblklist[0] = 1;
1771 else {
1772 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1773 snaplistsize * sizeof(daddr_t),
1774 ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1775 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
1776 l->l_cred, NULL, NULL);
1777 for (i = 0; i < snaplistsize; i++)
1778 snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1779 if (error) {
1780 printf("ffs_snapshot_mount: read_2 failed %d\n",
1781 error);
1782 snapblklist[0] = 1;
1783 }
1784 }
1785 ip->i_snapblklist = &snapblklist[0];
1786
1787 /*
1788 * Link it onto the active snapshot list.
1789 */
1790 if (is_active_snapshot(si, ip))
1791 panic("ffs_snapshot_mount: %"PRIu64" already on list",
1792 ip->i_number);
1793 else
1794 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1795 vp->v_vflag |= VV_SYSTEM;
1796 VOP_UNLOCK(vp);
1797 }
1798 /*
1799 * No usable snapshots found.
1800 */
1801 if (vp == NULL) {
1802 mutex_exit(&si->si_lock);
1803 return;
1804 }
1805 /*
1806 * Attach the block hints list. We always want to
1807 * use the list from the newest snapshot.
1808 */
1809 xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1810 si->si_snapblklist = xp->i_snapblklist;
1811 fscow_establish(mp, ffs_copyonwrite, devvp);
1812 si->si_gen++;
1813 mutex_exit(&si->si_lock);
1814 }
1815
1816 /*
1817 * Disassociate snapshot files when unmounting.
1818 */
1819 void
1820 ffs_snapshot_unmount(struct mount *mp)
1821 {
1822 struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1823 struct inode *xp;
1824 struct vnode *vp = NULL;
1825 struct snap_info *si;
1826
1827 si = VFSTOUFS(mp)->um_snapinfo;
1828 mutex_enter(&si->si_lock);
1829 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1830 vp = ITOV(xp);
1831 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1832 if (xp->i_snapblklist == si->si_snapblklist)
1833 si->si_snapblklist = NULL;
1834 free(xp->i_snapblklist, M_UFSMNT);
1835 if (xp->i_nlink > 0) {
1836 si->si_gen++;
1837 mutex_exit(&si->si_lock);
1838 vrele(vp);
1839 mutex_enter(&si->si_lock);
1840 }
1841 }
1842 si->si_gen++;
1843 mutex_exit(&si->si_lock);
1844 if (vp)
1845 fscow_disestablish(mp, ffs_copyonwrite, devvp);
1846 }
1847
1848 /*
1849 * Check for need to copy block that is about to be written,
1850 * copying the block if necessary.
1851 */
1852 static int
1853 ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1854 {
1855 struct fs *fs;
1856 struct inode *ip;
1857 struct vnode *devvp = v, *vp = NULL;
1858 struct mount *mp = spec_node_getmountedfs(devvp);
1859 struct snap_info *si;
1860 void *saved_data = NULL;
1861 daddr_t lbn, blkno, *snapblklist;
1862 uint32_t gen;
1863 int lower, upper, mid, snapshot_locked = 0, error = 0;
1864
1865 /*
1866 * Check for valid snapshots.
1867 */
1868 si = VFSTOUFS(mp)->um_snapinfo;
1869 mutex_enter(&si->si_lock);
1870 ip = TAILQ_FIRST(&si->si_snapshots);
1871 if (ip == NULL) {
1872 mutex_exit(&si->si_lock);
1873 return 0;
1874 }
1875 /*
1876 * First check to see if it is after the file system,
1877 * in the journal or in the preallocated list.
1878 * By doing these checks we avoid several potential deadlocks.
1879 */
1880 fs = ip->i_fs;
1881 lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
1882 if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
1883 mutex_exit(&si->si_lock);
1884 return 0;
1885 }
1886 if ((fs->fs_flags & FS_DOWAPBL) &&
1887 fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
1888 off_t blk_off, log_start, log_end;
1889
1890 log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
1891 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1892 log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
1893 fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
1894 blk_off = dbtob(bp->b_blkno);
1895 if (blk_off >= log_start && blk_off < log_end) {
1896 mutex_exit(&si->si_lock);
1897 return 0;
1898 }
1899 }
1900 snapblklist = si->si_snapblklist;
1901 upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
1902 lower = 1;
1903 while (lower <= upper) {
1904 mid = (lower + upper) / 2;
1905 if (snapblklist[mid] == lbn)
1906 break;
1907 if (snapblklist[mid] < lbn)
1908 lower = mid + 1;
1909 else
1910 upper = mid - 1;
1911 }
1912 if (lower <= upper) {
1913 mutex_exit(&si->si_lock);
1914 return 0;
1915 }
1916 /*
1917 * Not in the precomputed list, so check the snapshots.
1918 */
1919 if (si->si_owner != curlwp) {
1920 if (!mutex_tryenter(&si->si_snaplock)) {
1921 mutex_exit(&si->si_lock);
1922 mutex_enter(&si->si_snaplock);
1923 mutex_enter(&si->si_lock);
1924 }
1925 si->si_owner = curlwp;
1926 snapshot_locked = 1;
1927 }
1928 if (data_valid && bp->b_bcount == fs->fs_bsize)
1929 saved_data = bp->b_data;
1930 retry:
1931 gen = si->si_gen;
1932 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1933 vp = ITOV(ip);
1934 /*
1935 * We ensure that everything of our own that needs to be
1936 * copied will be done at the time that ffs_snapshot is
1937 * called. Thus we can skip the check here which can
1938 * deadlock in doing the lookup in ffs_balloc.
1939 */
1940 if (bp->b_vp == vp)
1941 continue;
1942 /*
1943 * Check to see if block needs to be copied.
1944 */
1945 if (lbn < UFS_NDADDR) {
1946 blkno = db_get(ip, lbn);
1947 } else {
1948 mutex_exit(&si->si_lock);
1949 blkno = 0; /* XXX: GCC */
1950 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1951 mutex_enter(&si->si_lock);
1952 break;
1953 }
1954 mutex_enter(&si->si_lock);
1955 if (gen != si->si_gen)
1956 goto retry;
1957 }
1958 #ifdef DIAGNOSTIC
1959 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1960 panic("ffs_copyonwrite: bad copy block");
1961 #endif
1962 if (blkno != 0)
1963 continue;
1964
1965 if (curlwp == uvm.pagedaemon_lwp) {
1966 error = ENOMEM;
1967 break;
1968 }
1969 /* Only one level of recursion allowed. */
1970 KASSERT(snapshot_locked);
1971 /*
1972 * Allocate the block into which to do the copy. Since
1973 * multiple processes may all try to copy the same block,
1974 * we have to recheck our need to do a copy if we sleep
1975 * waiting for the lock.
1976 *
1977 * Because all snapshots on a filesystem share a single
1978 * lock, we ensure that we will never be in competition
1979 * with another process to allocate a block.
1980 */
1981 #ifdef DEBUG
1982 if (snapdebug) {
1983 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
1984 (unsigned long long)ip->i_number, lbn);
1985 if (bp->b_vp == devvp)
1986 printf("fs metadata");
1987 else
1988 printf("inum %llu", (unsigned long long)
1989 VTOI(bp->b_vp)->i_number);
1990 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1991 }
1992 #endif
1993 /*
1994 * If we have already read the old block contents, then
1995 * simply copy them to the new block. Note that we need
1996 * to synchronously write snapshots that have not been
1997 * unlinked, and hence will be visible after a crash,
1998 * to ensure their integrity.
1999 */
2000 mutex_exit(&si->si_lock);
2001 if (saved_data == NULL) {
2002 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
2003 error = rwfsblk(vp, B_READ, saved_data, lbn);
2004 if (error) {
2005 free(saved_data, M_UFSMNT);
2006 saved_data = NULL;
2007 mutex_enter(&si->si_lock);
2008 break;
2009 }
2010 }
2011 error = wrsnapblk(vp, saved_data, lbn);
2012 if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
2013 error = syncsnap(vp);
2014 mutex_enter(&si->si_lock);
2015 if (error)
2016 break;
2017 if (gen != si->si_gen)
2018 goto retry;
2019 }
2020 /*
2021 * Note that we need to synchronously write snapshots that
2022 * have not been unlinked, and hence will be visible after
2023 * a crash, to ensure their integrity.
2024 */
2025 if (snapshot_locked) {
2026 si->si_owner = NULL;
2027 mutex_exit(&si->si_lock);
2028 mutex_exit(&si->si_snaplock);
2029 } else
2030 mutex_exit(&si->si_lock);
2031 if (saved_data && saved_data != bp->b_data)
2032 free(saved_data, M_UFSMNT);
2033 return error;
2034 }
2035
2036 /*
2037 * Read from a snapshot.
2038 */
2039 int
2040 ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
2041 {
2042 struct inode *ip = VTOI(vp);
2043 struct fs *fs = ip->i_fs;
2044 struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
2045 struct buf *bp;
2046 daddr_t lbn, nextlbn;
2047 off_t fsbytes, bytesinfile;
2048 long size, xfersize, blkoffset;
2049 int error;
2050
2051 fstrans_start(vp->v_mount, FSTRANS_SHARED);
2052 mutex_enter(&si->si_snaplock);
2053
2054 if (ioflag & IO_ALTSEMANTICS)
2055 fsbytes = ip->i_size;
2056 else
2057 fsbytes = ffs_lfragtosize(fs, fs->fs_size);
2058 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
2059 bytesinfile = fsbytes - uio->uio_offset;
2060 if (bytesinfile <= 0)
2061 break;
2062 lbn = ffs_lblkno(fs, uio->uio_offset);
2063 nextlbn = lbn + 1;
2064 size = fs->fs_bsize;
2065 blkoffset = ffs_blkoff(fs, uio->uio_offset);
2066 xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
2067 bytesinfile);
2068
2069 if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
2070 if (ffs_lblktosize(fs, lbn) + size > fsbytes)
2071 size = ffs_fragroundup(fs,
2072 fsbytes - ffs_lblktosize(fs, lbn));
2073 error = bread(vp, lbn, size, NOCRED, 0, &bp);
2074 } else {
2075 int nextsize = fs->fs_bsize;
2076 error = breadn(vp, lbn,
2077 size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
2078 }
2079 if (error)
2080 break;
2081
2082 /*
2083 * We should only get non-zero b_resid when an I/O error
2084 * has occurred, which should cause us to break above.
2085 * However, if the short read did not cause an error,
2086 * then we want to ensure that we do not uiomove bad
2087 * or uninitialized data.
2088 */
2089 size -= bp->b_resid;
2090 if (size < blkoffset + xfersize) {
2091 xfersize = size - blkoffset;
2092 if (xfersize <= 0)
2093 break;
2094 }
2095 error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
2096 if (error)
2097 break;
2098 brelse(bp, BC_AGE);
2099 }
2100 if (bp != NULL)
2101 brelse(bp, BC_AGE);
2102
2103 mutex_exit(&si->si_snaplock);
2104 fstrans_done(vp->v_mount);
2105 return error;
2106 }
2107
2108 /*
2109 * Lookup a snapshots data block address.
2110 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
2111 * and safe even for the pagedaemon where we cannot bread().
2112 */
2113 static int
2114 snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
2115 {
2116 struct indir indirs[UFS_NIADDR + 2];
2117 struct inode *ip = VTOI(vp);
2118 struct fs *fs = ip->i_fs;
2119 struct buf *bp;
2120 int error, num;
2121
2122 KASSERT(lbn >= 0);
2123
2124 if (lbn < UFS_NDADDR) {
2125 *res = db_get(ip, lbn);
2126 return 0;
2127 }
2128 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
2129 return error;
2130 if (curlwp == uvm.pagedaemon_lwp) {
2131 mutex_enter(&bufcache_lock);
2132 bp = incore(vp, indirs[num-1].in_lbn);
2133 if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
2134 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2135 error = 0;
2136 } else
2137 error = ENOMEM;
2138 mutex_exit(&bufcache_lock);
2139 return error;
2140 }
2141 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
2142 if (error == 0) {
2143 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
2144 brelse(bp, 0);
2145 }
2146
2147 return error;
2148 }
2149
2150 /*
2151 * Read or write the specified block of the filesystem vp resides on
2152 * from or to the disk bypassing the buffer cache.
2153 */
2154 static int
2155 rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
2156 {
2157 int error;
2158 struct inode *ip = VTOI(vp);
2159 struct fs *fs = ip->i_fs;
2160 struct buf *nbp;
2161
2162 nbp = getiobuf(NULL, true);
2163 nbp->b_flags = flags;
2164 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
2165 nbp->b_error = 0;
2166 nbp->b_data = data;
2167 nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
2168 nbp->b_proc = NULL;
2169 nbp->b_dev = ip->i_devvp->v_rdev;
2170 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
2171
2172 bdev_strategy(nbp);
2173
2174 error = biowait(nbp);
2175
2176 putiobuf(nbp);
2177
2178 return error;
2179 }
2180
2181 /*
2182 * Write all dirty buffers to disk and invalidate them.
2183 */
2184 static int
2185 syncsnap(struct vnode *vp)
2186 {
2187 int error;
2188 buf_t *bp;
2189 struct fs *fs = VTOI(vp)->i_fs;
2190
2191 mutex_enter(&bufcache_lock);
2192 while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
2193 error = bbusy(bp, false, 0, NULL);
2194 if (error == EPASSTHROUGH)
2195 continue;
2196 else if (error != 0) {
2197 mutex_exit(&bufcache_lock);
2198 return error;
2199 }
2200 KASSERT(bp->b_bcount == fs->fs_bsize);
2201 mutex_exit(&bufcache_lock);
2202 error = rwfsblk(vp, B_WRITE, bp->b_data,
2203 ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
2204 brelse(bp, BC_INVAL | BC_VFLUSH);
2205 if (error)
2206 return error;
2207 mutex_enter(&bufcache_lock);
2208 }
2209 mutex_exit(&bufcache_lock);
2210
2211 return 0;
2212 }
2213
2214 /*
2215 * Write the specified block to a snapshot.
2216 */
2217 static int
2218 wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
2219 {
2220 struct inode *ip = VTOI(vp);
2221 struct fs *fs = ip->i_fs;
2222 struct buf *bp;
2223 int error;
2224
2225 error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
2226 FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
2227 if (error)
2228 return error;
2229 memcpy(bp->b_data, data, fs->fs_bsize);
2230 if (ip->i_nlink > 0)
2231 error = bwrite(bp);
2232 else
2233 bawrite(bp);
2234
2235 return error;
2236 }
2237
2238 /*
2239 * Check if this inode is present on the active snapshot list.
2240 * Must be called with snapinfo locked.
2241 */
2242 static inline bool
2243 is_active_snapshot(struct snap_info *si, struct inode *ip)
2244 {
2245 struct inode *xp;
2246
2247 KASSERT(mutex_owned(&si->si_lock));
2248
2249 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
2250 if (xp == ip)
2251 return true;
2252 return false;
2253 }
2254
2255 /*
2256 * Get/Put direct block from inode or buffer containing disk addresses. Take
2257 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2258 * into a global include.
2259 */
2260 static inline daddr_t
2261 db_get(struct inode *ip, int loc)
2262 {
2263 if (ip->i_ump->um_fstype == UFS1)
2264 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2265 else
2266 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2267 }
2268
2269 static inline void
2270 db_assign(struct inode *ip, int loc, daddr_t val)
2271 {
2272 if (ip->i_ump->um_fstype == UFS1)
2273 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2274 else
2275 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2276 }
2277
2278 __unused static inline daddr_t
2279 ib_get(struct inode *ip, int loc)
2280 {
2281 if (ip->i_ump->um_fstype == UFS1)
2282 return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2283 else
2284 return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2285 }
2286
2287 static inline daddr_t
2288 idb_get(struct inode *ip, void *bf, int loc)
2289 {
2290 if (ip->i_ump->um_fstype == UFS1)
2291 return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2292 else
2293 return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
2294 }
2295
2296 static inline void
2297 idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
2298 {
2299 if (ip->i_ump->um_fstype == UFS1)
2300 ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2301 else
2302 ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2303 }
2304