ffs_snapshot.c revision 1.9 1 /*
2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * Further information about snapshots can be obtained from:
5 *
6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/
7 * 1614 Oxford Street mckusick (at) mckusick.com
8 * Berkeley, CA 94709-1608 +1-510-843-9542
9 * USA
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
34 *
35 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
36 */
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.9 2005/02/09 16:05:29 hannken Exp $");
40
41 #if defined(_KERNEL_OPT)
42 #include "opt_ffs.h"
43 #endif
44
45 #include <sys/param.h>
46 #include <sys/kernel.h>
47 #include <sys/systm.h>
48 #include <sys/conf.h>
49 #include <sys/buf.h>
50 #include <sys/proc.h>
51 #include <sys/namei.h>
52 #include <sys/sched.h>
53 #include <sys/stat.h>
54 #include <sys/malloc.h>
55 #include <sys/mount.h>
56 #include <sys/resource.h>
57 #include <sys/resourcevar.h>
58 #include <sys/vnode.h>
59
60 #include <miscfs/specfs/specdev.h>
61
62 #include <ufs/ufs/quota.h>
63 #include <ufs/ufs/ufsmount.h>
64 #include <ufs/ufs/inode.h>
65 #include <ufs/ufs/ufs_extern.h>
66 #include <ufs/ufs/ufs_bswap.h>
67
68 #include <ufs/ffs/fs.h>
69 #include <ufs/ffs/ffs_extern.h>
70
71 /* FreeBSD -> NetBSD conversion */
72 #define KERNCRED proc0.p_ucred
73 #define ufs1_daddr_t int32_t
74 #define ufs2_daddr_t int64_t
75 #define ufs_lbn_t daddr_t
76 #define VI_MTX(v) (&(v)->v_interlock)
77 #define VI_LOCK(v) simple_lock(&(v)->v_interlock)
78 #define VI_UNLOCK(v) simple_unlock(&(v)->v_interlock)
79 #define MNT_ILOCK(v) simple_lock(&mntvnode_slock)
80 #define MNT_IUNLOCK(v) simple_unlock(&mntvnode_slock)
81
82 static int cgaccount(int, struct vnode *, caddr_t, int);
83 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
84 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
85 ufs_lbn_t, int), int);
86 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
87 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
88 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
89 ufs_lbn_t, int), int);
90 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
91 struct fs *, ufs_lbn_t, int);
92 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
93 struct fs *, ufs_lbn_t, int);
94 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
95 struct fs *, ufs_lbn_t, int);
96 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
97 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
98 ufs_lbn_t, int), int);
99 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
100 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
101 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
102 ufs_lbn_t, int), int);
103 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
104 struct fs *, ufs_lbn_t, int);
105 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
106 struct fs *, ufs_lbn_t, int);
107 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
108 struct fs *, ufs_lbn_t, int);
109 static int ffs_copyonwrite(void *, struct buf *);
110 static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t);
111 static int readvnblk(struct vnode *, caddr_t, ufs2_daddr_t);
112 static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t);
113 static inline int cow_enter(void);
114 static inline void cow_leave(int);
115 static inline ufs2_daddr_t db_get(struct inode *, int);
116 static inline void db_assign(struct inode *, int, ufs2_daddr_t);
117 static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int);
118 static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t);
119
120 #ifdef DEBUG
121 static int snapdebug = 0;
122 #endif
123
124 /*
125 * Create a snapshot file and initialize it for the filesystem.
126 * Vnode is locked on entry and return.
127 */
128 int
129 ffs_snapshot(mp, vp, ctime)
130 struct mount *mp;
131 struct vnode *vp;
132 struct timespec *ctime;
133 {
134 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist;
135 int error, ns, cg, snaploc;
136 int i, size, len, loc;
137 int flag = mp->mnt_flag;
138 struct timeval starttime;
139 #ifdef DEBUG
140 struct timeval endtime;
141 #endif
142 struct timespec ts;
143 long redo = 0;
144 int32_t *lp;
145 void *space;
146 caddr_t cgbuf;
147 struct ufsmount *ump = VFSTOUFS(mp);
148 struct fs *copy_fs = NULL, *fs = ump->um_fs;
149 struct proc *p = curproc;
150 struct inode *ip, *xp;
151 struct buf *bp, *ibp;
152 struct vattr vat;
153 struct vnode *xvp, *nvp, *devvp;
154
155 ns = UFS_FSNEEDSWAP(fs);
156 /*
157 * Need to serialize access to snapshot code per filesystem.
158 */
159 /*
160 * If the vnode already is a snapshot, return.
161 */
162 if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
163 if (ctime) {
164 ctime->tv_sec = DIP(VTOI(vp), mtime);
165 ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
166 }
167 return 0;
168 }
169 /*
170 * Check mount, exclusive reference and owner.
171 */
172 if (vp->v_mount != mp)
173 return EXDEV;
174 if (vp->v_usecount != 1 || vp->v_writecount != 0)
175 return EBUSY;
176 if (suser(p->p_ucred, &p->p_acflag) != 0 &&
177 VTOI(vp)->i_uid != p->p_ucred->cr_uid)
178 return EACCES;
179
180 if (vp->v_size != 0) {
181 error = VOP_TRUNCATE(vp, 0, 0, NOCRED, p);
182 if (error)
183 return error;
184 }
185 /*
186 * Assign a snapshot slot in the superblock.
187 */
188 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
189 if (fs->fs_snapinum[snaploc] == 0)
190 break;
191 if (snaploc == FSMAXSNAP)
192 return (ENOSPC);
193 ip = VTOI(vp);
194 devvp = ip->i_devvp;
195 /*
196 * Allocate and copy the last block contents so as to be able
197 * to set size to that of the filesystem.
198 */
199 numblks = howmany(fs->fs_size, fs->fs_frag);
200 cgbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
201 if ((error = readfsblk(vp, cgbuf, numblks - 1)) != 0)
202 goto out;
203 error = vn_rdwr(UIO_WRITE, vp,
204 cgbuf, fs->fs_bsize, lblktosize(fs, (off_t)(numblks - 1)),
205 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
206 if (error)
207 goto out;
208 /*
209 * Preallocate critical data structures so that we can copy
210 * them in without further allocation after we suspend all
211 * operations on the filesystem. We would like to just release
212 * the allocated buffers without writing them since they will
213 * be filled in below once we are ready to go, but this upsets
214 * the soft update code, so we go ahead and write the new buffers.
215 *
216 * Allocate all indirect blocks and mark all of them as not
217 * needing to be copied.
218 */
219 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
220 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
221 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
222 if (error)
223 goto out;
224 bwrite(ibp);
225 }
226 /*
227 * Allocate copies for the superblock and its summary information.
228 */
229 bzero(cgbuf, fs->fs_bsize);
230 blkno = lblkno(fs, fs->fs_sblockloc);
231 for (loc = 0; loc < howmany(fs->fs_sbsize, fs->fs_bsize); loc++)
232 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0)
233 goto out;
234 blkno = fragstoblks(fs, fs->fs_csaddr);
235 for (loc = 0; loc < howmany(fs->fs_cssize, fs->fs_bsize); loc++)
236 if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0)
237 goto out;
238 /*
239 * Allocate all cylinder group blocks.
240 */
241 for (cg = 0; cg < fs->fs_ncg; cg++)
242 if ((error = writevnblk(vp, cgbuf,
243 fragstoblks(fs, cgtod(fs, cg)))) != 0)
244 goto out;
245 /*
246 * Copy all the cylinder group maps. Although the
247 * filesystem is still active, we hope that only a few
248 * cylinder groups will change between now and when we
249 * suspend operations. Thus, we will be able to quickly
250 * touch up the few cylinder groups that changed during
251 * the suspension period.
252 */
253 len = howmany(fs->fs_ncg, NBBY);
254 MALLOC(fs->fs_active, u_char *, len, M_DEVBUF, M_WAITOK | M_ZERO);
255 for (cg = 0; cg < fs->fs_ncg; cg++) {
256 if ((error = cgaccount(cg, vp, cgbuf, 1)) != 0)
257 goto out;
258 if ((error = writevnblk(vp, cgbuf,
259 fragstoblks(fs, cgtod(fs, cg)))) != 0)
260 goto out;
261 }
262 /*
263 * Change inode to snapshot type file.
264 */
265 ip->i_flags |= SF_SNAPSHOT;
266 DIP_ASSIGN(ip, flags, ip->i_flags);
267 ip->i_flag |= IN_CHANGE | IN_UPDATE;
268 /*
269 * Ensure that the snapshot is completely on disk.
270 * Since we have marked it as a snapshot it is safe to
271 * unlock it as no process will be allowed to write to it.
272 */
273 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, p)) != 0)
274 goto out;
275 VOP_UNLOCK(vp, 0);
276 /*
277 * All allocations are done, so we can now snapshot the system.
278 *
279 * Suspend operation on filesystem.
280 */
281 if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) {
282 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
283 goto out;
284 }
285 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
286 microtime(&starttime);
287 /*
288 * First, copy all the cylinder group maps that have changed.
289 */
290 for (cg = 0; cg < fs->fs_ncg; cg++) {
291 if (ACTIVECG_ISSET(fs, cg))
292 continue;
293 redo++;
294 if ((error = cgaccount(cg, vp, cgbuf, 2)) != 0)
295 goto out1;
296 if ((error = writevnblk(vp, cgbuf,
297 fragstoblks(fs, cgtod(fs, cg)))) != 0)
298 goto out1;
299 }
300 /*
301 * Grab a copy of the superblock and its summary information.
302 * We delay writing it until the suspension is released below.
303 */
304 loc = blkoff(fs, fs->fs_sblockloc);
305 if (loc > 0)
306 bzero(&cgbuf[0], loc);
307 copy_fs = (struct fs *)(cgbuf + loc);
308 bcopy(fs, copy_fs, fs->fs_sbsize);
309 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
310 if (fs->fs_sbsize < size)
311 bzero(&cgbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize);
312 size = blkroundup(fs, fs->fs_cssize);
313 if (fs->fs_contigsumsize > 0)
314 size += fs->fs_ncg * sizeof(int32_t);
315 space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
316 copy_fs->fs_csp = space;
317 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
318 (char *)space += fs->fs_cssize;
319 loc = howmany(fs->fs_cssize, fs->fs_fsize);
320 i = fs->fs_frag - loc % fs->fs_frag;
321 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
322 if (len > 0) {
323 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
324 len, KERNCRED, &bp)) != 0) {
325 brelse(bp);
326 free(copy_fs->fs_csp, M_UFSMNT);
327 goto out1;
328 }
329 bcopy(bp->b_data, space, (u_int)len);
330 (char *)space += len;
331 bp->b_flags |= B_INVAL | B_NOCACHE;
332 brelse(bp);
333 }
334 if (fs->fs_contigsumsize > 0) {
335 copy_fs->fs_maxcluster = lp = space;
336 for (i = 0; i < fs->fs_ncg; i++)
337 *lp++ = fs->fs_contigsumsize;
338 }
339 /*
340 * We must check for active files that have been unlinked
341 * (e.g., with a zero link count). We have to expunge all
342 * trace of these files from the snapshot so that they are
343 * not reclaimed prematurely by fsck or unnecessarily dumped.
344 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
345 * spec_strategy about writing on a suspended filesystem.
346 * Note that we skip unlinked snapshot files as they will
347 * be handled separately below.
348 *
349 * We also calculate the needed size for the snapshot list.
350 */
351 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
352 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
353 MNT_ILOCK(mp);
354 loop:
355 for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) {
356 /*
357 * Make sure this vnode wasn't reclaimed in getnewvnode().
358 * Start over if it has (it won't be on the list anymore).
359 */
360 if (xvp->v_mount != mp)
361 goto loop;
362 nvp = LIST_NEXT(xvp, v_mntvnodes);
363 VI_LOCK(xvp);
364 MNT_IUNLOCK(mp);
365 if ((xvp->v_flag & VXLOCK) ||
366 xvp->v_usecount == 0 || xvp->v_type == VNON ||
367 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
368 VI_UNLOCK(xvp);
369 MNT_ILOCK(mp);
370 continue;
371 }
372 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
373 MNT_ILOCK(mp);
374 goto loop;
375 }
376 #ifdef DEBUG
377 if (snapdebug)
378 vprint("ffs_snapshot: busy vnode", xvp);
379 #endif
380 if (VOP_GETATTR(xvp, &vat, p->p_ucred, p) == 0 &&
381 vat.va_nlink > 0) {
382 VOP_UNLOCK(xvp, 0);
383 MNT_ILOCK(mp);
384 continue;
385 }
386 xp = VTOI(xvp);
387 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
388 VOP_UNLOCK(xvp, 0);
389 MNT_ILOCK(mp);
390 continue;
391 }
392 /*
393 * If there is a fragment, clear it here.
394 */
395 blkno = 0;
396 loc = howmany(xp->i_size, fs->fs_bsize) - 1;
397 if (loc < NDADDR) {
398 len = fragroundup(fs, blkoff(fs, xp->i_size));
399 if (len > 0 && len < fs->fs_bsize) {
400 ffs_blkfree(copy_fs, vp, db_get(xp, loc),
401 len, xp->i_number);
402 blkno = db_get(xp, loc);
403 db_assign(xp, loc, 0);
404 }
405 }
406 snaplistsize += 1;
407 if (xp->i_ump->um_fstype == UFS1)
408 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
409 BLK_NOCOPY);
410 else
411 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
412 BLK_NOCOPY);
413 if (blkno)
414 db_assign(xp, loc, blkno);
415 if (!error)
416 error = ffs_freefile(copy_fs, vp, xp->i_number,
417 xp->i_mode);
418 VOP_UNLOCK(xvp, 0);
419 if (error) {
420 free(copy_fs->fs_csp, M_UFSMNT);
421 goto out1;
422 }
423 MNT_ILOCK(mp);
424 }
425 MNT_IUNLOCK(mp);
426 /*
427 * If there already exist snapshots on this filesystem, grab a
428 * reference to their shared lock. If this is the first snapshot
429 * on this filesystem, we need to allocate a lock for the snapshots
430 * to share. In either case, acquire the snapshot lock and give
431 * up our original private lock.
432 */
433 VI_LOCK(devvp);
434 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) {
435 struct lock *lkp;
436
437 lkp = ITOV(xp)->v_vnlock;
438 VI_UNLOCK(devvp);
439 VI_LOCK(vp);
440 vp->v_vnlock = lkp;
441 } else {
442 struct lock *lkp;
443
444 VI_UNLOCK(devvp);
445 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
446 M_WAITOK);
447 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE);
448 VI_LOCK(vp);
449 vp->v_vnlock = lkp;
450 }
451 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY);
452 transferlockers(&vp->v_lock, vp->v_vnlock);
453 lockmgr(&vp->v_lock, LK_RELEASE, NULL);
454 /*
455 * If this is the first snapshot on this filesystem, then we need
456 * to allocate the space for the list of preallocated snapshot blocks.
457 * This list will be refined below, but this preliminary one will
458 * keep us out of deadlock until the full one is ready.
459 */
460 if (xp == NULL) {
461 MALLOC(snapblklist, ufs2_daddr_t *,
462 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
463 blkp = &snapblklist[1];
464 *blkp++ = ufs_rw64(lblkno(fs, fs->fs_sblockloc), ns);
465 blkno = fragstoblks(fs, fs->fs_csaddr);
466 for (cg = 0; cg < fs->fs_ncg; cg++) {
467 if (fragstoblks(fs, cgtod(fs, cg) > blkno))
468 break;
469 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns);
470 }
471 len = howmany(fs->fs_cssize, fs->fs_bsize);
472 for (loc = 0; loc < len; loc++)
473 *blkp++ = ufs_rw64(blkno + loc, ns);
474 for (; cg < fs->fs_ncg; cg++)
475 *blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns);
476 snapblklist[0] = ufs_rw64(blkp - snapblklist, ns);
477 VI_LOCK(devvp);
478 if (ump->um_snapblklist != NULL)
479 panic("ffs_snapshot: non-empty list");
480 ump->um_snapblklist = snapblklist;
481 ump->um_snaplistsize = blkp - snapblklist;
482 VI_UNLOCK(devvp);
483 }
484 /*
485 * Record snapshot inode. Since this is the newest snapshot,
486 * it must be placed at the end of the list.
487 */
488 VI_LOCK(devvp);
489 fs->fs_snapinum[snaploc] = ip->i_number;
490 if (ip->i_nextsnap.tqe_prev != 0)
491 panic("ffs_snapshot: %d already on list", ip->i_number);
492 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap);
493 VI_UNLOCK(devvp);
494 if (xp == NULL)
495 vn_cow_establish(devvp, ffs_copyonwrite, devvp);
496 vp->v_flag |= VSYSTEM;
497 out1:
498 /*
499 * Resume operation on filesystem.
500 */
501 vfs_write_resume(vp->v_mount);
502 /*
503 * Set the mtime to the time the snapshot has been taken.
504 */
505 TIMEVAL_TO_TIMESPEC(&starttime, &ts);
506 if (ctime)
507 *ctime = ts;
508 DIP_ASSIGN(ip, mtime, ts.tv_sec);
509 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
510 ip->i_flag |= IN_CHANGE | IN_UPDATE;
511
512 #ifdef DEBUG
513 if (starttime.tv_sec > 0) {
514 microtime(&endtime);
515 timersub(&endtime, &starttime, &endtime);
516 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
517 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
518 endtime.tv_usec / 1000, redo, fs->fs_ncg);
519 }
520 #endif
521 if (error)
522 goto out;
523 /*
524 * Copy allocation information from all the snapshots in
525 * this snapshot and then expunge them from its view.
526 */
527 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) {
528 if (xp == ip)
529 break;
530 if (xp->i_ump->um_fstype == UFS1)
531 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
532 BLK_SNAP);
533 else
534 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
535 BLK_SNAP);
536 if (error) {
537 fs->fs_snapinum[snaploc] = 0;
538 goto done;
539 }
540 }
541 /*
542 * Allocate space for the full list of preallocated snapshot blocks.
543 */
544 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t),
545 M_UFSMNT, M_WAITOK);
546 ip->i_snapblklist = &snapblklist[1];
547 /*
548 * Expunge the blocks used by the snapshots from the set of
549 * blocks marked as used in the snapshot bitmaps. Also, collect
550 * the list of allocated blocks in i_snapblklist.
551 */
552 if (ip->i_ump->um_fstype == UFS1)
553 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
554 else
555 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
556 if (error) {
557 fs->fs_snapinum[snaploc] = 0;
558 FREE(snapblklist, M_UFSMNT);
559 goto done;
560 }
561 if (snaplistsize < ip->i_snapblklist - snapblklist)
562 panic("ffs_snapshot: list too small");
563 snaplistsize = ip->i_snapblklist - snapblklist;
564 snapblklist[0] = ufs_rw64(snaplistsize, ns);
565 ip->i_snapblklist = 0;
566 /*
567 * Write out the list of allocated blocks to the end of the snapshot.
568 */
569 error = vn_rdwr(UIO_WRITE, vp,
570 (caddr_t)snapblklist, snaplistsize*sizeof(ufs2_daddr_t), ip->i_size,
571 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
572 if (error) {
573 fs->fs_snapinum[snaploc] = 0;
574 FREE(snapblklist, M_UFSMNT);
575 goto done;
576 }
577 /*
578 * Write the superblock and its summary information
579 * to the snapshot.
580 */
581 blkno = fragstoblks(fs, fs->fs_csaddr);
582 len = howmany(fs->fs_cssize, fs->fs_bsize);
583 space = copy_fs->fs_csp;
584 #ifdef FFS_EI
585 if (ns) {
586 ffs_sb_swap(copy_fs, copy_fs);
587 ffs_csum_swap(space, space, fs->fs_cssize);
588 }
589 #endif
590 for (loc = 0; loc < len; loc++) {
591 if ((error = writevnblk(vp, space, blkno + loc)) != 0) {
592 fs->fs_snapinum[snaploc] = 0;
593 FREE(snapblklist, M_UFSMNT);
594 goto done;
595 }
596 space = (char *)space + fs->fs_bsize;
597 }
598 /*
599 * As this is the newest list, it is the most inclusive, so
600 * should replace the previous list.
601 */
602 VI_LOCK(devvp);
603 space = ump->um_snapblklist;
604 ump->um_snapblklist = snapblklist;
605 ump->um_snaplistsize = snaplistsize;
606 VI_UNLOCK(devvp);
607 if (space != NULL)
608 FREE(space, M_UFSMNT);
609 done:
610 free(copy_fs->fs_csp, M_UFSMNT);
611 blkno = lblkno(fs, fs->fs_sblockloc);
612 if (error == 0 && (error = writevnblk(vp, cgbuf, blkno)) != 0)
613 fs->fs_snapinum[snaploc] = 0;
614 out:
615 /*
616 * All block address modifications are done. Invalidate and free
617 * all pages on the snapshot vnode. Those coming from read ahead
618 * are no longer valid.
619 */
620 if (!error) {
621 simple_lock(&vp->v_interlock);
622 error = VOP_PUTPAGES(vp, 0, 0,
623 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
624 }
625 if (cgbuf)
626 free(cgbuf, M_UFSMNT);
627 if (fs->fs_active != 0) {
628 FREE(fs->fs_active, M_DEVBUF);
629 fs->fs_active = 0;
630 }
631 mp->mnt_flag = flag;
632 if (error)
633 (void) VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p);
634 else
635 vref(vp);
636 return (error);
637 }
638
639 /*
640 * Copy a cylinder group map. All the unallocated blocks are marked
641 * BLK_NOCOPY so that the snapshot knows that it need not copy them
642 * if they are later written. If passno is one, then this is a first
643 * pass, so only setting needs to be done. If passno is 2, then this
644 * is a revision to a previous pass which must be undone as the
645 * replacement pass is done.
646 */
647 static int
648 cgaccount(cg, vp, data, passno)
649 int cg;
650 struct vnode *vp;
651 caddr_t data;
652 int passno;
653 {
654 struct buf *bp, *ibp;
655 struct inode *ip;
656 struct cg *cgp;
657 struct fs *fs;
658 ufs2_daddr_t base, numblks;
659 int error, len, loc, ns, indiroff;
660
661 ip = VTOI(vp);
662 fs = ip->i_fs;
663 ns = UFS_FSNEEDSWAP(fs);
664 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
665 (int)fs->fs_cgsize, KERNCRED, &bp);
666 if (error) {
667 brelse(bp);
668 return (error);
669 }
670 cgp = (struct cg *)bp->b_data;
671 if (!cg_chkmagic(cgp, ns)) {
672 brelse(bp);
673 return (EIO);
674 }
675 ACTIVECG_SET(fs, cg);
676
677 bcopy(bp->b_data, data, fs->fs_cgsize);
678 brelse(bp);
679 if (fs->fs_cgsize < fs->fs_bsize)
680 bzero(&data[fs->fs_cgsize],
681 fs->fs_bsize - fs->fs_cgsize);
682 numblks = howmany(fs->fs_size, fs->fs_frag);
683 len = howmany(fs->fs_fpg, fs->fs_frag);
684 base = cg * fs->fs_fpg / fs->fs_frag;
685 if (base + len >= numblks)
686 len = numblks - base - 1;
687 loc = 0;
688 if (base < NDADDR) {
689 for ( ; loc < NDADDR; loc++) {
690 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
691 db_assign(ip, loc, BLK_NOCOPY);
692 else if (db_get(ip, loc) == BLK_NOCOPY) {
693 if (passno == 2)
694 db_assign(ip, loc, 0);
695 else if (passno == 1)
696 panic("ffs_snapshot: lost direct block");
697 }
698 }
699 }
700 if ((error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
701 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
702 return (error);
703 indiroff = (base + loc - NDADDR) % NINDIR(fs);
704 for ( ; loc < len; loc++, indiroff++) {
705 if (indiroff >= NINDIR(fs)) {
706 bwrite(ibp);
707 if ((error = VOP_BALLOC(vp,
708 lblktosize(fs, (off_t)(base + loc)),
709 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
710 return (error);
711 indiroff = 0;
712 }
713 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
714 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
715 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
716 if (passno == 2)
717 idb_assign(ip, ibp->b_data, indiroff, 0);
718 else if (passno == 1)
719 panic("ffs_snapshot: lost indirect block");
720 }
721 }
722 bwrite(ibp);
723 return (0);
724 }
725
726 /*
727 * Before expunging a snapshot inode, note all the
728 * blocks that it claims with BLK_SNAP so that fsck will
729 * be able to account for those blocks properly and so
730 * that this snapshot knows that it need not copy them
731 * if the other snapshot holding them is freed. This code
732 * is reproduced once each for UFS1 and UFS2.
733 */
734 static int
735 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
736 struct vnode *snapvp;
737 struct inode *cancelip;
738 struct fs *fs;
739 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
740 struct fs *, ufs_lbn_t, int);
741 int expungetype;
742 {
743 int i, s, error, ns, indiroff;
744 ufs_lbn_t lbn, rlbn;
745 ufs2_daddr_t len, blkno, numblks, blksperindir;
746 struct ufs1_dinode *dip;
747 struct buf *bp;
748 caddr_t buf;
749
750 ns = UFS_FSNEEDSWAP(fs);
751 /*
752 * Prepare to expunge the inode. If its inode block has not
753 * yet been copied, then allocate and fill the copy.
754 */
755 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
756 blkno = 0;
757 if (lbn < NDADDR) {
758 blkno = db_get(VTOI(snapvp), lbn);
759 } else {
760 s = cow_enter();
761 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
762 fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
763 cow_leave(s);
764 if (error)
765 return (error);
766 indiroff = (lbn - NDADDR) % NINDIR(fs);
767 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff);
768 brelse(bp);
769 }
770 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
771 if (blkno != 0)
772 error = readvnblk(snapvp, buf, lbn);
773 else
774 error = readfsblk(snapvp, buf, lbn);
775 if (error) {
776 free(buf, M_UFSMNT);
777 return error;
778 }
779 /*
780 * Set a snapshot inode to be a zero length file, regular files
781 * to be completely unallocated.
782 */
783 dip = (struct ufs1_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number);
784 if (expungetype == BLK_NOCOPY)
785 dip->di_mode = 0;
786 dip->di_size = 0;
787 dip->di_blocks = 0;
788 dip->di_flags =
789 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
790 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
791 error = writevnblk(snapvp, buf, lbn);
792 free(buf, M_UFSMNT);
793 if (error)
794 return error;
795 /*
796 * Now go through and expunge all the blocks in the file
797 * using the function requested.
798 */
799 numblks = howmany(cancelip->i_size, fs->fs_bsize);
800 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0],
801 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype)))
802 return (error);
803 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0],
804 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype)))
805 return (error);
806 blksperindir = 1;
807 lbn = -NDADDR;
808 len = numblks - NDADDR;
809 rlbn = NDADDR;
810 for (i = 0; len > 0 && i < NIADDR; i++) {
811 error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
812 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len,
813 blksperindir, fs, acctfunc, expungetype);
814 if (error)
815 return (error);
816 blksperindir *= NINDIR(fs);
817 lbn -= blksperindir + 1;
818 len -= blksperindir;
819 rlbn += blksperindir;
820 }
821 return (0);
822 }
823
824 /*
825 * Descend an indirect block chain for vnode cancelvp accounting for all
826 * its indirect blocks in snapvp.
827 */
828 static int
829 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
830 blksperindir, fs, acctfunc, expungetype)
831 struct vnode *snapvp;
832 struct vnode *cancelvp;
833 int level;
834 ufs1_daddr_t blkno;
835 ufs_lbn_t lbn;
836 ufs_lbn_t rlbn;
837 ufs_lbn_t remblks;
838 ufs_lbn_t blksperindir;
839 struct fs *fs;
840 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
841 struct fs *, ufs_lbn_t, int);
842 int expungetype;
843 {
844 int error, ns, num, i;
845 ufs_lbn_t subblksperindir;
846 struct indir indirs[NIADDR + 2];
847 ufs1_daddr_t last, *bap;
848 struct buf *bp;
849
850 ns = UFS_FSNEEDSWAP(fs);
851
852 if (blkno == 0) {
853 if (expungetype == BLK_NOCOPY)
854 return (0);
855 panic("indiracct_ufs1: missing indir");
856 }
857 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
858 return (error);
859 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
860 panic("indiracct_ufs1: botched params");
861 /*
862 * We have to expand bread here since it will deadlock looking
863 * up the block number for any blocks that are not in the cache.
864 */
865 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
866 bp->b_blkno = fsbtodb(fs, blkno);
867 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
868 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) {
869 brelse(bp);
870 return (error);
871 }
872 /*
873 * Account for the block pointers in this indirect block.
874 */
875 last = howmany(remblks, blksperindir);
876 if (last > NINDIR(fs))
877 last = NINDIR(fs);
878 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
879 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
880 brelse(bp);
881 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
882 level == 0 ? rlbn : -1, expungetype);
883 if (error || level == 0)
884 goto out;
885 /*
886 * Account for the block pointers in each of the indirect blocks
887 * in the levels below us.
888 */
889 subblksperindir = blksperindir / NINDIR(fs);
890 for (lbn++, level--, i = 0; i < last; i++) {
891 error = indiracct_ufs1(snapvp, cancelvp, level,
892 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
893 fs, acctfunc, expungetype);
894 if (error)
895 goto out;
896 rlbn += blksperindir;
897 lbn -= blksperindir;
898 remblks -= blksperindir;
899 }
900 out:
901 FREE(bap, M_DEVBUF);
902 return (error);
903 }
904
905 /*
906 * Do both snap accounting and map accounting.
907 */
908 static int
909 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
910 struct vnode *vp;
911 ufs1_daddr_t *oldblkp, *lastblkp;
912 struct fs *fs;
913 ufs_lbn_t lblkno;
914 int exptype; /* BLK_SNAP or BLK_NOCOPY */
915 {
916 int error;
917
918 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
919 return (error);
920 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
921 }
922
923 /*
924 * Identify a set of blocks allocated in a snapshot inode.
925 */
926 static int
927 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
928 struct vnode *vp;
929 ufs1_daddr_t *oldblkp, *lastblkp;
930 struct fs *fs;
931 ufs_lbn_t lblkno;
932 int expungetype; /* BLK_SNAP or BLK_NOCOPY */
933 {
934 struct inode *ip = VTOI(vp);
935 ufs1_daddr_t blkno, *blkp;
936 ufs_lbn_t lbn;
937 struct buf *ibp;
938 int error, ns;
939
940 ns = UFS_FSNEEDSWAP(fs);
941
942 for ( ; oldblkp < lastblkp; oldblkp++) {
943 blkno = ufs_rw32(*oldblkp, ns);
944 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
945 continue;
946 lbn = fragstoblks(fs, blkno);
947 if (lbn < NDADDR) {
948 blkp = &ip->i_ffs1_db[lbn];
949 ip->i_flag |= IN_CHANGE | IN_UPDATE;
950 } else {
951 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
952 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
953 if (error)
954 return (error);
955 blkp = &((ufs1_daddr_t *)(ibp->b_data))
956 [(lbn - NDADDR) % NINDIR(fs)];
957 }
958 /*
959 * If we are expunging a snapshot vnode and we
960 * find a block marked BLK_NOCOPY, then it is
961 * one that has been allocated to this snapshot after
962 * we took our current snapshot and can be ignored.
963 */
964 blkno = ufs_rw32(*blkp, ns);
965 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
966 if (lbn >= NDADDR)
967 brelse(ibp);
968 } else {
969 if (blkno != 0)
970 panic("snapacct_ufs1: bad block");
971 *blkp = ufs_rw32(expungetype, ns);
972 if (lbn >= NDADDR)
973 bwrite(ibp);
974 }
975 }
976 return (0);
977 }
978
979 /*
980 * Account for a set of blocks allocated in a snapshot inode.
981 */
982 static int
983 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
984 struct vnode *vp;
985 ufs1_daddr_t *oldblkp, *lastblkp;
986 struct fs *fs;
987 ufs_lbn_t lblkno;
988 int expungetype;
989 {
990 ufs1_daddr_t blkno;
991 struct inode *ip;
992 ino_t inum;
993 int acctit, ns;
994
995 ns = UFS_FSNEEDSWAP(fs);
996 ip = VTOI(vp);
997 inum = ip->i_number;
998 if (lblkno == -1)
999 acctit = 0;
1000 else
1001 acctit = 1;
1002 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1003 blkno = ufs_rw32(*oldblkp, ns);
1004 if (blkno == 0 || blkno == BLK_NOCOPY)
1005 continue;
1006 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1007 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns);
1008 if (blkno == BLK_SNAP)
1009 blkno = blkstofrags(fs, lblkno);
1010 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1011 }
1012 return (0);
1013 }
1014
1015 /*
1016 * Before expunging a snapshot inode, note all the
1017 * blocks that it claims with BLK_SNAP so that fsck will
1018 * be able to account for those blocks properly and so
1019 * that this snapshot knows that it need not copy them
1020 * if the other snapshot holding them is freed. This code
1021 * is reproduced once each for UFS1 and UFS2.
1022 */
1023 static int
1024 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
1025 struct vnode *snapvp;
1026 struct inode *cancelip;
1027 struct fs *fs;
1028 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1029 struct fs *, ufs_lbn_t, int);
1030 int expungetype;
1031 {
1032 int i, s, error, ns, indiroff;
1033 ufs_lbn_t lbn, rlbn;
1034 ufs2_daddr_t len, blkno, numblks, blksperindir;
1035 struct ufs2_dinode *dip;
1036 struct buf *bp;
1037 caddr_t buf;
1038
1039 ns = UFS_FSNEEDSWAP(fs);
1040 /*
1041 * Prepare to expunge the inode. If its inode block has not
1042 * yet been copied, then allocate and fill the copy.
1043 */
1044 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1045 blkno = 0;
1046 if (lbn < NDADDR) {
1047 blkno = db_get(VTOI(snapvp), lbn);
1048 } else {
1049 s = cow_enter();
1050 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
1051 fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
1052 cow_leave(s);
1053 if (error)
1054 return (error);
1055 indiroff = (lbn - NDADDR) % NINDIR(fs);
1056 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff);
1057 brelse(bp);
1058 }
1059 buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1060 if (blkno != 0)
1061 error = readvnblk(snapvp, buf, lbn);
1062 else
1063 error = readfsblk(snapvp, buf, lbn);
1064 if (error) {
1065 free(buf, M_UFSMNT);
1066 return error;
1067 }
1068 /*
1069 * Set a snapshot inode to be a zero length file, regular files
1070 * to be completely unallocated.
1071 */
1072 dip = (struct ufs2_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number);
1073 if (expungetype == BLK_NOCOPY)
1074 dip->di_mode = 0;
1075 dip->di_size = 0;
1076 dip->di_blocks = 0;
1077 dip->di_flags =
1078 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
1079 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1080 error = writevnblk(snapvp, buf, lbn);
1081 free(buf, M_UFSMNT);
1082 if (error)
1083 return error;
1084 /*
1085 * Now go through and expunge all the blocks in the file
1086 * using the function requested.
1087 */
1088 numblks = howmany(cancelip->i_size, fs->fs_bsize);
1089 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0],
1090 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype)))
1091 return (error);
1092 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0],
1093 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype)))
1094 return (error);
1095 blksperindir = 1;
1096 lbn = -NDADDR;
1097 len = numblks - NDADDR;
1098 rlbn = NDADDR;
1099 for (i = 0; len > 0 && i < NIADDR; i++) {
1100 error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1101 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len,
1102 blksperindir, fs, acctfunc, expungetype);
1103 if (error)
1104 return (error);
1105 blksperindir *= NINDIR(fs);
1106 lbn -= blksperindir + 1;
1107 len -= blksperindir;
1108 rlbn += blksperindir;
1109 }
1110 return (0);
1111 }
1112
1113 /*
1114 * Descend an indirect block chain for vnode cancelvp accounting for all
1115 * its indirect blocks in snapvp.
1116 */
1117 static int
1118 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1119 blksperindir, fs, acctfunc, expungetype)
1120 struct vnode *snapvp;
1121 struct vnode *cancelvp;
1122 int level;
1123 ufs2_daddr_t blkno;
1124 ufs_lbn_t lbn;
1125 ufs_lbn_t rlbn;
1126 ufs_lbn_t remblks;
1127 ufs_lbn_t blksperindir;
1128 struct fs *fs;
1129 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1130 struct fs *, ufs_lbn_t, int);
1131 int expungetype;
1132 {
1133 int error, ns, num, i;
1134 ufs_lbn_t subblksperindir;
1135 struct indir indirs[NIADDR + 2];
1136 ufs2_daddr_t last, *bap;
1137 struct buf *bp;
1138
1139 ns = UFS_FSNEEDSWAP(fs);
1140
1141 if (blkno == 0) {
1142 if (expungetype == BLK_NOCOPY)
1143 return (0);
1144 panic("indiracct_ufs2: missing indir");
1145 }
1146 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1147 return (error);
1148 if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1149 panic("indiracct_ufs2: botched params");
1150 /*
1151 * We have to expand bread here since it will deadlock looking
1152 * up the block number for any blocks that are not in the cache.
1153 */
1154 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
1155 bp->b_blkno = fsbtodb(fs, blkno);
1156 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1157 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) {
1158 brelse(bp);
1159 return (error);
1160 }
1161 /*
1162 * Account for the block pointers in this indirect block.
1163 */
1164 last = howmany(remblks, blksperindir);
1165 if (last > NINDIR(fs))
1166 last = NINDIR(fs);
1167 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
1168 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1169 brelse(bp);
1170 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1171 level == 0 ? rlbn : -1, expungetype);
1172 if (error || level == 0)
1173 goto out;
1174 /*
1175 * Account for the block pointers in each of the indirect blocks
1176 * in the levels below us.
1177 */
1178 subblksperindir = blksperindir / NINDIR(fs);
1179 for (lbn++, level--, i = 0; i < last; i++) {
1180 error = indiracct_ufs2(snapvp, cancelvp, level,
1181 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
1182 fs, acctfunc, expungetype);
1183 if (error)
1184 goto out;
1185 rlbn += blksperindir;
1186 lbn -= blksperindir;
1187 remblks -= blksperindir;
1188 }
1189 out:
1190 FREE(bap, M_DEVBUF);
1191 return (error);
1192 }
1193
1194 /*
1195 * Do both snap accounting and map accounting.
1196 */
1197 static int
1198 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1199 struct vnode *vp;
1200 ufs2_daddr_t *oldblkp, *lastblkp;
1201 struct fs *fs;
1202 ufs_lbn_t lblkno;
1203 int exptype; /* BLK_SNAP or BLK_NOCOPY */
1204 {
1205 int error;
1206
1207 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1208 return (error);
1209 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1210 }
1211
1212 /*
1213 * Identify a set of blocks allocated in a snapshot inode.
1214 */
1215 static int
1216 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1217 struct vnode *vp;
1218 ufs2_daddr_t *oldblkp, *lastblkp;
1219 struct fs *fs;
1220 ufs_lbn_t lblkno;
1221 int expungetype; /* BLK_SNAP or BLK_NOCOPY */
1222 {
1223 struct inode *ip = VTOI(vp);
1224 ufs2_daddr_t blkno, *blkp;
1225 ufs_lbn_t lbn;
1226 struct buf *ibp;
1227 int error, ns;
1228
1229 ns = UFS_FSNEEDSWAP(fs);
1230
1231 for ( ; oldblkp < lastblkp; oldblkp++) {
1232 blkno = ufs_rw64(*oldblkp, ns);
1233 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1234 continue;
1235 lbn = fragstoblks(fs, blkno);
1236 if (lbn < NDADDR) {
1237 blkp = &ip->i_ffs2_db[lbn];
1238 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1239 } else {
1240 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1241 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1242 if (error)
1243 return (error);
1244 blkp = &((ufs2_daddr_t *)(ibp->b_data))
1245 [(lbn - NDADDR) % NINDIR(fs)];
1246 }
1247 /*
1248 * If we are expunging a snapshot vnode and we
1249 * find a block marked BLK_NOCOPY, then it is
1250 * one that has been allocated to this snapshot after
1251 * we took our current snapshot and can be ignored.
1252 */
1253 blkno = ufs_rw64(*blkp, ns);
1254 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1255 if (lbn >= NDADDR)
1256 brelse(ibp);
1257 } else {
1258 if (blkno != 0)
1259 panic("snapacct_ufs2: bad block");
1260 *blkp = ufs_rw64(expungetype, ns);
1261 if (lbn >= NDADDR)
1262 bwrite(ibp);
1263 }
1264 }
1265 return (0);
1266 }
1267
1268 /*
1269 * Account for a set of blocks allocated in a snapshot inode.
1270 */
1271 static int
1272 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1273 struct vnode *vp;
1274 ufs2_daddr_t *oldblkp, *lastblkp;
1275 struct fs *fs;
1276 ufs_lbn_t lblkno;
1277 int expungetype;
1278 {
1279 ufs2_daddr_t blkno;
1280 struct inode *ip;
1281 ino_t inum;
1282 int acctit, ns;
1283
1284 ns = UFS_FSNEEDSWAP(fs);
1285 ip = VTOI(vp);
1286 inum = ip->i_number;
1287 if (lblkno == -1)
1288 acctit = 0;
1289 else
1290 acctit = 1;
1291 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1292 blkno = ufs_rw64(*oldblkp, ns);
1293 if (blkno == 0 || blkno == BLK_NOCOPY)
1294 continue;
1295 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1296 *ip->i_snapblklist++ = ufs_rw64(lblkno, ns);
1297 if (blkno == BLK_SNAP)
1298 blkno = blkstofrags(fs, lblkno);
1299 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1300 }
1301 return (0);
1302 }
1303
1304 /*
1305 * Decrement extra reference on snapshot when last name is removed.
1306 * It will not be freed until the last open reference goes away.
1307 */
1308 void
1309 ffs_snapgone(ip)
1310 struct inode *ip;
1311 {
1312 struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint);
1313 struct inode *xp;
1314 struct fs *fs;
1315 int snaploc;
1316
1317 /*
1318 * Find snapshot in incore list.
1319 */
1320 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap)
1321 if (xp == ip)
1322 break;
1323 if (xp != NULL)
1324 vrele(ITOV(ip));
1325 #ifdef DEBUG
1326 else if (snapdebug)
1327 printf("ffs_snapgone: lost snapshot vnode %d\n",
1328 ip->i_number);
1329 #endif
1330 /*
1331 * Delete snapshot inode from superblock. Keep list dense.
1332 */
1333 fs = ip->i_fs;
1334 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1335 if (fs->fs_snapinum[snaploc] == ip->i_number)
1336 break;
1337 if (snaploc < FSMAXSNAP) {
1338 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1339 if (fs->fs_snapinum[snaploc] == 0)
1340 break;
1341 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1342 }
1343 fs->fs_snapinum[snaploc - 1] = 0;
1344 }
1345 }
1346
1347 /*
1348 * Prepare a snapshot file for being removed.
1349 */
1350 void
1351 ffs_snapremove(vp)
1352 struct vnode *vp;
1353 {
1354 struct inode *ip = VTOI(vp);
1355 struct vnode *devvp = ip->i_devvp;
1356 struct fs *fs = ip->i_fs;
1357 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1358 struct lock *lkp;
1359 struct buf *ibp;
1360 ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
1361 int error, ns, loc, last;
1362
1363 ns = UFS_FSNEEDSWAP(fs);
1364 /*
1365 * If active, delete from incore list (this snapshot may
1366 * already have been in the process of being deleted, so
1367 * would not have been active).
1368 *
1369 * Clear copy-on-write flag if last snapshot.
1370 */
1371 if (ip->i_nextsnap.tqe_prev != 0) {
1372 VI_LOCK(devvp);
1373 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE,
1374 VI_MTX(devvp));
1375 VI_LOCK(devvp);
1376 TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap);
1377 ip->i_nextsnap.tqe_prev = 0;
1378 lkp = vp->v_vnlock;
1379 vp->v_vnlock = &vp->v_lock;
1380 lockmgr(lkp, LK_RELEASE, NULL);
1381 if (TAILQ_FIRST(&ump->um_snapshots) != 0) {
1382 VI_UNLOCK(devvp);
1383 } else {
1384 snapblklist = ump->um_snapblklist;
1385 ump->um_snapblklist = 0;
1386 ump->um_snaplistsize = 0;
1387 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
1388 lockmgr(lkp, LK_RELEASE, NULL);
1389 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp);
1390 FREE(lkp, M_UFSMNT);
1391 FREE(snapblklist, M_UFSMNT);
1392 }
1393 }
1394 /*
1395 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1396 * snapshots that want them (see ffs_snapblkfree below).
1397 */
1398 for (blkno = 1; blkno < NDADDR; blkno++) {
1399 dblk = db_get(ip, blkno);
1400 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1401 db_assign(ip, blkno, 0);
1402 else if ((dblk == blkstofrags(fs, blkno) &&
1403 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1404 ip->i_number))) {
1405 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1406 db_assign(ip, blkno, 0);
1407 }
1408 }
1409 numblks = howmany(ip->i_size, fs->fs_bsize);
1410 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1411 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1412 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1413 if (error)
1414 continue;
1415 if (fs->fs_size - blkno > NINDIR(fs))
1416 last = NINDIR(fs);
1417 else
1418 last = fs->fs_size - blkno;
1419 for (loc = 0; loc < last; loc++) {
1420 dblk = idb_get(ip, ibp->b_data, loc);
1421 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1422 idb_assign(ip, ibp->b_data, loc, 0);
1423 else if (dblk == blkstofrags(fs, blkno) &&
1424 ffs_snapblkfree(fs, ip->i_devvp, dblk,
1425 fs->fs_bsize, ip->i_number)) {
1426 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1427 idb_assign(ip, ibp->b_data, loc, 0);
1428 }
1429 }
1430 bwrite(ibp);
1431 }
1432 /*
1433 * Clear snapshot flag and drop reference.
1434 */
1435 ip->i_flags &= ~SF_SNAPSHOT;
1436 DIP_ASSIGN(ip, flags, ip->i_flags);
1437 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1438 }
1439
1440 /*
1441 * Notification that a block is being freed. Return zero if the free
1442 * should be allowed to proceed. Return non-zero if the snapshot file
1443 * wants to claim the block. The block will be claimed if it is an
1444 * uncopied part of one of the snapshots. It will be freed if it is
1445 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1446 * If a fragment is being freed, then all snapshots that care about
1447 * it must make a copy since a snapshot file can only claim full sized
1448 * blocks. Note that if more than one snapshot file maps the block,
1449 * we can pick one at random to claim it. Since none of the snapshots
1450 * can change, we are assurred that they will all see the same unmodified
1451 * image. When deleting a snapshot file (see ffs_snapremove above), we
1452 * must push any of these claimed blocks to one of the other snapshots
1453 * that maps it. These claimed blocks are easily identified as they will
1454 * have a block number equal to their logical block number within the
1455 * snapshot. A copied block can never have this property because they
1456 * must always have been allocated from a BLK_NOCOPY location.
1457 */
1458 int
1459 ffs_snapblkfree(fs, devvp, bno, size, inum)
1460 struct fs *fs;
1461 struct vnode *devvp;
1462 ufs2_daddr_t bno;
1463 long size;
1464 ino_t inum;
1465 {
1466 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1467 struct buf *ibp;
1468 struct inode *ip;
1469 struct vnode *vp = NULL, *saved_vp = NULL;
1470 caddr_t saved_data = NULL;
1471 ufs_lbn_t lbn;
1472 ufs2_daddr_t blkno;
1473 int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1474
1475 lbn = fragstoblks(fs, bno);
1476 retry:
1477 VI_LOCK(devvp);
1478 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) {
1479 vp = ITOV(ip);
1480 /*
1481 * Lookup block being written.
1482 */
1483 if (lbn < NDADDR) {
1484 blkno = db_get(ip, lbn);
1485 } else {
1486 if (snapshot_locked == 0 &&
1487 lockmgr(vp->v_vnlock,
1488 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1489 VI_MTX(devvp)) != 0)
1490 goto retry;
1491 snapshot_locked = 1;
1492 s = cow_enter();
1493 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1494 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1495 cow_leave(s);
1496 if (error)
1497 break;
1498 indiroff = (lbn - NDADDR) % NINDIR(fs);
1499 blkno = idb_get(ip, ibp->b_data, indiroff);
1500 }
1501 /*
1502 * Check to see if block needs to be copied.
1503 */
1504 if (blkno == 0) {
1505 /*
1506 * A block that we map is being freed. If it has not
1507 * been claimed yet, we will claim or copy it (below).
1508 */
1509 claimedblk = 1;
1510 } else if (blkno == BLK_SNAP) {
1511 /*
1512 * No previous snapshot claimed the block,
1513 * so it will be freed and become a BLK_NOCOPY
1514 * (don't care) for us.
1515 */
1516 if (claimedblk)
1517 panic("snapblkfree: inconsistent block type");
1518 if (snapshot_locked == 0 &&
1519 lockmgr(vp->v_vnlock,
1520 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1521 VI_MTX(devvp)) != 0) {
1522 if (lbn >= NDADDR)
1523 brelse(ibp);
1524 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL);
1525 goto retry;
1526 }
1527 snapshot_locked = 1;
1528 if (lbn < NDADDR) {
1529 db_assign(ip, lbn, BLK_NOCOPY);
1530 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1531 } else {
1532 idb_assign(ip, ibp->b_data, indiroff,
1533 BLK_NOCOPY);
1534 bwrite(ibp);
1535 }
1536 continue;
1537 } else /* BLK_NOCOPY or default */ {
1538 /*
1539 * If the snapshot has already copied the block
1540 * (default), or does not care about the block,
1541 * it is not needed.
1542 */
1543 if (lbn >= NDADDR)
1544 brelse(ibp);
1545 continue;
1546 }
1547 /*
1548 * If this is a full size block, we will just grab it
1549 * and assign it to the snapshot inode. Otherwise we
1550 * will proceed to copy it. See explanation for this
1551 * routine as to why only a single snapshot needs to
1552 * claim this block.
1553 */
1554 if (snapshot_locked == 0 &&
1555 lockmgr(vp->v_vnlock,
1556 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1557 VI_MTX(devvp)) != 0) {
1558 if (lbn >= NDADDR)
1559 brelse(ibp);
1560 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL);
1561 goto retry;
1562 }
1563 snapshot_locked = 1;
1564 if (size == fs->fs_bsize) {
1565 #ifdef DEBUG
1566 if (snapdebug)
1567 printf("%s %d lbn %" PRId64 " from inum %d\n",
1568 "Grabonremove: snapino", ip->i_number,
1569 lbn, inum);
1570 #endif
1571 if (lbn < NDADDR) {
1572 db_assign(ip, lbn, bno);
1573 } else {
1574 idb_assign(ip, ibp->b_data, indiroff, bno);
1575 bwrite(ibp);
1576 }
1577 DIP_ADD(ip, blocks, btodb(size));
1578 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1579 VOP_UNLOCK(vp, 0);
1580 return (1);
1581 }
1582 if (lbn >= NDADDR)
1583 brelse(ibp);
1584 #ifdef DEBUG
1585 if (snapdebug)
1586 printf("%s%d lbn %" PRId64 " %s %d size %ld\n",
1587 "Copyonremove: snapino ", ip->i_number,
1588 lbn, "for inum", inum, size);
1589 #endif
1590 /*
1591 * If we have already read the old block contents, then
1592 * simply copy them to the new block. Note that we need
1593 * to synchronously write snapshots that have not been
1594 * unlinked, and hence will be visible after a crash,
1595 * to ensure their integrity.
1596 */
1597 if (saved_data) {
1598 error = writevnblk(vp, saved_data, lbn);
1599 if (error)
1600 break;
1601 continue;
1602 }
1603 /*
1604 * Otherwise, read the old block contents into the buffer.
1605 */
1606 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1607 saved_vp = vp;
1608 if ((error = readfsblk(vp, saved_data, lbn)) != 0) {
1609 free(saved_data, M_UFSMNT);
1610 saved_data = NULL;
1611 break;
1612 }
1613 }
1614 /*
1615 * Note that we need to synchronously write snapshots that
1616 * have not been unlinked, and hence will be visible after
1617 * a crash, to ensure their integrity.
1618 */
1619 if (saved_data) {
1620 error = writevnblk(saved_vp, saved_data, lbn);
1621 free(saved_data, M_UFSMNT);
1622 }
1623 /*
1624 * If we have been unable to allocate a block in which to do
1625 * the copy, then return non-zero so that the fragment will
1626 * not be freed. Although space will be lost, the snapshot
1627 * will stay consistent.
1628 */
1629 if (snapshot_locked)
1630 VOP_UNLOCK(vp, 0);
1631 else
1632 VI_UNLOCK(devvp);
1633 return (error);
1634 }
1635
1636 /*
1637 * Associate snapshot files when mounting.
1638 */
1639 void
1640 ffs_snapshot_mount(mp)
1641 struct mount *mp;
1642 {
1643 struct ufsmount *ump = VFSTOUFS(mp);
1644 struct vnode *devvp = ump->um_devvp;
1645 struct fs *fs = ump->um_fs;
1646 struct proc *p = curproc;
1647 struct vnode *vp;
1648 struct inode *ip, *xp;
1649 ufs2_daddr_t snaplistsize, *snapblklist;
1650 int error, ns, snaploc, loc;
1651
1652 ns = UFS_FSNEEDSWAP(fs);
1653 /*
1654 * XXX The following needs to be set before VOP_TRUNCATE or
1655 * VOP_READ can be called.
1656 */
1657 mp->mnt_stat.f_iosize = fs->fs_bsize;
1658 /*
1659 * Process each snapshot listed in the superblock.
1660 */
1661 vp = NULL;
1662 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1663 if (fs->fs_snapinum[snaploc] == 0)
1664 break;
1665 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1666 &vp)) != 0) {
1667 printf("ffs_snapshot_mount: vget failed %d\n", error);
1668 continue;
1669 }
1670 ip = VTOI(vp);
1671 if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1672 printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1673 fs->fs_snapinum[snaploc]);
1674 vput(vp);
1675 vp = NULL;
1676 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1677 if (fs->fs_snapinum[loc] == 0)
1678 break;
1679 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1680 }
1681 fs->fs_snapinum[loc - 1] = 0;
1682 snaploc--;
1683 continue;
1684 }
1685 /*
1686 * If there already exist snapshots on this filesystem, grab a
1687 * reference to their shared lock. If this is the first snapshot
1688 * on this filesystem, we need to allocate a lock for the
1689 * snapshots to share. In either case, acquire the snapshot
1690 * lock and give up our original private lock.
1691 */
1692 VI_LOCK(devvp);
1693 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) {
1694 struct lock *lkp;
1695
1696 lkp = ITOV(xp)->v_vnlock;
1697 VI_UNLOCK(devvp);
1698 VI_LOCK(vp);
1699 vp->v_vnlock = lkp;
1700 } else {
1701 struct lock *lkp;
1702
1703 VI_UNLOCK(devvp);
1704 MALLOC(lkp, struct lock *, sizeof(struct lock),
1705 M_UFSMNT, M_WAITOK);
1706 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE);
1707 VI_LOCK(vp);
1708 vp->v_vnlock = lkp;
1709 }
1710 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY);
1711 transferlockers(&vp->v_lock, vp->v_vnlock);
1712 lockmgr(&vp->v_lock, LK_RELEASE, NULL);
1713 /*
1714 * Link it onto the active snapshot list.
1715 */
1716 VI_LOCK(devvp);
1717 if (ip->i_nextsnap.tqe_prev != 0)
1718 panic("ffs_snapshot_mount: %d already on list",
1719 ip->i_number);
1720 else
1721 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap);
1722 vp->v_flag |= VSYSTEM;
1723 VI_UNLOCK(devvp);
1724 VOP_UNLOCK(vp, 0);
1725 }
1726 /*
1727 * No usable snapshots found.
1728 */
1729 if (vp == NULL)
1730 return;
1731 /*
1732 * Allocate the space for the block hints list. We always want to
1733 * use the list from the newest snapshot.
1734 */
1735 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1736 error = vn_rdwr(UIO_READ, vp,
1737 (caddr_t)&snaplistsize, sizeof(snaplistsize),
1738 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1739 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
1740 if (error) {
1741 printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1742 VOP_UNLOCK(vp, 0);
1743 return;
1744 }
1745 snaplistsize = ufs_rw64(snaplistsize, ns);
1746 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t),
1747 M_UFSMNT, M_WAITOK);
1748 error = vn_rdwr(UIO_READ, vp,
1749 (caddr_t)snapblklist, snaplistsize * sizeof(ufs2_daddr_t),
1750 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1751 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
1752 if (error) {
1753 printf("ffs_snapshot_mount: read_2 failed %d\n", error);
1754 VOP_UNLOCK(vp, 0);
1755 FREE(snapblklist, M_UFSMNT);
1756 return;
1757 }
1758 VOP_UNLOCK(vp, 0);
1759 VI_LOCK(devvp);
1760 ump->um_snaplistsize = snaplistsize;
1761 ump->um_snapblklist = snapblklist;
1762 VI_UNLOCK(devvp);
1763 vn_cow_establish(devvp, ffs_copyonwrite, devvp);
1764 }
1765
1766 /*
1767 * Disassociate snapshot files when unmounting.
1768 */
1769 void
1770 ffs_snapshot_unmount(mp)
1771 struct mount *mp;
1772 {
1773 struct ufsmount *ump = VFSTOUFS(mp);
1774 struct vnode *devvp = ump->um_devvp;
1775 struct lock *lkp = NULL;
1776 struct inode *xp;
1777 struct vnode *vp;
1778
1779 VI_LOCK(devvp);
1780 while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) {
1781 vp = ITOV(xp);
1782 lkp = vp->v_vnlock;
1783 vp->v_vnlock = &vp->v_lock;
1784 TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap);
1785 xp->i_nextsnap.tqe_prev = 0;
1786 if (xp->i_ffs_effnlink > 0) {
1787 VI_UNLOCK(devvp);
1788 vrele(vp);
1789 VI_LOCK(devvp);
1790 }
1791 }
1792 if (ump->um_snapblklist != NULL) {
1793 FREE(ump->um_snapblklist, M_UFSMNT);
1794 ump->um_snapblklist = NULL;
1795 ump->um_snaplistsize = 0;
1796 }
1797 VI_UNLOCK(devvp);
1798 if (lkp != NULL) {
1799 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp);
1800 FREE(lkp, M_UFSMNT);
1801 }
1802 }
1803
1804 /*
1805 * Check for need to copy block that is about to be written,
1806 * copying the block if necessary.
1807 */
1808 static int
1809 ffs_copyonwrite(v, bp)
1810 void *v;
1811 struct buf *bp;
1812 {
1813 struct buf *ibp;
1814 struct fs *fs;
1815 struct inode *ip;
1816 struct vnode *devvp = v, *vp = 0, *saved_vp = NULL;
1817 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1818 caddr_t saved_data = NULL;
1819 ufs2_daddr_t lbn, blkno, *snapblklist;
1820 int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0;
1821
1822 /*
1823 * Check for valid snapshots.
1824 */
1825 VI_LOCK(devvp);
1826 ip = TAILQ_FIRST(&ump->um_snapshots);
1827 if (ip == NULL) {
1828 VI_UNLOCK(devvp);
1829 return 0;
1830 }
1831 /*
1832 * First check to see if it is in the preallocated list.
1833 * By doing this check we avoid several potential deadlocks.
1834 */
1835 fs = ip->i_fs;
1836 ns = UFS_FSNEEDSWAP(fs);
1837 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1838 snapblklist = ump->um_snapblklist;
1839 upper = ump->um_snaplistsize - 1;
1840 lower = 1;
1841 while (lower <= upper) {
1842 mid = (lower + upper) / 2;
1843 if (ufs_rw64(snapblklist[mid], ns) == lbn)
1844 break;
1845 if (ufs_rw64(snapblklist[mid], ns) < lbn)
1846 lower = mid + 1;
1847 else
1848 upper = mid - 1;
1849 }
1850 if (lower <= upper) {
1851 VI_UNLOCK(devvp);
1852 return 0;
1853 }
1854 /*
1855 * Not in the precomputed list, so check the snapshots.
1856 */
1857 retry:
1858 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) {
1859 vp = ITOV(ip);
1860 /*
1861 * We ensure that everything of our own that needs to be
1862 * copied will be done at the time that ffs_snapshot is
1863 * called. Thus we can skip the check here which can
1864 * deadlock in doing the lookup in VOP_BALLOC.
1865 */
1866 if (bp->b_vp == vp)
1867 continue;
1868 /*
1869 * Check to see if block needs to be copied. We do not have
1870 * to hold the snapshot lock while doing this lookup as it
1871 * will never require any additional allocations for the
1872 * snapshot inode.
1873 */
1874 if (lbn < NDADDR) {
1875 blkno = db_get(ip, lbn);
1876 } else {
1877 if (snapshot_locked == 0 &&
1878 lockmgr(vp->v_vnlock,
1879 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1880 VI_MTX(devvp)) != 0) {
1881 VI_LOCK(devvp);
1882 goto retry;
1883 }
1884 snapshot_locked = 1;
1885 s = cow_enter();
1886 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1887 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1888 cow_leave(s);
1889 if (error)
1890 break;
1891 indiroff = (lbn - NDADDR) % NINDIR(fs);
1892 blkno = idb_get(ip, ibp->b_data, indiroff);
1893 brelse(ibp);
1894 }
1895 #ifdef DIAGNOSTIC
1896 if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1897 panic("ffs_copyonwrite: bad copy block");
1898 #endif
1899 if (blkno != 0)
1900 continue;
1901 #ifdef DIAGNOSTIC
1902 if (curlwp->l_flag & L_COWINPROGRESS)
1903 printf("ffs_copyonwrite: recursive call\n");
1904 #endif
1905 /*
1906 * Allocate the block into which to do the copy. Since
1907 * multiple processes may all try to copy the same block,
1908 * we have to recheck our need to do a copy if we sleep
1909 * waiting for the lock.
1910 *
1911 * Because all snapshots on a filesystem share a single
1912 * lock, we ensure that we will never be in competition
1913 * with another process to allocate a block.
1914 */
1915 if (snapshot_locked == 0 &&
1916 lockmgr(vp->v_vnlock,
1917 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1918 VI_MTX(devvp)) != 0) {
1919 VI_LOCK(devvp);
1920 goto retry;
1921 }
1922 snapshot_locked = 1;
1923 #ifdef DEBUG
1924 if (snapdebug) {
1925 printf("Copyonwrite: snapino %d lbn %" PRId64 " for ",
1926 ip->i_number, lbn);
1927 if (bp->b_vp == devvp)
1928 printf("fs metadata");
1929 else
1930 printf("inum %d", VTOI(bp->b_vp)->i_number);
1931 printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1932 }
1933 #endif
1934 /*
1935 * If we have already read the old block contents, then
1936 * simply copy them to the new block. Note that we need
1937 * to synchronously write snapshots that have not been
1938 * unlinked, and hence will be visible after a crash,
1939 * to ensure their integrity.
1940 */
1941 if (saved_data) {
1942 error = writevnblk(vp, saved_data, lbn);
1943 if (error)
1944 break;
1945 continue;
1946 }
1947 /*
1948 * Otherwise, read the old block contents into the buffer.
1949 */
1950 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1951 saved_vp = vp;
1952 if ((error = readfsblk(vp, saved_data, lbn)) != 0) {
1953 free(saved_data, M_UFSMNT);
1954 saved_data = NULL;
1955 break;
1956 }
1957 }
1958 /*
1959 * Note that we need to synchronously write snapshots that
1960 * have not been unlinked, and hence will be visible after
1961 * a crash, to ensure their integrity.
1962 */
1963 if (saved_data) {
1964 error = writevnblk(saved_vp, saved_data, lbn);
1965 free(saved_data, M_UFSMNT);
1966 }
1967 if (snapshot_locked)
1968 VOP_UNLOCK(vp, 0);
1969 else
1970 VI_UNLOCK(devvp);
1971 return error;
1972 }
1973
1974 /*
1975 * Read the specified block from disk. Vp is usually a snapshot vnode.
1976 */
1977 static int
1978 readfsblk(vp, data, lbn)
1979 struct vnode *vp;
1980 caddr_t data;
1981 ufs2_daddr_t lbn;
1982 {
1983 int s, error;
1984 struct inode *ip = VTOI(vp);
1985 struct fs *fs = ip->i_fs;
1986 struct buf *nbp;
1987
1988 s = splbio();
1989 nbp = pool_get(&bufpool, PR_WAITOK);
1990 splx(s);
1991
1992 BUF_INIT(nbp);
1993 nbp->b_flags = B_READ;
1994 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
1995 nbp->b_error = 0;
1996 nbp->b_data = data;
1997 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
1998 nbp->b_proc = NULL;
1999 nbp->b_dev = ip->i_devvp->v_rdev;
2000 nbp->b_vp = NULLVP;
2001
2002 DEV_STRATEGY(nbp);
2003
2004 error = biowait(nbp);
2005
2006 s = splbio();
2007 pool_put(&bufpool, nbp);
2008 splx(s);
2009
2010 return error;
2011 }
2012
2013 /*
2014 * Read the specified block. Bypass UBC to prevent deadlocks.
2015 */
2016 static int
2017 readvnblk(vp, data, lbn)
2018 struct vnode *vp;
2019 caddr_t data;
2020 ufs2_daddr_t lbn;
2021 {
2022 int error;
2023 daddr_t bn;
2024 off_t offset;
2025 struct inode *ip = VTOI(vp);
2026 struct fs *fs = ip->i_fs;
2027
2028 error = VOP_BMAP(vp, lbn, NULL, &bn, NULL);
2029 if (error)
2030 return error;
2031
2032 if (bn != (daddr_t)-1) {
2033 offset = dbtob(bn);
2034 simple_lock(&vp->v_interlock);
2035 error = VOP_PUTPAGES(vp, trunc_page(offset),
2036 round_page(offset+fs->fs_bsize),
2037 PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
2038 if (error)
2039 return error;
2040
2041 return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn)));
2042 }
2043
2044 bzero(data, fs->fs_bsize);
2045
2046 return 0;
2047 }
2048
2049 /*
2050 * Write the specified block. Bypass UBC to prevent deadlocks.
2051 */
2052 static int
2053 writevnblk(vp, data, lbn)
2054 struct vnode *vp;
2055 caddr_t data;
2056 ufs2_daddr_t lbn;
2057 {
2058 int s, error;
2059 off_t offset;
2060 struct buf *bp;
2061 struct inode *ip = VTOI(vp);
2062 struct fs *fs = ip->i_fs;
2063
2064 offset = lblktosize(fs, (off_t)lbn);
2065 s = cow_enter();
2066 simple_lock(&vp->v_interlock);
2067 error = VOP_PUTPAGES(vp, trunc_page(offset),
2068 round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
2069 if (error == 0)
2070 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2071 fs->fs_bsize, KERNCRED, B_SYNC, &bp);
2072 cow_leave(s);
2073 if (error)
2074 return error;
2075
2076 bcopy(data, bp->b_data, fs->fs_bsize);
2077 bp->b_flags |= B_NOCACHE;
2078
2079 return bwrite(bp);
2080 }
2081
2082 /*
2083 * Set/reset lwp's L_COWINPROGRESS flag.
2084 * May be called recursive.
2085 */
2086 static inline int
2087 cow_enter(void)
2088 {
2089 struct lwp *l = curlwp;
2090
2091 if (l->l_flag & L_COWINPROGRESS) {
2092 return 0;
2093 } else {
2094 l->l_flag |= L_COWINPROGRESS;
2095 return L_COWINPROGRESS;
2096 }
2097 }
2098
2099 static inline void
2100 cow_leave(int flag)
2101 {
2102 struct lwp *l = curlwp;
2103
2104 l->l_flag &= ~flag;
2105 }
2106
2107 /*
2108 * Get/Put direct block from inode or buffer containing disk addresses. Take
2109 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2110 * into a global include.
2111 */
2112 static inline ufs2_daddr_t
2113 db_get(struct inode *ip, int loc)
2114 {
2115 if (ip->i_ump->um_fstype == UFS1)
2116 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2117 else
2118 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2119 }
2120
2121 static inline void
2122 db_assign(struct inode *ip, int loc, ufs2_daddr_t val)
2123 {
2124 if (ip->i_ump->um_fstype == UFS1)
2125 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2126 else
2127 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2128 }
2129
2130 static inline ufs2_daddr_t
2131 idb_get(struct inode *ip, caddr_t buf, int loc)
2132 {
2133 if (ip->i_ump->um_fstype == UFS1)
2134 return ufs_rw32(((ufs1_daddr_t *)(buf))[loc],
2135 UFS_IPNEEDSWAP(ip));
2136 else
2137 return ufs_rw64(((ufs2_daddr_t *)(buf))[loc],
2138 UFS_IPNEEDSWAP(ip));
2139 }
2140
2141 static inline void
2142 idb_assign(struct inode *ip, caddr_t buf, int loc, ufs2_daddr_t val)
2143 {
2144 if (ip->i_ump->um_fstype == UFS1)
2145 ((ufs1_daddr_t *)(buf))[loc] =
2146 ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2147 else
2148 ((ufs2_daddr_t *)(buf))[loc] =
2149 ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2150 }
2151