lfs_rfw.c revision 1.33 1 /* $NetBSD: lfs_rfw.c,v 1.33 2018/12/10 14:46:25 maxv Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.33 2018/12/10 14:46:25 maxv Exp $");
34
35 #if defined(_KERNEL_OPT)
36 #include "opt_quota.h"
37 #endif
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/namei.h>
42 #include <sys/proc.h>
43 #include <sys/kernel.h>
44 #include <sys/vnode.h>
45 #include <sys/mount.h>
46 #include <sys/kthread.h>
47 #include <sys/buf.h>
48 #include <sys/device.h>
49 #include <sys/file.h>
50 #include <sys/disklabel.h>
51 #include <sys/ioctl.h>
52 #include <sys/errno.h>
53 #include <sys/malloc.h>
54 #include <sys/pool.h>
55 #include <sys/socket.h>
56 #include <sys/syslog.h>
57 #include <uvm/uvm_extern.h>
58 #include <sys/sysctl.h>
59 #include <sys/conf.h>
60 #include <sys/kauth.h>
61
62 #include <miscfs/specfs/specdev.h>
63
64 #include <ufs/lfs/ulfs_quotacommon.h>
65 #include <ufs/lfs/ulfs_inode.h>
66 #include <ufs/lfs/ulfsmount.h>
67 #include <ufs/lfs/ulfs_extern.h>
68
69 #include <uvm/uvm.h>
70 #include <uvm/uvm_stat.h>
71 #include <uvm/uvm_pager.h>
72 #include <uvm/uvm_pdaemon.h>
73
74 #include <ufs/lfs/lfs.h>
75 #include <ufs/lfs/lfs_accessors.h>
76 #include <ufs/lfs/lfs_kernel.h>
77 #include <ufs/lfs/lfs_extern.h>
78
79 #include <miscfs/genfs/genfs.h>
80 #include <miscfs/genfs/genfs_node.h>
81
82 /*
83 * Roll-forward code.
84 */
85 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
86 kauth_cred_t, int, int *, struct lwp *);
87
88 extern int lfs_do_rfw;
89
90 /*
91 * Allocate a particular inode with a particular version number, freeing
92 * any previous versions of this inode that may have gone before.
93 * Used by the roll-forward code.
94 *
95 * XXX this function does not have appropriate locking to be used on a live fs;
96 * XXX but something similar could probably be used for an "undelete" call.
97 *
98 * Called with the Ifile inode locked.
99 */
100 int
101 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
102 struct vnode **vpp)
103 {
104 struct vattr va;
105 struct vnode *vp;
106 struct inode *ip;
107 int error;
108
109 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
110
111 /*
112 * First, just try a vget. If the version number is the one we want,
113 * we don't have to do anything else. If the version number is wrong,
114 * take appropriate action.
115 */
116 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp);
117 if (error == 0) {
118 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp));
119
120 *vpp = vp;
121 ip = VTOI(vp);
122 if (ip->i_gen == vers)
123 return 0;
124 else if (ip->i_gen < vers) {
125 lfs_truncate(vp, (off_t)0, 0, NOCRED);
126 ip->i_gen = vers;
127 lfs_dino_setgen(fs, ip->i_din, vers);
128 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
129 return 0;
130 } else {
131 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
132 ino, vers, lfs_dino_getgen(fs, ip->i_din)));
133 vput(vp);
134 *vpp = NULLVP;
135 return EEXIST;
136 }
137 }
138
139 /* Not found, create as regular file. */
140 vattr_null(&va);
141 va.va_type = VREG;
142 va.va_mode = 0;
143 va.va_fileid = ino;
144 va.va_gen = vers;
145 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, &vp);
146 if (error)
147 return error;
148 error = vn_lock(vp, LK_EXCLUSIVE);
149 if (error) {
150 vrele(vp);
151 *vpp = NULLVP;
152 return error;
153 }
154 ip = VTOI(vp);
155 ip->i_nlink = 1;
156 lfs_dino_setnlink(fs, ip->i_din, 1);
157 *vpp = vp;
158 return 0;
159 }
160
161 /*
162 * Load the appropriate indirect block, and change the appropriate pointer.
163 * Mark the block dirty. Do segment and avail accounting.
164 */
165 static int
166 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
167 daddr_t ndaddr, size_t size, struct lwp *l)
168 {
169 int error;
170 struct vnode *vp;
171 struct inode *ip;
172 #ifdef DEBUG
173 daddr_t odaddr;
174 struct indir a[ULFS_NIADDR];
175 int num;
176 int i;
177 #endif /* DEBUG */
178 struct buf *bp;
179 SEGUSE *sup;
180
181 KASSERT(lbn >= 0); /* no indirect blocks */
182
183 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) {
184 DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc"
185 " returned %d\n", ino, error));
186 return error;
187 }
188
189 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)), size,
190 NOCRED, 0, &bp)) != 0) {
191 vput(vp);
192 return (error);
193 }
194 /* No need to write, the block is already on disk */
195 if (bp->b_oflags & BO_DELWRI) {
196 LFS_UNLOCK_BUF(bp);
197 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
198 /* XXX should this wake up fs->lfs_availsleep? */
199 }
200 brelse(bp, BC_INVAL);
201
202 /*
203 * Extend the file, if it is not large enough already.
204 * XXX this is not exactly right, we don't know how much of the
205 * XXX last block is actually used. We hope that an inode will
206 * XXX appear later to give the correct size.
207 */
208 ip = VTOI(vp);
209 if (ip->i_size <= (lbn << lfs_sb_getbshift(fs))) {
210 u_int64_t newsize;
211
212 if (lbn < ULFS_NDADDR) {
213 newsize = (lbn << lfs_sb_getbshift(fs)) +
214 (size - lfs_sb_getfsize(fs)) + 1;
215 } else {
216 newsize = (lbn << lfs_sb_getbshift(fs)) + 1;
217 }
218 lfs_dino_setsize(fs, ip->i_din, newsize);
219
220 if (ip->i_size < newsize) {
221 ip->i_size = newsize;
222 /*
223 * tell vm our new size for the case the inode won't
224 * appear later.
225 */
226 uvm_vnp_setsize(vp, newsize);
227 }
228 }
229
230 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
231
232 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
233 sup->su_nbytes += size;
234 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
235
236 /* differences here should be due to UNWRITTEN indirect blocks. */
237 KASSERT((lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR &&
238 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din)) ||
239 ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din));
240
241 #ifdef DEBUG
242 /* Now look again to make sure it worked */
243 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
244 for (i = num; i > 0; i--) {
245 if (!a[i].in_exists)
246 panic("update_meta: absent %d lv indirect block", i);
247 }
248 if (LFS_DBTOFSB(fs, odaddr) != ndaddr)
249 DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %"
250 PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr));
251 #endif /* DEBUG */
252 vput(vp);
253 return 0;
254 }
255
256 /*
257 * Copy some the fields of the dinode as needed by update_inoblk().
258 */
259 static void
260 update_inoblk_copy_dinode(struct lfs *fs,
261 union lfs_dinode *dstu, const union lfs_dinode *srcu)
262 {
263 if (fs->lfs_is64) {
264 struct lfs64_dinode *dst = &dstu->u_64;
265 const struct lfs64_dinode *src = &srcu->u_64;
266 unsigned i;
267
268 /*
269 * Copy everything but the block pointers and di_blocks.
270 * XXX what about di_extb?
271 */
272 dst->di_mode = src->di_mode;
273 dst->di_nlink = src->di_nlink;
274 dst->di_uid = src->di_uid;
275 dst->di_gid = src->di_gid;
276 dst->di_blksize = src->di_blksize;
277 dst->di_size = src->di_size;
278 dst->di_atime = src->di_atime;
279 dst->di_mtime = src->di_mtime;
280 dst->di_ctime = src->di_ctime;
281 dst->di_birthtime = src->di_birthtime;
282 dst->di_mtimensec = src->di_mtimensec;
283 dst->di_atimensec = src->di_atimensec;
284 dst->di_ctimensec = src->di_ctimensec;
285 dst->di_birthnsec = src->di_birthnsec;
286 dst->di_gen = src->di_gen;
287 dst->di_kernflags = src->di_kernflags;
288 dst->di_flags = src->di_flags;
289 dst->di_extsize = src->di_extsize;
290 dst->di_modrev = src->di_modrev;
291 dst->di_inumber = src->di_inumber;
292 for (i = 0; i < __arraycount(src->di_spare); i++) {
293 dst->di_spare[i] = src->di_spare[i];
294 }
295 } else {
296 struct lfs32_dinode *dst = &dstu->u_32;
297 const struct lfs32_dinode *src = &srcu->u_32;
298
299 /* Get mode, link count, size, and times */
300 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0]));
301
302 /* Then the rest, except di_blocks */
303 dst->di_flags = src->di_flags;
304 dst->di_gen = src->di_gen;
305 dst->di_uid = src->di_uid;
306 dst->di_gid = src->di_gid;
307 dst->di_modrev = src->di_modrev;
308 }
309 }
310
311 static int
312 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
313 struct lwp *l)
314 {
315 struct vnode *devvp, *vp;
316 struct inode *ip;
317 union lfs_dinode *dip;
318 struct buf *dbp, *ibp;
319 int error;
320 daddr_t daddr;
321 IFILE *ifp;
322 SEGUSE *sup;
323 unsigned i, num;
324
325 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
326
327 /*
328 * Get the inode, update times and perms.
329 * DO NOT update disk blocks, we do that separately.
330 */
331 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
332 0, &dbp);
333 if (error) {
334 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
335 return error;
336 }
337 num = LFS_INOPB(fs);
338 for (i = num; i-- > 0; ) {
339 dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
340 if (lfs_dino_getinumber(fs, dip) > LFS_IFILE_INUM) {
341 error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip),
342 lfs_dino_getgen(fs, dip),
343 l, &vp);
344 if (error) {
345 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
346 " returned %d\n", error));
347 continue;
348 }
349 ip = VTOI(vp);
350 if (lfs_dino_getsize(fs, dip) != ip->i_size)
351 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0,
352 NOCRED);
353 update_inoblk_copy_dinode(fs, ip->i_din, dip);
354
355 ip->i_flags = lfs_dino_getflags(fs, dip);
356 ip->i_gen = lfs_dino_getgen(fs, dip);
357 ip->i_uid = lfs_dino_getuid(fs, dip);
358 ip->i_gid = lfs_dino_getgid(fs, dip);
359
360 ip->i_mode = lfs_dino_getmode(fs, dip);
361 ip->i_nlink = lfs_dino_getnlink(fs, dip);
362 ip->i_size = lfs_dino_getsize(fs, dip);
363
364 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
365
366 /* Re-initialize to get type right */
367 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
368 &vp);
369 vput(vp);
370
371 /* Record change in location */
372 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
373 daddr = lfs_if_getdaddr(fs, ifp);
374 lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno));
375 error = LFS_BWRITE_LOG(ibp); /* Ifile */
376 /* And do segment accounting */
377 if (lfs_dtosn(fs, daddr) != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) {
378 if (daddr > 0) {
379 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, daddr),
380 ibp);
381 sup->su_nbytes -= DINOSIZE(fs);
382 LFS_WRITESEGENTRY(sup, fs,
383 lfs_dtosn(fs, daddr),
384 ibp);
385 }
386 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)),
387 ibp);
388 sup->su_nbytes += DINOSIZE(fs);
389 LFS_WRITESEGENTRY(sup, fs,
390 lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno)),
391 ibp);
392 }
393 }
394 }
395 brelse(dbp, BC_AGE);
396
397 return 0;
398 }
399
400 #define CHECK_CKSUM 0x0001 /* Check the checksum to make sure it's valid */
401 #define CHECK_UPDATE 0x0002 /* Update Ifile for new data blocks / inodes */
402
403 static daddr_t
404 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
405 kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l)
406 {
407 struct vnode *devvp;
408 struct buf *bp, *dbp;
409 int error, nblocks = 0, ninos, i, j; /* XXX: gcc */
410 SEGSUM *ssp;
411 u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */
412 daddr_t oldoffset;
413 IINFO *iip;
414 FINFO *fip;
415 SEGUSE *sup;
416 size_t size;
417 uint32_t datasum, foundsum;
418
419 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
420 /*
421 * If the segment has a superblock and we're at the top
422 * of the segment, skip the superblock.
423 */
424 if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) {
425 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
426 if (sup->su_flags & SEGUSE_SUPERBLOCK)
427 offset += lfs_btofsb(fs, LFS_SBPAD);
428 brelse(bp, 0);
429 }
430
431 /* Read in the segment summary */
432 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs),
433 0, &bp);
434 if (error)
435 return -1;
436
437 /* Check summary checksum */
438 ssp = (SEGSUM *)bp->b_data;
439 if (flags & CHECK_CKSUM) {
440 size_t sumstart;
441
442 sumstart = lfs_ss_getsumstart(fs);
443 if (lfs_ss_getsumsum(fs, ssp) !=
444 cksum((char *)ssp + sumstart,
445 lfs_sb_getsumsize(fs) - sumstart)) {
446 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset));
447 offset = -1;
448 goto err1;
449 }
450 if (lfs_ss_getnfinfo(fs, ssp) == 0 &&
451 lfs_ss_getninos(fs, ssp) == 0) {
452 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset));
453 offset = -1;
454 goto err1;
455 }
456 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) {
457 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
458 offset = -1;
459 goto err1;
460 }
461 }
462 if (lfs_sb_getversion(fs) > 1) {
463 if (lfs_ss_getserial(fs, ssp) != nextserial) {
464 DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64
465 "\n", offset));
466 offset = -1;
467 goto err1;
468 }
469 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) {
470 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
471 PRIx64 "\n", lfs_ss_getident(fs, ssp),
472 lfs_sb_getident(fs), offset));
473 offset = -1;
474 goto err1;
475 }
476 }
477 if (pseg_flags)
478 *pseg_flags = lfs_ss_getflags(fs, ssp);
479 oldoffset = offset;
480 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs));
481
482 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs));
483 iip = SEGSUM_IINFOSTART(fs, bp->b_data);
484 if (flags & CHECK_CKSUM) {
485 /* Count blocks */
486 nblocks = 0;
487 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data);
488 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp); ++i) {
489 nblocks += lfs_fi_getnblocks(fs, fip);
490 if (lfs_fi_getnblocks(fs, fip) <= 0)
491 break;
492 fip = NEXT_FINFO(fs, fip);
493 }
494 nblocks += ninos;
495 /* Create the sum array */
496 datap = dp = malloc(nblocks * sizeof(u_long),
497 M_SEGMENT, M_WAITOK);
498 }
499
500 /* Handle individual blocks */
501 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)bp->b_data);
502 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) {
503 /* Inode block? */
504 if (ninos && lfs_ii_getblock(fs, iip) == offset) {
505 if (flags & CHECK_CKSUM) {
506 /* Read in the head and add to the buffer */
507 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getbsize(fs),
508 0, &dbp);
509 if (error) {
510 offset = -1;
511 goto err2;
512 }
513 /* XXX this can't be right, on-disk u_long? */
514 (*dp++) = ((u_long *)(dbp->b_data))[0];
515 brelse(dbp, BC_AGE);
516 }
517 if (flags & CHECK_UPDATE) {
518 if ((error = update_inoblk(fs, offset, cred, l))
519 != 0) {
520 offset = -1;
521 goto err2;
522 }
523 }
524 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs));
525 iip = NEXTLOWER_IINFO(fs, iip);
526 --ninos;
527 --i; /* compensate for ++i in loop header */
528 continue;
529 }
530 size = lfs_sb_getbsize(fs);
531 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
532 if (j == lfs_fi_getnblocks(fs, fip) - 1)
533 size = lfs_fi_getlastlength(fs, fip);
534 if (flags & CHECK_CKSUM) {
535 error = bread(devvp, LFS_FSBTODB(fs, offset), size,
536 0, &dbp);
537 if (error) {
538 offset = -1;
539 goto err2;
540 }
541 (*dp++) = ((u_long *)(dbp->b_data))[0];
542 brelse(dbp, BC_AGE);
543 }
544 /* Account for and update any direct blocks */
545 if ((flags & CHECK_UPDATE) &&
546 lfs_fi_getino(fs, fip) > LFS_IFILE_INUM &&
547 lfs_fi_getblock(fs, fip, j) >= 0) {
548 update_meta(fs, lfs_fi_getino(fs, fip),
549 lfs_fi_getversion(fs, fip),
550 lfs_fi_getblock(fs, fip, j),
551 offset, size, l);
552 }
553 offset += lfs_btofsb(fs, size);
554 }
555 fip = NEXT_FINFO(fs, fip);
556 }
557 /* Checksum the array, compare */
558 datasum = lfs_ss_getdatasum(fs, ssp);
559 foundsum = cksum(datap, nblocks * sizeof(u_long));
560 if ((flags & CHECK_CKSUM) && datasum != foundsum) {
561 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
562 " (wanted %x got %x)\n",
563 offset, datasum, foundsum));
564 offset = -1;
565 goto err2;
566 }
567
568 /* If we're at the end of the segment, move to the next */
569 if (lfs_dtosn(fs, offset + lfs_btofsb(fs, lfs_sb_getsumsize(fs) + lfs_sb_getbsize(fs))) !=
570 lfs_dtosn(fs, offset)) {
571 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs, ssp))) {
572 offset = -1;
573 goto err2;
574 }
575 offset = lfs_ss_getnext(fs, ssp);
576 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
577 " -> segment %d\n", offset, lfs_dtosn(fs,offset)));
578 }
579
580 if (flags & CHECK_UPDATE) {
581 lfs_sb_subavail(fs, offset - oldoffset);
582 /* Don't clog the buffer queue */
583 mutex_enter(&lfs_lock);
584 if (locked_queue_count > LFS_MAX_BUFS ||
585 locked_queue_bytes > LFS_MAX_BYTES) {
586 lfs_flush(fs, SEGM_CKP, 0);
587 }
588 mutex_exit(&lfs_lock);
589 }
590
591 err2:
592 if (flags & CHECK_CKSUM)
593 free(datap, M_SEGMENT);
594 err1:
595 brelse(bp, BC_AGE);
596
597 /* XXX should we update the serial number even for bad psegs? */
598 if ((flags & CHECK_UPDATE) && offset > 0 && lfs_sb_getversion(fs) > 1)
599 lfs_sb_setserial(fs, nextserial);
600 return offset;
601 }
602
603 void
604 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
605 {
606 int flags, dirty;
607 daddr_t offset, oldoffset, lastgoodpseg;
608 int sn, curseg, do_rollforward;
609 struct proc *p;
610 kauth_cred_t cred;
611 SEGUSE *sup;
612 struct buf *bp;
613
614 p = l ? l->l_proc : NULL;
615 cred = p ? p->p_cred : NOCRED;
616
617 /*
618 * Roll forward.
619 *
620 * We don't roll forward for v1 filesystems, because
621 * of the danger that the clock was turned back between the last
622 * checkpoint and crash. This would roll forward garbage.
623 *
624 * v2 filesystems don't have this problem because they use a
625 * monotonically increasing serial number instead of a timestamp.
626 */
627 do_rollforward = (!(lfs_sb_getpflags(fs) & LFS_PF_CLEAN) &&
628 lfs_do_rfw && lfs_sb_getversion(fs) > 1 && p != NULL);
629 if (do_rollforward) {
630 u_int64_t nextserial;
631 /*
632 * Phase I: Find the address of the last good partial
633 * segment that was written after the checkpoint. Mark
634 * the segments in question dirty, so they won't be
635 * reallocated.
636 */
637 lastgoodpseg = oldoffset = offset = lfs_sb_getoffset(fs);
638 flags = 0x0;
639 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
640 PRIx64 "\n", offset));
641 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
642 if (!(sup->su_flags & SEGUSE_DIRTY))
643 lfs_sb_subnclean(fs, 1);
644 sup->su_flags |= SEGUSE_DIRTY;
645 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
646 nextserial = lfs_sb_getserial(fs) + 1;
647 while ((offset = check_segsum(fs, offset, nextserial,
648 cred, CHECK_CKSUM, &flags, l)) > 0) {
649 nextserial++;
650 if (lfs_sntod(fs, oldoffset) != lfs_sntod(fs, offset)) {
651 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset),
652 bp);
653 if (!(sup->su_flags & SEGUSE_DIRTY))
654 lfs_sb_subnclean(fs, 1);
655 sup->su_flags |= SEGUSE_DIRTY;
656 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, oldoffset),
657 bp);
658 }
659
660 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%"
661 PRIx64 "\n", offset));
662 if (flags & SS_DIROP) {
663 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
664 PRIx64 "\n", oldoffset));
665 if (!(flags & SS_CONT)) {
666 DLOG((DLOG_RF, "lfs_mountfs: dirops end "
667 "at 0x%" PRIx64 "\n", oldoffset));
668 }
669 }
670 if (!(flags & SS_CONT))
671 lastgoodpseg = offset;
672 oldoffset = offset;
673 }
674 if (flags & SS_CONT) {
675 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
676 "dirops discarded\n"));
677 }
678 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
679 "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg));
680 oldoffset = lfs_sb_getoffset(fs);
681 if (lfs_sb_getoffset(fs) != lastgoodpseg) {
682 /* Don't overwrite what we're trying to preserve */
683 offset = lfs_sb_getoffset(fs);
684 lfs_sb_setoffset(fs, lastgoodpseg);
685 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, lfs_sb_getoffset(fs))));
686 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) {
687 sn = (sn + 1) % lfs_sb_getnseg(fs);
688 if (sn == curseg)
689 panic("lfs_mountfs: no clean segments");
690 LFS_SEGENTRY(sup, fs, sn, bp);
691 dirty = (sup->su_flags & SEGUSE_DIRTY);
692 brelse(bp, 0);
693 if (!dirty)
694 break;
695 }
696 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
697
698 /*
699 * Phase II: Roll forward from the first superblock.
700 */
701 while (offset != lastgoodpseg) {
702 DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%"
703 PRIx64 "\n", offset));
704 offset = check_segsum(fs, offset,
705 lfs_sb_getserial(fs) + 1, cred, CHECK_UPDATE,
706 NULL, l);
707 }
708
709 /*
710 * Finish: flush our changes to disk.
711 */
712 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
713 DLOG((DLOG_RF, "lfs_mountfs: roll forward ",
714 "recovered %jd blocks\n",
715 (intmax_t)(lastgoodpseg - oldoffset)));
716 }
717 DLOG((DLOG_RF, "LFS roll forward complete\n"));
718 }
719 }
720