lfs_rfw.c revision 1.37 1 /* $NetBSD: lfs_rfw.c,v 1.37 2025/09/17 04:37:47 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.37 2025/09/17 04:37:47 perseant Exp $");
34
35 #if defined(_KERNEL_OPT)
36 #include "opt_quota.h"
37 #endif
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/namei.h>
42 #include <sys/proc.h>
43 #include <sys/kernel.h>
44 #include <sys/vnode.h>
45 #include <sys/mount.h>
46 #include <sys/kthread.h>
47 #include <sys/buf.h>
48 #include <sys/device.h>
49 #include <sys/file.h>
50 #include <sys/disklabel.h>
51 #include <sys/ioctl.h>
52 #include <sys/errno.h>
53 #include <sys/malloc.h>
54 #include <sys/pool.h>
55 #include <sys/socket.h>
56 #include <sys/stat.h>
57 #include <sys/syslog.h>
58 #include <sys/sysctl.h>
59 #include <sys/conf.h>
60 #include <sys/kauth.h>
61
62 #include <miscfs/specfs/specdev.h>
63
64 #include <ufs/lfs/ulfs_quotacommon.h>
65 #include <ufs/lfs/ulfs_inode.h>
66 #include <ufs/lfs/ulfsmount.h>
67 #include <ufs/lfs/ulfs_extern.h>
68
69 #include <uvm/uvm_extern.h>
70
71 #include <ufs/lfs/lfs.h>
72 #include <ufs/lfs/lfs_accessors.h>
73 #include <ufs/lfs/lfs_kernel.h>
74 #include <ufs/lfs/lfs_extern.h>
75
76 #include <miscfs/genfs/genfs.h>
77 #include <miscfs/genfs/genfs_node.h>
78
79 /*
80 * Roll-forward code.
81 */
82 static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
83 kauth_cred_t, int, int *, struct lwp *);
84
85 static bool all_selector(void *, struct vnode *);
86 static void drop_vnode_pages(struct mount *, struct lwp *);
87 static int update_inogen(struct lfs *, daddr_t);
88 static void update_inoblk_copy_dinode(struct lfs *, union lfs_dinode *, const union lfs_dinode *);
89
90 extern int lfs_do_rfw;
91 int rblkcnt;
92 int lfs_rfw_max_psegs = 0;
93
94 /*
95 * Allocate a particular inode with a particular version number, freeing
96 * any previous versions of this inode that may have gone before.
97 * Used by the roll-forward code.
98 *
99 * XXX this function does not have appropriate locking to be used on a live fs;
100 * XXX but something similar could probably be used for an "undelete" call.
101 *
102 * Called with the Ifile inode locked.
103 */
104 int
105 lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
106 struct vnode **vpp, union lfs_dinode *dip)
107 {
108 struct vattr va;
109 struct vnode *vp;
110 struct inode *ip;
111 int error;
112
113 KASSERT(ino > LFS_IFILE_INUM);
114 ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
115
116 /*
117 * First, just try a vget. If the version number is the one we want,
118 * we don't have to do anything else. If the version number is wrong,
119 * take appropriate action.
120 */
121 error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, LK_EXCLUSIVE, &vp);
122 if (error == 0) {
123 DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n",
124 (int)ino, vp));
125
126 *vpp = vp;
127 ip = VTOI(vp);
128 DLOG((DLOG_RF, " ip->i_gen=%jd dip nlink %jd seeking"
129 " version %jd\n", (intmax_t)ip->i_gen,
130 (intmax_t)(dip == NULL ? -1
131 : lfs_dino_getnlink(fs, dip)), (intmax_t)vers));
132 if (ip->i_gen == vers) {
133 /*
134 * We have what we wanted already.
135 */
136 DLOG((DLOG_RF, " pre-existing\n"));
137 return 0;
138 } else if (ip->i_gen < vers && dip != NULL
139 && lfs_dino_getnlink(fs, dip) > 0) {
140 /*
141 * We have found a newer version. Truncate
142 * the old vnode to zero and re-initialize
143 * from the given dinode.
144 */
145 DLOG((DLOG_RF, " replace old version %jd\n",
146 (intmax_t)ip->i_gen));
147 lfs_truncate(vp, (off_t)0, 0, NOCRED);
148 ip->i_gen = vers;
149 vp->v_type = IFTOVT(lfs_dino_getmode(fs, dip));
150 update_inoblk_copy_dinode(fs, ip->i_din, dip);
151 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
152 return 0;
153 } else {
154 /*
155 * Not the right version and nothing to
156 * initialize from. Don't recover this data.
157 */
158 DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
159 (int)ino, (int)vers,
160 (int)lfs_dino_getgen(fs, ip->i_din)));
161 vput(vp);
162 *vpp = NULLVP;
163 return EEXIST;
164 }
165 }
166
167 /*
168 * No version of this inode was found in the cache.
169 * Make a new one from the dinode. We will add data blocks
170 * as they come in, so scrub any block addresses off of the
171 * inode and reset block counts to zero.
172 */
173 if (dip == NULL)
174 return ENOENT;
175
176 vattr_null(&va);
177 va.va_type = IFTOVT(lfs_dino_getmode(fs, dip));
178 va.va_mode = lfs_dino_getmode(fs, dip) & ALLPERMS;
179 va.va_fileid = ino;
180 va.va_gen = vers;
181 error = vcache_new(fs->lfs_ivnode->v_mount, NULL, &va, NOCRED, NULL,
182 &vp);
183 if (error)
184 return error;
185 error = vn_lock(vp, LK_EXCLUSIVE);
186 if (error)
187 goto err;
188
189 ip = VTOI(vp);
190 update_inoblk_copy_dinode(fs, ip->i_din, dip);
191
192 DLOG((DLOG_RF, "lfs_valloc[2] ino %d vp %p size=%lld effnblks=%d,"
193 " blocks=%d\n", (int)ino, vp, (long long)ip->i_size,
194 (int)ip->i_lfs_effnblks,
195 (int)lfs_dino_getblocks(fs, ip->i_din)));
196 *vpp = vp;
197 return 0;
198
199 err:
200 vrele(vp);
201 *vpp = NULLVP;
202 return error;
203 }
204
205 /*
206 * Load the appropriate indirect block, and change the appropriate pointer.
207 * Mark the block dirty. Do segment and avail accounting.
208 */
209 static int
210 update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
211 daddr_t ndaddr, size_t size, struct lwp *l)
212 {
213 int error;
214 struct vnode *vp;
215 struct inode *ip;
216 daddr_t odaddr;
217 struct indir a[ULFS_NIADDR];
218 int num;
219 struct buf *bp;
220 SEGUSE *sup;
221 u_int64_t newsize, loff;
222
223 KASSERT(lbn >= 0); /* no indirect blocks */
224 KASSERT(ino > LFS_IFILE_INUM);
225
226 DLOG((DLOG_RF, "update_meta: ino %d lbn %d size %d at 0x%jx\n",
227 (int)ino, (int)lbn, (int)size, (uintmax_t)ndaddr));
228
229 if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp, NULL)) != 0)
230 return error;
231 ip = VTOI(vp);
232
233 /*
234 * If block already exists, note its new location
235 * but do not account it as new.
236 */
237 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
238 if (odaddr == UNASSIGNED) {
239 if ((error = lfs_balloc(vp, (lbn << lfs_sb_getbshift(fs)),
240 size, NOCRED, 0, &bp)) != 0) {
241 vput(vp);
242 return (error);
243 }
244 /* No need to write, the block is already on disk */
245 if (bp->b_oflags & BO_DELWRI) {
246 LFS_UNLOCK_BUF(bp);
247 /* Account recovery of the previous version */
248 lfs_sb_addavail(fs, lfs_btofsb(fs, bp->b_bcount));
249 }
250 brelse(bp, BC_INVAL);
251 DLOG((DLOG_RF, "balloc ip->i_lfs_effnblks = %d,"
252 " lfs_dino_getblocks(fs, ip->i_din) = %d\n",
253 (int)ip->i_lfs_effnblks,
254 (int)lfs_dino_getblocks(fs, ip->i_din)));
255 } else {
256 /* XXX fragextend? */
257 DLOG((DLOG_RF, "block exists, no balloc\n"));
258 }
259
260 /*
261 * Extend the file, if it is not large enough already.
262 * XXX this is not exactly right, we don't know how much of the
263 * XXX last block is actually used.
264 */
265 loff = lfs_lblktosize(fs, lbn);
266 if (loff >= (ULFS_NDADDR << lfs_sb_getbshift(fs))) {
267 /* No fragments */
268 newsize = loff + 1;
269 } else {
270 /* Subtract only a fragment to account for block size */
271 newsize = loff + size - lfs_fsbtob(fs, 1) + 1;
272 }
273
274 if (ip->i_size < newsize) {
275 DLOG((DLOG_RF, "ino %d size %d -> %d\n",
276 (int)ino, (int)ip->i_size, (int)newsize));
277 lfs_dino_setsize(fs, ip->i_din, newsize);
278 ip->i_size = newsize;
279 /*
280 * tell vm our new size for the case the inode won't
281 * appear later.
282 */
283 uvm_vnp_setsize(vp, newsize);
284 }
285
286 lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
287
288 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
289 sup->su_nbytes += size;
290 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, ndaddr), bp);
291
292 /* differences here should be due to UNWRITTEN indirect blocks. */
293 if (vp->v_type != VLNK) {
294 if (!(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din))
295 #if 0
296 || !(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
297 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din))
298 #endif /* 0 */
299 ) {
300 vprint("vnode", vp);
301 printf("effnblks=%jd dino_getblocks=%jd\n",
302 (intmax_t)ip->i_lfs_effnblks,
303 (intmax_t)lfs_dino_getblocks(fs, ip->i_din));
304 }
305 KASSERT(ip->i_lfs_effnblks >= lfs_dino_getblocks(fs, ip->i_din));
306 #if 0
307 KASSERT(lfs_lblkno(fs, ip->i_size) > ULFS_NDADDR ||
308 ip->i_lfs_effnblks == lfs_dino_getblocks(fs, ip->i_din));
309 #endif /* 0 */
310 }
311
312 #ifdef DEBUG
313 /* Now look again to make sure it worked */
314 ulfs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
315 if (LFS_DBTOFSB(fs, odaddr) != ndaddr)
316 DLOG((DLOG_RF, "update_meta: failed setting ino %jd lbn %jd"
317 " to %jd\n", (intmax_t)ino, (intmax_t)lbn, (intmax_t)ndaddr));
318 #endif /* DEBUG */
319 vput(vp);
320 return 0;
321 }
322
323 /*
324 * Copy some the fields of the dinode as needed by update_inoblk().
325 */
326 static void
327 update_inoblk_copy_dinode(struct lfs *fs,
328 union lfs_dinode *dstu, const union lfs_dinode *srcu)
329 {
330 if (fs->lfs_is64) {
331 struct lfs64_dinode *dst = &dstu->u_64;
332 const struct lfs64_dinode *src = &srcu->u_64;
333 unsigned i;
334
335 /*
336 * Copy everything but the block pointers and di_blocks.
337 * XXX what about di_extb?
338 */
339 dst->di_mode = src->di_mode;
340 dst->di_nlink = src->di_nlink;
341 dst->di_uid = src->di_uid;
342 dst->di_gid = src->di_gid;
343 dst->di_blksize = src->di_blksize;
344 dst->di_size = src->di_size;
345 dst->di_atime = src->di_atime;
346 dst->di_mtime = src->di_mtime;
347 dst->di_ctime = src->di_ctime;
348 dst->di_birthtime = src->di_birthtime;
349 dst->di_mtimensec = src->di_mtimensec;
350 dst->di_atimensec = src->di_atimensec;
351 dst->di_ctimensec = src->di_ctimensec;
352 dst->di_birthnsec = src->di_birthnsec;
353 dst->di_gen = src->di_gen;
354 dst->di_kernflags = src->di_kernflags;
355 dst->di_flags = src->di_flags;
356 dst->di_extsize = src->di_extsize;
357 dst->di_modrev = src->di_modrev;
358 dst->di_inumber = src->di_inumber;
359 for (i = 0; i < __arraycount(src->di_spare); i++) {
360 dst->di_spare[i] = src->di_spare[i];
361 }
362 /* Short symlinks store their data in di_db. */
363 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
364 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
365 memcpy(dst->di_db, src->di_db, src->di_size);
366 }
367 } else {
368 struct lfs32_dinode *dst = &dstu->u_32;
369 const struct lfs32_dinode *src = &srcu->u_32;
370
371 /* Get mode, link count, size, and times */
372 memcpy(dst, src, offsetof(struct lfs32_dinode, di_db[0]));
373
374 /* Then the rest, except di_blocks */
375 dst->di_flags = src->di_flags;
376 dst->di_gen = src->di_gen;
377 dst->di_uid = src->di_uid;
378 dst->di_gid = src->di_gid;
379 dst->di_modrev = src->di_modrev;
380
381 /* Short symlinks store their data in di_db. */
382 if ((src->di_mode & LFS_IFMT) == LFS_IFLNK
383 && src->di_size < lfs_sb_getmaxsymlinklen(fs)) {
384 memcpy(dst->di_db, src->di_db, src->di_size);
385 }
386 }
387 }
388
389 static int
390 update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
391 struct lwp *l)
392 {
393 struct vnode *devvp, *vp;
394 struct inode *ip;
395 union lfs_dinode *dip;
396 struct buf *dbp, *ibp;
397 int error;
398 daddr_t daddr;
399 IFILE *ifp;
400 SEGUSE *sup;
401 unsigned i, num;
402 uint32_t gen;
403 char *buf;
404
405 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
406
407 /*
408 * Get the inode, update times and perms.
409 * DO NOT update disk blocks, we do that separately.
410 */
411 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
412 0, &dbp);
413 if (error) {
414 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
415 return error;
416 }
417 buf = malloc(dbp->b_bcount, M_SEGMENT, M_WAITOK);
418 memcpy(buf, dbp->b_data, dbp->b_bcount);
419 brelse(dbp, BC_AGE);
420 num = LFS_INOPB(fs);
421 for (i = num; i-- > 0; ) {
422 dip = DINO_IN_BLOCK(fs, buf, i);
423 if (lfs_dino_getinumber(fs, dip) <= LFS_IFILE_INUM)
424 continue;
425
426 /* Check generation number */
427 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
428 gen = lfs_if_getversion(fs, ifp);
429 brelse(ibp, 0);
430 if (lfs_dino_getgen(fs, dip) < gen) {
431 continue;
432 }
433
434 /*
435 * This inode is the newest generation. Load it.
436 */
437 error = lfs_rf_valloc(fs, lfs_dino_getinumber(fs, dip),
438 lfs_dino_getgen(fs, dip),
439 l, &vp, dip);
440 if (error) {
441 DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
442 " returned %d\n", error));
443 continue;
444 }
445 ip = VTOI(vp);
446 if (lfs_dino_getsize(fs, dip) != ip->i_size
447 && vp->v_type != VLNK) {
448 /* XXX What should we do sith symlinks? */
449 DLOG((DLOG_RF, " ino %jd size %jd -> %jd\n",
450 (intmax_t)lfs_dino_getinumber(fs, dip),
451 (intmax_t)ip->i_size,
452 (intmax_t)lfs_dino_getsize(fs, dip)));
453 lfs_truncate(vp, lfs_dino_getsize(fs, dip), 0,
454 NOCRED);
455 }
456 update_inoblk_copy_dinode(fs, ip->i_din, dip);
457
458 ip->i_flags = lfs_dino_getflags(fs, dip);
459 ip->i_gen = lfs_dino_getgen(fs, dip);
460 ip->i_uid = lfs_dino_getuid(fs, dip);
461 ip->i_gid = lfs_dino_getgid(fs, dip);
462
463 ip->i_mode = lfs_dino_getmode(fs, dip);
464 ip->i_nlink = lfs_dino_getnlink(fs, dip);
465 ip->i_size = lfs_dino_getsize(fs, dip);
466
467 LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
468
469 /* Re-initialize to get type right */
470 ulfs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
471 &vp);
472
473 /* Record change in location */
474 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
475 daddr = lfs_if_getdaddr(fs, ifp);
476 lfs_if_setdaddr(fs, ifp, LFS_DBTOFSB(fs, dbp->b_blkno));
477 error = LFS_BWRITE_LOG(ibp); /* Ifile */
478 /* And do segment accounting */
479 if (lfs_dtosn(fs, daddr)
480 != lfs_dtosn(fs, LFS_DBTOFSB(fs, dbp->b_blkno))) {
481 if (!DADDR_IS_BAD(daddr)) {
482 LFS_SEGENTRY(sup, fs,
483 lfs_dtosn(fs, daddr), ibp);
484 sup->su_nbytes -= DINOSIZE(fs);
485 LFS_WRITESEGENTRY(sup, fs,
486 lfs_dtosn(fs, daddr),
487 ibp);
488 }
489 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs,
490 LFS_DBTOFSB(fs, dbp->b_blkno)),
491 ibp);
492 sup->su_nbytes += DINOSIZE(fs);
493 LFS_WRITESEGENTRY(sup, fs,
494 lfs_dtosn(fs, LFS_DBTOFSB(fs,
495 dbp->b_blkno)),
496 ibp);
497 }
498 vput(vp);
499 }
500 free(buf, M_SEGMENT);
501
502 return 0;
503 }
504
505 /*
506 * Note the highest generation number of each inode in the Ifile.
507 * This allows us to skip processing data for intermediate versions.
508 */
509 static int
510 update_inogen(struct lfs *fs, daddr_t offset)
511 {
512 struct vnode *devvp;
513 union lfs_dinode *dip;
514 struct buf *dbp, *ibp;
515 int error;
516 IFILE *ifp;
517 unsigned i, num;
518
519 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
520
521 /* Read inode block */
522 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getibsize(fs),
523 0, &dbp);
524 if (error) {
525 DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
526 return error;
527 }
528
529 /* Check each inode against ifile entry */
530 num = LFS_INOPB(fs);
531 for (i = num; i-- > 0; ) {
532 dip = DINO_IN_BLOCK(fs, dbp->b_data, i);
533 if (lfs_dino_getinumber(fs, dip) == LFS_IFILE_INUM)
534 continue;
535
536 /* Update generation number */
537 LFS_IENTRY(ifp, fs, lfs_dino_getinumber(fs, dip), ibp);
538 if (lfs_if_getversion(fs, ifp) < lfs_dino_getgen(fs, dip))
539 lfs_if_setversion(fs, ifp, lfs_dino_getgen(fs, dip));
540 error = LFS_BWRITE_LOG(ibp); /* Ifile */
541 if (error)
542 break;
543 }
544 brelse(dbp, BC_AGE);
545
546 return error;
547 }
548
549 #define CHECK_CKSUM 1 /* Check the checksum to make sure it's valid */
550 #define CHECK_GEN 2 /* Update highest generation number */
551 #define CHECK_INODES 3 /* Read and process inodes */
552 #define CHECK_DATA 4 /* Identify and process data blocks */
553
554 static daddr_t
555 check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
556 kauth_cred_t cred, int phase, int *pseg_flags, struct lwp *l)
557 {
558 struct vnode *devvp;
559 struct buf *bp, *dbp;
560 int error, ninos, i, j;
561 SEGSUM *ssp;
562 daddr_t prevoffset;
563 IINFO *iip;
564 FINFO *fip;
565 SEGUSE *sup;
566 size_t size;
567 uint32_t datasum, foundsum;
568 char *buf;
569
570 devvp = VTOI(fs->lfs_ivnode)->i_devvp;
571
572 /*
573 * If this is segment 0, skip the label.
574 * If the segment has a superblock and we're at the top
575 * of the segment, skip the superblock.
576 */
577 if (offset == lfs_sb_gets0addr(fs))
578 offset += lfs_btofsb(fs, LFS_LABELPAD);
579 if (lfs_sntod(fs, lfs_dtosn(fs, offset)) == offset) {
580 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
581 if (sup->su_flags & SEGUSE_SUPERBLOCK)
582 offset += lfs_btofsb(fs, LFS_SBPAD);
583 brelse(bp, 0);
584 }
585
586 /* Read in the segment summary */
587 error = bread(devvp, LFS_FSBTODB(fs, offset), lfs_sb_getsumsize(fs),
588 0, &bp);
589 if (error)
590 return -1;
591 buf = malloc(bp->b_bcount, M_SEGMENT, M_WAITOK);
592 memcpy(buf, bp->b_data, bp->b_bcount);
593 brelse(bp, BC_AGE);
594
595 ssp = (SEGSUM *)buf;
596
597 /*
598 * Phase I: Check summary checksum.
599 */
600 if (phase == CHECK_CKSUM) {
601 size_t sumstart;
602
603 sumstart = lfs_ss_getsumstart(fs);
604 if (lfs_ss_getsumsum(fs, ssp) !=
605 cksum((char *)ssp + sumstart,
606 lfs_sb_getsumsize(fs) - sumstart)) {
607 DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n",
608 offset));
609 offset = -1;
610 goto err;
611 }
612 if (lfs_ss_getnfinfo(fs, ssp) == 0 &&
613 lfs_ss_getninos(fs, ssp) == 0) {
614 DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n",
615 offset));
616 offset = -1;
617 goto err;
618 }
619 if (lfs_sb_getversion(fs) == 1) {
620 if (lfs_ss_getcreate(fs, ssp) < lfs_sb_gettstamp(fs)) {
621 DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
622 offset = -1;
623 goto err;
624 }
625 } else {
626 if (lfs_ss_getserial(fs, ssp) != nextserial) {
627 DLOG((DLOG_RF, "Serial number at 0x%jx given as 0x%jx,"
628 " expected 0x%jx\n", (intmax_t)offset,
629 (intmax_t)lfs_ss_getserial(fs, ssp),
630 (intmax_t)nextserial));
631 offset = -1;
632 goto err;
633 }
634 if (lfs_ss_getident(fs, ssp) != lfs_sb_getident(fs)) {
635 DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
636 PRIx64 "\n", lfs_ss_getident(fs, ssp),
637 lfs_sb_getident(fs), offset));
638 offset = -1;
639 goto err;
640 }
641 }
642 }
643 if (pseg_flags)
644 *pseg_flags = lfs_ss_getflags(fs, ssp);
645 prevoffset = offset;
646 offset += lfs_btofsb(fs, lfs_sb_getsumsize(fs));
647
648 /* Handle individual blocks */
649 foundsum = 0;
650 ninos = howmany(lfs_ss_getninos(fs, ssp), LFS_INOPB(fs));
651 iip = SEGSUM_IINFOSTART(fs, buf);
652 fip = SEGSUM_FINFOBASE(fs, (SEGSUM *)buf);
653 for (i = 0; i < lfs_ss_getnfinfo(fs, ssp) || ninos; ++i) {
654 /* Inode block? */
655 if (ninos && lfs_ii_getblock(fs, iip) == offset) {
656 if (phase == CHECK_CKSUM) {
657 /* Read in the head and add to the buffer */
658 error = bread(devvp, LFS_FSBTODB(fs, offset),
659 lfs_sb_getbsize(fs), 0, &dbp);
660 if (error) {
661 offset = -1;
662 goto err;
663 }
664 foundsum = lfs_cksum_part(dbp->b_data,
665 sizeof(uint32_t), foundsum);
666 brelse(dbp, BC_AGE);
667 }
668 if (phase == CHECK_GEN) {
669 if ((error = update_inogen(fs, offset))
670 != 0) {
671 offset = -1;
672 goto err;
673 }
674 }
675 if (phase == CHECK_INODES) {
676 if ((error = update_inoblk(fs, offset, cred, l))
677 != 0) {
678 offset = -1;
679 goto err;
680 }
681 }
682 offset += lfs_btofsb(fs, lfs_sb_getibsize(fs));
683 iip = NEXTLOWER_IINFO(fs, iip);
684 --ninos;
685 --i; /* compensate for ++i in loop header */
686 continue;
687 }
688
689 /* File block */
690 size = lfs_sb_getbsize(fs);
691 for (j = 0; j < lfs_fi_getnblocks(fs, fip); ++j) {
692 if (j == lfs_fi_getnblocks(fs, fip) - 1)
693 size = lfs_fi_getlastlength(fs, fip);
694 if (phase == CHECK_CKSUM) {
695 error = bread(devvp, LFS_FSBTODB(fs, offset),
696 size, 0, &dbp);
697 if (error) {
698 offset = -1;
699 goto err;
700 }
701 foundsum = lfs_cksum_part(dbp->b_data,
702 sizeof(uint32_t), foundsum);
703 brelse(dbp, BC_AGE);
704 }
705 /* Account for and update any direct blocks */
706 if (phase == CHECK_DATA &&
707 lfs_fi_getino(fs, fip) > LFS_IFILE_INUM &&
708 lfs_fi_getblock(fs, fip, j) >= 0) {
709 update_meta(fs, lfs_fi_getino(fs, fip),
710 lfs_fi_getversion(fs, fip),
711 lfs_fi_getblock(fs, fip, j),
712 offset, size, l);
713 ++rblkcnt;
714 }
715 offset += lfs_btofsb(fs, size);
716 }
717
718 fip = NEXT_FINFO(fs, fip);
719 }
720
721 /* Checksum the array, compare */
722 if (phase == CHECK_CKSUM) {
723 datasum = lfs_ss_getdatasum(fs, ssp);
724 foundsum = lfs_cksum_fold(foundsum);
725 if (datasum != foundsum) {
726 DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
727 " (wanted %x got %x)\n",
728 offset, datasum, foundsum));
729 offset = -1;
730 goto err;
731 }
732 }
733
734 if (phase == CHECK_CKSUM)
735 lfs_sb_subavail(fs, offset - prevoffset);
736 else {
737 /* Don't clog the buffer queue */
738 mutex_enter(&lfs_lock);
739 if (locked_queue_count > LFS_MAX_BUFS ||
740 locked_queue_bytes > LFS_MAX_BYTES) {
741 lfs_flush(fs, SEGM_CKP, 0);
742 }
743 mutex_exit(&lfs_lock);
744 }
745
746 /*
747 * If we're at the end of the segment, move to the next.
748 * A partial segment needs space for a segment header (1 fsb)
749 * and a full block ("frag" fsb). Thus, adding "frag" fsb should
750 * still be within the current segment (whereas frag + 1 might
751 * be at the start of the next segment).
752 *
753 * This needs to match the definition of LFS_PARTIAL_FITS
754 * in lfs_segment.c.
755 */
756 if (lfs_dtosn(fs, offset + lfs_sb_getfrag(fs))
757 != lfs_dtosn(fs, offset)) {
758 if (lfs_dtosn(fs, offset) == lfs_dtosn(fs, lfs_ss_getnext(fs,
759 ssp))) {
760 printf("WHOA! at 0x%jx/seg %jd moving to 0x%jx/seg %jd\n",
761 (intmax_t)offset,
762 (intmax_t)lfs_dtosn(fs, offset),
763 (intmax_t)lfs_ss_getnext(fs, ssp),
764 (intmax_t)lfs_dtosn(fs, lfs_ss_getnext(fs, ssp)));
765 offset = -1;
766 goto err;
767 }
768 offset = lfs_ss_getnext(fs, ssp);
769 DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
770 " -> segment %d\n", offset, lfs_dtosn(fs,offset)));
771 }
772
773 err:
774 free(buf, M_SEGMENT);
775
776 return offset;
777 }
778
779 void
780 lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
781 {
782 int flags, dirty, phase;
783 daddr_t startoffset, offset, nextoffset, endpseg;
784 u_int64_t nextserial, startserial, endserial;
785 int sn, curseg;
786 struct proc *p;
787 kauth_cred_t cred;
788 SEGUSE *sup;
789 struct buf *bp;
790
791 p = l ? l->l_proc : NULL;
792 cred = p ? p->p_cred : NOCRED;
793
794 /*
795 * Roll forward.
796 *
797 * We don't roll forward for v1 filesystems, because
798 * of the danger that the clock was turned back between the last
799 * checkpoint and crash. This would roll forward garbage.
800 *
801 * v2 filesystems don't have this problem because they use a
802 * monotonically increasing serial number instead of a timestamp.
803 */
804 rblkcnt = 0;
805 if ((lfs_sb_getpflags(fs) & LFS_PF_CLEAN) || !lfs_do_rfw
806 || lfs_sb_getversion(fs) <= 1 || p == NULL)
807 return;
808
809 DLOG((DLOG_RF, "%s: begin roll forward at serial 0x%jx\n",
810 lfs_sb_getfsmnt(fs), (intmax_t)lfs_sb_getserial(fs)));
811 DEBUG_CHECK_FREELIST(fs);
812
813 /*
814 * Phase I: Find the address of the last good partial
815 * segment that was written after the checkpoint. Mark
816 * the segments in question dirty, so they won't be
817 * reallocated.
818 */
819 endpseg = startoffset = offset = lfs_sb_getoffset(fs);
820 flags = 0x0;
821 DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
822 PRIx64 "\n", offset));
823 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
824 if (!(sup->su_flags & SEGUSE_DIRTY))
825 lfs_sb_subnclean(fs, 1);
826 sup->su_flags |= SEGUSE_DIRTY;
827 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
828
829 startserial = lfs_sb_getserial(fs);
830 endserial = nextserial = startserial + 1;
831 while ((nextoffset = check_segsum(fs, offset, nextserial,
832 cred, CHECK_CKSUM, &flags, l)) > 0) {
833 if (lfs_sntod(fs, offset) != lfs_sntod(fs, nextoffset)) {
834 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, offset),
835 bp);
836 if (!(sup->su_flags & SEGUSE_DIRTY))
837 lfs_sb_subnclean(fs, 1);
838 sup->su_flags |= SEGUSE_DIRTY;
839 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, offset), bp);
840 }
841
842 DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%jx"
843 " serial=0x%jx\n", (intmax_t)nextoffset,
844 (intmax_t)nextserial));
845 if (flags & SS_DIROP) {
846 DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
847 PRIx64 "\n", offset));
848 if (!(flags & SS_CONT)) {
849 DLOG((DLOG_RF, "lfs_mountfs: dirops end "
850 "at 0x%" PRIx64 "\n", offset));
851 }
852 }
853 offset = nextoffset;
854 ++nextserial;
855
856 if (!(flags & SS_CONT)) {
857 endpseg = nextoffset;
858 endserial = nextserial;
859 }
860 if (lfs_rfw_max_psegs > 0
861 && nextserial > startserial + lfs_rfw_max_psegs)
862 break;
863 }
864 if (flags & SS_CONT) {
865 DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
866 "dirops discarded (0x%jx < 0x%jx)\n",
867 endpseg, nextoffset));
868 }
869 if (lfs_sb_getversion(fs) > 1)
870 lfs_sb_setserial(fs, endserial);
871 DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
872 "endpseg=0x%" PRIx64 "\n", endpseg));
873 offset = startoffset;
874 if (offset != endpseg) {
875 /* Don't overwrite what we're trying to preserve */
876 lfs_sb_setoffset(fs, endpseg);
877 lfs_sb_setcurseg(fs, lfs_sntod(fs, lfs_dtosn(fs, endpseg)));
878 for (sn = curseg = lfs_dtosn(fs, lfs_sb_getcurseg(fs));;) {
879 sn = (sn + 1) % lfs_sb_getnseg(fs);
880 /* XXX could we just fail to roll forward? */
881 if (sn == curseg)
882 panic("lfs_mountfs: no clean segments");
883 LFS_SEGENTRY(sup, fs, sn, bp);
884 dirty = (sup->su_flags & SEGUSE_DIRTY);
885 brelse(bp, 0);
886 if (!dirty)
887 break;
888 }
889 lfs_sb_setnextseg(fs, lfs_sntod(fs, sn));
890 /* Explicitly set this segment dirty */
891 LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
892 sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
893 LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, endpseg), bp);
894
895
896 /*
897 * Phase II: Identify the highest generation of each
898 * inode.
899 *
900 * Phase III: Update inodes. We end up with the
901 * last version of each inode present, and can ignore
902 * data blocks belonging to previous versions.
903 *
904 * Phase IV: Roll forward, updating data blocks.
905 */
906 for (phase = CHECK_GEN; phase <= CHECK_DATA; ++phase) {
907 offset = startoffset;
908 nextserial = startserial + 1;
909 printf("LFS roll forward phase %d beginning\n", phase);
910 while (offset > 0 && offset != endpseg) {
911 if (phase == CHECK_DATA) {
912 DLOG((DLOG_RF, "LFS roll forward"
913 " phase %d: offset=0x%jx"
914 " serial=0x%jx\n",
915 phase, (intmax_t)offset,
916 (intmax_t)nextserial));
917 }
918 offset = check_segsum(fs, offset,
919 nextserial, cred,
920 phase, NULL, l);
921 ++nextserial;
922 DEBUG_CHECK_FREELIST(fs);
923 }
924 }
925
926 /*
927 * Finish: flush our changes to disk.
928 */
929 lfs_sb_setserial(fs, endserial);
930
931 lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
932 DLOG((DLOG_RF, "lfs_mountfs: roll forward "
933 "examined %jd blocks\n",
934 (intmax_t)(endpseg - startoffset)));
935 }
936
937 /* Get rid of our vnodes, except the ifile */
938 drop_vnode_pages(mp, l);
939 DLOG((DLOG_RF, "LFS roll forward complete\n"));
940 printf("%s: roll forward recovered %d data blocks\n",
941 lfs_sb_getfsmnt(fs), rblkcnt);
942
943 /*
944 * At this point we have no more changes to write to disk.
945 * Reset the "avail" count to match the segments as they
946 * appear on disk, and the clean segment count.
947 */
948 lfs_reset_avail(fs);
949 }
950
951 static bool
952 all_selector(void *cl, struct vnode *vp)
953 {
954 return true;
955 }
956
957
958 /*
959 * Dump any pages from vnodes that may have been put on
960 * during truncation.
961 */
962 static void
963 drop_vnode_pages(struct mount *mp, struct lwp *l)
964 {
965 struct vnode_iterator *marker;
966 struct lfs *fs;
967 struct vnode *vp;
968
969 fs = VFSTOULFS(mp)->um_lfs;
970 vfs_vnode_iterator_init(mp, &marker);
971 while ((vp = vfs_vnode_iterator_next(marker,
972 all_selector, NULL)) != NULL) {
973 if (vp == fs->lfs_ivnode)
974 continue;
975 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
976 uvm_vnp_setsize(vp, 0);
977 uvm_vnp_setsize(vp, VTOI(vp)->i_size);
978 VOP_UNLOCK(vp);
979 vrele(vp);
980 }
981 vfs_vnode_iterator_destroy(marker);
982 }
983
984