lfs_syscalls.c revision 1.96 1 /* $NetBSD: lfs_syscalls.c,v 1.96 2003/07/30 12:38:53 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement:
52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
71 */
72
73 #include <sys/cdefs.h>
74 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.96 2003/07/30 12:38:53 yamt Exp $");
75
76 #ifndef LFS
77 # define LFS /* for prototypes in syscallargs.h */
78 #endif
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc.h>
83 #include <sys/buf.h>
84 #include <sys/mount.h>
85 #include <sys/vnode.h>
86 #include <sys/malloc.h>
87 #include <sys/kernel.h>
88
89 #include <sys/sa.h>
90 #include <sys/syscallargs.h>
91
92 #include <ufs/ufs/inode.h>
93 #include <ufs/ufs/ufsmount.h>
94 #include <ufs/ufs/ufs_extern.h>
95
96 #include <ufs/lfs/lfs.h>
97 #include <ufs/lfs/lfs_extern.h>
98
99 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
100 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
101
102 int debug_cleaner = 0;
103 int clean_vnlocked = 0;
104 int clean_inlocked = 0;
105 int verbose_debug = 0;
106
107 pid_t lfs_cleaner_pid = 0;
108
109 #define LFS_FORCE_WRITE UNASSIGNED
110
111 /*
112 * sys_lfs_markv:
113 *
114 * This will mark inodes and blocks dirty, so they are written into the log.
115 * It will block until all the blocks have been written. The segment create
116 * time passed in the block_info and inode_info structures is used to decide
117 * if the data is valid for each block (in case some process dirtied a block
118 * or inode that is being cleaned between the determination that a block is
119 * live and the lfs_markv call).
120 *
121 * 0 on success
122 * -1/errno is return on error.
123 */
124 #ifdef USE_64BIT_SYSCALLS
125 int
126 sys_lfs_markv(struct proc *p, void *v, register_t *retval)
127 {
128 struct sys_lfs_markv_args /* {
129 syscallarg(fsid_t *) fsidp;
130 syscallarg(struct block_info *) blkiov;
131 syscallarg(int) blkcnt;
132 } */ *uap = v;
133 BLOCK_INFO *blkiov;
134 int blkcnt, error;
135 fsid_t fsid;
136
137 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
138 return (error);
139
140 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
141 return (error);
142
143 blkcnt = SCARG(uap, blkcnt);
144 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
145 return (EINVAL);
146
147 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
148 if ((error = copyin(SCARG(uap, blkiov), blkiov,
149 blkcnt * sizeof(BLOCK_INFO))) != 0)
150 goto out;
151
152 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
153 copyout(blkiov, SCARG(uap, blkiov),
154 blkcnt * sizeof(BLOCK_INFO));
155 out:
156 free(blkiov, M_SEGMENT);
157 return error;
158 }
159 #else
160 int
161 sys_lfs_markv(struct lwp *l, void *v, register_t *retval)
162 {
163 struct sys_lfs_markv_args /* {
164 syscallarg(fsid_t *) fsidp;
165 syscallarg(struct block_info *) blkiov;
166 syscallarg(int) blkcnt;
167 } */ *uap = v;
168 BLOCK_INFO *blkiov;
169 BLOCK_INFO_15 *blkiov15;
170 int i, blkcnt, error;
171 fsid_t fsid;
172
173 if ((error = suser(l->l_proc->p_ucred, &l->l_proc->p_acflag)) != 0)
174 return (error);
175
176 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
177 return (error);
178
179 blkcnt = SCARG(uap, blkcnt);
180 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
181 return (EINVAL);
182
183 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
184 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
185 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
186 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
187 goto out;
188
189 for (i = 0; i < blkcnt; i++) {
190 blkiov[i].bi_inode = blkiov15[i].bi_inode;
191 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
192 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
193 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
194 blkiov[i].bi_version = blkiov15[i].bi_version;
195 blkiov[i].bi_bp = blkiov15[i].bi_bp;
196 blkiov[i].bi_size = blkiov15[i].bi_size;
197 }
198
199 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
200 for (i = 0; i < blkcnt; i++) {
201 blkiov15[i].bi_inode = blkiov[i].bi_inode;
202 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
203 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
204 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
205 blkiov15[i].bi_version = blkiov[i].bi_version;
206 blkiov15[i].bi_bp = blkiov[i].bi_bp;
207 blkiov15[i].bi_size = blkiov[i].bi_size;
208 }
209 copyout(blkiov15, SCARG(uap, blkiov),
210 blkcnt * sizeof(BLOCK_INFO_15));
211 }
212 out:
213 free(blkiov, M_SEGMENT);
214 free(blkiov15, M_SEGMENT);
215 return error;
216 }
217 #endif
218
219 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
220
221 int
222 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
223 {
224 BLOCK_INFO *blkp;
225 IFILE *ifp;
226 struct buf *bp;
227 struct inode *ip = NULL;
228 struct lfs *fs;
229 struct mount *mntp;
230 struct vnode *vp;
231 #ifdef DEBUG_LFS
232 int vputc = 0, iwritten = 0;
233 #endif
234 ino_t lastino;
235 daddr_t b_daddr, v_daddr;
236 int cnt, error;
237 int do_again = 0;
238 #ifdef CHECK_COPYIN
239 int i;
240 #endif /* CHECK_COPYIN */
241 int numrefed = 0;
242 ino_t maxino;
243 size_t obsize;
244
245 /* number of blocks/inodes that we have already bwrite'ed */
246 int nblkwritten, ninowritten;
247
248 if ((mntp = vfs_getvfs(fsidp)) == NULL)
249 return (ENOENT);
250
251 fs = VFSTOUFS(mntp)->um_lfs;
252
253 if (fs->lfs_ronly)
254 return EROFS;
255
256 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
257 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
258
259 cnt = blkcnt;
260
261 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
262 return (error);
263
264 /*
265 * This seglock is just to prevent the fact that we might have to sleep
266 * from allowing the possibility that our blocks might become
267 * invalid.
268 *
269 * It is also important to note here that unless we specify SEGM_CKP,
270 * any Ifile blocks that we might be asked to clean will never get
271 * to the disk.
272 */
273 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
274
275 /* Mark blocks/inodes dirty. */
276 error = 0;
277
278 #ifdef DEBUG_LFS
279 /* Run through and count the inodes */
280 lastino = LFS_UNUSED_INUM;
281 for (blkp = blkiov; cnt--; ++blkp) {
282 if (lastino != blkp->bi_inode) {
283 lastino = blkp->bi_inode;
284 vputc++;
285 }
286 }
287 cnt = blkcnt;
288 printf("[%d/",vputc);
289 iwritten = 0;
290 #endif /* DEBUG_LFS */
291 /* these were inside the initialization for the for loop */
292 v_daddr = LFS_UNUSED_DADDR;
293 lastino = LFS_UNUSED_INUM;
294 nblkwritten = ninowritten = 0;
295 for (blkp = blkiov; cnt--; ++blkp)
296 {
297 if (blkp->bi_daddr == LFS_FORCE_WRITE)
298 printf("lfs_markv: warning: force-writing ino %d "
299 "lbn %lld\n",
300 blkp->bi_inode, (long long)blkp->bi_lbn);
301 /* Bounds-check incoming data, avoid panic for failed VGET */
302 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
303 error = EINVAL;
304 goto err3;
305 }
306 /*
307 * Get the IFILE entry (only once) and see if the file still
308 * exists.
309 */
310 if (lastino != blkp->bi_inode) {
311 /*
312 * Finish the old file, if there was one. The presence
313 * of a usable vnode in vp is signaled by a valid v_daddr.
314 */
315 if (v_daddr != LFS_UNUSED_DADDR) {
316 #ifdef DEBUG_LFS
317 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
318 iwritten++;
319 #endif
320 lfs_vunref(vp);
321 numrefed--;
322 }
323
324 /*
325 * Start a new file
326 */
327 lastino = blkp->bi_inode;
328 if (blkp->bi_inode == LFS_IFILE_INUM)
329 v_daddr = fs->lfs_idaddr;
330 else {
331 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
332 /* XXX fix for force write */
333 v_daddr = ifp->if_daddr;
334 brelse(bp);
335 }
336 /* Don't force-write the ifile */
337 if (blkp->bi_inode == LFS_IFILE_INUM
338 && blkp->bi_daddr == LFS_FORCE_WRITE)
339 {
340 continue;
341 }
342 if (v_daddr == LFS_UNUSED_DADDR
343 && blkp->bi_daddr != LFS_FORCE_WRITE)
344 {
345 continue;
346 }
347
348 /* Get the vnode/inode. */
349 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
350 &vp,
351 (blkp->bi_lbn == LFS_UNUSED_LBN
352 ? blkp->bi_bp
353 : NULL));
354
355 if (!error) {
356 numrefed++;
357 }
358 if (error) {
359 #ifdef DEBUG_LFS
360 printf("lfs_markv: lfs_fastvget failed with %d (ino %d, segment %d)\n",
361 error, blkp->bi_inode,
362 dtosn(fs, blkp->bi_daddr));
363 #endif /* DEBUG_LFS */
364 /*
365 * If we got EAGAIN, that means that the
366 * Inode was locked. This is
367 * recoverable: just clean the rest of
368 * this segment, and let the cleaner try
369 * again with another. (When the
370 * cleaner runs again, this segment will
371 * sort high on the list, since it is
372 * now almost entirely empty.) But, we
373 * still set v_daddr = LFS_UNUSED_ADDR
374 * so as not to test this over and over
375 * again.
376 */
377 if (error == EAGAIN) {
378 error = 0;
379 do_again++;
380 }
381 #ifdef DIAGNOSTIC
382 else if (error != ENOENT)
383 panic("lfs_markv VFS_VGET FAILED");
384 #endif
385 /* lastino = LFS_UNUSED_INUM; */
386 v_daddr = LFS_UNUSED_DADDR;
387 vp = NULL;
388 ip = NULL;
389 continue;
390 }
391 ip = VTOI(vp);
392 ninowritten++;
393 } else if (v_daddr == LFS_UNUSED_DADDR) {
394 /*
395 * This can only happen if the vnode is dead (or
396 * in any case we can't get it...e.g., it is
397 * inlocked). Keep going.
398 */
399 continue;
400 }
401
402 /* Past this point we are guaranteed that vp, ip are valid. */
403
404 /* If this BLOCK_INFO didn't contain a block, keep going. */
405 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
406 /* XXX need to make sure that the inode gets written in this case */
407 /* XXX but only write the inode if it's the right one */
408 if (blkp->bi_inode != LFS_IFILE_INUM) {
409 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
410 if (ifp->if_daddr == blkp->bi_daddr
411 || blkp->bi_daddr == LFS_FORCE_WRITE)
412 {
413 LFS_SET_UINO(ip, IN_CLEANING);
414 }
415 brelse(bp);
416 }
417 continue;
418 }
419
420 b_daddr = 0;
421 if (blkp->bi_daddr != LFS_FORCE_WRITE) {
422 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
423 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
424 {
425 if (dtosn(fs,dbtofsb(fs, b_daddr))
426 == dtosn(fs,blkp->bi_daddr))
427 {
428 printf("lfs_markv: wrong da same seg: %llx vs %llx\n",
429 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr));
430 }
431 do_again++;
432 continue;
433 }
434 }
435
436 /*
437 * Check block sizes. The blocks being cleaned come from
438 * disk, so they should have the same size as their on-disk
439 * counterparts.
440 */
441 if (blkp->bi_lbn >= 0)
442 obsize = blksize(fs, ip, blkp->bi_lbn);
443 else
444 obsize = fs->lfs_bsize;
445 /* Check for fragment size change */
446 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
447 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
448 }
449 if (obsize != blkp->bi_size) {
450 printf("lfs_markv: ino %d lbn %lld wrong size (%ld != %d), try again\n",
451 blkp->bi_inode, (long long)blkp->bi_lbn,
452 (long) obsize, blkp->bi_size);
453 do_again++;
454 continue;
455 }
456
457 /*
458 * If we get to here, then we are keeping the block. If
459 * it is an indirect block, we want to actually put it
460 * in the buffer cache so that it can be updated in the
461 * finish_meta section. If it's not, we need to
462 * allocate a fake buffer so that writeseg can perform
463 * the copyin and write the buffer.
464 */
465 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
466 /* Data Block */
467 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
468 blkp->bi_size, blkp->bi_bp);
469 /* Pretend we used bread() to get it */
470 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
471 } else {
472 /* Indirect block or ifile */
473 if (blkp->bi_size != fs->lfs_bsize &&
474 ip->i_number != LFS_IFILE_INUM)
475 panic("lfs_markv: partial indirect block?"
476 " size=%d\n", blkp->bi_size);
477 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
478 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
479 /*
480 * The block in question was not found
481 * in the cache; i.e., the block that
482 * getblk() returned is empty. So, we
483 * can (and should) copy in the
484 * contents, because we've already
485 * determined that this was the right
486 * version of this block on disk.
487 *
488 * And, it can't have changed underneath
489 * us, because we have the segment lock.
490 */
491 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
492 if (error)
493 goto err2;
494 }
495 }
496 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
497 goto err2;
498
499 nblkwritten++;
500 /*
501 * XXX should account indirect blocks and ifile pages as well
502 */
503 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
504 > LFS_MARKV_MAX_BLOCKS) {
505 #ifdef DEBUG_LFS
506 printf("lfs_markv: writing %d blks %d inos\n",
507 nblkwritten, ninowritten);
508 #endif
509 lfs_segwrite(mntp, SEGM_CLEAN);
510 nblkwritten = ninowritten = 0;
511 }
512 }
513
514 /*
515 * Finish the old file, if there was one
516 */
517 if (v_daddr != LFS_UNUSED_DADDR) {
518 #ifdef DEBUG_LFS
519 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
520 iwritten++;
521 #endif
522 lfs_vunref(vp);
523 numrefed--;
524 }
525
526 #ifdef DEBUG_LFS
527 printf("%d]",iwritten);
528 if (numrefed != 0) {
529 panic("lfs_markv: numrefed=%d", numrefed);
530 }
531 #endif
532
533 #ifdef DEBUG_LFS
534 printf("lfs_markv: writing %d blks %d inos (check point)\n",
535 nblkwritten, ninowritten);
536 #endif
537 /*
538 * The last write has to be SEGM_SYNC, because of calling semantics.
539 * It also has to be SEGM_CKP, because otherwise we could write
540 * over the newly cleaned data contained in a checkpoint, and then
541 * we'd be unhappy at recovery time.
542 */
543 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
544
545 lfs_segunlock(fs);
546
547 vfs_unbusy(mntp);
548 if (error)
549 return (error);
550 else if (do_again)
551 return EAGAIN;
552
553 return 0;
554
555 err2:
556 printf("lfs_markv err2\n");
557
558 /*
559 * XXX we're here because copyin() failed.
560 * XXX it means that we can't trust the cleanerd. too bad.
561 * XXX how can we recover from this?
562 */
563
564 err3:
565 /*
566 * XXX should do segwrite here anyway?
567 */
568
569 if (v_daddr != LFS_UNUSED_DADDR) {
570 lfs_vunref(vp);
571 --numrefed;
572 }
573
574 lfs_segunlock(fs);
575 vfs_unbusy(mntp);
576 #ifdef DEBUG_LFS
577 if (numrefed != 0) {
578 panic("lfs_markv: numrefed=%d", numrefed);
579 }
580 #endif
581
582 return (error);
583 }
584
585 /*
586 * sys_lfs_bmapv:
587 *
588 * This will fill in the current disk address for arrays of blocks.
589 *
590 * 0 on success
591 * -1/errno is return on error.
592 */
593 #ifdef USE_64BIT_SYSCALLS
594 int
595 sys_lfs_bmapv(struct proc *p, void *v, register_t *retval)
596 {
597 struct sys_lfs_bmapv_args /* {
598 syscallarg(fsid_t *) fsidp;
599 syscallarg(struct block_info *) blkiov;
600 syscallarg(int) blkcnt;
601 } */ *uap = v;
602 BLOCK_INFO *blkiov;
603 int blkcnt, error;
604 fsid_t fsid;
605
606 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
607 return (error);
608
609 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
610 return (error);
611
612 blkcnt = SCARG(uap, blkcnt);
613 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
614 return (EINVAL);
615 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
616 if ((error = copyin(SCARG(uap, blkiov), blkiov,
617 blkcnt * sizeof(BLOCK_INFO))) != 0)
618 goto out;
619
620 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
621 copyout(blkiov, SCARG(uap, blkiov),
622 blkcnt * sizeof(BLOCK_INFO));
623 out:
624 free(blkiov, M_SEGMENT);
625 return error;
626 }
627 #else
628 int
629 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval)
630 {
631 struct sys_lfs_bmapv_args /* {
632 syscallarg(fsid_t *) fsidp;
633 syscallarg(struct block_info *) blkiov;
634 syscallarg(int) blkcnt;
635 } */ *uap = v;
636 struct proc *p = l->l_proc;
637 BLOCK_INFO *blkiov;
638 BLOCK_INFO_15 *blkiov15;
639 int i, blkcnt, error;
640 fsid_t fsid;
641
642 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
643 return (error);
644
645 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
646 return (error);
647
648 blkcnt = SCARG(uap, blkcnt);
649 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
650 return (EINVAL);
651 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
652 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
653 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
654 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
655 goto out;
656
657 for (i = 0; i < blkcnt; i++) {
658 blkiov[i].bi_inode = blkiov15[i].bi_inode;
659 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
660 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
661 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
662 blkiov[i].bi_version = blkiov15[i].bi_version;
663 blkiov[i].bi_bp = blkiov15[i].bi_bp;
664 blkiov[i].bi_size = blkiov15[i].bi_size;
665 }
666
667 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) {
668 for (i = 0; i < blkcnt; i++) {
669 blkiov15[i].bi_inode = blkiov[i].bi_inode;
670 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
671 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
672 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
673 blkiov15[i].bi_version = blkiov[i].bi_version;
674 blkiov15[i].bi_bp = blkiov[i].bi_bp;
675 blkiov15[i].bi_size = blkiov[i].bi_size;
676 }
677 copyout(blkiov15, SCARG(uap, blkiov),
678 blkcnt * sizeof(BLOCK_INFO_15));
679 }
680 out:
681 free(blkiov, M_SEGMENT);
682 free(blkiov15, M_SEGMENT);
683 return error;
684 }
685 #endif
686
687 int
688 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
689 {
690 BLOCK_INFO *blkp;
691 IFILE *ifp;
692 struct buf *bp;
693 struct inode *ip = NULL;
694 struct lfs *fs;
695 struct mount *mntp;
696 struct ufsmount *ump;
697 struct vnode *vp;
698 ino_t lastino;
699 daddr_t v_daddr;
700 int cnt, error;
701 int numrefed = 0;
702
703 lfs_cleaner_pid = p->p_pid;
704
705 if ((mntp = vfs_getvfs(fsidp)) == NULL)
706 return (ENOENT);
707
708 ump = VFSTOUFS(mntp);
709 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
710 return (error);
711
712 cnt = blkcnt;
713
714 fs = VFSTOUFS(mntp)->um_lfs;
715
716 error = 0;
717
718 /* these were inside the initialization for the for loop */
719 v_daddr = LFS_UNUSED_DADDR;
720 lastino = LFS_UNUSED_INUM;
721 for (blkp = blkiov; cnt--; ++blkp)
722 {
723 /*
724 * Get the IFILE entry (only once) and see if the file still
725 * exists.
726 */
727 if (lastino != blkp->bi_inode) {
728 /*
729 * Finish the old file, if there was one. The presence
730 * of a usable vnode in vp is signaled by a valid
731 * v_daddr.
732 */
733 if (v_daddr != LFS_UNUSED_DADDR) {
734 lfs_vunref(vp);
735 numrefed--;
736 }
737
738 /*
739 * Start a new file
740 */
741 lastino = blkp->bi_inode;
742 if (blkp->bi_inode == LFS_IFILE_INUM)
743 v_daddr = fs->lfs_idaddr;
744 else {
745 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
746 v_daddr = ifp->if_daddr;
747 brelse(bp);
748 }
749 if (v_daddr == LFS_UNUSED_DADDR) {
750 blkp->bi_daddr = LFS_UNUSED_DADDR;
751 continue;
752 }
753 /*
754 * A regular call to VFS_VGET could deadlock
755 * here. Instead, we try an unlocked access.
756 */
757 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
758 if (vp != NULL && !(vp->v_flag & VXLOCK)) {
759 ip = VTOI(vp);
760 if (lfs_vref(vp)) {
761 v_daddr = LFS_UNUSED_DADDR;
762 continue;
763 }
764 numrefed++;
765 } else {
766 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
767 if (error) {
768 #ifdef DEBUG_LFS
769 printf("lfs_bmapv: vget of ino %d failed with %d",blkp->bi_inode,error);
770 #endif
771 v_daddr = LFS_UNUSED_DADDR;
772 continue;
773 } else {
774 KASSERT(VOP_ISLOCKED(vp));
775 VOP_UNLOCK(vp, 0);
776 numrefed++;
777 }
778 }
779 ip = VTOI(vp);
780 } else if (v_daddr == LFS_UNUSED_DADDR) {
781 /*
782 * This can only happen if the vnode is dead.
783 * Keep going. Note that we DO NOT set the
784 * bi_addr to anything -- if we failed to get
785 * the vnode, for example, we want to assume
786 * conservatively that all of its blocks *are*
787 * located in the segment in question.
788 * lfs_markv will throw them out if we are
789 * wrong.
790 */
791 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
792 continue;
793 }
794
795 /* Past this point we are guaranteed that vp, ip are valid. */
796
797 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
798 /*
799 * We just want the inode address, which is
800 * conveniently in v_daddr.
801 */
802 blkp->bi_daddr = v_daddr;
803 } else {
804 daddr_t bi_daddr;
805
806 /* XXX ondisk32 */
807 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
808 &bi_daddr, NULL);
809 if (error)
810 {
811 blkp->bi_daddr = LFS_UNUSED_DADDR;
812 continue;
813 }
814 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
815 /* Fill in the block size, too */
816 if (blkp->bi_lbn >= 0)
817 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
818 else
819 blkp->bi_size = fs->lfs_bsize;
820 }
821 }
822
823 /*
824 * Finish the old file, if there was one. The presence
825 * of a usable vnode in vp is signaled by a valid v_daddr.
826 */
827 if (v_daddr != LFS_UNUSED_DADDR) {
828 lfs_vunref(vp);
829 numrefed--;
830 }
831
832 #ifdef DEBUG_LFS
833 if (numrefed != 0) {
834 panic("lfs_bmapv: numrefed=%d", numrefed);
835 }
836 #endif
837
838 vfs_unbusy(mntp);
839
840 return 0;
841 }
842
843 /*
844 * sys_lfs_segclean:
845 *
846 * Mark the segment clean.
847 *
848 * 0 on success
849 * -1/errno is return on error.
850 */
851 int
852 sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
853 {
854 struct sys_lfs_segclean_args /* {
855 syscallarg(fsid_t *) fsidp;
856 syscallarg(u_long) segment;
857 } */ *uap = v;
858 struct lfs *fs;
859 struct mount *mntp;
860 fsid_t fsid;
861 int error;
862 unsigned long segnum;
863 struct proc *p = l->l_proc;
864
865 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
866 return (error);
867
868 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
869 return (error);
870 if ((mntp = vfs_getvfs(&fsid)) == NULL)
871 return (ENOENT);
872
873 fs = VFSTOUFS(mntp)->um_lfs;
874 segnum = SCARG(uap, segment);
875
876 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
877 return (error);
878
879 lfs_seglock(fs, SEGM_PROT);
880 error = lfs_do_segclean(fs, segnum);
881 lfs_segunlock(fs);
882 vfs_unbusy(mntp);
883 return error;
884 }
885
886 /*
887 * Actually mark the segment clean.
888 * Must be called with the segment lock held.
889 */
890 int
891 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
892 {
893 struct buf *bp;
894 CLEANERINFO *cip;
895 SEGUSE *sup;
896
897 if (dtosn(fs, fs->lfs_curseg) == segnum) {
898 return (EBUSY);
899 }
900
901 LFS_SEGENTRY(sup, fs, segnum, bp);
902 if (sup->su_nbytes) {
903 printf("lfs_segclean: not cleaning segment %lu: %d live bytes\n",
904 segnum, sup->su_nbytes);
905 brelse(bp);
906 return (EBUSY);
907 }
908 if (sup->su_flags & SEGUSE_ACTIVE) {
909 brelse(bp);
910 return (EBUSY);
911 }
912 if (!(sup->su_flags & SEGUSE_DIRTY)) {
913 brelse(bp);
914 return (EALREADY);
915 }
916
917 fs->lfs_avail += segtod(fs, 1);
918 if (sup->su_flags & SEGUSE_SUPERBLOCK)
919 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
920 if (fs->lfs_version > 1 && segnum == 0 &&
921 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
922 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
923 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
924 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
925 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
926 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
927 if (fs->lfs_dmeta < 0)
928 fs->lfs_dmeta = 0;
929 sup->su_flags &= ~SEGUSE_DIRTY;
930 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
931
932 LFS_CLEANERINFO(cip, fs, bp);
933 ++cip->clean;
934 --cip->dirty;
935 fs->lfs_nclean = cip->clean;
936 cip->bfree = fs->lfs_bfree;
937 cip->avail = fs->lfs_avail - fs->lfs_ravail;
938 (void) LFS_BWRITE_LOG(bp);
939 wakeup(&fs->lfs_avail);
940
941 return (0);
942 }
943
944 /*
945 * This will block until a segment in file system fsid is written. A timeout
946 * in milliseconds may be specified which will awake the cleaner automatically.
947 * An fsid of -1 means any file system, and a timeout of 0 means forever.
948 */
949 int
950 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
951 {
952 struct mount *mntp;
953 void *addr;
954 u_long timeout;
955 int error, s;
956
957 if ((mntp = vfs_getvfs(fsidp)) == NULL)
958 addr = &lfs_allclean_wakeup;
959 else
960 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
961 /*
962 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
963 * XXX IS THAT WHAT IS INTENDED?
964 */
965 s = splclock();
966 timeradd(tv, &time, tv);
967 timeout = hzto(tv);
968 splx(s);
969 error = tsleep(addr, PCATCH | PUSER, "segment", timeout);
970 return (error == ERESTART ? EINTR : 0);
971 }
972
973 /*
974 * sys_lfs_segwait:
975 *
976 * System call wrapper around lfs_segwait().
977 *
978 * 0 on success
979 * 1 on timeout
980 * -1/errno is return on error.
981 */
982 int
983 sys_lfs_segwait(struct lwp *l, void *v, register_t *retval)
984 {
985 struct sys_lfs_segwait_args /* {
986 syscallarg(fsid_t *) fsidp;
987 syscallarg(struct timeval *) tv;
988 } */ *uap = v;
989 struct proc *p = l->l_proc;
990 struct timeval atv;
991 fsid_t fsid;
992 int error;
993
994 /* XXX need we be su to segwait? */
995 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) {
996 return (error);
997 }
998 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
999 return (error);
1000
1001 if (SCARG(uap, tv)) {
1002 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
1003 if (error)
1004 return (error);
1005 if (itimerfix(&atv))
1006 return (EINVAL);
1007 } else /* NULL or invalid */
1008 atv.tv_sec = atv.tv_usec = 0;
1009 return lfs_segwait(&fsid, &atv);
1010 }
1011
1012 /*
1013 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1014 * daddr from the ifile, so don't look it up again. If the cleaner is
1015 * processing IINFO structures, it may have the ondisk inode already, so
1016 * don't go retrieving it again.
1017 *
1018 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1019 * when finished.
1020 */
1021 extern struct lock ufs_hashlock;
1022
1023 int
1024 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1025 {
1026
1027 /*
1028 * This is playing fast and loose. Someone may have the inode
1029 * locked, in which case they are going to be distinctly unhappy
1030 * if we trash something.
1031 */
1032 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
1033 if ((*vpp)->v_flag & VXLOCK) {
1034 printf("lfs_fastvget: vnode VXLOCKed for ino %d\n",
1035 ino);
1036 clean_vnlocked++;
1037 #ifdef LFS_EAGAIN_FAIL
1038 return EAGAIN;
1039 #endif
1040 }
1041 if (lfs_vref(*vpp)) {
1042 clean_inlocked++;
1043 return EAGAIN;
1044 }
1045 } else
1046 *vpp = NULL;
1047
1048 return (0);
1049 }
1050
1051 int
1052 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, struct ufs1_dinode *dinp)
1053 {
1054 struct inode *ip;
1055 struct ufs1_dinode *dip;
1056 struct vnode *vp;
1057 struct ufsmount *ump;
1058 dev_t dev;
1059 int error, retries;
1060 struct buf *bp;
1061 struct lfs *fs;
1062
1063 ump = VFSTOUFS(mp);
1064 dev = ump->um_dev;
1065 fs = ump->um_lfs;
1066
1067 /*
1068 * Wait until the filesystem is fully mounted before allowing vget
1069 * to complete. This prevents possible problems with roll-forward.
1070 */
1071 while (fs->lfs_flags & LFS_NOTYET) {
1072 tsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0);
1073 }
1074 /*
1075 * This is playing fast and loose. Someone may have the inode
1076 * locked, in which case they are going to be distinctly unhappy
1077 * if we trash something.
1078 */
1079
1080 error = lfs_fasthashget(dev, ino, vpp);
1081 if (error != 0 || *vpp != NULL)
1082 return (error);
1083
1084 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1085 *vpp = NULL;
1086 return (error);
1087 }
1088
1089 do {
1090 error = lfs_fasthashget(dev, ino, vpp);
1091 if (error != 0 || *vpp != NULL) {
1092 ungetnewvnode(vp);
1093 return (error);
1094 }
1095 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
1096
1097 /* Allocate new vnode/inode. */
1098 lfs_vcreate(mp, ino, vp);
1099
1100 /*
1101 * Put it onto its hash chain and lock it so that other requests for
1102 * this inode will block if they arrive while we are sleeping waiting
1103 * for old data structures to be purged or for the contents of the
1104 * disk portion of this inode to be read.
1105 */
1106 ip = VTOI(vp);
1107 ufs_ihashins(ip);
1108 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1109
1110 /*
1111 * XXX
1112 * This may not need to be here, logically it should go down with
1113 * the i_devvp initialization.
1114 * Ask Kirk.
1115 */
1116 ip->i_lfs = fs;
1117
1118 /* Read in the disk contents for the inode, copy into the inode. */
1119 if (dinp) {
1120 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1121 if (error) {
1122 printf("lfs_fastvget: dinode copyin failed for ino %d\n", ino);
1123 ufs_ihashrem(ip);
1124
1125 /* Unlock and discard unneeded inode. */
1126 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1127 lfs_vunref(vp);
1128 *vpp = NULL;
1129 return (error);
1130 }
1131 if (ip->i_number != ino)
1132 panic("lfs_fastvget: I was fed the wrong inode!");
1133 } else {
1134 retries = 0;
1135 again:
1136 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1137 NOCRED, &bp);
1138 if (error) {
1139 printf("lfs_fastvget: bread failed with %d\n",error);
1140 /*
1141 * The inode does not contain anything useful, so it
1142 * would be misleading to leave it on its hash chain.
1143 * Iput() will return it to the free list.
1144 */
1145 ufs_ihashrem(ip);
1146
1147 /* Unlock and discard unneeded inode. */
1148 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1149 lfs_vunref(vp);
1150 brelse(bp);
1151 *vpp = NULL;
1152 return (error);
1153 }
1154 dip = lfs_ifind(ump->um_lfs, ino, bp);
1155 if (dip == NULL) {
1156 /* Assume write has not completed yet; try again */
1157 bp->b_flags |= B_INVAL;
1158 brelse(bp);
1159 ++retries;
1160 if (retries > LFS_IFIND_RETRIES)
1161 panic("lfs_fastvget: dinode not found");
1162 printf("lfs_fastvget: dinode not found, retrying...\n");
1163 goto again;
1164 }
1165 *ip->i_din.ffs1_din = *dip;
1166 brelse(bp);
1167 }
1168 lfs_vinit(mp, vp);
1169
1170 *vpp = vp;
1171
1172 KASSERT(VOP_ISLOCKED(vp));
1173 VOP_UNLOCK(vp, 0);
1174
1175 return (0);
1176 }
1177
1178 /*
1179 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1180 */
1181 struct buf *
1182 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
1183 {
1184 struct buf *bp;
1185 int error;
1186
1187 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1188
1189 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1190 error = copyin(uaddr, bp->b_data, size);
1191 if (error) {
1192 lfs_freebuf(fs, bp);
1193 return NULL;
1194 }
1195 KDASSERT(bp->b_iodone == lfs_callback);
1196
1197 #if 0
1198 bp->b_saveaddr = (caddr_t)fs;
1199 ++fs->lfs_iocount;
1200 #endif
1201 bp->b_bufsize = size;
1202 bp->b_bcount = size;
1203 return (bp);
1204 }
1205