lfs_syscalls.c revision 1.102 1 /* $NetBSD: lfs_syscalls.c,v 1.102 2005/02/26 22:32:20 perry Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.102 2005/02/26 22:32:20 perry Exp $");
71
72 #ifndef LFS
73 # define LFS /* for prototypes in syscallargs.h */
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/buf.h>
80 #include <sys/mount.h>
81 #include <sys/vnode.h>
82 #include <sys/malloc.h>
83 #include <sys/kernel.h>
84
85 #include <sys/sa.h>
86 #include <sys/syscallargs.h>
87
88 #include <ufs/ufs/inode.h>
89 #include <ufs/ufs/ufsmount.h>
90 #include <ufs/ufs/ufs_extern.h>
91
92 #include <ufs/lfs/lfs.h>
93 #include <ufs/lfs/lfs_extern.h>
94
95 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
96 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
97
98 int debug_cleaner = 0;
99 int clean_vnlocked = 0;
100 int clean_inlocked = 0;
101 int verbose_debug = 0;
102
103 pid_t lfs_cleaner_pid = 0;
104
105 #define LFS_FORCE_WRITE UNASSIGNED
106
107 /*
108 * sys_lfs_markv:
109 *
110 * This will mark inodes and blocks dirty, so they are written into the log.
111 * It will block until all the blocks have been written. The segment create
112 * time passed in the block_info and inode_info structures is used to decide
113 * if the data is valid for each block (in case some process dirtied a block
114 * or inode that is being cleaned between the determination that a block is
115 * live and the lfs_markv call).
116 *
117 * 0 on success
118 * -1/errno is return on error.
119 */
120 #ifdef USE_64BIT_SYSCALLS
121 int
122 sys_lfs_markv(struct proc *p, void *v, register_t *retval)
123 {
124 struct sys_lfs_markv_args /* {
125 syscallarg(fsid_t *) fsidp;
126 syscallarg(struct block_info *) blkiov;
127 syscallarg(int) blkcnt;
128 } */ *uap = v;
129 BLOCK_INFO *blkiov;
130 int blkcnt, error;
131 fsid_t fsid;
132
133 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
134 return (error);
135
136 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
137 return (error);
138
139 blkcnt = SCARG(uap, blkcnt);
140 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
141 return (EINVAL);
142
143 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
144 if ((error = copyin(SCARG(uap, blkiov), blkiov,
145 blkcnt * sizeof(BLOCK_INFO))) != 0)
146 goto out;
147
148 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
149 copyout(blkiov, SCARG(uap, blkiov),
150 blkcnt * sizeof(BLOCK_INFO));
151 out:
152 free(blkiov, M_SEGMENT);
153 return error;
154 }
155 #else
156 int
157 sys_lfs_markv(struct lwp *l, void *v, register_t *retval)
158 {
159 struct sys_lfs_markv_args /* {
160 syscallarg(fsid_t *) fsidp;
161 syscallarg(struct block_info *) blkiov;
162 syscallarg(int) blkcnt;
163 } */ *uap = v;
164 BLOCK_INFO *blkiov;
165 BLOCK_INFO_15 *blkiov15;
166 int i, blkcnt, error;
167 fsid_t fsid;
168
169 if ((error = suser(l->l_proc->p_ucred, &l->l_proc->p_acflag)) != 0)
170 return (error);
171
172 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
173 return (error);
174
175 blkcnt = SCARG(uap, blkcnt);
176 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
177 return (EINVAL);
178
179 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
180 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
181 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
182 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
183 goto out;
184
185 for (i = 0; i < blkcnt; i++) {
186 blkiov[i].bi_inode = blkiov15[i].bi_inode;
187 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
188 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
189 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
190 blkiov[i].bi_version = blkiov15[i].bi_version;
191 blkiov[i].bi_bp = blkiov15[i].bi_bp;
192 blkiov[i].bi_size = blkiov15[i].bi_size;
193 }
194
195 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
196 for (i = 0; i < blkcnt; i++) {
197 blkiov15[i].bi_inode = blkiov[i].bi_inode;
198 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
199 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
200 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
201 blkiov15[i].bi_version = blkiov[i].bi_version;
202 blkiov15[i].bi_bp = blkiov[i].bi_bp;
203 blkiov15[i].bi_size = blkiov[i].bi_size;
204 }
205 copyout(blkiov15, SCARG(uap, blkiov),
206 blkcnt * sizeof(BLOCK_INFO_15));
207 }
208 out:
209 free(blkiov, M_SEGMENT);
210 free(blkiov15, M_SEGMENT);
211 return error;
212 }
213 #endif
214
215 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
216
217 int
218 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
219 {
220 BLOCK_INFO *blkp;
221 IFILE *ifp;
222 struct buf *bp;
223 struct inode *ip = NULL;
224 struct lfs *fs;
225 struct mount *mntp;
226 struct vnode *vp;
227 #ifdef DEBUG_LFS
228 int vputc = 0, iwritten = 0;
229 #endif
230 ino_t lastino;
231 daddr_t b_daddr, v_daddr;
232 int cnt, error;
233 int do_again = 0;
234 int numrefed = 0;
235 ino_t maxino;
236 size_t obsize;
237
238 /* number of blocks/inodes that we have already bwrite'ed */
239 int nblkwritten, ninowritten;
240
241 if ((mntp = vfs_getvfs(fsidp)) == NULL)
242 return (ENOENT);
243
244 fs = VFSTOUFS(mntp)->um_lfs;
245
246 if (fs->lfs_ronly)
247 return EROFS;
248
249 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
250 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
251
252 cnt = blkcnt;
253
254 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
255 return (error);
256
257 /*
258 * This seglock is just to prevent the fact that we might have to sleep
259 * from allowing the possibility that our blocks might become
260 * invalid.
261 *
262 * It is also important to note here that unless we specify SEGM_CKP,
263 * any Ifile blocks that we might be asked to clean will never get
264 * to the disk.
265 */
266 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
267
268 /* Mark blocks/inodes dirty. */
269 error = 0;
270
271 #ifdef DEBUG_LFS
272 /* Run through and count the inodes */
273 lastino = LFS_UNUSED_INUM;
274 for (blkp = blkiov; cnt--; ++blkp) {
275 if (lastino != blkp->bi_inode) {
276 lastino = blkp->bi_inode;
277 vputc++;
278 }
279 }
280 cnt = blkcnt;
281 printf("[%d/",vputc);
282 iwritten = 0;
283 #endif /* DEBUG_LFS */
284 /* these were inside the initialization for the for loop */
285 v_daddr = LFS_UNUSED_DADDR;
286 lastino = LFS_UNUSED_INUM;
287 nblkwritten = ninowritten = 0;
288 for (blkp = blkiov; cnt--; ++blkp)
289 {
290 if (blkp->bi_daddr == LFS_FORCE_WRITE)
291 printf("lfs_markv: warning: force-writing ino %d "
292 "lbn %lld\n",
293 blkp->bi_inode, (long long)blkp->bi_lbn);
294 /* Bounds-check incoming data, avoid panic for failed VGET */
295 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
296 error = EINVAL;
297 goto err3;
298 }
299 /*
300 * Get the IFILE entry (only once) and see if the file still
301 * exists.
302 */
303 if (lastino != blkp->bi_inode) {
304 /*
305 * Finish the old file, if there was one. The presence
306 * of a usable vnode in vp is signaled by a valid v_daddr.
307 */
308 if (v_daddr != LFS_UNUSED_DADDR) {
309 #ifdef DEBUG_LFS
310 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
311 iwritten++;
312 #endif
313 lfs_vunref(vp);
314 numrefed--;
315 }
316
317 /*
318 * Start a new file
319 */
320 lastino = blkp->bi_inode;
321 if (blkp->bi_inode == LFS_IFILE_INUM)
322 v_daddr = fs->lfs_idaddr;
323 else {
324 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
325 /* XXX fix for force write */
326 v_daddr = ifp->if_daddr;
327 brelse(bp);
328 }
329 /* Don't force-write the ifile */
330 if (blkp->bi_inode == LFS_IFILE_INUM
331 && blkp->bi_daddr == LFS_FORCE_WRITE)
332 {
333 continue;
334 }
335 if (v_daddr == LFS_UNUSED_DADDR
336 && blkp->bi_daddr != LFS_FORCE_WRITE)
337 {
338 continue;
339 }
340
341 /* Get the vnode/inode. */
342 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
343 &vp,
344 (blkp->bi_lbn == LFS_UNUSED_LBN
345 ? blkp->bi_bp
346 : NULL));
347
348 if (!error) {
349 numrefed++;
350 }
351 if (error) {
352 #ifdef DEBUG_LFS
353 printf("lfs_markv: lfs_fastvget failed with %d (ino %d, segment %d)\n",
354 error, blkp->bi_inode,
355 dtosn(fs, blkp->bi_daddr));
356 #endif /* DEBUG_LFS */
357 /*
358 * If we got EAGAIN, that means that the
359 * Inode was locked. This is
360 * recoverable: just clean the rest of
361 * this segment, and let the cleaner try
362 * again with another. (When the
363 * cleaner runs again, this segment will
364 * sort high on the list, since it is
365 * now almost entirely empty.) But, we
366 * still set v_daddr = LFS_UNUSED_ADDR
367 * so as not to test this over and over
368 * again.
369 */
370 if (error == EAGAIN) {
371 error = 0;
372 do_again++;
373 }
374 #ifdef DIAGNOSTIC
375 else if (error != ENOENT)
376 panic("lfs_markv VFS_VGET FAILED");
377 #endif
378 /* lastino = LFS_UNUSED_INUM; */
379 v_daddr = LFS_UNUSED_DADDR;
380 vp = NULL;
381 ip = NULL;
382 continue;
383 }
384 ip = VTOI(vp);
385 ninowritten++;
386 } else if (v_daddr == LFS_UNUSED_DADDR) {
387 /*
388 * This can only happen if the vnode is dead (or
389 * in any case we can't get it...e.g., it is
390 * inlocked). Keep going.
391 */
392 continue;
393 }
394
395 /* Past this point we are guaranteed that vp, ip are valid. */
396
397 /* If this BLOCK_INFO didn't contain a block, keep going. */
398 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
399 /* XXX need to make sure that the inode gets written in this case */
400 /* XXX but only write the inode if it's the right one */
401 if (blkp->bi_inode != LFS_IFILE_INUM) {
402 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
403 if (ifp->if_daddr == blkp->bi_daddr
404 || blkp->bi_daddr == LFS_FORCE_WRITE)
405 {
406 LFS_SET_UINO(ip, IN_CLEANING);
407 }
408 brelse(bp);
409 }
410 continue;
411 }
412
413 b_daddr = 0;
414 if (blkp->bi_daddr != LFS_FORCE_WRITE) {
415 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
416 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
417 {
418 if (dtosn(fs,dbtofsb(fs, b_daddr))
419 == dtosn(fs,blkp->bi_daddr))
420 {
421 printf("lfs_markv: wrong da same seg: %llx vs %llx\n",
422 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr));
423 }
424 do_again++;
425 continue;
426 }
427 }
428
429 /*
430 * Check block sizes. The blocks being cleaned come from
431 * disk, so they should have the same size as their on-disk
432 * counterparts.
433 */
434 if (blkp->bi_lbn >= 0)
435 obsize = blksize(fs, ip, blkp->bi_lbn);
436 else
437 obsize = fs->lfs_bsize;
438 /* Check for fragment size change */
439 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
440 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
441 }
442 if (obsize != blkp->bi_size) {
443 printf("lfs_markv: ino %d lbn %lld wrong size (%ld != %d), try again\n",
444 blkp->bi_inode, (long long)blkp->bi_lbn,
445 (long) obsize, blkp->bi_size);
446 do_again++;
447 continue;
448 }
449
450 /*
451 * If we get to here, then we are keeping the block. If
452 * it is an indirect block, we want to actually put it
453 * in the buffer cache so that it can be updated in the
454 * finish_meta section. If it's not, we need to
455 * allocate a fake buffer so that writeseg can perform
456 * the copyin and write the buffer.
457 */
458 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
459 /* Data Block */
460 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
461 blkp->bi_size, blkp->bi_bp);
462 /* Pretend we used bread() to get it */
463 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
464 } else {
465 /* Indirect block or ifile */
466 if (blkp->bi_size != fs->lfs_bsize &&
467 ip->i_number != LFS_IFILE_INUM)
468 panic("lfs_markv: partial indirect block?"
469 " size=%d\n", blkp->bi_size);
470 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
471 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
472 /*
473 * The block in question was not found
474 * in the cache; i.e., the block that
475 * getblk() returned is empty. So, we
476 * can (and should) copy in the
477 * contents, because we've already
478 * determined that this was the right
479 * version of this block on disk.
480 *
481 * And, it can't have changed underneath
482 * us, because we have the segment lock.
483 */
484 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
485 if (error)
486 goto err2;
487 }
488 }
489 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
490 goto err2;
491
492 nblkwritten++;
493 /*
494 * XXX should account indirect blocks and ifile pages as well
495 */
496 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
497 > LFS_MARKV_MAX_BLOCKS) {
498 #ifdef DEBUG_LFS
499 printf("lfs_markv: writing %d blks %d inos\n",
500 nblkwritten, ninowritten);
501 #endif
502 lfs_segwrite(mntp, SEGM_CLEAN);
503 nblkwritten = ninowritten = 0;
504 }
505 }
506
507 /*
508 * Finish the old file, if there was one
509 */
510 if (v_daddr != LFS_UNUSED_DADDR) {
511 #ifdef DEBUG_LFS
512 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
513 iwritten++;
514 #endif
515 lfs_vunref(vp);
516 numrefed--;
517 }
518
519 #ifdef DEBUG_LFS
520 printf("%d]",iwritten);
521 if (numrefed != 0) {
522 panic("lfs_markv: numrefed=%d", numrefed);
523 }
524 #endif
525
526 #ifdef DEBUG_LFS
527 printf("lfs_markv: writing %d blks %d inos (check point)\n",
528 nblkwritten, ninowritten);
529 #endif
530 /*
531 * The last write has to be SEGM_SYNC, because of calling semantics.
532 * It also has to be SEGM_CKP, because otherwise we could write
533 * over the newly cleaned data contained in a checkpoint, and then
534 * we'd be unhappy at recovery time.
535 */
536 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
537
538 lfs_segunlock(fs);
539
540 vfs_unbusy(mntp);
541 if (error)
542 return (error);
543 else if (do_again)
544 return EAGAIN;
545
546 return 0;
547
548 err2:
549 printf("lfs_markv err2\n");
550
551 /*
552 * XXX we're here because copyin() failed.
553 * XXX it means that we can't trust the cleanerd. too bad.
554 * XXX how can we recover from this?
555 */
556
557 err3:
558 /*
559 * XXX should do segwrite here anyway?
560 */
561
562 if (v_daddr != LFS_UNUSED_DADDR) {
563 lfs_vunref(vp);
564 --numrefed;
565 }
566
567 lfs_segunlock(fs);
568 vfs_unbusy(mntp);
569 #ifdef DEBUG_LFS
570 if (numrefed != 0) {
571 panic("lfs_markv: numrefed=%d", numrefed);
572 }
573 #endif
574
575 return (error);
576 }
577
578 /*
579 * sys_lfs_bmapv:
580 *
581 * This will fill in the current disk address for arrays of blocks.
582 *
583 * 0 on success
584 * -1/errno is return on error.
585 */
586 #ifdef USE_64BIT_SYSCALLS
587 int
588 sys_lfs_bmapv(struct proc *p, void *v, register_t *retval)
589 {
590 struct sys_lfs_bmapv_args /* {
591 syscallarg(fsid_t *) fsidp;
592 syscallarg(struct block_info *) blkiov;
593 syscallarg(int) blkcnt;
594 } */ *uap = v;
595 BLOCK_INFO *blkiov;
596 int blkcnt, error;
597 fsid_t fsid;
598
599 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
600 return (error);
601
602 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
603 return (error);
604
605 blkcnt = SCARG(uap, blkcnt);
606 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
607 return (EINVAL);
608 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
609 if ((error = copyin(SCARG(uap, blkiov), blkiov,
610 blkcnt * sizeof(BLOCK_INFO))) != 0)
611 goto out;
612
613 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
614 copyout(blkiov, SCARG(uap, blkiov),
615 blkcnt * sizeof(BLOCK_INFO));
616 out:
617 free(blkiov, M_SEGMENT);
618 return error;
619 }
620 #else
621 int
622 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval)
623 {
624 struct sys_lfs_bmapv_args /* {
625 syscallarg(fsid_t *) fsidp;
626 syscallarg(struct block_info *) blkiov;
627 syscallarg(int) blkcnt;
628 } */ *uap = v;
629 struct proc *p = l->l_proc;
630 BLOCK_INFO *blkiov;
631 BLOCK_INFO_15 *blkiov15;
632 int i, blkcnt, error;
633 fsid_t fsid;
634
635 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
636 return (error);
637
638 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
639 return (error);
640
641 blkcnt = SCARG(uap, blkcnt);
642 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
643 return (EINVAL);
644 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
645 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
646 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
647 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
648 goto out;
649
650 for (i = 0; i < blkcnt; i++) {
651 blkiov[i].bi_inode = blkiov15[i].bi_inode;
652 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
653 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
654 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
655 blkiov[i].bi_version = blkiov15[i].bi_version;
656 blkiov[i].bi_bp = blkiov15[i].bi_bp;
657 blkiov[i].bi_size = blkiov15[i].bi_size;
658 }
659
660 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) {
661 for (i = 0; i < blkcnt; i++) {
662 blkiov15[i].bi_inode = blkiov[i].bi_inode;
663 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
664 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
665 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
666 blkiov15[i].bi_version = blkiov[i].bi_version;
667 blkiov15[i].bi_bp = blkiov[i].bi_bp;
668 blkiov15[i].bi_size = blkiov[i].bi_size;
669 }
670 copyout(blkiov15, SCARG(uap, blkiov),
671 blkcnt * sizeof(BLOCK_INFO_15));
672 }
673 out:
674 free(blkiov, M_SEGMENT);
675 free(blkiov15, M_SEGMENT);
676 return error;
677 }
678 #endif
679
680 int
681 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
682 {
683 BLOCK_INFO *blkp;
684 IFILE *ifp;
685 struct buf *bp;
686 struct inode *ip = NULL;
687 struct lfs *fs;
688 struct mount *mntp;
689 struct ufsmount *ump;
690 struct vnode *vp;
691 ino_t lastino;
692 daddr_t v_daddr;
693 int cnt, error;
694 int numrefed = 0;
695
696 lfs_cleaner_pid = p->p_pid;
697
698 if ((mntp = vfs_getvfs(fsidp)) == NULL)
699 return (ENOENT);
700
701 ump = VFSTOUFS(mntp);
702 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
703 return (error);
704
705 cnt = blkcnt;
706
707 fs = VFSTOUFS(mntp)->um_lfs;
708
709 error = 0;
710
711 /* these were inside the initialization for the for loop */
712 v_daddr = LFS_UNUSED_DADDR;
713 lastino = LFS_UNUSED_INUM;
714 for (blkp = blkiov; cnt--; ++blkp)
715 {
716 /*
717 * Get the IFILE entry (only once) and see if the file still
718 * exists.
719 */
720 if (lastino != blkp->bi_inode) {
721 /*
722 * Finish the old file, if there was one. The presence
723 * of a usable vnode in vp is signaled by a valid
724 * v_daddr.
725 */
726 if (v_daddr != LFS_UNUSED_DADDR) {
727 lfs_vunref(vp);
728 numrefed--;
729 }
730
731 /*
732 * Start a new file
733 */
734 lastino = blkp->bi_inode;
735 if (blkp->bi_inode == LFS_IFILE_INUM)
736 v_daddr = fs->lfs_idaddr;
737 else {
738 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
739 v_daddr = ifp->if_daddr;
740 brelse(bp);
741 }
742 if (v_daddr == LFS_UNUSED_DADDR) {
743 blkp->bi_daddr = LFS_UNUSED_DADDR;
744 continue;
745 }
746 /*
747 * A regular call to VFS_VGET could deadlock
748 * here. Instead, we try an unlocked access.
749 */
750 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
751 if (vp != NULL && !(vp->v_flag & VXLOCK)) {
752 ip = VTOI(vp);
753 if (lfs_vref(vp)) {
754 v_daddr = LFS_UNUSED_DADDR;
755 continue;
756 }
757 numrefed++;
758 } else {
759 /*
760 * Don't VFS_VGET if we're being unmounted,
761 * since we hold vfs_busy().
762 */
763 if (mntp->mnt_iflag & IMNT_UNMOUNT) {
764 v_daddr = LFS_UNUSED_DADDR;
765 continue;
766 }
767 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
768 if (error) {
769 #ifdef DEBUG_LFS
770 printf("lfs_bmapv: vget of ino %d failed with %d",blkp->bi_inode,error);
771 #endif
772 v_daddr = LFS_UNUSED_DADDR;
773 continue;
774 } else {
775 KASSERT(VOP_ISLOCKED(vp));
776 VOP_UNLOCK(vp, 0);
777 numrefed++;
778 }
779 }
780 ip = VTOI(vp);
781 } else if (v_daddr == LFS_UNUSED_DADDR) {
782 /*
783 * This can only happen if the vnode is dead.
784 * Keep going. Note that we DO NOT set the
785 * bi_addr to anything -- if we failed to get
786 * the vnode, for example, we want to assume
787 * conservatively that all of its blocks *are*
788 * located in the segment in question.
789 * lfs_markv will throw them out if we are
790 * wrong.
791 */
792 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
793 continue;
794 }
795
796 /* Past this point we are guaranteed that vp, ip are valid. */
797
798 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
799 /*
800 * We just want the inode address, which is
801 * conveniently in v_daddr.
802 */
803 blkp->bi_daddr = v_daddr;
804 } else {
805 daddr_t bi_daddr;
806
807 /* XXX ondisk32 */
808 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
809 &bi_daddr, NULL);
810 if (error)
811 {
812 blkp->bi_daddr = LFS_UNUSED_DADDR;
813 continue;
814 }
815 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
816 /* Fill in the block size, too */
817 if (blkp->bi_lbn >= 0)
818 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
819 else
820 blkp->bi_size = fs->lfs_bsize;
821 }
822 }
823
824 /*
825 * Finish the old file, if there was one. The presence
826 * of a usable vnode in vp is signaled by a valid v_daddr.
827 */
828 if (v_daddr != LFS_UNUSED_DADDR) {
829 lfs_vunref(vp);
830 numrefed--;
831 }
832
833 #ifdef DEBUG_LFS
834 if (numrefed != 0) {
835 panic("lfs_bmapv: numrefed=%d", numrefed);
836 }
837 #endif
838
839 vfs_unbusy(mntp);
840
841 return 0;
842 }
843
844 /*
845 * sys_lfs_segclean:
846 *
847 * Mark the segment clean.
848 *
849 * 0 on success
850 * -1/errno is return on error.
851 */
852 int
853 sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
854 {
855 struct sys_lfs_segclean_args /* {
856 syscallarg(fsid_t *) fsidp;
857 syscallarg(u_long) segment;
858 } */ *uap = v;
859 struct lfs *fs;
860 struct mount *mntp;
861 fsid_t fsid;
862 int error;
863 unsigned long segnum;
864 struct proc *p = l->l_proc;
865
866 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
867 return (error);
868
869 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
870 return (error);
871 if ((mntp = vfs_getvfs(&fsid)) == NULL)
872 return (ENOENT);
873
874 fs = VFSTOUFS(mntp)->um_lfs;
875 segnum = SCARG(uap, segment);
876
877 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
878 return (error);
879
880 lfs_seglock(fs, SEGM_PROT);
881 error = lfs_do_segclean(fs, segnum);
882 lfs_segunlock(fs);
883 vfs_unbusy(mntp);
884 return error;
885 }
886
887 /*
888 * Actually mark the segment clean.
889 * Must be called with the segment lock held.
890 */
891 int
892 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
893 {
894 struct buf *bp;
895 CLEANERINFO *cip;
896 SEGUSE *sup;
897
898 if (dtosn(fs, fs->lfs_curseg) == segnum) {
899 return (EBUSY);
900 }
901
902 LFS_SEGENTRY(sup, fs, segnum, bp);
903 if (sup->su_nbytes) {
904 printf("lfs_segclean: not cleaning segment %lu: %d live bytes\n",
905 segnum, sup->su_nbytes);
906 brelse(bp);
907 return (EBUSY);
908 }
909 if (sup->su_flags & SEGUSE_ACTIVE) {
910 brelse(bp);
911 return (EBUSY);
912 }
913 if (!(sup->su_flags & SEGUSE_DIRTY)) {
914 brelse(bp);
915 return (EALREADY);
916 }
917
918 fs->lfs_avail += segtod(fs, 1);
919 if (sup->su_flags & SEGUSE_SUPERBLOCK)
920 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
921 if (fs->lfs_version > 1 && segnum == 0 &&
922 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
923 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
924 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
925 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
926 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
927 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
928 if (fs->lfs_dmeta < 0)
929 fs->lfs_dmeta = 0;
930 sup->su_flags &= ~SEGUSE_DIRTY;
931 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
932
933 LFS_CLEANERINFO(cip, fs, bp);
934 ++cip->clean;
935 --cip->dirty;
936 fs->lfs_nclean = cip->clean;
937 cip->bfree = fs->lfs_bfree;
938 cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
939 (void) LFS_BWRITE_LOG(bp);
940 wakeup(&fs->lfs_avail);
941
942 return (0);
943 }
944
945 /*
946 * This will block until a segment in file system fsid is written. A timeout
947 * in milliseconds may be specified which will awake the cleaner automatically.
948 * An fsid of -1 means any file system, and a timeout of 0 means forever.
949 */
950 int
951 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
952 {
953 struct mount *mntp;
954 void *addr;
955 u_long timeout;
956 int error, s;
957
958 if ((mntp = vfs_getvfs(fsidp)) == NULL)
959 addr = &lfs_allclean_wakeup;
960 else
961 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
962 /*
963 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
964 * XXX IS THAT WHAT IS INTENDED?
965 */
966 s = splclock();
967 timeradd(tv, &time, tv);
968 timeout = hzto(tv);
969 splx(s);
970 error = tsleep(addr, PCATCH | PUSER, "segment", timeout);
971 return (error == ERESTART ? EINTR : 0);
972 }
973
974 /*
975 * sys_lfs_segwait:
976 *
977 * System call wrapper around lfs_segwait().
978 *
979 * 0 on success
980 * 1 on timeout
981 * -1/errno is return on error.
982 */
983 int
984 sys_lfs_segwait(struct lwp *l, void *v, register_t *retval)
985 {
986 struct sys_lfs_segwait_args /* {
987 syscallarg(fsid_t *) fsidp;
988 syscallarg(struct timeval *) tv;
989 } */ *uap = v;
990 struct proc *p = l->l_proc;
991 struct timeval atv;
992 fsid_t fsid;
993 int error;
994
995 /* XXX need we be su to segwait? */
996 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) {
997 return (error);
998 }
999 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
1000 return (error);
1001
1002 if (SCARG(uap, tv)) {
1003 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
1004 if (error)
1005 return (error);
1006 if (itimerfix(&atv))
1007 return (EINVAL);
1008 } else /* NULL or invalid */
1009 atv.tv_sec = atv.tv_usec = 0;
1010 return lfs_segwait(&fsid, &atv);
1011 }
1012
1013 /*
1014 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1015 * daddr from the ifile, so don't look it up again. If the cleaner is
1016 * processing IINFO structures, it may have the ondisk inode already, so
1017 * don't go retrieving it again.
1018 *
1019 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1020 * when finished.
1021 */
1022 extern struct lock ufs_hashlock;
1023
1024 int
1025 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1026 {
1027
1028 /*
1029 * This is playing fast and loose. Someone may have the inode
1030 * locked, in which case they are going to be distinctly unhappy
1031 * if we trash something.
1032 */
1033 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
1034 if ((*vpp)->v_flag & VXLOCK) {
1035 printf("lfs_fastvget: vnode VXLOCKed for ino %d\n",
1036 ino);
1037 clean_vnlocked++;
1038 #ifdef LFS_EAGAIN_FAIL
1039 return EAGAIN;
1040 #endif
1041 }
1042 if (lfs_vref(*vpp)) {
1043 clean_inlocked++;
1044 return EAGAIN;
1045 }
1046 } else
1047 *vpp = NULL;
1048
1049 return (0);
1050 }
1051
1052 int
1053 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, struct ufs1_dinode *dinp)
1054 {
1055 struct inode *ip;
1056 struct ufs1_dinode *dip;
1057 struct vnode *vp;
1058 struct ufsmount *ump;
1059 dev_t dev;
1060 int error, retries;
1061 struct buf *bp;
1062 struct lfs *fs;
1063
1064 ump = VFSTOUFS(mp);
1065 dev = ump->um_dev;
1066 fs = ump->um_lfs;
1067
1068 /*
1069 * Wait until the filesystem is fully mounted before allowing vget
1070 * to complete. This prevents possible problems with roll-forward.
1071 */
1072 while (fs->lfs_flags & LFS_NOTYET) {
1073 tsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0);
1074 }
1075 /*
1076 * This is playing fast and loose. Someone may have the inode
1077 * locked, in which case they are going to be distinctly unhappy
1078 * if we trash something.
1079 */
1080
1081 error = lfs_fasthashget(dev, ino, vpp);
1082 if (error != 0 || *vpp != NULL)
1083 return (error);
1084
1085 /*
1086 * getnewvnode(9) will call vfs_busy, which will block if the
1087 * filesystem is being unmounted; but umount(9) is waiting for
1088 * us because we're already holding the fs busy.
1089 * XXXMP
1090 */
1091 if (mp->mnt_iflag & IMNT_UNMOUNT) {
1092 *vpp = NULL;
1093 return EDEADLK;
1094 }
1095 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1096 *vpp = NULL;
1097 return (error);
1098 }
1099
1100 do {
1101 error = lfs_fasthashget(dev, ino, vpp);
1102 if (error != 0 || *vpp != NULL) {
1103 ungetnewvnode(vp);
1104 return (error);
1105 }
1106 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
1107
1108 /* Allocate new vnode/inode. */
1109 lfs_vcreate(mp, ino, vp);
1110
1111 /*
1112 * Put it onto its hash chain and lock it so that other requests for
1113 * this inode will block if they arrive while we are sleeping waiting
1114 * for old data structures to be purged or for the contents of the
1115 * disk portion of this inode to be read.
1116 */
1117 ip = VTOI(vp);
1118 ufs_ihashins(ip);
1119 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1120
1121 /*
1122 * XXX
1123 * This may not need to be here, logically it should go down with
1124 * the i_devvp initialization.
1125 * Ask Kirk.
1126 */
1127 ip->i_lfs = fs;
1128
1129 /* Read in the disk contents for the inode, copy into the inode. */
1130 if (dinp) {
1131 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1132 if (error) {
1133 printf("lfs_fastvget: dinode copyin failed for ino %d\n", ino);
1134 ufs_ihashrem(ip);
1135
1136 /* Unlock and discard unneeded inode. */
1137 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1138 lfs_vunref(vp);
1139 *vpp = NULL;
1140 return (error);
1141 }
1142 if (ip->i_number != ino)
1143 panic("lfs_fastvget: I was fed the wrong inode!");
1144 } else {
1145 retries = 0;
1146 again:
1147 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1148 NOCRED, &bp);
1149 if (error) {
1150 printf("lfs_fastvget: bread failed with %d\n",error);
1151 /*
1152 * The inode does not contain anything useful, so it
1153 * would be misleading to leave it on its hash chain.
1154 * Iput() will return it to the free list.
1155 */
1156 ufs_ihashrem(ip);
1157
1158 /* Unlock and discard unneeded inode. */
1159 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1160 lfs_vunref(vp);
1161 brelse(bp);
1162 *vpp = NULL;
1163 return (error);
1164 }
1165 dip = lfs_ifind(ump->um_lfs, ino, bp);
1166 if (dip == NULL) {
1167 /* Assume write has not completed yet; try again */
1168 bp->b_flags |= B_INVAL;
1169 brelse(bp);
1170 ++retries;
1171 if (retries > LFS_IFIND_RETRIES)
1172 panic("lfs_fastvget: dinode not found");
1173 printf("lfs_fastvget: dinode not found, retrying...\n");
1174 goto again;
1175 }
1176 *ip->i_din.ffs1_din = *dip;
1177 brelse(bp);
1178 }
1179 lfs_vinit(mp, &vp);
1180
1181 *vpp = vp;
1182
1183 KASSERT(VOP_ISLOCKED(vp));
1184 VOP_UNLOCK(vp, 0);
1185
1186 return (0);
1187 }
1188
1189 /*
1190 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1191 */
1192 struct buf *
1193 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
1194 {
1195 struct buf *bp;
1196 int error;
1197
1198 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1199
1200 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1201 error = copyin(uaddr, bp->b_data, size);
1202 if (error) {
1203 lfs_freebuf(fs, bp);
1204 return NULL;
1205 }
1206 KDASSERT(bp->b_iodone == lfs_callback);
1207
1208 #if 0
1209 ++fs->lfs_iocount;
1210 #endif
1211 bp->b_bufsize = size;
1212 bp->b_bcount = size;
1213 return (bp);
1214 }
1215