lfs_syscalls.c revision 1.93 1 /* $NetBSD: lfs_syscalls.c,v 1.93 2003/06/29 22:32:40 fvdl Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement:
52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
71 */
72
73 #include <sys/cdefs.h>
74 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.93 2003/06/29 22:32:40 fvdl Exp $");
75
76 #ifndef LFS
77 # define LFS /* for prototypes in syscallargs.h */
78 #endif
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc.h>
83 #include <sys/buf.h>
84 #include <sys/mount.h>
85 #include <sys/vnode.h>
86 #include <sys/malloc.h>
87 #include <sys/kernel.h>
88
89 #include <sys/sa.h>
90 #include <sys/syscallargs.h>
91
92 #include <ufs/ufs/inode.h>
93 #include <ufs/ufs/ufsmount.h>
94 #include <ufs/ufs/ufs_extern.h>
95
96 #include <ufs/lfs/lfs.h>
97 #include <ufs/lfs/lfs_extern.h>
98
99 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
100 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
101
102 int debug_cleaner = 0;
103 int clean_vnlocked = 0;
104 int clean_inlocked = 0;
105 int verbose_debug = 0;
106
107 pid_t lfs_cleaner_pid = 0;
108
109 /*
110 * Definitions for the buffer free lists.
111 */
112 #define BQUEUES 4 /* number of free buffer queues */
113
114 #define BQ_LOCKED 0 /* super-blocks &c */
115 #define BQ_LRU 1 /* lru, useful buffers */
116 #define BQ_AGE 2 /* rubbish */
117 #define BQ_EMPTY 3 /* buffer headers with no memory */
118
119 extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
120
121 #define LFS_FORCE_WRITE UNASSIGNED
122
123 #define LFS_VREF_THRESHOLD 128
124
125 /*
126 * sys_lfs_markv:
127 *
128 * This will mark inodes and blocks dirty, so they are written into the log.
129 * It will block until all the blocks have been written. The segment create
130 * time passed in the block_info and inode_info structures is used to decide
131 * if the data is valid for each block (in case some process dirtied a block
132 * or inode that is being cleaned between the determination that a block is
133 * live and the lfs_markv call).
134 *
135 * 0 on success
136 * -1/errno is return on error.
137 */
138 #ifdef USE_64BIT_SYSCALLS
139 int
140 sys_lfs_markv(struct proc *p, void *v, register_t *retval)
141 {
142 struct sys_lfs_markv_args /* {
143 syscallarg(fsid_t *) fsidp;
144 syscallarg(struct block_info *) blkiov;
145 syscallarg(int) blkcnt;
146 } */ *uap = v;
147 BLOCK_INFO *blkiov;
148 int blkcnt, error;
149 fsid_t fsid;
150
151 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
152 return (error);
153
154 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
155 return (error);
156
157 blkcnt = SCARG(uap, blkcnt);
158 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
159 return (EINVAL);
160
161 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
162 if ((error = copyin(SCARG(uap, blkiov), blkiov,
163 blkcnt * sizeof(BLOCK_INFO))) != 0)
164 goto out;
165
166 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
167 copyout(blkiov, SCARG(uap, blkiov),
168 blkcnt * sizeof(BLOCK_INFO));
169 out:
170 free(blkiov, M_SEGMENT);
171 return error;
172 }
173 #else
174 int
175 sys_lfs_markv(struct lwp *l, void *v, register_t *retval)
176 {
177 struct sys_lfs_markv_args /* {
178 syscallarg(fsid_t *) fsidp;
179 syscallarg(struct block_info *) blkiov;
180 syscallarg(int) blkcnt;
181 } */ *uap = v;
182 BLOCK_INFO *blkiov;
183 BLOCK_INFO_15 *blkiov15;
184 int i, blkcnt, error;
185 fsid_t fsid;
186
187 if ((error = suser(l->l_proc->p_ucred, &l->l_proc->p_acflag)) != 0)
188 return (error);
189
190 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
191 return (error);
192
193 blkcnt = SCARG(uap, blkcnt);
194 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
195 return (EINVAL);
196
197 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
198 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
199 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
200 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
201 goto out;
202
203 for (i = 0; i < blkcnt; i++) {
204 blkiov[i].bi_inode = blkiov15[i].bi_inode;
205 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
206 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
207 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
208 blkiov[i].bi_version = blkiov15[i].bi_version;
209 blkiov[i].bi_bp = blkiov15[i].bi_bp;
210 blkiov[i].bi_size = blkiov15[i].bi_size;
211 }
212
213 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
214 for (i = 0; i < blkcnt; i++) {
215 blkiov15[i].bi_inode = blkiov[i].bi_inode;
216 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
217 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
218 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
219 blkiov15[i].bi_version = blkiov[i].bi_version;
220 blkiov15[i].bi_bp = blkiov[i].bi_bp;
221 blkiov15[i].bi_size = blkiov[i].bi_size;
222 }
223 copyout(blkiov15, SCARG(uap, blkiov),
224 blkcnt * sizeof(BLOCK_INFO_15));
225 }
226 out:
227 free(blkiov, M_SEGMENT);
228 free(blkiov15, M_SEGMENT);
229 return error;
230 }
231 #endif
232
233 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
234
235 int
236 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
237 {
238 BLOCK_INFO *blkp;
239 IFILE *ifp;
240 struct buf *bp, *nbp;
241 struct inode *ip = NULL;
242 struct lfs *fs;
243 struct mount *mntp;
244 struct vnode *vp;
245 #ifdef DEBUG_LFS
246 int vputc = 0, iwritten = 0;
247 #endif
248 ino_t lastino;
249 daddr_t b_daddr, v_daddr;
250 int cnt, error;
251 int do_again = 0;
252 int s;
253 #ifdef CHECK_COPYIN
254 int i;
255 #endif /* CHECK_COPYIN */
256 int numrefed = 0;
257 ino_t maxino;
258 size_t obsize;
259
260 /* number of blocks/inodes that we have already bwrite'ed */
261 int nblkwritten, ninowritten;
262
263 if ((mntp = vfs_getvfs(fsidp)) == NULL)
264 return (ENOENT);
265
266 fs = VFSTOUFS(mntp)->um_lfs;
267 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
268 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
269
270 cnt = blkcnt;
271
272 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
273 return (error);
274
275 /*
276 * This seglock is just to prevent the fact that we might have to sleep
277 * from allowing the possibility that our blocks might become
278 * invalid.
279 *
280 * It is also important to note here that unless we specify SEGM_CKP,
281 * any Ifile blocks that we might be asked to clean will never get
282 * to the disk.
283 */
284 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
285
286 /* Mark blocks/inodes dirty. */
287 error = 0;
288
289 #ifdef DEBUG_LFS
290 /* Run through and count the inodes */
291 lastino = LFS_UNUSED_INUM;
292 for (blkp = blkiov; cnt--; ++blkp) {
293 if (lastino != blkp->bi_inode) {
294 lastino = blkp->bi_inode;
295 vputc++;
296 }
297 }
298 cnt = blkcnt;
299 printf("[%d/",vputc);
300 iwritten = 0;
301 #endif /* DEBUG_LFS */
302 /* these were inside the initialization for the for loop */
303 v_daddr = LFS_UNUSED_DADDR;
304 lastino = LFS_UNUSED_INUM;
305 nblkwritten = ninowritten = 0;
306 for (blkp = blkiov; cnt--; ++blkp)
307 {
308 if (blkp->bi_daddr == LFS_FORCE_WRITE)
309 printf("lfs_markv: warning: force-writing ino %d "
310 "lbn %lld\n",
311 blkp->bi_inode, (long long)blkp->bi_lbn);
312 /* Bounds-check incoming data, avoid panic for failed VGET */
313 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
314 error = EINVAL;
315 goto again;
316 }
317 /*
318 * Get the IFILE entry (only once) and see if the file still
319 * exists.
320 */
321 if (lastino != blkp->bi_inode) {
322 /*
323 * Finish the old file, if there was one. The presence
324 * of a usable vnode in vp is signaled by a valid v_daddr.
325 */
326 if (v_daddr != LFS_UNUSED_DADDR) {
327 #ifdef DEBUG_LFS
328 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
329 iwritten++;
330 #endif
331 lfs_vunref(vp);
332 numrefed--;
333 }
334
335 /*
336 * Start a new file
337 */
338 lastino = blkp->bi_inode;
339 if (blkp->bi_inode == LFS_IFILE_INUM)
340 v_daddr = fs->lfs_idaddr;
341 else {
342 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
343 /* XXX fix for force write */
344 v_daddr = ifp->if_daddr;
345 brelse(bp);
346 }
347 /* Don't force-write the ifile */
348 if (blkp->bi_inode == LFS_IFILE_INUM
349 && blkp->bi_daddr == LFS_FORCE_WRITE)
350 {
351 continue;
352 }
353 if (v_daddr == LFS_UNUSED_DADDR
354 && blkp->bi_daddr != LFS_FORCE_WRITE)
355 {
356 continue;
357 }
358
359 /* Get the vnode/inode. */
360 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
361 &vp,
362 (blkp->bi_lbn == LFS_UNUSED_LBN
363 ? blkp->bi_bp
364 : NULL));
365
366 if (!error) {
367 numrefed++;
368 }
369 if (error) {
370 #ifdef DEBUG_LFS
371 printf("lfs_markv: lfs_fastvget failed with %d (ino %d, segment %d)\n",
372 error, blkp->bi_inode,
373 dtosn(fs, blkp->bi_daddr));
374 #endif /* DEBUG_LFS */
375 /*
376 * If we got EAGAIN, that means that the
377 * Inode was locked. This is
378 * recoverable: just clean the rest of
379 * this segment, and let the cleaner try
380 * again with another. (When the
381 * cleaner runs again, this segment will
382 * sort high on the list, since it is
383 * now almost entirely empty.) But, we
384 * still set v_daddr = LFS_UNUSED_ADDR
385 * so as not to test this over and over
386 * again.
387 */
388 if (error == EAGAIN) {
389 error = 0;
390 do_again++;
391 }
392 #ifdef DIAGNOSTIC
393 else if (error != ENOENT)
394 panic("lfs_markv VFS_VGET FAILED");
395 #endif
396 /* lastino = LFS_UNUSED_INUM; */
397 v_daddr = LFS_UNUSED_DADDR;
398 vp = NULL;
399 ip = NULL;
400 continue;
401 }
402 ip = VTOI(vp);
403 ninowritten++;
404 } else if (v_daddr == LFS_UNUSED_DADDR) {
405 /*
406 * This can only happen if the vnode is dead (or
407 * in any case we can't get it...e.g., it is
408 * inlocked). Keep going.
409 */
410 continue;
411 }
412
413 /* Past this point we are guaranteed that vp, ip are valid. */
414
415 /* If this BLOCK_INFO didn't contain a block, keep going. */
416 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
417 /* XXX need to make sure that the inode gets written in this case */
418 /* XXX but only write the inode if it's the right one */
419 if (blkp->bi_inode != LFS_IFILE_INUM) {
420 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
421 if (ifp->if_daddr == blkp->bi_daddr
422 || blkp->bi_daddr == LFS_FORCE_WRITE)
423 {
424 LFS_SET_UINO(ip, IN_CLEANING);
425 }
426 brelse(bp);
427 }
428 continue;
429 }
430
431 b_daddr = 0;
432 if (blkp->bi_daddr != LFS_FORCE_WRITE) {
433 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
434 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
435 {
436 if (dtosn(fs,dbtofsb(fs, b_daddr))
437 == dtosn(fs,blkp->bi_daddr))
438 {
439 printf("lfs_markv: wrong da same seg: %llx vs %llx\n",
440 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr));
441 }
442 do_again++;
443 continue;
444 }
445 }
446
447 /*
448 * Check block sizes. The blocks being cleaned come from
449 * disk, so they should have the same size as their on-disk
450 * counterparts.
451 */
452 if (blkp->bi_lbn >= 0)
453 obsize = blksize(fs, ip, blkp->bi_lbn);
454 else
455 obsize = fs->lfs_bsize;
456 /* Check for fragment size change */
457 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
458 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
459 }
460 if (obsize != blkp->bi_size) {
461 printf("lfs_markv: ino %d lbn %lld wrong size (%ld != %d), try again\n",
462 blkp->bi_inode, (long long)blkp->bi_lbn,
463 (long) obsize, blkp->bi_size);
464 do_again++;
465 continue;
466 }
467
468 /*
469 * If we get to here, then we are keeping the block. If
470 * it is an indirect block, we want to actually put it
471 * in the buffer cache so that it can be updated in the
472 * finish_meta section. If it's not, we need to
473 * allocate a fake buffer so that writeseg can perform
474 * the copyin and write the buffer.
475 */
476 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
477 /* Data Block */
478 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
479 blkp->bi_size, blkp->bi_bp);
480 /* Pretend we used bread() to get it */
481 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
482 } else {
483 /* Indirect block or ifile */
484 if (blkp->bi_size != fs->lfs_bsize &&
485 ip->i_number != LFS_IFILE_INUM)
486 panic("lfs_markv: partial indirect block?"
487 " size=%d\n", blkp->bi_size);
488 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
489 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
490 /*
491 * The block in question was not found
492 * in the cache; i.e., the block that
493 * getblk() returned is empty. So, we
494 * can (and should) copy in the
495 * contents, because we've already
496 * determined that this was the right
497 * version of this block on disk.
498 *
499 * And, it can't have changed underneath
500 * us, because we have the segment lock.
501 */
502 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
503 if (error)
504 goto err2;
505 }
506 }
507 if ((error = lfs_bwrite_ext(bp,BW_CLEAN)) != 0)
508 goto err2;
509
510 nblkwritten++;
511 /*
512 * XXX should account indirect blocks and ifile pages as well
513 */
514 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
515 > LFS_MARKV_MAX_BLOCKS) {
516 #ifdef DEBUG_LFS
517 printf("lfs_markv: writing %d blks %d inos\n",
518 nblkwritten, ninowritten);
519 #endif
520 lfs_segwrite(mntp, SEGM_CLEAN);
521 nblkwritten = ninowritten = 0;
522 }
523 }
524
525 /*
526 * Finish the old file, if there was one
527 */
528 if (v_daddr != LFS_UNUSED_DADDR) {
529 #ifdef DEBUG_LFS
530 if (ip->i_flag & (IN_MODIFIED|IN_CLEANING))
531 iwritten++;
532 #endif
533 lfs_vunref(vp);
534 numrefed--;
535 }
536
537 #ifdef DEBUG_LFS
538 printf("%d]",iwritten);
539 if (numrefed != 0) {
540 panic("lfs_markv: numrefed=%d", numrefed);
541 }
542 #endif
543
544 #ifdef DEBUG_LFS
545 printf("lfs_markv: writing %d blks %d inos (check point)\n",
546 nblkwritten, ninowritten);
547 #endif
548 /*
549 * The last write has to be SEGM_SYNC, because of calling semantics.
550 * It also has to be SEGM_CKP, because otherwise we could write
551 * over the newly cleaned data contained in a checkpoint, and then
552 * we'd be unhappy at recovery time.
553 */
554 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
555
556 lfs_segunlock(fs);
557
558 vfs_unbusy(mntp);
559 if (error)
560 return (error);
561 else if (do_again)
562 return EAGAIN;
563
564 return 0;
565
566 err2:
567 printf("lfs_markv err2\n");
568 lfs_vunref(vp);
569 --numrefed;
570
571 /* Free up fakebuffers -- have to take these from the LOCKED list */
572 again:
573 s = splbio();
574 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; bp = nbp) {
575 nbp = bp->b_freelist.tqe_next;
576 if (LFS_IS_MALLOC_BUF(bp)) {
577 if (bp->b_flags & B_BUSY) { /* not bloody likely */
578 bp->b_flags |= B_WANTED;
579 tsleep(bp, PRIBIO+1, "markv", 0);
580 splx(s);
581 goto again;
582 }
583 if (bp->b_flags & B_DELWRI)
584 fs->lfs_avail += btofsb(fs, bp->b_bcount);
585 bremfree(bp);
586 splx(s);
587 brelse(bp);
588 s = splbio();
589 }
590 }
591 splx(s);
592 lfs_segunlock(fs);
593 vfs_unbusy(mntp);
594 #ifdef DEBUG_LFS
595 if (numrefed != 0) {
596 panic("lfs_markv: numrefed=%d", numrefed);
597 }
598 #endif
599
600 return (error);
601 }
602
603 /*
604 * sys_lfs_bmapv:
605 *
606 * This will fill in the current disk address for arrays of blocks.
607 *
608 * 0 on success
609 * -1/errno is return on error.
610 */
611 #ifdef USE_64BIT_SYSCALLS
612 int
613 sys_lfs_bmapv(struct proc *p, void *v, register_t *retval)
614 {
615 struct sys_lfs_bmapv_args /* {
616 syscallarg(fsid_t *) fsidp;
617 syscallarg(struct block_info *) blkiov;
618 syscallarg(int) blkcnt;
619 } */ *uap = v;
620 BLOCK_INFO *blkiov;
621 int blkcnt, error;
622 fsid_t fsid;
623
624 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
625 return (error);
626
627 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
628 return (error);
629
630 blkcnt = SCARG(uap, blkcnt);
631 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
632 return (EINVAL);
633 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
634 if ((error = copyin(SCARG(uap, blkiov), blkiov,
635 blkcnt * sizeof(BLOCK_INFO))) != 0)
636 goto out;
637
638 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
639 copyout(blkiov, SCARG(uap, blkiov),
640 blkcnt * sizeof(BLOCK_INFO));
641 out:
642 free(blkiov, M_SEGMENT);
643 return error;
644 }
645 #else
646 int
647 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval)
648 {
649 struct sys_lfs_bmapv_args /* {
650 syscallarg(fsid_t *) fsidp;
651 syscallarg(struct block_info *) blkiov;
652 syscallarg(int) blkcnt;
653 } */ *uap = v;
654 struct proc *p = l->l_proc;
655 BLOCK_INFO *blkiov;
656 BLOCK_INFO_15 *blkiov15;
657 int i, blkcnt, error;
658 fsid_t fsid;
659
660 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
661 return (error);
662
663 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
664 return (error);
665
666 blkcnt = SCARG(uap, blkcnt);
667 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
668 return (EINVAL);
669 blkiov = malloc(blkcnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
670 blkiov15 = malloc(blkcnt * sizeof(BLOCK_INFO_15), M_SEGMENT, M_WAITOK);
671 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
672 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
673 goto out;
674
675 for (i = 0; i < blkcnt; i++) {
676 blkiov[i].bi_inode = blkiov15[i].bi_inode;
677 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
678 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
679 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
680 blkiov[i].bi_version = blkiov15[i].bi_version;
681 blkiov[i].bi_bp = blkiov15[i].bi_bp;
682 blkiov[i].bi_size = blkiov15[i].bi_size;
683 }
684
685 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) {
686 for (i = 0; i < blkcnt; i++) {
687 blkiov15[i].bi_inode = blkiov[i].bi_inode;
688 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
689 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
690 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
691 blkiov15[i].bi_version = blkiov[i].bi_version;
692 blkiov15[i].bi_bp = blkiov[i].bi_bp;
693 blkiov15[i].bi_size = blkiov[i].bi_size;
694 }
695 copyout(blkiov15, SCARG(uap, blkiov),
696 blkcnt * sizeof(BLOCK_INFO_15));
697 }
698 out:
699 free(blkiov, M_SEGMENT);
700 free(blkiov15, M_SEGMENT);
701 return error;
702 }
703 #endif
704
705 int
706 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
707 {
708 BLOCK_INFO *blkp;
709 IFILE *ifp;
710 struct buf *bp;
711 struct inode *ip = NULL;
712 struct lfs *fs;
713 struct mount *mntp;
714 struct ufsmount *ump;
715 struct vnode *vp;
716 ino_t lastino;
717 daddr_t v_daddr;
718 int cnt, error;
719 int numrefed = 0;
720
721 lfs_cleaner_pid = p->p_pid;
722
723 if ((mntp = vfs_getvfs(fsidp)) == NULL)
724 return (ENOENT);
725
726 ump = VFSTOUFS(mntp);
727 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
728 return (error);
729
730 cnt = blkcnt;
731
732 fs = VFSTOUFS(mntp)->um_lfs;
733
734 error = 0;
735
736 /* these were inside the initialization for the for loop */
737 v_daddr = LFS_UNUSED_DADDR;
738 lastino = LFS_UNUSED_INUM;
739 for (blkp = blkiov; cnt--; ++blkp)
740 {
741 /*
742 * Get the IFILE entry (only once) and see if the file still
743 * exists.
744 */
745 if (lastino != blkp->bi_inode) {
746 /*
747 * Finish the old file, if there was one. The presence
748 * of a usable vnode in vp is signaled by a valid
749 * v_daddr.
750 */
751 if (v_daddr != LFS_UNUSED_DADDR) {
752 lfs_vunref(vp);
753 numrefed--;
754 }
755
756 /*
757 * Start a new file
758 */
759 lastino = blkp->bi_inode;
760 if (blkp->bi_inode == LFS_IFILE_INUM)
761 v_daddr = fs->lfs_idaddr;
762 else {
763 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
764 v_daddr = ifp->if_daddr;
765 brelse(bp);
766 }
767 if (v_daddr == LFS_UNUSED_DADDR) {
768 blkp->bi_daddr = LFS_UNUSED_DADDR;
769 continue;
770 }
771 /*
772 * A regular call to VFS_VGET could deadlock
773 * here. Instead, we try an unlocked access.
774 */
775 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
776 if (vp != NULL && !(vp->v_flag & VXLOCK)) {
777 ip = VTOI(vp);
778 if (lfs_vref(vp)) {
779 v_daddr = LFS_UNUSED_DADDR;
780 continue;
781 }
782 numrefed++;
783 } else {
784 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
785 if (error) {
786 #ifdef DEBUG_LFS
787 printf("lfs_bmapv: vget of ino %d failed with %d",blkp->bi_inode,error);
788 #endif
789 v_daddr = LFS_UNUSED_DADDR;
790 continue;
791 } else {
792 KASSERT(VOP_ISLOCKED(vp));
793 VOP_UNLOCK(vp, 0);
794 numrefed++;
795 }
796 }
797 ip = VTOI(vp);
798 } else if (v_daddr == LFS_UNUSED_DADDR) {
799 /*
800 * This can only happen if the vnode is dead.
801 * Keep going. Note that we DO NOT set the
802 * bi_addr to anything -- if we failed to get
803 * the vnode, for example, we want to assume
804 * conservatively that all of its blocks *are*
805 * located in the segment in question.
806 * lfs_markv will throw them out if we are
807 * wrong.
808 */
809 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
810 continue;
811 }
812
813 /* Past this point we are guaranteed that vp, ip are valid. */
814
815 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
816 /*
817 * We just want the inode address, which is
818 * conveniently in v_daddr.
819 */
820 blkp->bi_daddr = v_daddr;
821 } else {
822 daddr_t bi_daddr;
823
824 /* XXX ondisk32 */
825 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
826 &bi_daddr, NULL);
827 if (error)
828 {
829 blkp->bi_daddr = LFS_UNUSED_DADDR;
830 continue;
831 }
832 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
833 /* Fill in the block size, too */
834 if (blkp->bi_lbn >= 0)
835 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
836 else
837 blkp->bi_size = fs->lfs_bsize;
838 }
839 }
840
841 /*
842 * Finish the old file, if there was one. The presence
843 * of a usable vnode in vp is signaled by a valid v_daddr.
844 */
845 if (v_daddr != LFS_UNUSED_DADDR) {
846 lfs_vunref(vp);
847 numrefed--;
848 }
849
850 #ifdef DEBUG_LFS
851 if (numrefed != 0) {
852 panic("lfs_bmapv: numrefed=%d", numrefed);
853 }
854 #endif
855
856 vfs_unbusy(mntp);
857
858 return 0;
859 }
860
861 /*
862 * sys_lfs_segclean:
863 *
864 * Mark the segment clean.
865 *
866 * 0 on success
867 * -1/errno is return on error.
868 */
869 int
870 sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
871 {
872 struct sys_lfs_segclean_args /* {
873 syscallarg(fsid_t *) fsidp;
874 syscallarg(u_long) segment;
875 } */ *uap = v;
876 struct lfs *fs;
877 struct mount *mntp;
878 fsid_t fsid;
879 int error;
880 unsigned long segnum;
881 struct proc *p = l->l_proc;
882
883 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
884 return (error);
885
886 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
887 return (error);
888 if ((mntp = vfs_getvfs(&fsid)) == NULL)
889 return (ENOENT);
890
891 fs = VFSTOUFS(mntp)->um_lfs;
892 segnum = SCARG(uap, segment);
893
894 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
895 return (error);
896
897 lfs_seglock(fs, SEGM_PROT);
898 error = lfs_do_segclean(fs, segnum);
899 lfs_segunlock(fs);
900 vfs_unbusy(mntp);
901 return error;
902 }
903
904 /*
905 * Actually mark the segment clean.
906 * Must be called with the segment lock held.
907 */
908 int
909 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
910 {
911 struct buf *bp;
912 CLEANERINFO *cip;
913 SEGUSE *sup;
914
915 if (dtosn(fs, fs->lfs_curseg) == segnum) {
916 return (EBUSY);
917 }
918
919 LFS_SEGENTRY(sup, fs, segnum, bp);
920 if (sup->su_nbytes) {
921 printf("lfs_segclean: not cleaning segment %lu: %d live bytes\n",
922 segnum, sup->su_nbytes);
923 brelse(bp);
924 return (EBUSY);
925 }
926 if (sup->su_flags & SEGUSE_ACTIVE) {
927 brelse(bp);
928 return (EBUSY);
929 }
930 if (!(sup->su_flags & SEGUSE_DIRTY)) {
931 brelse(bp);
932 return (EALREADY);
933 }
934
935 fs->lfs_avail += segtod(fs, 1);
936 if (sup->su_flags & SEGUSE_SUPERBLOCK)
937 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
938 if (fs->lfs_version > 1 && segnum == 0 &&
939 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
940 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
941 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
942 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
943 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
944 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
945 if (fs->lfs_dmeta < 0)
946 fs->lfs_dmeta = 0;
947 sup->su_flags &= ~SEGUSE_DIRTY;
948 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
949
950 LFS_CLEANERINFO(cip, fs, bp);
951 ++cip->clean;
952 --cip->dirty;
953 fs->lfs_nclean = cip->clean;
954 cip->bfree = fs->lfs_bfree;
955 cip->avail = fs->lfs_avail - fs->lfs_ravail;
956 (void) LFS_BWRITE_LOG(bp);
957 wakeup(&fs->lfs_avail);
958
959 return (0);
960 }
961
962 /*
963 * This will block until a segment in file system fsid is written. A timeout
964 * in milliseconds may be specified which will awake the cleaner automatically.
965 * An fsid of -1 means any file system, and a timeout of 0 means forever.
966 */
967 int
968 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
969 {
970 struct mount *mntp;
971 void *addr;
972 u_long timeout;
973 int error, s;
974
975 if ((mntp = vfs_getvfs(fsidp)) == NULL)
976 addr = &lfs_allclean_wakeup;
977 else
978 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
979 /*
980 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
981 * XXX IS THAT WHAT IS INTENDED?
982 */
983 s = splclock();
984 timeradd(tv, &time, tv);
985 timeout = hzto(tv);
986 splx(s);
987 error = tsleep(addr, PCATCH | PUSER, "segment", timeout);
988 return (error == ERESTART ? EINTR : 0);
989 }
990
991 /*
992 * sys_lfs_segwait:
993 *
994 * System call wrapper around lfs_segwait().
995 *
996 * 0 on success
997 * 1 on timeout
998 * -1/errno is return on error.
999 */
1000 int
1001 sys_lfs_segwait(struct lwp *l, void *v, register_t *retval)
1002 {
1003 struct sys_lfs_segwait_args /* {
1004 syscallarg(fsid_t *) fsidp;
1005 syscallarg(struct timeval *) tv;
1006 } */ *uap = v;
1007 struct proc *p = l->l_proc;
1008 struct timeval atv;
1009 fsid_t fsid;
1010 int error;
1011
1012 /* XXX need we be su to segwait? */
1013 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) {
1014 return (error);
1015 }
1016 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
1017 return (error);
1018
1019 if (SCARG(uap, tv)) {
1020 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
1021 if (error)
1022 return (error);
1023 if (itimerfix(&atv))
1024 return (EINVAL);
1025 } else /* NULL or invalid */
1026 atv.tv_sec = atv.tv_usec = 0;
1027 return lfs_segwait(&fsid, &atv);
1028 }
1029
1030 /*
1031 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1032 * daddr from the ifile, so don't look it up again. If the cleaner is
1033 * processing IINFO structures, it may have the ondisk inode already, so
1034 * don't go retrieving it again.
1035 *
1036 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1037 * when finished.
1038 */
1039 extern struct lock ufs_hashlock;
1040
1041 int
1042 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1043 {
1044
1045 /*
1046 * This is playing fast and loose. Someone may have the inode
1047 * locked, in which case they are going to be distinctly unhappy
1048 * if we trash something.
1049 */
1050 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
1051 if ((*vpp)->v_flag & VXLOCK) {
1052 printf("lfs_fastvget: vnode VXLOCKed for ino %d\n",
1053 ino);
1054 clean_vnlocked++;
1055 #ifdef LFS_EAGAIN_FAIL
1056 return EAGAIN;
1057 #endif
1058 }
1059 if (lfs_vref(*vpp)) {
1060 clean_inlocked++;
1061 return EAGAIN;
1062 }
1063 } else
1064 *vpp = NULL;
1065
1066 return (0);
1067 }
1068
1069 int
1070 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, struct ufs1_dinode *dinp)
1071 {
1072 struct inode *ip;
1073 struct ufs1_dinode *dip;
1074 struct vnode *vp;
1075 struct ufsmount *ump;
1076 dev_t dev;
1077 int error, retries;
1078 struct buf *bp;
1079 struct lfs *fs;
1080
1081 ump = VFSTOUFS(mp);
1082 dev = ump->um_dev;
1083 fs = ump->um_lfs;
1084
1085 /*
1086 * Wait until the filesystem is fully mounted before allowing vget
1087 * to complete. This prevents possible problems with roll-forward.
1088 */
1089 while (fs->lfs_flags & LFS_NOTYET) {
1090 tsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0);
1091 }
1092 /*
1093 * This is playing fast and loose. Someone may have the inode
1094 * locked, in which case they are going to be distinctly unhappy
1095 * if we trash something.
1096 */
1097
1098 error = lfs_fasthashget(dev, ino, vpp);
1099 if (error != 0 || *vpp != NULL)
1100 return (error);
1101
1102 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1103 *vpp = NULL;
1104 return (error);
1105 }
1106
1107 do {
1108 error = lfs_fasthashget(dev, ino, vpp);
1109 if (error != 0 || *vpp != NULL) {
1110 ungetnewvnode(vp);
1111 return (error);
1112 }
1113 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
1114
1115 /* Allocate new vnode/inode. */
1116 lfs_vcreate(mp, ino, vp);
1117
1118 /*
1119 * Put it onto its hash chain and lock it so that other requests for
1120 * this inode will block if they arrive while we are sleeping waiting
1121 * for old data structures to be purged or for the contents of the
1122 * disk portion of this inode to be read.
1123 */
1124 ip = VTOI(vp);
1125 ufs_ihashins(ip);
1126 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1127
1128 /*
1129 * XXX
1130 * This may not need to be here, logically it should go down with
1131 * the i_devvp initialization.
1132 * Ask Kirk.
1133 */
1134 ip->i_lfs = fs;
1135
1136 /* Read in the disk contents for the inode, copy into the inode. */
1137 if (dinp) {
1138 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1139 if (error) {
1140 printf("lfs_fastvget: dinode copyin failed for ino %d\n", ino);
1141 ufs_ihashrem(ip);
1142
1143 /* Unlock and discard unneeded inode. */
1144 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1145 lfs_vunref(vp);
1146 *vpp = NULL;
1147 return (error);
1148 }
1149 if (ip->i_number != ino)
1150 panic("lfs_fastvget: I was fed the wrong inode!");
1151 } else {
1152 retries = 0;
1153 again:
1154 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1155 NOCRED, &bp);
1156 if (error) {
1157 printf("lfs_fastvget: bread failed with %d\n",error);
1158 /*
1159 * The inode does not contain anything useful, so it
1160 * would be misleading to leave it on its hash chain.
1161 * Iput() will return it to the free list.
1162 */
1163 ufs_ihashrem(ip);
1164
1165 /* Unlock and discard unneeded inode. */
1166 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1167 lfs_vunref(vp);
1168 brelse(bp);
1169 *vpp = NULL;
1170 return (error);
1171 }
1172 dip = lfs_ifind(ump->um_lfs, ino, bp);
1173 if (dip == NULL) {
1174 /* Assume write has not completed yet; try again */
1175 bp->b_flags |= B_INVAL;
1176 brelse(bp);
1177 ++retries;
1178 if (retries > LFS_IFIND_RETRIES)
1179 panic("lfs_fastvget: dinode not found");
1180 printf("lfs_fastvget: dinode not found, retrying...\n");
1181 goto again;
1182 }
1183 *ip->i_din.ffs1_din = *dip;
1184 brelse(bp);
1185 }
1186 lfs_vinit(mp, vp);
1187
1188 *vpp = vp;
1189
1190 KASSERT(VOP_ISLOCKED(vp));
1191 VOP_UNLOCK(vp, 0);
1192
1193 return (0);
1194 }
1195
1196 /*
1197 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1198 */
1199 struct buf *
1200 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
1201 {
1202 struct buf *bp;
1203 int error;
1204
1205 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1206
1207 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1208 error = copyin(uaddr, bp->b_data, size);
1209 if (error) {
1210 lfs_freebuf(fs, bp);
1211 return NULL;
1212 }
1213 KDASSERT(bp->b_iodone == lfs_callback);
1214
1215 #if 0
1216 bp->b_saveaddr = (caddr_t)fs;
1217 ++fs->lfs_iocount;
1218 #endif
1219 bp->b_bufsize = size;
1220 bp->b_bcount = size;
1221 return (bp);
1222 }
1223