lfs_syscalls.c revision 1.22 1 /* $NetBSD: lfs_syscalls.c,v 1.22 1999/03/10 00:20:00 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 1999 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement:
52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
71 */
72
73 #include "fs_lfs.h" /* for prototypes in syscallargs.h */
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/proc.h>
78 #include <sys/buf.h>
79 #include <sys/mount.h>
80 #include <sys/vnode.h>
81 #include <sys/malloc.h>
82 #include <sys/kernel.h>
83
84 #include <sys/syscallargs.h>
85
86 #include <ufs/ufs/quota.h>
87 #include <ufs/ufs/inode.h>
88 #include <ufs/ufs/ufsmount.h>
89 #include <ufs/ufs/ufs_extern.h>
90
91 #include <ufs/lfs/lfs.h>
92 #include <ufs/lfs/lfs_extern.h>
93
94 /* Flags for return from lfs_fastvget */
95 #define FVG_UNLOCK 0x01 /* Needs to be unlocked */
96 #define FVG_PUT 0x02 /* Needs to be vput() */
97
98 struct buf *lfs_fakebuf __P((struct vnode *, int, size_t, caddr_t));
99
100 int debug_cleaner = 0;
101 int clean_vnlocked = 0;
102 int clean_inlocked = 0;
103 int verbose_debug = 0;
104 int lfs_clean_vnhead = 1;
105
106 pid_t lfs_cleaner_pid = 0;
107
108 /*
109 * Definitions for the buffer free lists.
110 */
111 #define BQUEUES 4 /* number of free buffer queues */
112
113 #define BQ_LOCKED 0 /* super-blocks &c */
114 #define BQ_LRU 1 /* lru, useful buffers */
115 #define BQ_AGE 2 /* rubbish */
116 #define BQ_EMPTY 3 /* buffer headers with no memory */
117
118 extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
119
120 #define LFS_FORCE_WRITE UNASSIGNED
121
122 #define LFS_VREF_THRESHOLD 128
123
124 /*
125 * lfs_markv:
126 *
127 * This will mark inodes and blocks dirty, so they are written into the log.
128 * It will block until all the blocks have been written. The segment create
129 * time passed in the block_info and inode_info structures is used to decide
130 * if the data is valid for each block (in case some process dirtied a block
131 * or inode that is being cleaned between the determination that a block is
132 * live and the lfs_markv call).
133 *
134 * 0 on success
135 * -1/errno is return on error.
136 */
137 int
138 lfs_markv(p, v, retval)
139 struct proc *p;
140 void *v;
141 register_t *retval;
142 {
143 struct lfs_markv_args /* {
144 syscallarg(fsid_t *) fsidp;
145 syscallarg(struct block_info *) blkiov;
146 syscallarg(int) blkcnt;
147 } */ *uap = v;
148 BLOCK_INFO *blkp;
149 IFILE *ifp;
150 struct buf *bp, *nbp;
151 struct inode *ip = NULL;
152 struct lfs *fs;
153 struct mount *mntp;
154 struct vnode *vp;
155 #ifdef DEBUG_LFS
156 int vputc=0, iwritten=0;
157 #endif
158 fsid_t fsid;
159 void *start;
160 ino_t lastino;
161 ufs_daddr_t b_daddr, v_daddr;
162 int origcnt, cnt, error, lfs_fastvget_unlock;
163 int do_again=0;
164 int s;
165 #ifdef CHECK_COPYIN
166 int i;
167 #endif /* CHECK_COPYIN */
168 #ifdef LFS_TRACK_IOS
169 int j;
170 #endif
171 #ifdef THROTTLE_REFERENCES
172 ino_t refed_vnodes[LFS_VREF_THRESHOLD];
173 #endif
174 int numlocked=0, numrefed=0;
175
176 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
177 return (error);
178
179 if ((mntp = vfs_getvfs(&fsid)) == NULL)
180 return (EINVAL);
181
182 fs = VFSTOUFS(mntp)->um_lfs;
183
184 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
185 return (error);
186
187 origcnt = cnt = SCARG(uap, blkcnt);
188 start = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
189 error = copyin(SCARG(uap, blkiov), start, cnt * sizeof(BLOCK_INFO));
190 if (error)
191 goto err1;
192
193 /*
194 * This seglock is just to prevent the fact that we might have to sleep
195 * from allowing the possibility that our blocks might become
196 * invalid.
197 *
198 * It is also important to note here that unless we specify SEGM_CKP,
199 * any Ifile blocks that we might be asked to clean will never get
200 * to the disk.
201 */
202 lfs_seglock(fs, SEGM_SYNC|SEGM_CLEAN|SEGM_CKP);
203
204 /* Mark blocks/inodes dirty. */
205 error = 0;
206
207 #ifdef DEBUG_LFS
208 /* Run through and count the inodes */
209 lastino = LFS_UNUSED_INUM;
210 for(blkp = start; cnt--; ++blkp) {
211 if(lastino != blkp->bi_inode) {
212 lastino = blkp->bi_inode;
213 vputc++;
214 }
215 }
216 cnt = origcnt;
217 printf("[%d/",vputc);
218 iwritten=0;
219 #endif /* DEBUG_LFS */
220 /* these were inside the initialization for the for loop */
221 v_daddr = LFS_UNUSED_DADDR;
222 lastino = LFS_UNUSED_INUM;
223 for (blkp = start; cnt--; ++blkp)
224 {
225 #ifdef LFS_TRACK_IOS
226 /*
227 * If there is I/O on this segment that is not yet complete,
228 * the cleaner probably does not have the right information.
229 * Send it packing.
230 */
231 for(j=0;j<LFS_THROTTLE;j++) {
232 if(fs->lfs_pending[j] != LFS_UNUSED_DADDR
233 && datosn(fs,fs->lfs_pending[j])==datosn(fs,blkp->bi_daddr)
234 && blkp->bi_daddr != LFS_FORCE_WRITE)
235 {
236 printf("lfs_markv: attempt to clean pending segment? (#%d)\n",
237 datosn(fs, fs->lfs_pending[j]));
238 /* free(start,M_SEGMENT); */
239 /* return (EBUSY); */
240 }
241 }
242 #endif /* LFS_TRACK_IOS */
243 /*
244 * Get the IFILE entry (only once) and see if the file still
245 * exists.
246 */
247 if (lastino != blkp->bi_inode) {
248 /*
249 * Finish the old file, if there was one. The presence
250 * of a usable vnode in vp is signaled by a valid v_daddr.
251 */
252 if(v_daddr != LFS_UNUSED_DADDR) {
253 if(ip->i_flag & (IN_MODIFIED|IN_CLEANING))
254 #ifdef DEBUG_LFS
255 iwritten++;
256 #endif
257 if(lfs_fastvget_unlock) {
258 VOP_UNLOCK(vp,0);
259 numlocked--;
260 }
261 #ifndef THROTTLE_REFERENCES
262 lfs_vunref(vp);
263 numrefed--;
264 #else
265 /*
266 * Have to do this so that getnewvnode doesn't
267 * get ahold of one of these vnodes while
268 * we're still processing others, set VXLOCK,
269 * and prevent us from writing it out.
270 * XXX Yuck.
271 */
272 if(numrefed == LFS_VREF_THRESHOLD-1) {
273 lfs_segwrite(mntp, SEGM_SYNC|SEGM_CLEAN|SEGM_CKP);
274 while(--numrefed) {
275 vp = ufs_ihashlookup(VFSTOUFS(mntp)->um_dev, refed_vnodes[numrefed]);
276 if(vp && (VTOI(vp)->i_flag & IN_CLEANING))
277 lfs_vunref(vp);
278 }
279 }
280 #endif
281 }
282
283 /*
284 * Start a new file
285 */
286 lastino = blkp->bi_inode;
287 if (blkp->bi_inode == LFS_IFILE_INUM)
288 v_daddr = fs->lfs_idaddr;
289 else {
290 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
291 /* XXX fix for force write */
292 v_daddr = ifp->if_daddr;
293 brelse(bp);
294 }
295 /* Don't force-write the ifile */
296 if (blkp->bi_inode == LFS_IFILE_INUM
297 && blkp->bi_daddr == LFS_FORCE_WRITE)
298 {
299 continue;
300 }
301 if (v_daddr == LFS_UNUSED_DADDR
302 && blkp->bi_daddr != LFS_FORCE_WRITE)
303 {
304 continue;
305 }
306
307 /* Get the vnode/inode. */
308 error=lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
309 &vp,
310 (blkp->bi_lbn==LFS_UNUSED_LBN
311 ? blkp->bi_bp
312 : NULL),
313 &lfs_fastvget_unlock);
314 if(lfs_fastvget_unlock)
315 numlocked++;
316
317 if(!error) {
318 #ifndef THROTTLE_REFERENCES
319 numrefed++;
320 #else
321 refed_vnodes[numrefed++] = blkp->bi_inode;
322 #endif
323 }
324 if(error) {
325 #ifdef DIAGNOSTIC
326 printf("lfs_markv: VFS_VGET failed with %d (ino %d, segment %d)\n",
327 error, blkp->bi_inode,
328 datosn(fs, blkp->bi_daddr));
329 #endif /* DIAGNOSTIC */
330 /*
331 * If we got EAGAIN, that means that the
332 * Inode was locked. This is
333 * recoverable: just clean the rest of
334 * this segment, and let the cleaner try
335 * again with another. (When the
336 * cleaner runs again, this segment will
337 * sort high on the list, since it is
338 * now almost entirely empty.) But, we
339 * still set v_daddr = LFS_UNUSED_ADDR
340 * so as not to test this over and over
341 * again.
342 */
343 if(error == EAGAIN) {
344 error = 0;
345 do_again++;
346 }
347 #ifdef DIAGNOSTIC
348 else if(error != ENOENT)
349 panic("lfs_markv VFS_VGET FAILED");
350 #endif
351 /* lastino = LFS_UNUSED_INUM; */
352 v_daddr = LFS_UNUSED_DADDR;
353 vp = NULL;
354 ip = NULL;
355 continue;
356 }
357 ip = VTOI(vp);
358 } else if (v_daddr == LFS_UNUSED_DADDR) {
359 /*
360 * This can only happen if the vnode is dead (or
361 * in any case we can't get it...e.g., it is
362 * inlocked). Keep going.
363 */
364 continue;
365 }
366
367 /* Past this point we are guaranteed that vp, ip are valid. */
368
369 /* If this BLOCK_INFO didn't contain a block, keep going. */
370 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
371 /* XXX need to make sure that the inode gets written in this case */
372 /* XXX but only write the inode if it's the right one */
373 if (blkp->bi_inode != LFS_IFILE_INUM) {
374 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
375 if(ifp->if_daddr == blkp->bi_daddr
376 || blkp->bi_daddr == LFS_FORCE_WRITE)
377 {
378 #ifndef STINGY_CLEAN
379 if(!(ip->i_flag & IN_MODIFIED))
380 fs->lfs_uinodes++;
381 ip->i_flag |= IN_MODIFIED;
382 #else
383 if(!(ip->i_flag & IN_CLEANING))
384 fs->lfs_uinodes++;
385 ip->i_flag |= IN_CLEANING;
386 #endif
387 }
388 brelse(bp);
389 }
390 continue;
391 }
392
393 b_daddr = 0;
394 if(blkp->bi_daddr != LFS_FORCE_WRITE) {
395 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
396 b_daddr != blkp->bi_daddr)
397 {
398 if(datosn(fs,b_daddr)
399 == datosn(fs,blkp->bi_daddr))
400 {
401 printf("Wrong da same seg: %x vs %x\n",
402 blkp->bi_daddr, b_daddr);
403 }
404 continue;
405 }
406 }
407 /*
408 * If we got to here, then we are keeping the block. If
409 * it is an indirect block, we want to actually put it
410 * in the buffer cache so that it can be updated in the
411 * finish_meta section. If it's not, we need to
412 * allocate a fake buffer so that writeseg can perform
413 * the copyin and write the buffer.
414 */
415 #if 0 && defined(LFS_STINGY_CLEAN)
416 if(!(ip->i_flag & IN_CLEANING))
417 fs->lfs_uinodes++;
418 ip->i_flag |= IN_CLEANING;
419 #endif
420 /*
421 * XXX - if the block we are reading has been *extended* since
422 * it was written to disk, then we risk throwing away
423 * the extension in bread()/getblk(). Check the size
424 * here.
425 */
426 if(blkp->bi_size < fs->lfs_bsize) {
427 s = splbio();
428 bp = incore(vp, blkp->bi_lbn);
429 if(bp && bp->b_bcount > blkp->bi_size) {
430 printf("lfs_markv: %ld > %d (fixed)\n",
431 bp->b_bcount, blkp->bi_size);
432 blkp->bi_size = bp->b_bcount;
433 }
434 splx(s);
435 }
436 if (blkp->bi_lbn >= 0) { /* Data Block */
437 /* XXX KS - should we use incore here, or just always use getblk()? */
438 if((bp=incore(vp, blkp->bi_lbn))!=NULL) {
439 if(bp && bp->b_bcount > blkp->bi_size) {
440 printf("lfs_markv: %ld > %d (fixed)\n",
441 bp->b_bcount, blkp->bi_size);
442 blkp->bi_size = bp->b_bcount;
443 }
444 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
445 } else
446 bp = lfs_fakebuf(vp, blkp->bi_lbn, blkp->bi_size, blkp->bi_bp);
447 } else { /* Indirect block */
448 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
449 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
450 /*
451 * The block in question was not found
452 * in the cache; i.e., the block that
453 * getblk() returned is empty. So, we
454 * can (and should) copy in the
455 * contents, because we've already
456 * determined that this was the right
457 * version of this block on disk.
458 *
459 * And, it can't have changed underneath
460 * us, because we have the segment lock.
461 */
462 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
463 if(error)
464 goto err2;
465 }
466 }
467 #ifndef LFS_STINGY_CLEAN
468 /*
469 * At this point, we just write the block to be written again.
470 * lfs_bwrite will not block for us since we are calling it
471 * with the no-wait flag.
472 */
473 ip->i_flag |= IN_UPDATE;
474 #endif /* LFS_STINGY_CLEAN */
475 if ((error = lfs_bwrite_ext(bp,BW_CLEAN)) != 0)
476 goto err2;
477 }
478
479 /*
480 * Finish the old file, if there was one
481 */
482 if(v_daddr != LFS_UNUSED_DADDR) {
483 #ifdef DEBUG_LFS
484 if(ip->i_flag & (IN_MODIFIED|IN_CLEANING))
485 iwritten++;
486 #endif
487 if(lfs_fastvget_unlock) {
488 VOP_UNLOCK(vp,0);
489 numlocked--;
490 }
491 #ifndef THROTTLE_REFERENCES
492 lfs_vunref(vp);
493 numrefed--;
494 #endif
495 }
496
497 /*
498 * The last write has to be SEGM_SYNC, because of calling semantics.
499 * It also has to be SEGM_CKP, because otherwise we could write
500 * over the newly cleaned data contained in a checkpoint, and then
501 * we'd be unhappy at recovery time.
502 */
503 lfs_segwrite(mntp, SEGM_SYNC|SEGM_CLEAN|SEGM_CKP);
504 #ifdef THROTTLE_REFERENCES
505 /* unref the last few vnodes */
506 while(--numrefed) {
507 vp = ufs_ihashlookup(VFSTOUFS(mntp)->um_dev, refed_vnodes[numrefed]);
508 if(vp && (VTOI(vp)->i_flag & IN_CLEANING))
509 lfs_vunref(vp);
510 }
511 #endif
512 free(start, M_SEGMENT);
513
514 #ifdef LFS_STINGY_CLEAN
515 /* Now that we've finished the segwrite, go back and unmark all
516 of the vnodes */
517 /* XXX this inverts the vnode freelist, use the back-hack instead */
518 loop:
519 for (vp = mntp->mnt_vnodelist.lh_first;
520 vp != NULL;
521 vp = vp->v_mntvnodes.le_next)
522 {
523 if (vp->v_mount != mntp)
524 goto loop;
525 if(lfs_vref(vp))
526 continue;
527 ip = VTOI(vp);
528 if(ip->i_flag & IN_CLEANING) {
529 ip->i_flag &= ~IN_CLEANING;
530 printf("{%d}",ip->i_number);
531 if(ip->i_flag & IN_MODIFIED) {
532 fs->lfs_uinodes--;
533 #ifdef DEBUG_LFS
534 if((int32_t)fs->lfs_uinodes<0) {
535 printf("U3");
536 fs->lfs_uinodes=0;
537 }
538 #endif
539 } else
540 ip->i_flag |= IN_MODIFIED;
541 if(lfs_clean_vnhead
542 && (VTOI(vp)->i_flag & (IN_ACCESS|IN_UPDATE|IN_CHANGE|IN_MODIFIED))==0)
543 {
544 lfs_vunref_head(vp);
545 continue;
546 }
547 }
548 lfs_vunref(vp);
549 }
550 #endif /* LFS_STINGY_CLEAN */
551
552 lfs_segunlock(fs);
553
554 #ifdef DEBUG_LFS
555 printf("%d]",iwritten);
556 if(numlocked != 0 || numrefed != 0) {
557 panic("lfs_markv: numlocked=%d numrefed=%d", numlocked, numrefed);
558 }
559 #endif
560
561 if(error)
562 return (error);
563 else if(do_again)
564 return EAGAIN;
565
566 return 0;
567
568 err2:
569 printf("markv err2\n");
570 lfs_vunref(vp);
571 /* Free up fakebuffers -- have to take these from the LOCKED list */
572 again:
573 for(bp = bufqueues[BQ_LOCKED].tqh_first; bp; bp=nbp) {
574 nbp = bp->b_freelist.tqe_next;
575 if(bp->b_flags & B_CALL) {
576 s = splbio();
577 if(bp->b_flags & B_BUSY) { /* not bloody likely */
578 bp->b_flags |= B_WANTED;
579 tsleep(bp, PRIBIO+1, "markv", 0);
580 splx(s);
581 goto again;
582 }
583 bremfree(bp);
584 splx(s);
585 brelse(bp);
586 }
587 }
588 free(start, M_SEGMENT);
589 lfs_segunlock(fs);
590 vfs_unbusy(mntp);
591 return (error);
592
593 err1:
594 printf("markv err1\n");
595 free(start, M_SEGMENT);
596 return (error);
597 }
598
599 /*
600 * lfs_bmapv:
601 *
602 * This will fill in the current disk address for arrays of blocks.
603 *
604 * 0 on success
605 * -1/errno is return on error.
606 */
607
608 int
609 lfs_bmapv(p, v, retval)
610 struct proc *p;
611 void *v;
612 register_t *retval;
613 {
614 struct lfs_bmapv_args /* {
615 syscallarg(fsid_t *) fsidp;
616 syscallarg(struct block_info *) blkiov;
617 syscallarg(int) blkcnt;
618 } */ *uap = v;
619 BLOCK_INFO *blkp;
620 IFILE *ifp;
621 struct buf *bp;
622 struct inode *ip = NULL;
623 struct lfs *fs;
624 struct mount *mntp;
625 struct ufsmount *ump;
626 struct vnode *vp;
627 fsid_t fsid;
628 void *start;
629 ino_t lastino;
630 ufs_daddr_t v_daddr;
631 int origcnt, cnt, error, need_unlock=0;
632 int numlocked=0, numrefed=0;
633 #ifdef LFS_TRACK_IOS
634 int j;
635 #endif
636
637 lfs_cleaner_pid = p->p_pid;
638
639 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
640 return (error);
641
642 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
643 return (error);
644 if ((mntp = vfs_getvfs(&fsid)) == NULL)
645 return (EINVAL);
646
647 ump = VFSTOUFS(mntp);
648
649 origcnt = cnt = SCARG(uap, blkcnt);
650 start = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
651 error = copyin(SCARG(uap, blkiov), start, cnt * sizeof(BLOCK_INFO));
652 if (error) {
653 free(start, M_SEGMENT);
654 return (error);
655 }
656
657 fs = VFSTOUFS(mntp)->um_lfs;
658
659 error = 0;
660
661 /* these were inside the initialization for the for loop */
662 v_daddr = LFS_UNUSED_DADDR;
663 lastino = LFS_UNUSED_INUM;
664 for (blkp = start; cnt--; ++blkp)
665 {
666 #ifdef DEBUG
667 if (datosn(fs, fs->lfs_curseg) == datosn(fs, blkp->bi_daddr)) {
668 printf("Hm, attempt to clean current segment? (#%d)\n",
669 datosn(fs, fs->lfs_curseg));
670 free(start,M_SEGMENT);
671 return (EBUSY);
672 }
673 #endif /* DEBUG */
674 #ifdef LFS_TRACK_IOS
675 /*
676 * If there is I/O on this segment that is not yet complete,
677 * the cleaner probably does not have the right information.
678 * Send it packing.
679 */
680 for(j=0;j<LFS_THROTTLE;j++) {
681 if(fs->lfs_pending[j] != LFS_UNUSED_DADDR
682 && datosn(fs,fs->lfs_pending[j])==datosn(fs,blkp->bi_daddr))
683 {
684 printf("lfs_bmapv: attempt to clean pending segment? (#%d)\n",
685 datosn(fs, fs->lfs_pending[j]));
686 free(start,M_SEGMENT);
687 return (EBUSY);
688 }
689 }
690
691 #endif /* LFS_TRACK_IOS */
692 /*
693 * Get the IFILE entry (only once) and see if the file still
694 * exists.
695 */
696 if (lastino != blkp->bi_inode) {
697 /*
698 * Finish the old file, if there was one. The presence
699 * of a usable vnode in vp is signaled by a valid
700 * v_daddr.
701 */
702 if(v_daddr != LFS_UNUSED_DADDR) {
703 if(need_unlock) {
704 VOP_UNLOCK(vp,0);
705 numlocked--;
706 }
707 lfs_vunref(vp);
708 numrefed--;
709 }
710
711 /*
712 * Start a new file
713 */
714 lastino = blkp->bi_inode;
715 if (blkp->bi_inode == LFS_IFILE_INUM)
716 v_daddr = fs->lfs_idaddr;
717 else {
718 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
719 v_daddr = ifp->if_daddr;
720 brelse(bp);
721 }
722 if (v_daddr == LFS_UNUSED_DADDR) {
723 blkp->bi_daddr = LFS_UNUSED_DADDR;
724 continue;
725 }
726 /*
727 * A regular call to VFS_VGET could deadlock
728 * here. Instead, we try an unlocked access.
729 */
730 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
731 if (vp != NULL) {
732 ip = VTOI(vp);
733 if(VOP_ISLOCKED(vp)) {
734 /* printf("inode %d inlocked in bmapv\n",ip->i_number); */
735 need_unlock = 0;
736 } else {
737 VOP_LOCK(vp,LK_EXCLUSIVE);
738 need_unlock = FVG_UNLOCK;
739 numlocked++;
740 }
741 lfs_vref(vp);
742 numrefed++;
743 } else {
744 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
745 if(error) {
746 v_daddr = LFS_UNUSED_DADDR;
747 need_unlock = 0;
748 continue;
749 } else {
750 need_unlock = FVG_PUT;
751 numlocked++;
752 numrefed++;
753 }
754 }
755 ip = VTOI(vp);
756 } else if (v_daddr == LFS_UNUSED_DADDR) {
757 /*
758 * This can only happen if the vnode is dead.
759 * Keep going. Note that we DO NOT set the
760 * bi_addr to anything -- if we failed to get
761 * the vnode, for example, we want to assume
762 * conservatively that all of its blocks *are*
763 * located in the segment in question.
764 * lfs_markv will throw them out if we are
765 * wrong.
766 */
767 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
768 continue;
769 }
770
771 /* Past this point we are guaranteed that vp, ip are valid. */
772
773 if(blkp->bi_lbn == LFS_UNUSED_LBN) {
774 /*
775 * We just want the inode address, which is
776 * conveniently in v_daddr.
777 */
778 blkp->bi_daddr = v_daddr;
779 } else {
780 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
781 &(blkp->bi_daddr), NULL);
782 if(error)
783 {
784 blkp->bi_daddr = LFS_UNUSED_DADDR;
785 continue;
786 }
787 }
788 }
789
790 /*
791 * Finish the old file, if there was one. The presence
792 * of a usable vnode in vp is signaled by a valid v_daddr.
793 */
794 if(v_daddr != LFS_UNUSED_DADDR) {
795 if(need_unlock) {
796 VOP_UNLOCK(vp,0);
797 numlocked--;
798 }
799 lfs_vunref(vp);
800 numrefed--;
801 }
802
803 if(numlocked != 0 || numrefed != 0) {
804 panic("lfs_bmapv: numlocked=%d numrefed=%d", numlocked,
805 numrefed);
806 }
807
808 copyout(start, SCARG(uap, blkiov), origcnt * sizeof(BLOCK_INFO));
809 free(start, M_SEGMENT);
810
811 return 0;
812 }
813
814 /*
815 * lfs_segclean:
816 *
817 * Mark the segment clean.
818 *
819 * 0 on success
820 * -1/errno is return on error.
821 */
822 int
823 lfs_segclean(p, v, retval)
824 struct proc *p;
825 void *v;
826 register_t *retval;
827 {
828 struct lfs_segclean_args /* {
829 syscallarg(fsid_t *) fsidp;
830 syscallarg(u_long) segment;
831 } */ *uap = v;
832 CLEANERINFO *cip;
833 SEGUSE *sup;
834 struct buf *bp;
835 struct mount *mntp;
836 struct lfs *fs;
837 fsid_t fsid;
838 int error;
839
840 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
841 return (error);
842
843 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
844 return (error);
845 if ((mntp = vfs_getvfs(&fsid)) == NULL)
846 return (EINVAL);
847
848 fs = VFSTOUFS(mntp)->um_lfs;
849
850 if (datosn(fs, fs->lfs_curseg) == SCARG(uap, segment))
851 return (EBUSY);
852
853 LFS_SEGENTRY(sup, fs, SCARG(uap, segment), bp);
854 if (sup->su_flags & SEGUSE_ACTIVE) {
855 brelse(bp);
856 return (EBUSY);
857 }
858
859 fs->lfs_avail += fsbtodb(fs, fs->lfs_ssize) - 1;
860 fs->lfs_bfree += (sup->su_nsums * LFS_SUMMARY_SIZE / DEV_BSIZE) +
861 sup->su_ninos * btodb(fs->lfs_bsize);
862 sup->su_flags &= ~SEGUSE_DIRTY;
863 #if 1
864 /* XXX KS - before we return, really empty the segment (i.e., fill
865 it with zeroes). This is only for debugging purposes. */
866 {
867 daddr_t start;
868 int offset, sizeleft, bufsize;
869 struct buf *zbp;
870
871 start = sntoda(fs, SCARG(uap, segment));
872 offset = (sup->su_flags & SEGUSE_SUPERBLOCK) ? LFS_SBPAD : 0;
873 sizeleft = fs->lfs_ssize / DEV_BSIZE - offset;
874 while(sizeleft > 0) {
875 bufsize = (sizeleft < MAXPHYS) ? sizeleft : MAXPHYS;
876 zbp = lfs_newbuf(VTOI(fs->lfs_ivnode)->i_devvp, start+offset, bufsize);
877 memset(zbp->b_data, 'Z', bufsize);
878 VOP_STRATEGY(zbp);
879 offset += bufsize;
880 sizeleft -= bufsize;
881 }
882 }
883 #endif
884 (void) VOP_BWRITE(bp);
885
886 LFS_CLEANERINFO(cip, fs, bp);
887 ++cip->clean;
888 --cip->dirty;
889 fs->lfs_nclean = cip->clean;
890 (void) VOP_BWRITE(bp);
891 wakeup(&fs->lfs_avail);
892
893 return (0);
894 }
895
896 /*
897 * lfs_segwait:
898 *
899 * This will block until a segment in file system fsid is written. A timeout
900 * in milliseconds may be specified which will awake the cleaner automatically.
901 * An fsid of -1 means any file system, and a timeout of 0 means forever.
902 *
903 * 0 on success
904 * 1 on timeout
905 * -1/errno is return on error.
906 */
907 int
908 lfs_segwait(p, v, retval)
909 struct proc *p;
910 void *v;
911 register_t *retval;
912 {
913 struct lfs_segwait_args /* {
914 syscallarg(fsid_t *) fsidp;
915 syscallarg(struct timeval *) tv;
916 } */ *uap = v;
917 extern int lfs_allclean_wakeup;
918 struct mount *mntp;
919 struct timeval atv;
920 fsid_t fsid;
921 void *addr;
922 u_long timeout;
923 int error, s;
924
925 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) {
926 return (error);
927 }
928 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
929 return (error);
930 if ((mntp = vfs_getvfs(&fsid)) == NULL)
931 addr = &lfs_allclean_wakeup;
932 else
933 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
934
935 if (SCARG(uap, tv)) {
936 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
937 if (error)
938 return (error);
939 if (itimerfix(&atv))
940 return (EINVAL);
941 s = splclock();
942 timeradd(&atv, &time, &atv);
943 timeout = hzto(&atv);
944 splx(s);
945 } else
946 timeout = 0;
947
948 error = tsleep(addr, PCATCH | PUSER, "segment", timeout);
949 return (error == ERESTART ? EINTR : 0);
950 }
951
952 /*
953 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
954 * daddr from the ifile, so don't look it up again. If the cleaner is
955 * processing IINFO structures, it may have the ondisk inode already, so
956 * don't go retrieving it again.
957 *
958 * If we find the vnode on the hash chain, then it may be locked by another
959 * process; so we set (*need_unlock) to zero.
960 *
961 * If we don't, we call ufs_ihashins, which locks the inode, and we set
962 * (*need_unlock) to non-zero.
963 *
964 * In either case we lfs_vref, and it is the caller's responsibility to
965 * lfs_vunref and VOP_UNLOCK (if necessary) when finished.
966 */
967 #ifdef USE_UFS_HASHLOCK
968 extern struct lock ufs_hashlock;
969 #endif
970
971 int
972 lfs_fastvget(mp, ino, daddr, vpp, dinp, need_unlock)
973 struct mount *mp;
974 ino_t ino;
975 ufs_daddr_t daddr;
976 struct vnode **vpp;
977 struct dinode *dinp;
978 int *need_unlock;
979 {
980 register struct inode *ip;
981 struct vnode *vp;
982 struct ufsmount *ump;
983 dev_t dev;
984 int error;
985 struct buf *bp;
986
987 ump = VFSTOUFS(mp);
988 dev = ump->um_dev;
989 *need_unlock = 0;
990 /*
991 * This is playing fast and loose. Someone may have the inode
992 * locked, in which case they are going to be distinctly unhappy
993 * if we trash something.
994 */
995 #ifdef USE_UFS_HASHLOCK
996 do {
997 #endif
998 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
999 lfs_vref(*vpp);
1000 if ((*vpp)->v_flag & VXLOCK) {
1001 /* printf("vnode VXLOCKed\n"); */
1002 clean_vnlocked++;
1003 #ifdef LFS_EAGAIN_FAIL
1004 lfs_vunref(*vpp);
1005 return EAGAIN;
1006 #endif
1007 }
1008 ip = VTOI(*vpp);
1009 if (VOP_ISLOCKED(*vpp)) {
1010 printf("ino %d inlocked by pid %d\n",ip->i_number,
1011 ip->i_lock.lk_lockholder);
1012 clean_inlocked++;
1013 #ifdef LFS_EAGAIN_FAIL
1014 lfs_vunref(*vpp);
1015 return EAGAIN;
1016 #endif /* LFS_EAGAIN_FAIL */
1017 } else {
1018 VOP_LOCK(*vpp,LK_EXCLUSIVE);
1019 *need_unlock |= FVG_UNLOCK;
1020 }
1021 #ifndef LFS_STINGY_CLEAN
1022 if (!(ip->i_flag & IN_MODIFIED))
1023 ++ump->um_lfs->lfs_uinodes;
1024 ip->i_flag |= IN_MODIFIED;
1025 #endif /* LFS_STINGY_CLEAN */
1026 return (0);
1027 }
1028 #ifdef USE_UFS_HASHLOCK
1029 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
1030 #endif
1031
1032 /* Allocate new vnode/inode. */
1033 if ((error = lfs_vcreate(mp, ino, &vp)) != 0) {
1034 *vpp = NULL;
1035 #ifdef USE_UFS_HASHLOCK
1036 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1037 #endif
1038 return (error);
1039 }
1040 /*
1041 * Put it onto its hash chain and lock it so that other requests for
1042 * this inode will block if they arrive while we are sleeping waiting
1043 * for old data structures to be purged or for the contents of the
1044 * disk portion of this inode to be read.
1045 */
1046 ip = VTOI(vp);
1047 ufs_ihashins(ip);
1048 #ifdef USE_UFS_HASHLOCK
1049 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1050 #endif
1051
1052 /*
1053 * XXX
1054 * This may not need to be here, logically it should go down with
1055 * the i_devvp initialization.
1056 * Ask Kirk.
1057 */
1058 ip->i_lfs = ump->um_lfs;
1059
1060 /* Read in the disk contents for the inode, copy into the inode. */
1061 if (dinp) {
1062 error = copyin(dinp, &ip->i_din.ffs_din, DINODE_SIZE);
1063 if (error) {
1064 ufs_ihashrem(ip);
1065
1066 /* Unlock and discard unneeded inode. */
1067 lfs_vunref(vp);
1068 *vpp = NULL;
1069 return (error);
1070 }
1071 if(ip->i_number != ino)
1072 panic("lfs_fastvget: I was fed the wrong inode!");
1073 } else {
1074 error = bread(ump->um_devvp, daddr,
1075 (int)ump->um_lfs->lfs_bsize, NOCRED, &bp);
1076 if (error) {
1077 printf("error != 0 at %s:%d\n",__FILE__,__LINE__);
1078 /*
1079 * The inode does not contain anything useful, so it
1080 * would be misleading to leave it on its hash chain.
1081 * Iput() will return it to the free list.
1082 */
1083 ufs_ihashrem(ip);
1084
1085 /* Unlock and discard unneeded inode. */
1086 lfs_vunref(vp);
1087 brelse(bp);
1088 *vpp = NULL;
1089 return (error);
1090 }
1091 ip->i_din.ffs_din =
1092 *lfs_ifind(ump->um_lfs, ino, (struct dinode *)bp->b_data);
1093 brelse(bp);
1094 }
1095
1096 /*
1097 * Initialize the vnode from the inode, check for aliases. In all
1098 * cases re-init ip, the underlying vnode/inode may have changed.
1099 */
1100 error = ufs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
1101 if (error) {
1102 lfs_vunref(vp);
1103 *vpp = NULL;
1104 return (error);
1105 }
1106 #ifdef DEBUG_LFS
1107 if(vp->v_type == VNON) {
1108 printf("lfs_fastvget: ino %d is type VNON! (ifmt=%o, dinp=%p)\n",
1109 ip->i_number, (ip->i_ffs_mode & IFMT)>>12, dinp);
1110 lfs_dump_dinode(&ip->i_din.ffs_din);
1111 #ifdef DDB
1112 Debugger();
1113 #endif
1114 }
1115 #endif /* DEBUG_LFS */
1116 /*
1117 * Finish inode initialization now that aliasing has been resolved.
1118 */
1119 ip->i_devvp = ump->um_devvp;
1120 #ifndef LFS_STINGY_CLEAN
1121 ip->i_flag |= IN_MODIFIED;
1122 ++ump->um_lfs->lfs_uinodes;
1123 #endif
1124 VREF(ip->i_devvp);
1125 *vpp = vp;
1126 *need_unlock |= FVG_PUT;
1127
1128 return (0);
1129 }
1130
1131 struct buf *
1132 lfs_fakebuf(vp, lbn, size, uaddr)
1133 struct vnode *vp;
1134 int lbn;
1135 size_t size;
1136 caddr_t uaddr;
1137 {
1138 struct buf *bp;
1139
1140 #ifdef DEBUG
1141 /* Check for duplicates too */
1142 if(incore(vp,lbn)) {
1143 printf("Fake buffer (%d/%d) is in core\n", VTOI(vp)->i_number,
1144 lbn);
1145 if(bread(vp, lbn, size, NOCRED, &bp))
1146 return NULL;
1147 }
1148 #endif
1149 bp = lfs_newbuf(vp, lbn, 0);
1150 bp->b_saveaddr = uaddr;
1151 bp->b_bufsize = size;
1152 bp->b_bcount = size;
1153 bp->b_flags |= B_INVAL;
1154 return (bp);
1155 }
1156