lfs_syscalls.c revision 1.49 1 /* $NetBSD: lfs_syscalls.c,v 1.49 2000/09/09 04:49:55 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. All advertising materials mentioning features or use of this software
51 * must display the following acknowledgement:
52 * This product includes software developed by the University of
53 * California, Berkeley and its contributors.
54 * 4. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 *
70 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
71 */
72
73 #include "fs_lfs.h" /* for prototypes in syscallargs.h */
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/proc.h>
78 #include <sys/buf.h>
79 #include <sys/mount.h>
80 #include <sys/vnode.h>
81 #include <sys/malloc.h>
82 #include <sys/kernel.h>
83
84 #include <sys/syscallargs.h>
85
86 #include <ufs/ufs/quota.h>
87 #include <ufs/ufs/inode.h>
88 #include <ufs/ufs/ufsmount.h>
89 #include <ufs/ufs/ufs_extern.h>
90
91 #include <ufs/lfs/lfs.h>
92 #include <ufs/lfs/lfs_extern.h>
93
94 /* Flags for return from lfs_fastvget */
95 #define FVG_UNLOCK 0x01 /* Needs to be unlocked */
96 #define FVG_PUT 0x02 /* Needs to be vput() */
97
98 struct buf *lfs_fakebuf __P((struct vnode *, int, size_t, caddr_t));
99 int lfs_fasthashget __P((dev_t, ino_t, int *, struct vnode **));
100
101 int debug_cleaner = 0;
102 int clean_vnlocked = 0;
103 int clean_inlocked = 0;
104 int verbose_debug = 0;
105
106 pid_t lfs_cleaner_pid = 0;
107
108 /*
109 * Definitions for the buffer free lists.
110 */
111 #define BQUEUES 4 /* number of free buffer queues */
112
113 #define BQ_LOCKED 0 /* super-blocks &c */
114 #define BQ_LRU 1 /* lru, useful buffers */
115 #define BQ_AGE 2 /* rubbish */
116 #define BQ_EMPTY 3 /* buffer headers with no memory */
117
118 extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
119
120 #define LFS_FORCE_WRITE UNASSIGNED
121
122 #define LFS_VREF_THRESHOLD 128
123
124 /*
125 * sys_lfs_markv:
126 *
127 * This will mark inodes and blocks dirty, so they are written into the log.
128 * It will block until all the blocks have been written. The segment create
129 * time passed in the block_info and inode_info structures is used to decide
130 * if the data is valid for each block (in case some process dirtied a block
131 * or inode that is being cleaned between the determination that a block is
132 * live and the lfs_markv call).
133 *
134 * 0 on success
135 * -1/errno is return on error.
136 */
137 int
138 sys_lfs_markv(p, v, retval)
139 struct proc *p;
140 void *v;
141 register_t *retval;
142 {
143 struct sys_lfs_markv_args /* {
144 syscallarg(fsid_t *) fsidp;
145 syscallarg(struct block_info *) blkiov;
146 syscallarg(int) blkcnt;
147 } */ *uap = v;
148 BLOCK_INFO *blkp;
149 IFILE *ifp;
150 struct buf *bp, *nbp;
151 struct inode *ip = NULL;
152 struct lfs *fs;
153 struct mount *mntp;
154 struct vnode *vp;
155 #ifdef DEBUG_LFS
156 int vputc=0, iwritten=0;
157 #endif
158 fsid_t fsid;
159 void *start;
160 ino_t lastino;
161 ufs_daddr_t b_daddr, v_daddr;
162 int origcnt, cnt, error, lfs_fastvget_unlock;
163 int do_again=0;
164 int s;
165 #ifdef CHECK_COPYIN
166 int i;
167 #endif /* CHECK_COPYIN */
168 #ifdef LFS_TRACK_IOS
169 int j;
170 #endif
171 int numlocked=0, numrefed=0;
172 ino_t maxino;
173
174 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
175 return (error);
176
177 if ((mntp = vfs_getvfs(&fsid)) == NULL)
178 return (EINVAL);
179
180 fs = VFSTOUFS(mntp)->um_lfs;
181
182 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
183 return (error);
184
185 maxino = (dbtofsb(fs, VTOI(fs->lfs_ivnode)->i_ffs_blocks) -
186 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
187
188 origcnt = cnt = SCARG(uap, blkcnt);
189 start = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
190 error = copyin(SCARG(uap, blkiov), start, cnt * sizeof(BLOCK_INFO));
191 if (error)
192 goto err1;
193
194 /*
195 * This seglock is just to prevent the fact that we might have to sleep
196 * from allowing the possibility that our blocks might become
197 * invalid.
198 *
199 * It is also important to note here that unless we specify SEGM_CKP,
200 * any Ifile blocks that we might be asked to clean will never get
201 * to the disk.
202 */
203 lfs_seglock(fs, SEGM_SYNC|SEGM_CLEAN|SEGM_CKP);
204
205 /* Mark blocks/inodes dirty. */
206 error = 0;
207
208 #ifdef DEBUG_LFS
209 /* Run through and count the inodes */
210 lastino = LFS_UNUSED_INUM;
211 for(blkp = start; cnt--; ++blkp) {
212 if(lastino != blkp->bi_inode) {
213 lastino = blkp->bi_inode;
214 vputc++;
215 }
216 }
217 cnt = origcnt;
218 printf("[%d/",vputc);
219 iwritten=0;
220 #endif /* DEBUG_LFS */
221 /* these were inside the initialization for the for loop */
222 v_daddr = LFS_UNUSED_DADDR;
223 lastino = LFS_UNUSED_INUM;
224 for (blkp = start; cnt--; ++blkp)
225 {
226 if(blkp->bi_daddr == LFS_FORCE_WRITE)
227 printf("lfs_markv: warning: force-writing ino %d lbn %d\n",
228 blkp->bi_inode, blkp->bi_lbn);
229 #ifdef LFS_TRACK_IOS
230 /*
231 * If there is I/O on this segment that is not yet complete,
232 * the cleaner probably does not have the right information.
233 * Send it packing.
234 */
235 for(j=0;j<LFS_THROTTLE;j++) {
236 if(fs->lfs_pending[j] != LFS_UNUSED_DADDR
237 && datosn(fs,fs->lfs_pending[j])==datosn(fs,blkp->bi_daddr)
238 && blkp->bi_daddr != LFS_FORCE_WRITE)
239 {
240 printf("lfs_markv: attempt to clean pending segment? (#%d)\n",
241 datosn(fs, fs->lfs_pending[j]));
242 /* free(start,M_SEGMENT); */
243 /* return (EBUSY); */
244 }
245 }
246 #endif /* LFS_TRACK_IOS */
247 /* Bounds-check incoming data, avoid panic for failed VGET */
248 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
249 error = EINVAL;
250 goto again;
251 }
252 /*
253 * Get the IFILE entry (only once) and see if the file still
254 * exists.
255 */
256 if (lastino != blkp->bi_inode) {
257 /*
258 * Finish the old file, if there was one. The presence
259 * of a usable vnode in vp is signaled by a valid v_daddr.
260 */
261 if(v_daddr != LFS_UNUSED_DADDR) {
262 #ifdef DEBUG_LFS
263 if(ip->i_flag & (IN_MODIFIED|IN_CLEANING))
264 iwritten++;
265 #endif
266 if(lfs_fastvget_unlock) {
267 VOP_UNLOCK(vp, 0);
268 numlocked--;
269 }
270 lfs_vunref(vp);
271 numrefed--;
272 }
273
274 /*
275 * Start a new file
276 */
277 lastino = blkp->bi_inode;
278 if (blkp->bi_inode == LFS_IFILE_INUM)
279 v_daddr = fs->lfs_idaddr;
280 else {
281 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
282 /* XXX fix for force write */
283 v_daddr = ifp->if_daddr;
284 brelse(bp);
285 }
286 /* Don't force-write the ifile */
287 if (blkp->bi_inode == LFS_IFILE_INUM
288 && blkp->bi_daddr == LFS_FORCE_WRITE)
289 {
290 continue;
291 }
292 if (v_daddr == LFS_UNUSED_DADDR
293 && blkp->bi_daddr != LFS_FORCE_WRITE)
294 {
295 continue;
296 }
297
298 /* Get the vnode/inode. */
299 error=lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
300 &vp,
301 (blkp->bi_lbn==LFS_UNUSED_LBN
302 ? blkp->bi_bp
303 : NULL),
304 &lfs_fastvget_unlock);
305 if(lfs_fastvget_unlock)
306 numlocked++;
307
308 if(!error) {
309 numrefed++;
310 }
311 if(error) {
312 #ifdef DEBUG_LFS
313 printf("lfs_markv: lfs_fastvget failed with %d (ino %d, segment %d)\n",
314 error, blkp->bi_inode,
315 datosn(fs, blkp->bi_daddr));
316 #endif /* DEBUG_LFS */
317 /*
318 * If we got EAGAIN, that means that the
319 * Inode was locked. This is
320 * recoverable: just clean the rest of
321 * this segment, and let the cleaner try
322 * again with another. (When the
323 * cleaner runs again, this segment will
324 * sort high on the list, since it is
325 * now almost entirely empty.) But, we
326 * still set v_daddr = LFS_UNUSED_ADDR
327 * so as not to test this over and over
328 * again.
329 */
330 if(error == EAGAIN) {
331 error = 0;
332 do_again++;
333 }
334 #ifdef DIAGNOSTIC
335 else if(error != ENOENT)
336 panic("lfs_markv VFS_VGET FAILED");
337 #endif
338 /* lastino = LFS_UNUSED_INUM; */
339 v_daddr = LFS_UNUSED_DADDR;
340 vp = NULL;
341 ip = NULL;
342 continue;
343 }
344 ip = VTOI(vp);
345 } else if (v_daddr == LFS_UNUSED_DADDR) {
346 /*
347 * This can only happen if the vnode is dead (or
348 * in any case we can't get it...e.g., it is
349 * inlocked). Keep going.
350 */
351 continue;
352 }
353
354 /* Past this point we are guaranteed that vp, ip are valid. */
355
356 /* If this BLOCK_INFO didn't contain a block, keep going. */
357 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
358 /* XXX need to make sure that the inode gets written in this case */
359 /* XXX but only write the inode if it's the right one */
360 if (blkp->bi_inode != LFS_IFILE_INUM) {
361 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
362 if(ifp->if_daddr == blkp->bi_daddr
363 || blkp->bi_daddr == LFS_FORCE_WRITE)
364 {
365 LFS_SET_UINO(ip, IN_CLEANING);
366 }
367 brelse(bp);
368 }
369 continue;
370 }
371
372 b_daddr = 0;
373 if(blkp->bi_daddr != LFS_FORCE_WRITE) {
374 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
375 b_daddr != blkp->bi_daddr)
376 {
377 if(datosn(fs,b_daddr)
378 == datosn(fs,blkp->bi_daddr))
379 {
380 printf("lfs_markv: wrong da same seg: %x vs %x\n",
381 blkp->bi_daddr, b_daddr);
382 }
383 continue;
384 }
385 }
386 /*
387 * If we got to here, then we are keeping the block. If
388 * it is an indirect block, we want to actually put it
389 * in the buffer cache so that it can be updated in the
390 * finish_meta section. If it's not, we need to
391 * allocate a fake buffer so that writeseg can perform
392 * the copyin and write the buffer.
393 */
394 /*
395 * XXX - if the block we are reading has been *extended* since
396 * it was written to disk, then we risk throwing away
397 * the extension in bread()/getblk(). Check the size
398 * here.
399 */
400 if(blkp->bi_size < fs->lfs_bsize) {
401 s = splbio();
402 bp = incore(vp, blkp->bi_lbn);
403 if(bp && bp->b_bcount > blkp->bi_size) {
404 printf("lfs_markv: %ld > %d (fixed)\n",
405 bp->b_bcount, blkp->bi_size);
406 blkp->bi_size = bp->b_bcount;
407 }
408 splx(s);
409 }
410 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
411 /* Data Block */
412 bp = lfs_fakebuf(vp, blkp->bi_lbn,
413 blkp->bi_size, blkp->bi_bp);
414 /* Pretend we used bread() to get it */
415 bp->b_blkno = blkp->bi_daddr;
416 } else {
417 /* Indirect block */
418 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
419 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
420 /*
421 * The block in question was not found
422 * in the cache; i.e., the block that
423 * getblk() returned is empty. So, we
424 * can (and should) copy in the
425 * contents, because we've already
426 * determined that this was the right
427 * version of this block on disk.
428 *
429 * And, it can't have changed underneath
430 * us, because we have the segment lock.
431 */
432 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
433 if(error)
434 goto err2;
435 }
436 }
437 if ((error = lfs_bwrite_ext(bp,BW_CLEAN)) != 0)
438 goto err2;
439 }
440
441 /*
442 * Finish the old file, if there was one
443 */
444 if(v_daddr != LFS_UNUSED_DADDR) {
445 #ifdef DEBUG_LFS
446 if(ip->i_flag & (IN_MODIFIED|IN_CLEANING))
447 iwritten++;
448 #endif
449 if(lfs_fastvget_unlock) {
450 VOP_UNLOCK(vp, 0);
451 numlocked--;
452 }
453 lfs_vunref(vp);
454 numrefed--;
455 }
456
457 /*
458 * The last write has to be SEGM_SYNC, because of calling semantics.
459 * It also has to be SEGM_CKP, because otherwise we could write
460 * over the newly cleaned data contained in a checkpoint, and then
461 * we'd be unhappy at recovery time.
462 */
463 lfs_segwrite(mntp, SEGM_SYNC|SEGM_CLEAN|SEGM_CKP);
464 free(start, M_SEGMENT);
465
466 lfs_segunlock(fs);
467
468 #ifdef DEBUG_LFS
469 printf("%d]",iwritten);
470 if(numlocked != 0 || numrefed != 0) {
471 panic("lfs_markv: numlocked=%d numrefed=%d", numlocked, numrefed);
472 }
473 #endif
474
475 if(error)
476 return (error);
477 else if(do_again)
478 return EAGAIN;
479
480 return 0;
481
482 err2:
483 printf("lfs_markv err2\n");
484 lfs_vunref(vp);
485 /* Free up fakebuffers -- have to take these from the LOCKED list */
486 again:
487 s = splbio();
488 for(bp = bufqueues[BQ_LOCKED].tqh_first; bp; bp=nbp) {
489 nbp = bp->b_freelist.tqe_next;
490 if(bp->b_flags & B_CALL) {
491 if(bp->b_flags & B_BUSY) { /* not bloody likely */
492 bp->b_flags |= B_WANTED;
493 tsleep(bp, PRIBIO+1, "markv", 0);
494 splx(s);
495 goto again;
496 }
497 bremfree(bp);
498 splx(s);
499 brelse(bp);
500 s = splbio();
501 }
502 }
503 splx(s);
504 free(start, M_SEGMENT);
505 lfs_segunlock(fs);
506 vfs_unbusy(mntp);
507 return (error);
508
509 err1:
510 printf("lfs_markv err1\n");
511 free(start, M_SEGMENT);
512 return (error);
513 }
514
515 /*
516 * sys_lfs_bmapv:
517 *
518 * This will fill in the current disk address for arrays of blocks.
519 *
520 * 0 on success
521 * -1/errno is return on error.
522 */
523
524 int
525 sys_lfs_bmapv(p, v, retval)
526 struct proc *p;
527 void *v;
528 register_t *retval;
529 {
530 struct sys_lfs_bmapv_args /* {
531 syscallarg(fsid_t *) fsidp;
532 syscallarg(struct block_info *) blkiov;
533 syscallarg(int) blkcnt;
534 } */ *uap = v;
535 BLOCK_INFO *blkp;
536 IFILE *ifp;
537 struct buf *bp;
538 struct inode *ip = NULL;
539 struct lfs *fs;
540 struct mount *mntp;
541 struct ufsmount *ump;
542 struct vnode *vp;
543 fsid_t fsid;
544 void *start;
545 ino_t lastino;
546 ufs_daddr_t v_daddr;
547 int origcnt, cnt, error, need_unlock=0;
548 int numlocked=0, numrefed=0;
549 #ifdef LFS_TRACK_IOS
550 int j;
551 #endif
552
553 lfs_cleaner_pid = p->p_pid;
554
555 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
556 return (error);
557
558 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
559 return (error);
560 if ((mntp = vfs_getvfs(&fsid)) == NULL)
561 return (EINVAL);
562
563 ump = VFSTOUFS(mntp);
564
565 origcnt = cnt = SCARG(uap, blkcnt);
566 start = malloc(cnt * sizeof(BLOCK_INFO), M_SEGMENT, M_WAITOK);
567 error = copyin(SCARG(uap, blkiov), start, cnt * sizeof(BLOCK_INFO));
568 if (error) {
569 free(start, M_SEGMENT);
570 return (error);
571 }
572
573 fs = VFSTOUFS(mntp)->um_lfs;
574
575 error = 0;
576
577 /* these were inside the initialization for the for loop */
578 v_daddr = LFS_UNUSED_DADDR;
579 lastino = LFS_UNUSED_INUM;
580 for (blkp = start; cnt--; ++blkp)
581 {
582 #ifdef DEBUG
583 if (datosn(fs, fs->lfs_curseg) == datosn(fs, blkp->bi_daddr)) {
584 printf("lfs_bmapv: attempt to clean current segment? (#%d)\n",
585 datosn(fs, fs->lfs_curseg));
586 free(start,M_SEGMENT);
587 return (EBUSY);
588 }
589 #endif /* DEBUG */
590 #ifdef LFS_TRACK_IOS
591 /*
592 * If there is I/O on this segment that is not yet complete,
593 * the cleaner probably does not have the right information.
594 * Send it packing.
595 */
596 for(j=0;j<LFS_THROTTLE;j++) {
597 if(fs->lfs_pending[j] != LFS_UNUSED_DADDR
598 && datosn(fs,fs->lfs_pending[j])==datosn(fs,blkp->bi_daddr))
599 {
600 printf("lfs_bmapv: attempt to clean pending segment? (#%d)\n",
601 datosn(fs, fs->lfs_pending[j]));
602 free(start,M_SEGMENT);
603 return (EBUSY);
604 }
605 }
606
607 #endif /* LFS_TRACK_IOS */
608 /*
609 * Get the IFILE entry (only once) and see if the file still
610 * exists.
611 */
612 if (lastino != blkp->bi_inode) {
613 /*
614 * Finish the old file, if there was one. The presence
615 * of a usable vnode in vp is signaled by a valid
616 * v_daddr.
617 */
618 if(v_daddr != LFS_UNUSED_DADDR) {
619 if(need_unlock) {
620 VOP_UNLOCK(vp, 0);
621 numlocked--;
622 }
623 lfs_vunref(vp);
624 numrefed--;
625 }
626
627 /*
628 * Start a new file
629 */
630 lastino = blkp->bi_inode;
631 if (blkp->bi_inode == LFS_IFILE_INUM)
632 v_daddr = fs->lfs_idaddr;
633 else {
634 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
635 v_daddr = ifp->if_daddr;
636 brelse(bp);
637 }
638 if (v_daddr == LFS_UNUSED_DADDR) {
639 blkp->bi_daddr = LFS_UNUSED_DADDR;
640 continue;
641 }
642 /*
643 * A regular call to VFS_VGET could deadlock
644 * here. Instead, we try an unlocked access.
645 */
646 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
647 if (vp != NULL && !(vp->v_flag & VXLOCK)) {
648 ip = VTOI(vp);
649 if (lfs_vref(vp)) {
650 v_daddr = LFS_UNUSED_DADDR;
651 need_unlock = 0;
652 continue;
653 }
654 numrefed++;
655 if(VOP_ISLOCKED(vp)) {
656 #ifdef DEBUG_LFS
657 printf("lfs_bmapv: inode %d inlocked\n",ip->i_number);
658 #endif
659 v_daddr = LFS_UNUSED_DADDR;
660 need_unlock = 0;
661 lfs_vunref(vp);
662 --numrefed;
663 continue;
664 } else {
665 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
666 need_unlock = FVG_UNLOCK;
667 numlocked++;
668 }
669 } else {
670 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
671 if(error) {
672 #ifdef DEBUG_LFS
673 printf("lfs_bmapv: vget of ino %d failed with %d",blkp->bi_inode,error);
674 #endif
675 v_daddr = LFS_UNUSED_DADDR;
676 need_unlock = 0;
677 continue;
678 } else {
679 need_unlock = FVG_PUT;
680 numlocked++;
681 numrefed++;
682 }
683 }
684 ip = VTOI(vp);
685 } else if (v_daddr == LFS_UNUSED_DADDR) {
686 /*
687 * This can only happen if the vnode is dead.
688 * Keep going. Note that we DO NOT set the
689 * bi_addr to anything -- if we failed to get
690 * the vnode, for example, we want to assume
691 * conservatively that all of its blocks *are*
692 * located in the segment in question.
693 * lfs_markv will throw them out if we are
694 * wrong.
695 */
696 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
697 continue;
698 }
699
700 /* Past this point we are guaranteed that vp, ip are valid. */
701
702 if(blkp->bi_lbn == LFS_UNUSED_LBN) {
703 /*
704 * We just want the inode address, which is
705 * conveniently in v_daddr.
706 */
707 blkp->bi_daddr = v_daddr;
708 } else {
709 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
710 &(blkp->bi_daddr), NULL);
711 if(error)
712 {
713 blkp->bi_daddr = LFS_UNUSED_DADDR;
714 continue;
715 }
716 }
717 }
718
719 /*
720 * Finish the old file, if there was one. The presence
721 * of a usable vnode in vp is signaled by a valid v_daddr.
722 */
723 if(v_daddr != LFS_UNUSED_DADDR) {
724 if(need_unlock) {
725 VOP_UNLOCK(vp, 0);
726 numlocked--;
727 }
728 lfs_vunref(vp);
729 numrefed--;
730 }
731
732 if(numlocked != 0 || numrefed != 0) {
733 panic("lfs_bmapv: numlocked=%d numrefed=%d", numlocked,
734 numrefed);
735 }
736
737 copyout(start, SCARG(uap, blkiov), origcnt * sizeof(BLOCK_INFO));
738 free(start, M_SEGMENT);
739
740 return 0;
741 }
742
743 /*
744 * sys_lfs_segclean:
745 *
746 * Mark the segment clean.
747 *
748 * 0 on success
749 * -1/errno is return on error.
750 */
751 int
752 sys_lfs_segclean(p, v, retval)
753 struct proc *p;
754 void *v;
755 register_t *retval;
756 {
757 struct sys_lfs_segclean_args /* {
758 syscallarg(fsid_t *) fsidp;
759 syscallarg(u_long) segment;
760 } */ *uap = v;
761 CLEANERINFO *cip;
762 SEGUSE *sup;
763 struct buf *bp;
764 struct mount *mntp;
765 struct lfs *fs;
766 fsid_t fsid;
767 int error;
768
769 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
770 return (error);
771
772 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
773 return (error);
774 if ((mntp = vfs_getvfs(&fsid)) == NULL)
775 return (EINVAL);
776
777 fs = VFSTOUFS(mntp)->um_lfs;
778
779 if (datosn(fs, fs->lfs_curseg) == SCARG(uap, segment))
780 return (EBUSY);
781
782 LFS_SEGENTRY(sup, fs, SCARG(uap, segment), bp);
783 if (sup->su_flags & SEGUSE_ACTIVE) {
784 brelse(bp);
785 return (EBUSY);
786 }
787
788 fs->lfs_avail += fsbtodb(fs, fs->lfs_ssize);
789 if (sup->su_flags & SEGUSE_SUPERBLOCK)
790 fs->lfs_avail -= btodb(LFS_SBPAD);
791 fs->lfs_bfree += sup->su_nsums * btodb(LFS_SUMMARY_SIZE) +
792 fsbtodb(fs, sup->su_ninos);
793 fs->lfs_dmeta -= sup->su_nsums * btodb(LFS_SUMMARY_SIZE) +
794 fsbtodb(fs, sup->su_ninos);
795 if (fs->lfs_dmeta < 0)
796 fs->lfs_dmeta = 0;
797 sup->su_flags &= ~SEGUSE_DIRTY;
798 (void) VOP_BWRITE(bp);
799
800 LFS_CLEANERINFO(cip, fs, bp);
801 ++cip->clean;
802 --cip->dirty;
803 fs->lfs_nclean = cip->clean;
804 cip->bfree = fs->lfs_bfree;
805 cip->avail = fs->lfs_avail - fs->lfs_ravail;
806 (void) VOP_BWRITE(bp);
807 wakeup(&fs->lfs_avail);
808
809 return (0);
810 }
811
812 /*
813 * sys_lfs_segwait:
814 *
815 * This will block until a segment in file system fsid is written. A timeout
816 * in milliseconds may be specified which will awake the cleaner automatically.
817 * An fsid of -1 means any file system, and a timeout of 0 means forever.
818 *
819 * 0 on success
820 * 1 on timeout
821 * -1/errno is return on error.
822 */
823 int
824 sys_lfs_segwait(p, v, retval)
825 struct proc *p;
826 void *v;
827 register_t *retval;
828 {
829 struct sys_lfs_segwait_args /* {
830 syscallarg(fsid_t *) fsidp;
831 syscallarg(struct timeval *) tv;
832 } */ *uap = v;
833 extern int lfs_allclean_wakeup;
834 struct mount *mntp;
835 struct timeval atv;
836 fsid_t fsid;
837 void *addr;
838 u_long timeout;
839 int error, s;
840
841 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) {
842 return (error);
843 }
844 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
845 return (error);
846 if ((mntp = vfs_getvfs(&fsid)) == NULL)
847 addr = &lfs_allclean_wakeup;
848 else
849 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
850
851 if (SCARG(uap, tv)) {
852 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
853 if (error)
854 return (error);
855 if (itimerfix(&atv))
856 return (EINVAL);
857 /*
858 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
859 * XXX IS THAT WHAT IS INTENDED?
860 */
861 s = splclock();
862 timeradd(&atv, &time, &atv);
863 timeout = hzto(&atv);
864 splx(s);
865 } else
866 timeout = 0;
867
868 error = tsleep(addr, PCATCH | PUSER, "segment", timeout);
869 return (error == ERESTART ? EINTR : 0);
870 }
871
872 /*
873 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
874 * daddr from the ifile, so don't look it up again. If the cleaner is
875 * processing IINFO structures, it may have the ondisk inode already, so
876 * don't go retrieving it again.
877 *
878 * If we find the vnode on the hash chain, then it may be locked by another
879 * process; so we set (*need_unlock) to zero.
880 *
881 * If we don't, we call ufs_ihashins, which locks the inode, and we set
882 * (*need_unlock) to non-zero.
883 *
884 * In either case we lfs_vref, and it is the caller's responsibility to
885 * lfs_vunref and VOP_UNLOCK (if necessary) when finished.
886 */
887 extern struct lock ufs_hashlock;
888
889 int
890 lfs_fasthashget(dev, ino, need_unlock, vpp)
891 dev_t dev;
892 ino_t ino;
893 int *need_unlock;
894 struct vnode **vpp;
895 {
896 struct inode *ip;
897
898 /*
899 * This is playing fast and loose. Someone may have the inode
900 * locked, in which case they are going to be distinctly unhappy
901 * if we trash something.
902 */
903 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
904 if ((*vpp)->v_flag & VXLOCK) {
905 printf("lfs_fastvget: vnode VXLOCKed for ino %d\n",
906 ino);
907 clean_vnlocked++;
908 #ifdef LFS_EAGAIN_FAIL
909 return EAGAIN;
910 #endif
911 }
912 ip = VTOI(*vpp);
913 if (lfs_vref(*vpp)) {
914 clean_inlocked++;
915 return EAGAIN;
916 }
917 if (VOP_ISLOCKED(*vpp)) {
918 #ifdef DEBUG_LFS
919 printf("lfs_fastvget: ino %d inlocked by pid %d\n",
920 ip->i_number, (*vpp)->v_lock.lk_lockholder);
921 #endif
922 clean_inlocked++;
923 #ifdef LFS_EAGAIN_FAIL
924 lfs_vunref(*vpp);
925 return EAGAIN;
926 #endif /* LFS_EAGAIN_FAIL */
927 } else {
928 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
929 *need_unlock |= FVG_UNLOCK;
930 }
931 } else
932 *vpp = NULL;
933
934 return (0);
935 }
936
937 int
938 lfs_fastvget(mp, ino, daddr, vpp, dinp, need_unlock)
939 struct mount *mp;
940 ino_t ino;
941 ufs_daddr_t daddr;
942 struct vnode **vpp;
943 struct dinode *dinp;
944 int *need_unlock;
945 {
946 struct inode *ip;
947 struct vnode *vp;
948 struct ufsmount *ump;
949 dev_t dev;
950 int error;
951 struct buf *bp;
952
953 ump = VFSTOUFS(mp);
954 dev = ump->um_dev;
955 *need_unlock = 0;
956
957 error = lfs_fasthashget(dev, ino, need_unlock, vpp);
958 if (error != 0 || *vpp != NULL)
959 return (error);
960
961 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
962 *vpp = NULL;
963 return (error);
964 }
965
966 do {
967 error = lfs_fasthashget(dev, ino, need_unlock, vpp);
968 if (error != 0 || *vpp != NULL) {
969 ungetnewvnode(vp);
970 return (error);
971 }
972 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
973
974 /* Allocate new vnode/inode. */
975 lfs_vcreate(mp, ino, vp);
976
977 /*
978 * Put it onto its hash chain and lock it so that other requests for
979 * this inode will block if they arrive while we are sleeping waiting
980 * for old data structures to be purged or for the contents of the
981 * disk portion of this inode to be read.
982 */
983 ip = VTOI(vp);
984 ufs_ihashins(ip);
985 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
986
987 /*
988 * XXX
989 * This may not need to be here, logically it should go down with
990 * the i_devvp initialization.
991 * Ask Kirk.
992 */
993 ip->i_lfs = ump->um_lfs;
994
995 /* Read in the disk contents for the inode, copy into the inode. */
996 if (dinp) {
997 error = copyin(dinp, &ip->i_din.ffs_din, DINODE_SIZE);
998 if (error) {
999 printf("lfs_fastvget: dinode copyin failed for ino %d\n", ino);
1000 ufs_ihashrem(ip);
1001
1002 /* Unlock and discard unneeded inode. */
1003 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1004 lfs_vunref(vp);
1005 *vpp = NULL;
1006 return (error);
1007 }
1008 if(ip->i_number != ino)
1009 panic("lfs_fastvget: I was fed the wrong inode!");
1010 } else {
1011 error = bread(ump->um_devvp, daddr,
1012 (int)ump->um_lfs->lfs_bsize, NOCRED, &bp);
1013 if (error) {
1014 printf("lfs_fastvget: bread failed with %d\n",error);
1015 /*
1016 * The inode does not contain anything useful, so it
1017 * would be misleading to leave it on its hash chain.
1018 * Iput() will return it to the free list.
1019 */
1020 ufs_ihashrem(ip);
1021
1022 /* Unlock and discard unneeded inode. */
1023 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1024 lfs_vunref(vp);
1025 brelse(bp);
1026 *vpp = NULL;
1027 return (error);
1028 }
1029 ip->i_din.ffs_din =
1030 *lfs_ifind(ump->um_lfs, ino, bp);
1031 brelse(bp);
1032 }
1033 ip->i_ffs_effnlink = ip->i_ffs_nlink;
1034
1035 /*
1036 * Initialize the vnode from the inode, check for aliases. In all
1037 * cases re-init ip, the underlying vnode/inode may have changed.
1038 */
1039 error = ufs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
1040 if (error) {
1041 /* This CANNOT happen (see ufs_vinit) */
1042 printf("lfs_fastvget: ufs_vinit returned %d for ino %d\n", error, ino);
1043 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1044 lfs_vunref(vp);
1045 *vpp = NULL;
1046 return (error);
1047 }
1048 #ifdef DEBUG_LFS
1049 if(vp->v_type == VNON) {
1050 printf("lfs_fastvget: ino %d is type VNON! (ifmt=%o, dinp=%p)\n",
1051 ip->i_number, (ip->i_ffs_mode & IFMT)>>12, dinp);
1052 lfs_dump_dinode(&ip->i_din.ffs_din);
1053 #ifdef DDB
1054 Debugger();
1055 #endif
1056 }
1057 #endif /* DEBUG_LFS */
1058 /*
1059 * Finish inode initialization now that aliasing has been resolved.
1060 */
1061 ip->i_devvp = ump->um_devvp;
1062 VREF(ip->i_devvp);
1063 *vpp = vp;
1064 *need_unlock |= FVG_PUT;
1065
1066 return (0);
1067 }
1068
1069 struct buf *
1070 lfs_fakebuf(vp, lbn, size, uaddr)
1071 struct vnode *vp;
1072 int lbn;
1073 size_t size;
1074 caddr_t uaddr;
1075 {
1076 struct buf *bp;
1077 int error;
1078
1079 #ifndef ALLOW_VFLUSH_CORRUPTION
1080 bp = lfs_newbuf(vp, lbn, size);
1081 error = copyin(uaddr, bp->b_data, size);
1082 if(error) {
1083 lfs_freebuf(bp);
1084 return NULL;
1085 }
1086 #else
1087 bp = lfs_newbuf(vp, lbn, 0);
1088 bp->b_flags |= B_INVAL;
1089 bp->b_saveaddr = uaddr;
1090 #endif
1091
1092 bp->b_bufsize = size;
1093 bp->b_bcount = size;
1094 return (bp);
1095 }
1096