lfs_syscalls.c revision 1.106 1 /* $NetBSD: lfs_syscalls.c,v 1.106 2005/05/20 19:48:25 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.106 2005/05/20 19:48:25 perseant Exp $");
71
72 #ifndef LFS
73 # define LFS /* for prototypes in syscallargs.h */
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/buf.h>
80 #include <sys/mount.h>
81 #include <sys/vnode.h>
82 #include <sys/kernel.h>
83
84 #include <sys/sa.h>
85 #include <sys/syscallargs.h>
86
87 #include <ufs/ufs/inode.h>
88 #include <ufs/ufs/ufsmount.h>
89 #include <ufs/ufs/ufs_extern.h>
90
91 #include <ufs/lfs/lfs.h>
92 #include <ufs/lfs/lfs_extern.h>
93
94 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
95 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
96
97 pid_t lfs_cleaner_pid = 0;
98
99 #define LFS_FORCE_WRITE UNASSIGNED
100
101 /*
102 * sys_lfs_markv:
103 *
104 * This will mark inodes and blocks dirty, so they are written into the log.
105 * It will block until all the blocks have been written. The segment create
106 * time passed in the block_info and inode_info structures is used to decide
107 * if the data is valid for each block (in case some process dirtied a block
108 * or inode that is being cleaned between the determination that a block is
109 * live and the lfs_markv call).
110 *
111 * 0 on success
112 * -1/errno is return on error.
113 */
114 #ifdef USE_64BIT_SYSCALLS
115 int
116 sys_lfs_markv(struct proc *p, void *v, register_t *retval)
117 {
118 struct sys_lfs_markv_args /* {
119 syscallarg(fsid_t *) fsidp;
120 syscallarg(struct block_info *) blkiov;
121 syscallarg(int) blkcnt;
122 } */ *uap = v;
123 BLOCK_INFO *blkiov;
124 int blkcnt, error;
125 fsid_t fsid;
126 struct lfs *fs;
127 struct mount *mntp;
128
129 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
130 return (error);
131
132 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
133 return (error);
134
135 if ((mntp = vfs_getvfs(fsidp)) == NULL)
136 return (ENOENT);
137 fs = VFSTOUFS(mntp)->um_lfs;
138
139 blkcnt = SCARG(uap, blkcnt);
140 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
141 return (EINVAL);
142
143 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
144 if ((error = copyin(SCARG(uap, blkiov), blkiov,
145 blkcnt * sizeof(BLOCK_INFO))) != 0)
146 goto out;
147
148 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
149 copyout(blkiov, SCARG(uap, blkiov),
150 blkcnt * sizeof(BLOCK_INFO));
151 out:
152 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
153 return error;
154 }
155 #else
156 int
157 sys_lfs_markv(struct lwp *l, void *v, register_t *retval)
158 {
159 struct sys_lfs_markv_args /* {
160 syscallarg(fsid_t *) fsidp;
161 syscallarg(struct block_info *) blkiov;
162 syscallarg(int) blkcnt;
163 } */ *uap = v;
164 BLOCK_INFO *blkiov;
165 BLOCK_INFO_15 *blkiov15;
166 int i, blkcnt, error;
167 fsid_t fsid;
168 struct lfs *fs;
169 struct mount *mntp;
170
171 if ((error = suser(l->l_proc->p_ucred, &l->l_proc->p_acflag)) != 0)
172 return (error);
173
174 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
175 return (error);
176
177 if ((mntp = vfs_getvfs(&fsid)) == NULL)
178 return (ENOENT);
179 fs = VFSTOUFS(mntp)->um_lfs;
180
181 blkcnt = SCARG(uap, blkcnt);
182 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
183 return (EINVAL);
184
185 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
186 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
187 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
188 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
189 goto out;
190
191 for (i = 0; i < blkcnt; i++) {
192 blkiov[i].bi_inode = blkiov15[i].bi_inode;
193 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
194 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
195 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
196 blkiov[i].bi_version = blkiov15[i].bi_version;
197 blkiov[i].bi_bp = blkiov15[i].bi_bp;
198 blkiov[i].bi_size = blkiov15[i].bi_size;
199 }
200
201 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
202 for (i = 0; i < blkcnt; i++) {
203 blkiov15[i].bi_inode = blkiov[i].bi_inode;
204 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
205 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
206 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
207 blkiov15[i].bi_version = blkiov[i].bi_version;
208 blkiov15[i].bi_bp = blkiov[i].bi_bp;
209 blkiov15[i].bi_size = blkiov[i].bi_size;
210 }
211 copyout(blkiov15, SCARG(uap, blkiov),
212 blkcnt * sizeof(BLOCK_INFO_15));
213 }
214 out:
215 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
216 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
217 return error;
218 }
219 #endif
220
221 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
222
223 int
224 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
225 {
226 BLOCK_INFO *blkp;
227 IFILE *ifp;
228 struct buf *bp;
229 struct inode *ip = NULL;
230 struct lfs *fs;
231 struct mount *mntp;
232 struct vnode *vp;
233 ino_t lastino;
234 daddr_t b_daddr, v_daddr;
235 int cnt, error;
236 int do_again = 0;
237 int numrefed = 0;
238 ino_t maxino;
239 size_t obsize;
240
241 /* number of blocks/inodes that we have already bwrite'ed */
242 int nblkwritten, ninowritten;
243
244 if ((mntp = vfs_getvfs(fsidp)) == NULL)
245 return (ENOENT);
246
247 fs = VFSTOUFS(mntp)->um_lfs;
248
249 if (fs->lfs_ronly)
250 return EROFS;
251
252 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
253 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
254
255 cnt = blkcnt;
256
257 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
258 return (error);
259
260 /*
261 * This seglock is just to prevent the fact that we might have to sleep
262 * from allowing the possibility that our blocks might become
263 * invalid.
264 *
265 * It is also important to note here that unless we specify SEGM_CKP,
266 * any Ifile blocks that we might be asked to clean will never get
267 * to the disk.
268 */
269 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
270
271 /* Mark blocks/inodes dirty. */
272 error = 0;
273
274 /* these were inside the initialization for the for loop */
275 v_daddr = LFS_UNUSED_DADDR;
276 lastino = LFS_UNUSED_INUM;
277 nblkwritten = ninowritten = 0;
278 for (blkp = blkiov; cnt--; ++blkp)
279 {
280 if (blkp->bi_daddr == LFS_FORCE_WRITE)
281 DLOG((DLOG_CLEAN, "lfs_markv: warning: force-writing"
282 " ino %d lbn %lld\n", blkp->bi_inode,
283 (long long)blkp->bi_lbn));
284 /* Bounds-check incoming data, avoid panic for failed VGET */
285 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
286 error = EINVAL;
287 goto err3;
288 }
289 /*
290 * Get the IFILE entry (only once) and see if the file still
291 * exists.
292 */
293 if (lastino != blkp->bi_inode) {
294 /*
295 * Finish the old file, if there was one. The presence
296 * of a usable vnode in vp is signaled by a valid v_daddr.
297 */
298 if (v_daddr != LFS_UNUSED_DADDR) {
299 lfs_vunref(vp);
300 numrefed--;
301 }
302
303 /*
304 * Start a new file
305 */
306 lastino = blkp->bi_inode;
307 if (blkp->bi_inode == LFS_IFILE_INUM)
308 v_daddr = fs->lfs_idaddr;
309 else {
310 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
311 /* XXX fix for force write */
312 v_daddr = ifp->if_daddr;
313 brelse(bp);
314 }
315 /* Don't force-write the ifile */
316 if (blkp->bi_inode == LFS_IFILE_INUM
317 && blkp->bi_daddr == LFS_FORCE_WRITE)
318 {
319 continue;
320 }
321 if (v_daddr == LFS_UNUSED_DADDR
322 && blkp->bi_daddr != LFS_FORCE_WRITE)
323 {
324 continue;
325 }
326
327 /* Get the vnode/inode. */
328 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
329 &vp,
330 (blkp->bi_lbn == LFS_UNUSED_LBN
331 ? blkp->bi_bp
332 : NULL));
333
334 if (!error) {
335 numrefed++;
336 }
337 if (error) {
338 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
339 " failed with %d (ino %d, segment %d)\n",
340 error, blkp->bi_inode,
341 dtosn(fs, blkp->bi_daddr)));
342 /*
343 * If we got EAGAIN, that means that the
344 * Inode was locked. This is
345 * recoverable: just clean the rest of
346 * this segment, and let the cleaner try
347 * again with another. (When the
348 * cleaner runs again, this segment will
349 * sort high on the list, since it is
350 * now almost entirely empty.) But, we
351 * still set v_daddr = LFS_UNUSED_ADDR
352 * so as not to test this over and over
353 * again.
354 */
355 if (error == EAGAIN) {
356 error = 0;
357 do_again++;
358 }
359 #ifdef DIAGNOSTIC
360 else if (error != ENOENT)
361 panic("lfs_markv VFS_VGET FAILED");
362 #endif
363 /* lastino = LFS_UNUSED_INUM; */
364 v_daddr = LFS_UNUSED_DADDR;
365 vp = NULL;
366 ip = NULL;
367 continue;
368 }
369 ip = VTOI(vp);
370 ninowritten++;
371 } else if (v_daddr == LFS_UNUSED_DADDR) {
372 /*
373 * This can only happen if the vnode is dead (or
374 * in any case we can't get it...e.g., it is
375 * inlocked). Keep going.
376 */
377 continue;
378 }
379
380 /* Past this point we are guaranteed that vp, ip are valid. */
381
382 /* If this BLOCK_INFO didn't contain a block, keep going. */
383 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
384 /* XXX need to make sure that the inode gets written in this case */
385 /* XXX but only write the inode if it's the right one */
386 if (blkp->bi_inode != LFS_IFILE_INUM) {
387 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
388 if (ifp->if_daddr == blkp->bi_daddr
389 || blkp->bi_daddr == LFS_FORCE_WRITE)
390 {
391 LFS_SET_UINO(ip, IN_CLEANING);
392 }
393 brelse(bp);
394 }
395 continue;
396 }
397
398 b_daddr = 0;
399 if (blkp->bi_daddr != LFS_FORCE_WRITE) {
400 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
401 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
402 {
403 if (dtosn(fs,dbtofsb(fs, b_daddr))
404 == dtosn(fs,blkp->bi_daddr))
405 {
406 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
407 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)));
408 }
409 do_again++;
410 continue;
411 }
412 }
413
414 /*
415 * Check block sizes. The blocks being cleaned come from
416 * disk, so they should have the same size as their on-disk
417 * counterparts.
418 */
419 if (blkp->bi_lbn >= 0)
420 obsize = blksize(fs, ip, blkp->bi_lbn);
421 else
422 obsize = fs->lfs_bsize;
423 /* Check for fragment size change */
424 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
425 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
426 }
427 if (obsize != blkp->bi_size) {
428 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
429 " size (%ld != %d), try again\n",
430 blkp->bi_inode, (long long)blkp->bi_lbn,
431 (long) obsize, blkp->bi_size));
432 do_again++;
433 continue;
434 }
435
436 /*
437 * If we get to here, then we are keeping the block. If
438 * it is an indirect block, we want to actually put it
439 * in the buffer cache so that it can be updated in the
440 * finish_meta section. If it's not, we need to
441 * allocate a fake buffer so that writeseg can perform
442 * the copyin and write the buffer.
443 */
444 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
445 /* Data Block */
446 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
447 blkp->bi_size, blkp->bi_bp);
448 /* Pretend we used bread() to get it */
449 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
450 } else {
451 /* Indirect block or ifile */
452 if (blkp->bi_size != fs->lfs_bsize &&
453 ip->i_number != LFS_IFILE_INUM)
454 panic("lfs_markv: partial indirect block?"
455 " size=%d\n", blkp->bi_size);
456 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
457 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
458 /*
459 * The block in question was not found
460 * in the cache; i.e., the block that
461 * getblk() returned is empty. So, we
462 * can (and should) copy in the
463 * contents, because we've already
464 * determined that this was the right
465 * version of this block on disk.
466 *
467 * And, it can't have changed underneath
468 * us, because we have the segment lock.
469 */
470 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
471 if (error)
472 goto err2;
473 }
474 }
475 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
476 goto err2;
477
478 nblkwritten++;
479 /*
480 * XXX should account indirect blocks and ifile pages as well
481 */
482 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
483 > LFS_MARKV_MAX_BLOCKS) {
484 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
485 nblkwritten, ninowritten));
486 lfs_segwrite(mntp, SEGM_CLEAN);
487 nblkwritten = ninowritten = 0;
488 }
489 }
490
491 /*
492 * Finish the old file, if there was one
493 */
494 if (v_daddr != LFS_UNUSED_DADDR) {
495 lfs_vunref(vp);
496 numrefed--;
497 }
498
499 #ifdef DIAGNOSTIC
500 if (numrefed != 0)
501 panic("lfs_markv: numrefed=%d", numrefed);
502 #endif
503 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
504 nblkwritten, ninowritten));
505
506 /*
507 * The last write has to be SEGM_SYNC, because of calling semantics.
508 * It also has to be SEGM_CKP, because otherwise we could write
509 * over the newly cleaned data contained in a checkpoint, and then
510 * we'd be unhappy at recovery time.
511 */
512 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
513
514 lfs_segunlock(fs);
515
516 vfs_unbusy(mntp);
517 if (error)
518 return (error);
519 else if (do_again)
520 return EAGAIN;
521
522 return 0;
523
524 err2:
525 DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
526
527 /*
528 * XXX we're here because copyin() failed.
529 * XXX it means that we can't trust the cleanerd. too bad.
530 * XXX how can we recover from this?
531 */
532
533 err3:
534 /*
535 * XXX should do segwrite here anyway?
536 */
537
538 if (v_daddr != LFS_UNUSED_DADDR) {
539 lfs_vunref(vp);
540 --numrefed;
541 }
542
543 lfs_segunlock(fs);
544 vfs_unbusy(mntp);
545 #ifdef DIAGNOSTIC
546 if (numrefed != 0)
547 panic("lfs_markv: numrefed=%d", numrefed);
548 #endif
549
550 return (error);
551 }
552
553 /*
554 * sys_lfs_bmapv:
555 *
556 * This will fill in the current disk address for arrays of blocks.
557 *
558 * 0 on success
559 * -1/errno is return on error.
560 */
561 #ifdef USE_64BIT_SYSCALLS
562 int
563 sys_lfs_bmapv(struct proc *p, void *v, register_t *retval)
564 {
565 struct sys_lfs_bmapv_args /* {
566 syscallarg(fsid_t *) fsidp;
567 syscallarg(struct block_info *) blkiov;
568 syscallarg(int) blkcnt;
569 } */ *uap = v;
570 BLOCK_INFO *blkiov;
571 int blkcnt, error;
572 fsid_t fsid;
573 struct lfs *fs;
574 struct mount *mntp;
575
576 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
577 return (error);
578
579 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
580 return (error);
581
582 if ((mntp = vfs_getvfs(&fsid)) == NULL)
583 return (ENOENT);
584 fs = VFSTOUFS(mntp)->um_lfs;
585
586 blkcnt = SCARG(uap, blkcnt);
587 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
588 return (EINVAL);
589 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
590 if ((error = copyin(SCARG(uap, blkiov), blkiov,
591 blkcnt * sizeof(BLOCK_INFO))) != 0)
592 goto out;
593
594 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
595 copyout(blkiov, SCARG(uap, blkiov),
596 blkcnt * sizeof(BLOCK_INFO));
597 out:
598 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
599 return error;
600 }
601 #else
602 int
603 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval)
604 {
605 struct sys_lfs_bmapv_args /* {
606 syscallarg(fsid_t *) fsidp;
607 syscallarg(struct block_info *) blkiov;
608 syscallarg(int) blkcnt;
609 } */ *uap = v;
610 struct proc *p = l->l_proc;
611 BLOCK_INFO *blkiov;
612 BLOCK_INFO_15 *blkiov15;
613 int i, blkcnt, error;
614 fsid_t fsid;
615 struct lfs *fs;
616 struct mount *mntp;
617
618 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
619 return (error);
620
621 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
622 return (error);
623
624 if ((mntp = vfs_getvfs(&fsid)) == NULL)
625 return (ENOENT);
626 fs = VFSTOUFS(mntp)->um_lfs;
627
628 blkcnt = SCARG(uap, blkcnt);
629 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
630 return (EINVAL);
631 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
632 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
633 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
634 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
635 goto out;
636
637 for (i = 0; i < blkcnt; i++) {
638 blkiov[i].bi_inode = blkiov15[i].bi_inode;
639 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
640 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
641 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
642 blkiov[i].bi_version = blkiov15[i].bi_version;
643 blkiov[i].bi_bp = blkiov15[i].bi_bp;
644 blkiov[i].bi_size = blkiov15[i].bi_size;
645 }
646
647 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) {
648 for (i = 0; i < blkcnt; i++) {
649 blkiov15[i].bi_inode = blkiov[i].bi_inode;
650 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
651 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
652 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
653 blkiov15[i].bi_version = blkiov[i].bi_version;
654 blkiov15[i].bi_bp = blkiov[i].bi_bp;
655 blkiov15[i].bi_size = blkiov[i].bi_size;
656 }
657 copyout(blkiov15, SCARG(uap, blkiov),
658 blkcnt * sizeof(BLOCK_INFO_15));
659 }
660 out:
661 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
662 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
663 return error;
664 }
665 #endif
666
667 int
668 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
669 {
670 BLOCK_INFO *blkp;
671 IFILE *ifp;
672 struct buf *bp;
673 struct inode *ip = NULL;
674 struct lfs *fs;
675 struct mount *mntp;
676 struct ufsmount *ump;
677 struct vnode *vp;
678 ino_t lastino;
679 daddr_t v_daddr;
680 int cnt, error;
681 int numrefed = 0;
682
683 lfs_cleaner_pid = p->p_pid;
684
685 if ((mntp = vfs_getvfs(fsidp)) == NULL)
686 return (ENOENT);
687
688 ump = VFSTOUFS(mntp);
689 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
690 return (error);
691
692 cnt = blkcnt;
693
694 fs = VFSTOUFS(mntp)->um_lfs;
695
696 error = 0;
697
698 /* these were inside the initialization for the for loop */
699 v_daddr = LFS_UNUSED_DADDR;
700 lastino = LFS_UNUSED_INUM;
701 for (blkp = blkiov; cnt--; ++blkp)
702 {
703 /*
704 * Get the IFILE entry (only once) and see if the file still
705 * exists.
706 */
707 if (lastino != blkp->bi_inode) {
708 /*
709 * Finish the old file, if there was one. The presence
710 * of a usable vnode in vp is signaled by a valid
711 * v_daddr.
712 */
713 if (v_daddr != LFS_UNUSED_DADDR) {
714 lfs_vunref(vp);
715 numrefed--;
716 }
717
718 /*
719 * Start a new file
720 */
721 lastino = blkp->bi_inode;
722 if (blkp->bi_inode == LFS_IFILE_INUM)
723 v_daddr = fs->lfs_idaddr;
724 else {
725 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
726 v_daddr = ifp->if_daddr;
727 brelse(bp);
728 }
729 if (v_daddr == LFS_UNUSED_DADDR) {
730 blkp->bi_daddr = LFS_UNUSED_DADDR;
731 continue;
732 }
733 /*
734 * A regular call to VFS_VGET could deadlock
735 * here. Instead, we try an unlocked access.
736 */
737 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
738 if (vp != NULL && !(vp->v_flag & VXLOCK)) {
739 ip = VTOI(vp);
740 if (lfs_vref(vp)) {
741 v_daddr = LFS_UNUSED_DADDR;
742 continue;
743 }
744 numrefed++;
745 } else {
746 /*
747 * Don't VFS_VGET if we're being unmounted,
748 * since we hold vfs_busy().
749 */
750 if (mntp->mnt_iflag & IMNT_UNMOUNT) {
751 v_daddr = LFS_UNUSED_DADDR;
752 continue;
753 }
754 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
755 if (error) {
756 DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino"
757 "%d failed with %d",
758 blkp->bi_inode,error));
759 v_daddr = LFS_UNUSED_DADDR;
760 continue;
761 } else {
762 KASSERT(VOP_ISLOCKED(vp));
763 VOP_UNLOCK(vp, 0);
764 numrefed++;
765 }
766 }
767 ip = VTOI(vp);
768 } else if (v_daddr == LFS_UNUSED_DADDR) {
769 /*
770 * This can only happen if the vnode is dead.
771 * Keep going. Note that we DO NOT set the
772 * bi_addr to anything -- if we failed to get
773 * the vnode, for example, we want to assume
774 * conservatively that all of its blocks *are*
775 * located in the segment in question.
776 * lfs_markv will throw them out if we are
777 * wrong.
778 */
779 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
780 continue;
781 }
782
783 /* Past this point we are guaranteed that vp, ip are valid. */
784
785 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
786 /*
787 * We just want the inode address, which is
788 * conveniently in v_daddr.
789 */
790 blkp->bi_daddr = v_daddr;
791 } else {
792 daddr_t bi_daddr;
793
794 /* XXX ondisk32 */
795 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
796 &bi_daddr, NULL);
797 if (error)
798 {
799 blkp->bi_daddr = LFS_UNUSED_DADDR;
800 continue;
801 }
802 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
803 /* Fill in the block size, too */
804 if (blkp->bi_lbn >= 0)
805 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
806 else
807 blkp->bi_size = fs->lfs_bsize;
808 }
809 }
810
811 /*
812 * Finish the old file, if there was one. The presence
813 * of a usable vnode in vp is signaled by a valid v_daddr.
814 */
815 if (v_daddr != LFS_UNUSED_DADDR) {
816 lfs_vunref(vp);
817 numrefed--;
818 }
819
820 #ifdef DIAGNOSTIC
821 if (numrefed != 0)
822 panic("lfs_bmapv: numrefed=%d", numrefed);
823 #endif
824
825 vfs_unbusy(mntp);
826
827 return 0;
828 }
829
830 /*
831 * sys_lfs_segclean:
832 *
833 * Mark the segment clean.
834 *
835 * 0 on success
836 * -1/errno is return on error.
837 */
838 int
839 sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
840 {
841 struct sys_lfs_segclean_args /* {
842 syscallarg(fsid_t *) fsidp;
843 syscallarg(u_long) segment;
844 } */ *uap = v;
845 struct lfs *fs;
846 struct mount *mntp;
847 fsid_t fsid;
848 int error;
849 unsigned long segnum;
850 struct proc *p = l->l_proc;
851
852 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
853 return (error);
854
855 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
856 return (error);
857 if ((mntp = vfs_getvfs(&fsid)) == NULL)
858 return (ENOENT);
859
860 fs = VFSTOUFS(mntp)->um_lfs;
861 segnum = SCARG(uap, segment);
862
863 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
864 return (error);
865
866 lfs_seglock(fs, SEGM_PROT);
867 error = lfs_do_segclean(fs, segnum);
868 lfs_segunlock(fs);
869 vfs_unbusy(mntp);
870 return error;
871 }
872
873 /*
874 * Actually mark the segment clean.
875 * Must be called with the segment lock held.
876 */
877 int
878 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
879 {
880 struct buf *bp;
881 CLEANERINFO *cip;
882 SEGUSE *sup;
883
884 if (dtosn(fs, fs->lfs_curseg) == segnum) {
885 return (EBUSY);
886 }
887
888 LFS_SEGENTRY(sup, fs, segnum, bp);
889 if (sup->su_nbytes) {
890 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
891 " %d live bytes\n", segnum, sup->su_nbytes));
892 brelse(bp);
893 return (EBUSY);
894 }
895 if (sup->su_flags & SEGUSE_ACTIVE) {
896 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
897 " segment is active\n", segnum));
898 brelse(bp);
899 return (EBUSY);
900 }
901 if (!(sup->su_flags & SEGUSE_DIRTY)) {
902 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
903 " segment is already clean\n", segnum));
904 brelse(bp);
905 return (EALREADY);
906 }
907
908 fs->lfs_avail += segtod(fs, 1);
909 if (sup->su_flags & SEGUSE_SUPERBLOCK)
910 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
911 if (fs->lfs_version > 1 && segnum == 0 &&
912 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
913 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
914 simple_lock(&fs->lfs_interlock);
915 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
916 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
917 simple_unlock(&fs->lfs_interlock);
918 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
919 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
920 if (fs->lfs_dmeta < 0)
921 fs->lfs_dmeta = 0;
922 sup->su_flags &= ~SEGUSE_DIRTY;
923 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
924
925 LFS_CLEANERINFO(cip, fs, bp);
926 ++cip->clean;
927 --cip->dirty;
928 fs->lfs_nclean = cip->clean;
929 cip->bfree = fs->lfs_bfree;
930 simple_lock(&fs->lfs_interlock);
931 cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
932 simple_unlock(&fs->lfs_interlock);
933 (void) LFS_BWRITE_LOG(bp);
934 wakeup(&fs->lfs_avail);
935
936 ++lfs_stats.segs_reclaimed;
937
938 return (0);
939 }
940
941 /*
942 * This will block until a segment in file system fsid is written. A timeout
943 * in milliseconds may be specified which will awake the cleaner automatically.
944 * An fsid of -1 means any file system, and a timeout of 0 means forever.
945 */
946 int
947 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
948 {
949 struct mount *mntp;
950 void *addr;
951 u_long timeout;
952 int error, s;
953
954 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
955 addr = &lfs_allclean_wakeup;
956 else
957 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
958 /*
959 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
960 * XXX IS THAT WHAT IS INTENDED?
961 */
962 s = splclock();
963 timeradd(tv, &time, tv);
964 timeout = hzto(tv);
965 splx(s);
966 error = tsleep(addr, PCATCH | PUSER, "segment", timeout);
967 return (error == ERESTART ? EINTR : 0);
968 }
969
970 /*
971 * sys_lfs_segwait:
972 *
973 * System call wrapper around lfs_segwait().
974 *
975 * 0 on success
976 * 1 on timeout
977 * -1/errno is return on error.
978 */
979 int
980 sys_lfs_segwait(struct lwp *l, void *v, register_t *retval)
981 {
982 struct sys_lfs_segwait_args /* {
983 syscallarg(fsid_t *) fsidp;
984 syscallarg(struct timeval *) tv;
985 } */ *uap = v;
986 struct proc *p = l->l_proc;
987 struct timeval atv;
988 fsid_t fsid;
989 int error;
990
991 /* XXX need we be su to segwait? */
992 if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) {
993 return (error);
994 }
995 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
996 return (error);
997
998 if (SCARG(uap, tv)) {
999 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
1000 if (error)
1001 return (error);
1002 if (itimerfix(&atv))
1003 return (EINVAL);
1004 } else /* NULL or invalid */
1005 atv.tv_sec = atv.tv_usec = 0;
1006 return lfs_segwait(&fsid, &atv);
1007 }
1008
1009 /*
1010 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1011 * daddr from the ifile, so don't look it up again. If the cleaner is
1012 * processing IINFO structures, it may have the ondisk inode already, so
1013 * don't go retrieving it again.
1014 *
1015 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1016 * when finished.
1017 */
1018 extern struct lock ufs_hashlock;
1019
1020 int
1021 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1022 {
1023 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
1024 if ((*vpp)->v_flag & VXLOCK) {
1025 DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VXLOCK\n",
1026 ino));
1027 lfs_stats.clean_vnlocked++;
1028 return EAGAIN;
1029 }
1030 if (lfs_vref(*vpp)) {
1031 DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed"
1032 " for ino %d\n", ino));
1033 lfs_stats.clean_inlocked++;
1034 return EAGAIN;
1035 }
1036 } else
1037 *vpp = NULL;
1038
1039 return (0);
1040 }
1041
1042 int
1043 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, struct ufs1_dinode *dinp)
1044 {
1045 struct inode *ip;
1046 struct ufs1_dinode *dip;
1047 struct vnode *vp;
1048 struct ufsmount *ump;
1049 dev_t dev;
1050 int error, retries;
1051 struct buf *bp;
1052 struct lfs *fs;
1053
1054 ump = VFSTOUFS(mp);
1055 dev = ump->um_dev;
1056 fs = ump->um_lfs;
1057
1058 /*
1059 * Wait until the filesystem is fully mounted before allowing vget
1060 * to complete. This prevents possible problems with roll-forward.
1061 */
1062 simple_lock(&fs->lfs_interlock);
1063 while (fs->lfs_flags & LFS_NOTYET) {
1064 ltsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0,
1065 &fs->lfs_interlock);
1066 }
1067 simple_unlock(&fs->lfs_interlock);
1068
1069 /*
1070 * This is playing fast and loose. Someone may have the inode
1071 * locked, in which case they are going to be distinctly unhappy
1072 * if we trash something.
1073 */
1074
1075 error = lfs_fasthashget(dev, ino, vpp);
1076 if (error != 0 || *vpp != NULL)
1077 return (error);
1078
1079 /*
1080 * getnewvnode(9) will call vfs_busy, which will block if the
1081 * filesystem is being unmounted; but umount(9) is waiting for
1082 * us because we're already holding the fs busy.
1083 * XXXMP
1084 */
1085 if (mp->mnt_iflag & IMNT_UNMOUNT) {
1086 *vpp = NULL;
1087 return EDEADLK;
1088 }
1089 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1090 *vpp = NULL;
1091 return (error);
1092 }
1093
1094 do {
1095 error = lfs_fasthashget(dev, ino, vpp);
1096 if (error != 0 || *vpp != NULL) {
1097 ungetnewvnode(vp);
1098 return (error);
1099 }
1100 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
1101
1102 /* Allocate new vnode/inode. */
1103 lfs_vcreate(mp, ino, vp);
1104
1105 /*
1106 * Put it onto its hash chain and lock it so that other requests for
1107 * this inode will block if they arrive while we are sleeping waiting
1108 * for old data structures to be purged or for the contents of the
1109 * disk portion of this inode to be read.
1110 */
1111 ip = VTOI(vp);
1112 ufs_ihashins(ip);
1113 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1114
1115 /*
1116 * XXX
1117 * This may not need to be here, logically it should go down with
1118 * the i_devvp initialization.
1119 * Ask Kirk.
1120 */
1121 ip->i_lfs = fs;
1122
1123 /* Read in the disk contents for the inode, copy into the inode. */
1124 if (dinp) {
1125 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1126 if (error) {
1127 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed"
1128 " for ino %d\n", ino));
1129 ufs_ihashrem(ip);
1130
1131 /* Unlock and discard unneeded inode. */
1132 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1133 lfs_vunref(vp);
1134 *vpp = NULL;
1135 return (error);
1136 }
1137 if (ip->i_number != ino)
1138 panic("lfs_fastvget: I was fed the wrong inode!");
1139 } else {
1140 retries = 0;
1141 again:
1142 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1143 NOCRED, &bp);
1144 if (error) {
1145 DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n",
1146 error));
1147 /*
1148 * The inode does not contain anything useful, so it
1149 * would be misleading to leave it on its hash chain.
1150 * Iput() will return it to the free list.
1151 */
1152 ufs_ihashrem(ip);
1153
1154 /* Unlock and discard unneeded inode. */
1155 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1156 lfs_vunref(vp);
1157 brelse(bp);
1158 *vpp = NULL;
1159 return (error);
1160 }
1161 dip = lfs_ifind(ump->um_lfs, ino, bp);
1162 if (dip == NULL) {
1163 /* Assume write has not completed yet; try again */
1164 bp->b_flags |= B_INVAL;
1165 brelse(bp);
1166 ++retries;
1167 if (retries > LFS_IFIND_RETRIES)
1168 panic("lfs_fastvget: dinode not found");
1169 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found,"
1170 " retrying...\n"));
1171 goto again;
1172 }
1173 *ip->i_din.ffs1_din = *dip;
1174 brelse(bp);
1175 }
1176 lfs_vinit(mp, &vp);
1177
1178 *vpp = vp;
1179
1180 KASSERT(VOP_ISLOCKED(vp));
1181 VOP_UNLOCK(vp, 0);
1182
1183 return (0);
1184 }
1185
1186 /*
1187 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1188 */
1189 struct buf *
1190 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
1191 {
1192 struct buf *bp;
1193 int error;
1194
1195 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1196
1197 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1198 error = copyin(uaddr, bp->b_data, size);
1199 if (error) {
1200 lfs_freebuf(fs, bp);
1201 return NULL;
1202 }
1203 KDASSERT(bp->b_iodone == lfs_callback);
1204
1205 #if 0
1206 simple_lock(&fs->lfs_interlock);
1207 ++fs->lfs_iocount;
1208 simple_unlock(&fs->lfs_interlock);
1209 #endif
1210 bp->b_bufsize = size;
1211 bp->b_bcount = size;
1212 return (bp);
1213 }
1214