lfs_syscalls.c revision 1.129 1 /* $NetBSD: lfs_syscalls.c,v 1.129 2008/04/21 11:45:34 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Konrad E. Schroder <perseant (at) hhhh.org>.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39 /*-
40 * Copyright (c) 1991, 1993, 1994
41 * The Regents of the University of California. All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
68 */
69
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.129 2008/04/21 11:45:34 ad Exp $");
72
73 #ifndef LFS
74 # define LFS /* for prototypes in syscallargs.h */
75 #endif
76
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/proc.h>
80 #include <sys/buf.h>
81 #include <sys/mount.h>
82 #include <sys/vnode.h>
83 #include <sys/kernel.h>
84 #include <sys/kauth.h>
85 #include <sys/syscallargs.h>
86
87 #include <ufs/ufs/inode.h>
88 #include <ufs/ufs/ufsmount.h>
89 #include <ufs/ufs/ufs_extern.h>
90
91 #include <ufs/lfs/lfs.h>
92 #include <ufs/lfs/lfs_extern.h>
93
94 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *);
95 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
96
97 pid_t lfs_cleaner_pid = 0;
98
99 /*
100 * sys_lfs_markv:
101 *
102 * This will mark inodes and blocks dirty, so they are written into the log.
103 * It will block until all the blocks have been written. The segment create
104 * time passed in the block_info and inode_info structures is used to decide
105 * if the data is valid for each block (in case some process dirtied a block
106 * or inode that is being cleaned between the determination that a block is
107 * live and the lfs_markv call).
108 *
109 * 0 on success
110 * -1/errno is return on error.
111 */
112 #ifdef USE_64BIT_SYSCALLS
113 int
114 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
115 {
116 /* {
117 syscallarg(fsid_t *) fsidp;
118 syscallarg(struct block_info *) blkiov;
119 syscallarg(int) blkcnt;
120 } */
121 BLOCK_INFO *blkiov;
122 int blkcnt, error;
123 fsid_t fsid;
124 struct lfs *fs;
125 struct mount *mntp;
126
127 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
128 NULL)) != 0)
129 return (error);
130
131 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
132 return (error);
133
134 if ((mntp = vfs_getvfs(fsidp)) == NULL)
135 return (ENOENT);
136 fs = VFSTOUFS(mntp)->um_lfs;
137
138 blkcnt = SCARG(uap, blkcnt);
139 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
140 return (EINVAL);
141
142 KERNEL_LOCK(1, NULL);
143 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
144 if ((error = copyin(SCARG(uap, blkiov), blkiov,
145 blkcnt * sizeof(BLOCK_INFO))) != 0)
146 goto out;
147
148 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
149 copyout(blkiov, SCARG(uap, blkiov),
150 blkcnt * sizeof(BLOCK_INFO));
151 out:
152 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
153 KERNEL_UNLOCK_ONE(NULL);
154 return error;
155 }
156 #else
157 int
158 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
159 {
160 /* {
161 syscallarg(fsid_t *) fsidp;
162 syscallarg(struct block_info *) blkiov;
163 syscallarg(int) blkcnt;
164 } */
165 BLOCK_INFO *blkiov;
166 BLOCK_INFO_15 *blkiov15;
167 int i, blkcnt, error;
168 fsid_t fsid;
169 struct lfs *fs;
170 struct mount *mntp;
171
172 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
173 NULL)) != 0)
174 return (error);
175
176 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
177 return (error);
178
179 if ((mntp = vfs_getvfs(&fsid)) == NULL)
180 return (ENOENT);
181 fs = VFSTOUFS(mntp)->um_lfs;
182
183 blkcnt = SCARG(uap, blkcnt);
184 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
185 return (EINVAL);
186
187 KERNEL_LOCK(1, NULL);
188 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
189 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
190 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
191 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
192 goto out;
193
194 for (i = 0; i < blkcnt; i++) {
195 blkiov[i].bi_inode = blkiov15[i].bi_inode;
196 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
197 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
198 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
199 blkiov[i].bi_version = blkiov15[i].bi_version;
200 blkiov[i].bi_bp = blkiov15[i].bi_bp;
201 blkiov[i].bi_size = blkiov15[i].bi_size;
202 }
203
204 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
205 for (i = 0; i < blkcnt; i++) {
206 blkiov15[i].bi_inode = blkiov[i].bi_inode;
207 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
208 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
209 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
210 blkiov15[i].bi_version = blkiov[i].bi_version;
211 blkiov15[i].bi_bp = blkiov[i].bi_bp;
212 blkiov15[i].bi_size = blkiov[i].bi_size;
213 }
214 copyout(blkiov15, SCARG(uap, blkiov),
215 blkcnt * sizeof(BLOCK_INFO_15));
216 }
217 out:
218 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
219 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
220 KERNEL_UNLOCK_ONE(NULL);
221 return error;
222 }
223 #endif
224
225 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
226
227 int
228 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov,
229 int blkcnt)
230 {
231 BLOCK_INFO *blkp;
232 IFILE *ifp;
233 struct buf *bp;
234 struct inode *ip = NULL;
235 struct lfs *fs;
236 struct mount *mntp;
237 struct vnode *vp = NULL;
238 ino_t lastino;
239 daddr_t b_daddr, v_daddr;
240 int cnt, error;
241 int do_again = 0;
242 int numrefed = 0;
243 ino_t maxino;
244 size_t obsize;
245
246 /* number of blocks/inodes that we have already bwrite'ed */
247 int nblkwritten, ninowritten;
248
249 if ((mntp = vfs_getvfs(fsidp)) == NULL)
250 return (ENOENT);
251
252 fs = VFSTOUFS(mntp)->um_lfs;
253
254 if (fs->lfs_ronly)
255 return EROFS;
256
257 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
258 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
259
260 cnt = blkcnt;
261
262 if ((error = vfs_trybusy(mntp, RW_READER, NULL)) != 0)
263 return (error);
264
265 /*
266 * This seglock is just to prevent the fact that we might have to sleep
267 * from allowing the possibility that our blocks might become
268 * invalid.
269 *
270 * It is also important to note here that unless we specify SEGM_CKP,
271 * any Ifile blocks that we might be asked to clean will never get
272 * to the disk.
273 */
274 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
275
276 /* Mark blocks/inodes dirty. */
277 error = 0;
278
279 /* these were inside the initialization for the for loop */
280 v_daddr = LFS_UNUSED_DADDR;
281 lastino = LFS_UNUSED_INUM;
282 nblkwritten = ninowritten = 0;
283 for (blkp = blkiov; cnt--; ++blkp)
284 {
285 /* Bounds-check incoming data, avoid panic for failed VGET */
286 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
287 error = EINVAL;
288 goto err3;
289 }
290 /*
291 * Get the IFILE entry (only once) and see if the file still
292 * exists.
293 */
294 if (lastino != blkp->bi_inode) {
295 /*
296 * Finish the old file, if there was one. The presence
297 * of a usable vnode in vp is signaled by a valid v_daddr.
298 */
299 if (v_daddr != LFS_UNUSED_DADDR) {
300 lfs_vunref(vp);
301 numrefed--;
302 }
303
304 /*
305 * Start a new file
306 */
307 lastino = blkp->bi_inode;
308 if (blkp->bi_inode == LFS_IFILE_INUM)
309 v_daddr = fs->lfs_idaddr;
310 else {
311 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
312 /* XXX fix for force write */
313 v_daddr = ifp->if_daddr;
314 brelse(bp, 0);
315 }
316 if (v_daddr == LFS_UNUSED_DADDR)
317 continue;
318
319 /* Get the vnode/inode. */
320 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
321 &vp,
322 (blkp->bi_lbn == LFS_UNUSED_LBN
323 ? blkp->bi_bp
324 : NULL));
325
326 if (!error) {
327 numrefed++;
328 }
329 if (error) {
330 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
331 " failed with %d (ino %d, segment %d)\n",
332 error, blkp->bi_inode,
333 dtosn(fs, blkp->bi_daddr)));
334 /*
335 * If we got EAGAIN, that means that the
336 * Inode was locked. This is
337 * recoverable: just clean the rest of
338 * this segment, and let the cleaner try
339 * again with another. (When the
340 * cleaner runs again, this segment will
341 * sort high on the list, since it is
342 * now almost entirely empty.) But, we
343 * still set v_daddr = LFS_UNUSED_ADDR
344 * so as not to test this over and over
345 * again.
346 */
347 if (error == EAGAIN) {
348 error = 0;
349 do_again++;
350 }
351 #ifdef DIAGNOSTIC
352 else if (error != ENOENT)
353 panic("lfs_markv VFS_VGET FAILED");
354 #endif
355 /* lastino = LFS_UNUSED_INUM; */
356 v_daddr = LFS_UNUSED_DADDR;
357 vp = NULL;
358 ip = NULL;
359 continue;
360 }
361 ip = VTOI(vp);
362 ninowritten++;
363 } else if (v_daddr == LFS_UNUSED_DADDR) {
364 /*
365 * This can only happen if the vnode is dead (or
366 * in any case we can't get it...e.g., it is
367 * inlocked). Keep going.
368 */
369 continue;
370 }
371
372 /* Past this point we are guaranteed that vp, ip are valid. */
373
374 /* Can't clean VU_DIROP directories in case of truncation */
375 /* XXX - maybe we should mark removed dirs specially? */
376 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) {
377 do_again++;
378 continue;
379 }
380
381 /* If this BLOCK_INFO didn't contain a block, keep going. */
382 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
383 /* XXX need to make sure that the inode gets written in this case */
384 /* XXX but only write the inode if it's the right one */
385 if (blkp->bi_inode != LFS_IFILE_INUM) {
386 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
387 if (ifp->if_daddr == blkp->bi_daddr) {
388 mutex_enter(&lfs_lock);
389 LFS_SET_UINO(ip, IN_CLEANING);
390 mutex_exit(&lfs_lock);
391 }
392 brelse(bp, 0);
393 }
394 continue;
395 }
396
397 b_daddr = 0;
398 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
399 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
400 {
401 if (dtosn(fs, dbtofsb(fs, b_daddr)) ==
402 dtosn(fs, blkp->bi_daddr))
403 {
404 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
405 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)));
406 }
407 do_again++;
408 continue;
409 }
410
411 /*
412 * Check block sizes. The blocks being cleaned come from
413 * disk, so they should have the same size as their on-disk
414 * counterparts.
415 */
416 if (blkp->bi_lbn >= 0)
417 obsize = blksize(fs, ip, blkp->bi_lbn);
418 else
419 obsize = fs->lfs_bsize;
420 /* Check for fragment size change */
421 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
422 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
423 }
424 if (obsize != blkp->bi_size) {
425 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
426 " size (%ld != %d), try again\n",
427 blkp->bi_inode, (long long)blkp->bi_lbn,
428 (long) obsize, blkp->bi_size));
429 do_again++;
430 continue;
431 }
432
433 /*
434 * If we get to here, then we are keeping the block. If
435 * it is an indirect block, we want to actually put it
436 * in the buffer cache so that it can be updated in the
437 * finish_meta section. If it's not, we need to
438 * allocate a fake buffer so that writeseg can perform
439 * the copyin and write the buffer.
440 */
441 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
442 /* Data Block */
443 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
444 blkp->bi_size, blkp->bi_bp);
445 /* Pretend we used bread() to get it */
446 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
447 } else {
448 /* Indirect block or ifile */
449 if (blkp->bi_size != fs->lfs_bsize &&
450 ip->i_number != LFS_IFILE_INUM)
451 panic("lfs_markv: partial indirect block?"
452 " size=%d\n", blkp->bi_size);
453 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
454 if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
455 /*
456 * The block in question was not found
457 * in the cache; i.e., the block that
458 * getblk() returned is empty. So, we
459 * can (and should) copy in the
460 * contents, because we've already
461 * determined that this was the right
462 * version of this block on disk.
463 *
464 * And, it can't have changed underneath
465 * us, because we have the segment lock.
466 */
467 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
468 if (error)
469 goto err2;
470 }
471 }
472 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
473 goto err2;
474
475 nblkwritten++;
476 /*
477 * XXX should account indirect blocks and ifile pages as well
478 */
479 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
480 > LFS_MARKV_MAX_BLOCKS) {
481 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
482 nblkwritten, ninowritten));
483 lfs_segwrite(mntp, SEGM_CLEAN);
484 nblkwritten = ninowritten = 0;
485 }
486 }
487
488 /*
489 * Finish the old file, if there was one
490 */
491 if (v_daddr != LFS_UNUSED_DADDR) {
492 lfs_vunref(vp);
493 numrefed--;
494 }
495
496 #ifdef DIAGNOSTIC
497 if (numrefed != 0)
498 panic("lfs_markv: numrefed=%d", numrefed);
499 #endif
500 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
501 nblkwritten, ninowritten));
502
503 /*
504 * The last write has to be SEGM_SYNC, because of calling semantics.
505 * It also has to be SEGM_CKP, because otherwise we could write
506 * over the newly cleaned data contained in a checkpoint, and then
507 * we'd be unhappy at recovery time.
508 */
509 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
510
511 lfs_segunlock(fs);
512
513 vfs_unbusy(mntp, false);
514 if (error)
515 return (error);
516 else if (do_again)
517 return EAGAIN;
518
519 return 0;
520
521 err2:
522 DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
523
524 /*
525 * XXX we're here because copyin() failed.
526 * XXX it means that we can't trust the cleanerd. too bad.
527 * XXX how can we recover from this?
528 */
529
530 err3:
531 KERNEL_UNLOCK_ONE(NULL);
532 /*
533 * XXX should do segwrite here anyway?
534 */
535
536 if (v_daddr != LFS_UNUSED_DADDR) {
537 lfs_vunref(vp);
538 --numrefed;
539 }
540
541 lfs_segunlock(fs);
542 vfs_unbusy(mntp, false);
543 #ifdef DIAGNOSTIC
544 if (numrefed != 0)
545 panic("lfs_markv: numrefed=%d", numrefed);
546 #endif
547
548 return (error);
549 }
550
551 /*
552 * sys_lfs_bmapv:
553 *
554 * This will fill in the current disk address for arrays of blocks.
555 *
556 * 0 on success
557 * -1/errno is return on error.
558 */
559 #ifdef USE_64BIT_SYSCALLS
560 int
561 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
562 {
563 /* {
564 syscallarg(fsid_t *) fsidp;
565 syscallarg(struct block_info *) blkiov;
566 syscallarg(int) blkcnt;
567 } */
568 BLOCK_INFO *blkiov;
569 int blkcnt, error;
570 fsid_t fsid;
571 struct lfs *fs;
572 struct mount *mntp;
573
574 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
575 NULL)) != 0)
576 return (error);
577
578 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
579 return (error);
580
581 if ((mntp = vfs_getvfs(&fsid)) == NULL)
582 return (ENOENT);
583 fs = VFSTOUFS(mntp)->um_lfs;
584
585 blkcnt = SCARG(uap, blkcnt);
586 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
587 return (EINVAL);
588 KERNEL_LOCK(1, NULL);
589 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
590 if ((error = copyin(SCARG(uap, blkiov), blkiov,
591 blkcnt * sizeof(BLOCK_INFO))) != 0)
592 goto out;
593
594 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
595 copyout(blkiov, SCARG(uap, blkiov),
596 blkcnt * sizeof(BLOCK_INFO));
597 out:
598 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
599 KERNEL_UNLOCK_ONE(NULL);
600 return error;
601 }
602 #else
603 int
604 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
605 {
606 /* {
607 syscallarg(fsid_t *) fsidp;
608 syscallarg(struct block_info *) blkiov;
609 syscallarg(int) blkcnt;
610 } */
611 BLOCK_INFO *blkiov;
612 BLOCK_INFO_15 *blkiov15;
613 int i, blkcnt, error;
614 fsid_t fsid;
615 struct lfs *fs;
616 struct mount *mntp;
617
618 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
619 NULL)) != 0)
620 return (error);
621
622 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
623 return (error);
624
625 if ((mntp = vfs_getvfs(&fsid)) == NULL)
626 return (ENOENT);
627 fs = VFSTOUFS(mntp)->um_lfs;
628
629 blkcnt = SCARG(uap, blkcnt);
630 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
631 return (EINVAL);
632 KERNEL_LOCK(1, NULL);
633 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
634 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
635 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
636 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
637 goto out;
638
639 for (i = 0; i < blkcnt; i++) {
640 blkiov[i].bi_inode = blkiov15[i].bi_inode;
641 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
642 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
643 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
644 blkiov[i].bi_version = blkiov15[i].bi_version;
645 blkiov[i].bi_bp = blkiov15[i].bi_bp;
646 blkiov[i].bi_size = blkiov15[i].bi_size;
647 }
648
649 if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
650 for (i = 0; i < blkcnt; i++) {
651 blkiov15[i].bi_inode = blkiov[i].bi_inode;
652 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
653 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
654 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
655 blkiov15[i].bi_version = blkiov[i].bi_version;
656 blkiov15[i].bi_bp = blkiov[i].bi_bp;
657 blkiov15[i].bi_size = blkiov[i].bi_size;
658 }
659 copyout(blkiov15, SCARG(uap, blkiov),
660 blkcnt * sizeof(BLOCK_INFO_15));
661 }
662 out:
663 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
664 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
665 KERNEL_UNLOCK_ONE(NULL);
666 return error;
667 }
668 #endif
669
670 int
671 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
672 {
673 BLOCK_INFO *blkp;
674 IFILE *ifp;
675 struct buf *bp;
676 struct inode *ip = NULL;
677 struct lfs *fs;
678 struct mount *mntp;
679 struct ufsmount *ump;
680 struct vnode *vp;
681 ino_t lastino;
682 daddr_t v_daddr;
683 int cnt, error;
684 int numrefed = 0;
685
686 lfs_cleaner_pid = p->p_pid;
687
688 if ((mntp = vfs_getvfs(fsidp)) == NULL)
689 return (ENOENT);
690
691 ump = VFSTOUFS(mntp);
692 if ((error = vfs_trybusy(mntp, RW_READER, NULL)) != 0)
693 return (error);
694
695 cnt = blkcnt;
696
697 fs = VFSTOUFS(mntp)->um_lfs;
698
699 error = 0;
700
701 /* these were inside the initialization for the for loop */
702 v_daddr = LFS_UNUSED_DADDR;
703 lastino = LFS_UNUSED_INUM;
704 for (blkp = blkiov; cnt--; ++blkp)
705 {
706 /*
707 * Get the IFILE entry (only once) and see if the file still
708 * exists.
709 */
710 if (lastino != blkp->bi_inode) {
711 /*
712 * Finish the old file, if there was one. The presence
713 * of a usable vnode in vp is signaled by a valid
714 * v_daddr.
715 */
716 if (v_daddr != LFS_UNUSED_DADDR) {
717 lfs_vunref(vp);
718 numrefed--;
719 }
720
721 /*
722 * Start a new file
723 */
724 lastino = blkp->bi_inode;
725 if (blkp->bi_inode == LFS_IFILE_INUM)
726 v_daddr = fs->lfs_idaddr;
727 else {
728 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
729 v_daddr = ifp->if_daddr;
730 brelse(bp, 0);
731 }
732 if (v_daddr == LFS_UNUSED_DADDR) {
733 blkp->bi_daddr = LFS_UNUSED_DADDR;
734 continue;
735 }
736 /*
737 * A regular call to VFS_VGET could deadlock
738 * here. Instead, we try an unlocked access.
739 */
740 mutex_enter(&ufs_ihash_lock);
741 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
742 if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) {
743 ip = VTOI(vp);
744 mutex_enter(&vp->v_interlock);
745 mutex_exit(&ufs_ihash_lock);
746 if (lfs_vref(vp)) {
747 v_daddr = LFS_UNUSED_DADDR;
748 continue;
749 }
750 numrefed++;
751 } else {
752 mutex_exit(&ufs_ihash_lock);
753 /*
754 * Don't VFS_VGET if we're being unmounted,
755 * since we hold vfs_busy().
756 */
757 if (mntp->mnt_iflag & IMNT_UNMOUNT) {
758 v_daddr = LFS_UNUSED_DADDR;
759 continue;
760 }
761 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
762 if (error) {
763 DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino"
764 "%d failed with %d",
765 blkp->bi_inode,error));
766 v_daddr = LFS_UNUSED_DADDR;
767 continue;
768 } else {
769 KASSERT(VOP_ISLOCKED(vp));
770 VOP_UNLOCK(vp, 0);
771 numrefed++;
772 }
773 }
774 ip = VTOI(vp);
775 } else if (v_daddr == LFS_UNUSED_DADDR) {
776 /*
777 * This can only happen if the vnode is dead.
778 * Keep going. Note that we DO NOT set the
779 * bi_addr to anything -- if we failed to get
780 * the vnode, for example, we want to assume
781 * conservatively that all of its blocks *are*
782 * located in the segment in question.
783 * lfs_markv will throw them out if we are
784 * wrong.
785 */
786 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
787 continue;
788 }
789
790 /* Past this point we are guaranteed that vp, ip are valid. */
791
792 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
793 /*
794 * We just want the inode address, which is
795 * conveniently in v_daddr.
796 */
797 blkp->bi_daddr = v_daddr;
798 } else {
799 daddr_t bi_daddr;
800
801 /* XXX ondisk32 */
802 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
803 &bi_daddr, NULL);
804 if (error)
805 {
806 blkp->bi_daddr = LFS_UNUSED_DADDR;
807 continue;
808 }
809 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
810 /* Fill in the block size, too */
811 if (blkp->bi_lbn >= 0)
812 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
813 else
814 blkp->bi_size = fs->lfs_bsize;
815 }
816 }
817
818 /*
819 * Finish the old file, if there was one. The presence
820 * of a usable vnode in vp is signaled by a valid v_daddr.
821 */
822 if (v_daddr != LFS_UNUSED_DADDR) {
823 lfs_vunref(vp);
824 numrefed--;
825 }
826
827 #ifdef DIAGNOSTIC
828 if (numrefed != 0)
829 panic("lfs_bmapv: numrefed=%d", numrefed);
830 #endif
831
832 vfs_unbusy(mntp, false);
833
834 return 0;
835 }
836
837 /*
838 * sys_lfs_segclean:
839 *
840 * Mark the segment clean.
841 *
842 * 0 on success
843 * -1/errno is return on error.
844 */
845 int
846 sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval)
847 {
848 /* {
849 syscallarg(fsid_t *) fsidp;
850 syscallarg(u_long) segment;
851 } */
852 struct lfs *fs;
853 struct mount *mntp;
854 fsid_t fsid;
855 int error;
856 unsigned long segnum;
857
858 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
859 NULL)) != 0)
860 return (error);
861
862 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
863 return (error);
864 if ((mntp = vfs_getvfs(&fsid)) == NULL)
865 return (ENOENT);
866
867 fs = VFSTOUFS(mntp)->um_lfs;
868 segnum = SCARG(uap, segment);
869
870 if ((error = vfs_trybusy(mntp, RW_READER, NULL)) != 0)
871 return (error);
872
873 KERNEL_LOCK(1, NULL);
874 lfs_seglock(fs, SEGM_PROT);
875 error = lfs_do_segclean(fs, segnum);
876 lfs_segunlock(fs);
877 KERNEL_UNLOCK_ONE(NULL);
878 vfs_unbusy(mntp, false);
879 return error;
880 }
881
882 /*
883 * Actually mark the segment clean.
884 * Must be called with the segment lock held.
885 */
886 int
887 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
888 {
889 extern int lfs_dostats;
890 struct buf *bp;
891 CLEANERINFO *cip;
892 SEGUSE *sup;
893
894 if (dtosn(fs, fs->lfs_curseg) == segnum) {
895 return (EBUSY);
896 }
897
898 LFS_SEGENTRY(sup, fs, segnum, bp);
899 if (sup->su_nbytes) {
900 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
901 " %d live bytes\n", segnum, sup->su_nbytes));
902 brelse(bp, 0);
903 return (EBUSY);
904 }
905 if (sup->su_flags & SEGUSE_ACTIVE) {
906 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
907 " segment is active\n", segnum));
908 brelse(bp, 0);
909 return (EBUSY);
910 }
911 if (!(sup->su_flags & SEGUSE_DIRTY)) {
912 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
913 " segment is already clean\n", segnum));
914 brelse(bp, 0);
915 return (EALREADY);
916 }
917
918 fs->lfs_avail += segtod(fs, 1);
919 if (sup->su_flags & SEGUSE_SUPERBLOCK)
920 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
921 if (fs->lfs_version > 1 && segnum == 0 &&
922 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
923 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
924 mutex_enter(&lfs_lock);
925 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
926 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
927 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
928 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
929 if (fs->lfs_dmeta < 0)
930 fs->lfs_dmeta = 0;
931 mutex_exit(&lfs_lock);
932 sup->su_flags &= ~SEGUSE_DIRTY;
933 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
934
935 LFS_CLEANERINFO(cip, fs, bp);
936 ++cip->clean;
937 --cip->dirty;
938 fs->lfs_nclean = cip->clean;
939 cip->bfree = fs->lfs_bfree;
940 mutex_enter(&lfs_lock);
941 cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
942 wakeup(&fs->lfs_avail);
943 mutex_exit(&lfs_lock);
944 (void) LFS_BWRITE_LOG(bp);
945
946 if (lfs_dostats)
947 ++lfs_stats.segs_reclaimed;
948
949 return (0);
950 }
951
952 /*
953 * This will block until a segment in file system fsid is written. A timeout
954 * in milliseconds may be specified which will awake the cleaner automatically.
955 * An fsid of -1 means any file system, and a timeout of 0 means forever.
956 */
957 int
958 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
959 {
960 struct mount *mntp;
961 void *addr;
962 u_long timeout;
963 int error;
964
965 KERNEL_LOCK(1, NULL);
966 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
967 addr = &lfs_allclean_wakeup;
968 else
969 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
970 /*
971 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
972 * XXX IS THAT WHAT IS INTENDED?
973 */
974 timeout = tvtohz(tv);
975 error = tsleep(addr, PCATCH | PVFS, "segment", timeout);
976 KERNEL_UNLOCK_ONE(NULL);
977 return (error == ERESTART ? EINTR : 0);
978 }
979
980 /*
981 * sys_lfs_segwait:
982 *
983 * System call wrapper around lfs_segwait().
984 *
985 * 0 on success
986 * 1 on timeout
987 * -1/errno is return on error.
988 */
989 int
990 sys_lfs_segwait(struct lwp *l, const struct sys_lfs_segwait_args *uap, register_t *retval)
991 {
992 /* {
993 syscallarg(fsid_t *) fsidp;
994 syscallarg(struct timeval *) tv;
995 } */
996 struct timeval atv;
997 fsid_t fsid;
998 int error;
999
1000 /* XXX need we be su to segwait? */
1001 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
1002 NULL)) != 0)
1003 return (error);
1004 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
1005 return (error);
1006
1007 if (SCARG(uap, tv)) {
1008 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
1009 if (error)
1010 return (error);
1011 if (itimerfix(&atv))
1012 return (EINVAL);
1013 } else /* NULL or invalid */
1014 atv.tv_sec = atv.tv_usec = 0;
1015 return lfs_segwait(&fsid, &atv);
1016 }
1017
1018 /*
1019 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1020 * daddr from the ifile, so don't look it up again. If the cleaner is
1021 * processing IINFO structures, it may have the ondisk inode already, so
1022 * don't go retrieving it again.
1023 *
1024 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1025 * when finished.
1026 */
1027 extern kmutex_t ufs_hashlock;
1028
1029 int
1030 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1031 {
1032 struct vnode *vp;
1033
1034 mutex_enter(&ufs_ihash_lock);
1035 if ((vp = ufs_ihashlookup(dev, ino)) != NULL) {
1036 mutex_enter(&vp->v_interlock);
1037 mutex_exit(&ufs_ihash_lock);
1038 if (vp->v_iflag & VI_XLOCK) {
1039 DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n",
1040 ino));
1041 lfs_stats.clean_vnlocked++;
1042 mutex_exit(&vp->v_interlock);
1043 return EAGAIN;
1044 }
1045 if (lfs_vref(vp)) {
1046 DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed"
1047 " for ino %d\n", ino));
1048 lfs_stats.clean_inlocked++;
1049 return EAGAIN;
1050 }
1051 } else {
1052 mutex_exit(&ufs_ihash_lock);
1053 }
1054 *vpp = vp;
1055
1056 return (0);
1057 }
1058
1059 int
1060 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp,
1061 struct ufs1_dinode *dinp)
1062 {
1063 struct inode *ip;
1064 struct ufs1_dinode *dip;
1065 struct vnode *vp;
1066 struct ufsmount *ump;
1067 dev_t dev;
1068 int error, retries;
1069 struct buf *bp;
1070 struct lfs *fs;
1071
1072 ump = VFSTOUFS(mp);
1073 dev = ump->um_dev;
1074 fs = ump->um_lfs;
1075
1076 /*
1077 * Wait until the filesystem is fully mounted before allowing vget
1078 * to complete. This prevents possible problems with roll-forward.
1079 */
1080 mutex_enter(&lfs_lock);
1081 while (fs->lfs_flags & LFS_NOTYET) {
1082 mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0,
1083 &lfs_lock);
1084 }
1085 mutex_exit(&lfs_lock);
1086
1087 /*
1088 * This is playing fast and loose. Someone may have the inode
1089 * locked, in which case they are going to be distinctly unhappy
1090 * if we trash something.
1091 */
1092
1093 error = lfs_fasthashget(dev, ino, vpp);
1094 if (error != 0 || *vpp != NULL)
1095 return (error);
1096
1097 /*
1098 * getnewvnode(9) will call vfs_busy, which will block if the
1099 * filesystem is being unmounted; but umount(9) is waiting for
1100 * us because we're already holding the fs busy.
1101 * XXXMP
1102 */
1103 if (mp->mnt_iflag & IMNT_UNMOUNT) {
1104 *vpp = NULL;
1105 return EDEADLK;
1106 }
1107 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1108 *vpp = NULL;
1109 return (error);
1110 }
1111
1112 mutex_enter(&ufs_hashlock);
1113 error = lfs_fasthashget(dev, ino, vpp);
1114 if (error != 0 || *vpp != NULL) {
1115 mutex_exit(&ufs_hashlock);
1116 ungetnewvnode(vp);
1117 return (error);
1118 }
1119
1120 /* Allocate new vnode/inode. */
1121 lfs_vcreate(mp, ino, vp);
1122
1123 /*
1124 * Put it onto its hash chain and lock it so that other requests for
1125 * this inode will block if they arrive while we are sleeping waiting
1126 * for old data structures to be purged or for the contents of the
1127 * disk portion of this inode to be read.
1128 */
1129 ip = VTOI(vp);
1130 ufs_ihashins(ip);
1131 mutex_exit(&ufs_hashlock);
1132
1133 /*
1134 * XXX
1135 * This may not need to be here, logically it should go down with
1136 * the i_devvp initialization.
1137 * Ask Kirk.
1138 */
1139 ip->i_lfs = fs;
1140
1141 /* Read in the disk contents for the inode, copy into the inode. */
1142 if (dinp) {
1143 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1144 if (error) {
1145 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed"
1146 " for ino %d\n", ino));
1147 ufs_ihashrem(ip);
1148
1149 /* Unlock and discard unneeded inode. */
1150 vlockmgr(&vp->v_lock, LK_RELEASE);
1151 lfs_vunref(vp);
1152 *vpp = NULL;
1153 return (error);
1154 }
1155 if (ip->i_number != ino)
1156 panic("lfs_fastvget: I was fed the wrong inode!");
1157 } else {
1158 retries = 0;
1159 again:
1160 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1161 NOCRED, &bp);
1162 if (error) {
1163 DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n",
1164 error));
1165 /*
1166 * The inode does not contain anything useful, so it
1167 * would be misleading to leave it on its hash chain.
1168 * Iput() will return it to the free list.
1169 */
1170 ufs_ihashrem(ip);
1171
1172 /* Unlock and discard unneeded inode. */
1173 vlockmgr(&vp->v_lock, LK_RELEASE);
1174 lfs_vunref(vp);
1175 brelse(bp, 0);
1176 *vpp = NULL;
1177 return (error);
1178 }
1179 dip = lfs_ifind(ump->um_lfs, ino, bp);
1180 if (dip == NULL) {
1181 /* Assume write has not completed yet; try again */
1182 brelse(bp, BC_INVAL);
1183 ++retries;
1184 if (retries > LFS_IFIND_RETRIES)
1185 panic("lfs_fastvget: dinode not found");
1186 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found,"
1187 " retrying...\n"));
1188 goto again;
1189 }
1190 *ip->i_din.ffs1_din = *dip;
1191 brelse(bp, 0);
1192 }
1193 lfs_vinit(mp, &vp);
1194
1195 *vpp = vp;
1196
1197 KASSERT(VOP_ISLOCKED(vp));
1198 VOP_UNLOCK(vp, 0);
1199
1200 return (0);
1201 }
1202
1203 /*
1204 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1205 */
1206 struct buf *
1207 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr)
1208 {
1209 struct buf *bp;
1210 int error;
1211
1212 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1213
1214 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1215 error = copyin(uaddr, bp->b_data, size);
1216 if (error) {
1217 lfs_freebuf(fs, bp);
1218 return NULL;
1219 }
1220 KDASSERT(bp->b_iodone == lfs_callback);
1221
1222 #if 0
1223 mutex_enter(&lfs_lock);
1224 ++fs->lfs_iocount;
1225 mutex_exit(&lfs_lock);
1226 #endif
1227 bp->b_bufsize = size;
1228 bp->b_bcount = size;
1229 return (bp);
1230 }
1231