lfs_syscalls.c revision 1.127 1 /* $NetBSD: lfs_syscalls.c,v 1.127 2008/01/30 09:50:27 ad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.127 2008/01/30 09:50:27 ad Exp $");
71
72 #ifndef LFS
73 # define LFS /* for prototypes in syscallargs.h */
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/buf.h>
80 #include <sys/mount.h>
81 #include <sys/vnode.h>
82 #include <sys/kernel.h>
83 #include <sys/kauth.h>
84 #include <sys/syscallargs.h>
85
86 #include <ufs/ufs/inode.h>
87 #include <ufs/ufs/ufsmount.h>
88 #include <ufs/ufs/ufs_extern.h>
89
90 #include <ufs/lfs/lfs.h>
91 #include <ufs/lfs/lfs_extern.h>
92
93 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *);
94 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
95
96 pid_t lfs_cleaner_pid = 0;
97
98 /*
99 * sys_lfs_markv:
100 *
101 * This will mark inodes and blocks dirty, so they are written into the log.
102 * It will block until all the blocks have been written. The segment create
103 * time passed in the block_info and inode_info structures is used to decide
104 * if the data is valid for each block (in case some process dirtied a block
105 * or inode that is being cleaned between the determination that a block is
106 * live and the lfs_markv call).
107 *
108 * 0 on success
109 * -1/errno is return on error.
110 */
111 #ifdef USE_64BIT_SYSCALLS
112 int
113 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
114 {
115 /* {
116 syscallarg(fsid_t *) fsidp;
117 syscallarg(struct block_info *) blkiov;
118 syscallarg(int) blkcnt;
119 } */
120 BLOCK_INFO *blkiov;
121 int blkcnt, error;
122 fsid_t fsid;
123 struct lfs *fs;
124 struct mount *mntp;
125
126 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
127 NULL)) != 0)
128 return (error);
129
130 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
131 return (error);
132
133 if ((mntp = vfs_getvfs(fsidp)) == NULL)
134 return (ENOENT);
135 fs = VFSTOUFS(mntp)->um_lfs;
136
137 blkcnt = SCARG(uap, blkcnt);
138 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
139 return (EINVAL);
140
141 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
142 if ((error = copyin(SCARG(uap, blkiov), blkiov,
143 blkcnt * sizeof(BLOCK_INFO))) != 0)
144 goto out;
145
146 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
147 copyout(blkiov, SCARG(uap, blkiov),
148 blkcnt * sizeof(BLOCK_INFO));
149 out:
150 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
151 return error;
152 }
153 #else
154 int
155 sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
156 {
157 /* {
158 syscallarg(fsid_t *) fsidp;
159 syscallarg(struct block_info *) blkiov;
160 syscallarg(int) blkcnt;
161 } */
162 BLOCK_INFO *blkiov;
163 BLOCK_INFO_15 *blkiov15;
164 int i, blkcnt, error;
165 fsid_t fsid;
166 struct lfs *fs;
167 struct mount *mntp;
168
169 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
170 NULL)) != 0)
171 return (error);
172
173 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
174 return (error);
175
176 if ((mntp = vfs_getvfs(&fsid)) == NULL)
177 return (ENOENT);
178 fs = VFSTOUFS(mntp)->um_lfs;
179
180 blkcnt = SCARG(uap, blkcnt);
181 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
182 return (EINVAL);
183
184 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
185 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
186 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
187 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
188 goto out;
189
190 for (i = 0; i < blkcnt; i++) {
191 blkiov[i].bi_inode = blkiov15[i].bi_inode;
192 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
193 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
194 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
195 blkiov[i].bi_version = blkiov15[i].bi_version;
196 blkiov[i].bi_bp = blkiov15[i].bi_bp;
197 blkiov[i].bi_size = blkiov15[i].bi_size;
198 }
199
200 if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
201 for (i = 0; i < blkcnt; i++) {
202 blkiov15[i].bi_inode = blkiov[i].bi_inode;
203 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
204 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
205 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
206 blkiov15[i].bi_version = blkiov[i].bi_version;
207 blkiov15[i].bi_bp = blkiov[i].bi_bp;
208 blkiov15[i].bi_size = blkiov[i].bi_size;
209 }
210 copyout(blkiov15, SCARG(uap, blkiov),
211 blkcnt * sizeof(BLOCK_INFO_15));
212 }
213 out:
214 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
215 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
216 return error;
217 }
218 #endif
219
220 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
221
222 int
223 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov,
224 int blkcnt)
225 {
226 BLOCK_INFO *blkp;
227 IFILE *ifp;
228 struct buf *bp;
229 struct inode *ip = NULL;
230 struct lfs *fs;
231 struct mount *mntp;
232 struct vnode *vp = NULL;
233 ino_t lastino;
234 daddr_t b_daddr, v_daddr;
235 int cnt, error;
236 int do_again = 0;
237 int numrefed = 0;
238 ino_t maxino;
239 size_t obsize;
240
241 /* number of blocks/inodes that we have already bwrite'ed */
242 int nblkwritten, ninowritten;
243
244 if ((mntp = vfs_getvfs(fsidp)) == NULL)
245 return (ENOENT);
246
247 fs = VFSTOUFS(mntp)->um_lfs;
248
249 if (fs->lfs_ronly)
250 return EROFS;
251
252 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
253 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
254
255 cnt = blkcnt;
256
257 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
258 return (error);
259
260 /*
261 * This seglock is just to prevent the fact that we might have to sleep
262 * from allowing the possibility that our blocks might become
263 * invalid.
264 *
265 * It is also important to note here that unless we specify SEGM_CKP,
266 * any Ifile blocks that we might be asked to clean will never get
267 * to the disk.
268 */
269 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
270
271 /* Mark blocks/inodes dirty. */
272 error = 0;
273
274 /* these were inside the initialization for the for loop */
275 v_daddr = LFS_UNUSED_DADDR;
276 lastino = LFS_UNUSED_INUM;
277 nblkwritten = ninowritten = 0;
278 for (blkp = blkiov; cnt--; ++blkp)
279 {
280 /* Bounds-check incoming data, avoid panic for failed VGET */
281 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
282 error = EINVAL;
283 goto err3;
284 }
285 /*
286 * Get the IFILE entry (only once) and see if the file still
287 * exists.
288 */
289 if (lastino != blkp->bi_inode) {
290 /*
291 * Finish the old file, if there was one. The presence
292 * of a usable vnode in vp is signaled by a valid v_daddr.
293 */
294 if (v_daddr != LFS_UNUSED_DADDR) {
295 lfs_vunref(vp);
296 numrefed--;
297 }
298
299 /*
300 * Start a new file
301 */
302 lastino = blkp->bi_inode;
303 if (blkp->bi_inode == LFS_IFILE_INUM)
304 v_daddr = fs->lfs_idaddr;
305 else {
306 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
307 /* XXX fix for force write */
308 v_daddr = ifp->if_daddr;
309 brelse(bp, 0);
310 }
311 if (v_daddr == LFS_UNUSED_DADDR)
312 continue;
313
314 /* Get the vnode/inode. */
315 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
316 &vp,
317 (blkp->bi_lbn == LFS_UNUSED_LBN
318 ? blkp->bi_bp
319 : NULL));
320
321 if (!error) {
322 numrefed++;
323 }
324 if (error) {
325 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
326 " failed with %d (ino %d, segment %d)\n",
327 error, blkp->bi_inode,
328 dtosn(fs, blkp->bi_daddr)));
329 /*
330 * If we got EAGAIN, that means that the
331 * Inode was locked. This is
332 * recoverable: just clean the rest of
333 * this segment, and let the cleaner try
334 * again with another. (When the
335 * cleaner runs again, this segment will
336 * sort high on the list, since it is
337 * now almost entirely empty.) But, we
338 * still set v_daddr = LFS_UNUSED_ADDR
339 * so as not to test this over and over
340 * again.
341 */
342 if (error == EAGAIN) {
343 error = 0;
344 do_again++;
345 }
346 #ifdef DIAGNOSTIC
347 else if (error != ENOENT)
348 panic("lfs_markv VFS_VGET FAILED");
349 #endif
350 /* lastino = LFS_UNUSED_INUM; */
351 v_daddr = LFS_UNUSED_DADDR;
352 vp = NULL;
353 ip = NULL;
354 continue;
355 }
356 ip = VTOI(vp);
357 ninowritten++;
358 } else if (v_daddr == LFS_UNUSED_DADDR) {
359 /*
360 * This can only happen if the vnode is dead (or
361 * in any case we can't get it...e.g., it is
362 * inlocked). Keep going.
363 */
364 continue;
365 }
366
367 /* Past this point we are guaranteed that vp, ip are valid. */
368
369 /* Can't clean VU_DIROP directories in case of truncation */
370 /* XXX - maybe we should mark removed dirs specially? */
371 if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) {
372 do_again++;
373 continue;
374 }
375
376 /* If this BLOCK_INFO didn't contain a block, keep going. */
377 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
378 /* XXX need to make sure that the inode gets written in this case */
379 /* XXX but only write the inode if it's the right one */
380 if (blkp->bi_inode != LFS_IFILE_INUM) {
381 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
382 if (ifp->if_daddr == blkp->bi_daddr) {
383 mutex_enter(&lfs_lock);
384 LFS_SET_UINO(ip, IN_CLEANING);
385 mutex_exit(&lfs_lock);
386 }
387 brelse(bp, 0);
388 }
389 continue;
390 }
391
392 b_daddr = 0;
393 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
394 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
395 {
396 if (dtosn(fs, dbtofsb(fs, b_daddr)) ==
397 dtosn(fs, blkp->bi_daddr))
398 {
399 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
400 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)));
401 }
402 do_again++;
403 continue;
404 }
405
406 /*
407 * Check block sizes. The blocks being cleaned come from
408 * disk, so they should have the same size as their on-disk
409 * counterparts.
410 */
411 if (blkp->bi_lbn >= 0)
412 obsize = blksize(fs, ip, blkp->bi_lbn);
413 else
414 obsize = fs->lfs_bsize;
415 /* Check for fragment size change */
416 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
417 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
418 }
419 if (obsize != blkp->bi_size) {
420 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
421 " size (%ld != %d), try again\n",
422 blkp->bi_inode, (long long)blkp->bi_lbn,
423 (long) obsize, blkp->bi_size));
424 do_again++;
425 continue;
426 }
427
428 /*
429 * If we get to here, then we are keeping the block. If
430 * it is an indirect block, we want to actually put it
431 * in the buffer cache so that it can be updated in the
432 * finish_meta section. If it's not, we need to
433 * allocate a fake buffer so that writeseg can perform
434 * the copyin and write the buffer.
435 */
436 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
437 /* Data Block */
438 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
439 blkp->bi_size, blkp->bi_bp);
440 /* Pretend we used bread() to get it */
441 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
442 } else {
443 /* Indirect block or ifile */
444 if (blkp->bi_size != fs->lfs_bsize &&
445 ip->i_number != LFS_IFILE_INUM)
446 panic("lfs_markv: partial indirect block?"
447 " size=%d\n", blkp->bi_size);
448 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
449 if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
450 /*
451 * The block in question was not found
452 * in the cache; i.e., the block that
453 * getblk() returned is empty. So, we
454 * can (and should) copy in the
455 * contents, because we've already
456 * determined that this was the right
457 * version of this block on disk.
458 *
459 * And, it can't have changed underneath
460 * us, because we have the segment lock.
461 */
462 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
463 if (error)
464 goto err2;
465 }
466 }
467 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
468 goto err2;
469
470 nblkwritten++;
471 /*
472 * XXX should account indirect blocks and ifile pages as well
473 */
474 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
475 > LFS_MARKV_MAX_BLOCKS) {
476 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
477 nblkwritten, ninowritten));
478 lfs_segwrite(mntp, SEGM_CLEAN);
479 nblkwritten = ninowritten = 0;
480 }
481 }
482
483 /*
484 * Finish the old file, if there was one
485 */
486 if (v_daddr != LFS_UNUSED_DADDR) {
487 lfs_vunref(vp);
488 numrefed--;
489 }
490
491 #ifdef DIAGNOSTIC
492 if (numrefed != 0)
493 panic("lfs_markv: numrefed=%d", numrefed);
494 #endif
495 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
496 nblkwritten, ninowritten));
497
498 /*
499 * The last write has to be SEGM_SYNC, because of calling semantics.
500 * It also has to be SEGM_CKP, because otherwise we could write
501 * over the newly cleaned data contained in a checkpoint, and then
502 * we'd be unhappy at recovery time.
503 */
504 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
505
506 lfs_segunlock(fs);
507
508 vfs_unbusy(mntp);
509 if (error)
510 return (error);
511 else if (do_again)
512 return EAGAIN;
513
514 return 0;
515
516 err2:
517 DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
518
519 /*
520 * XXX we're here because copyin() failed.
521 * XXX it means that we can't trust the cleanerd. too bad.
522 * XXX how can we recover from this?
523 */
524
525 err3:
526 /*
527 * XXX should do segwrite here anyway?
528 */
529
530 if (v_daddr != LFS_UNUSED_DADDR) {
531 lfs_vunref(vp);
532 --numrefed;
533 }
534
535 lfs_segunlock(fs);
536 vfs_unbusy(mntp);
537 #ifdef DIAGNOSTIC
538 if (numrefed != 0)
539 panic("lfs_markv: numrefed=%d", numrefed);
540 #endif
541
542 return (error);
543 }
544
545 /*
546 * sys_lfs_bmapv:
547 *
548 * This will fill in the current disk address for arrays of blocks.
549 *
550 * 0 on success
551 * -1/errno is return on error.
552 */
553 #ifdef USE_64BIT_SYSCALLS
554 int
555 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
556 {
557 /* {
558 syscallarg(fsid_t *) fsidp;
559 syscallarg(struct block_info *) blkiov;
560 syscallarg(int) blkcnt;
561 } */
562 BLOCK_INFO *blkiov;
563 int blkcnt, error;
564 fsid_t fsid;
565 struct lfs *fs;
566 struct mount *mntp;
567
568 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
569 NULL)) != 0)
570 return (error);
571
572 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
573 return (error);
574
575 if ((mntp = vfs_getvfs(&fsid)) == NULL)
576 return (ENOENT);
577 fs = VFSTOUFS(mntp)->um_lfs;
578
579 blkcnt = SCARG(uap, blkcnt);
580 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
581 return (EINVAL);
582 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
583 if ((error = copyin(SCARG(uap, blkiov), blkiov,
584 blkcnt * sizeof(BLOCK_INFO))) != 0)
585 goto out;
586
587 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
588 copyout(blkiov, SCARG(uap, blkiov),
589 blkcnt * sizeof(BLOCK_INFO));
590 out:
591 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
592 return error;
593 }
594 #else
595 int
596 sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
597 {
598 /* {
599 syscallarg(fsid_t *) fsidp;
600 syscallarg(struct block_info *) blkiov;
601 syscallarg(int) blkcnt;
602 } */
603 BLOCK_INFO *blkiov;
604 BLOCK_INFO_15 *blkiov15;
605 int i, blkcnt, error;
606 fsid_t fsid;
607 struct lfs *fs;
608 struct mount *mntp;
609
610 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
611 NULL)) != 0)
612 return (error);
613
614 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
615 return (error);
616
617 if ((mntp = vfs_getvfs(&fsid)) == NULL)
618 return (ENOENT);
619 fs = VFSTOUFS(mntp)->um_lfs;
620
621 blkcnt = SCARG(uap, blkcnt);
622 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
623 return (EINVAL);
624 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
625 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
626 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
627 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
628 goto out;
629
630 for (i = 0; i < blkcnt; i++) {
631 blkiov[i].bi_inode = blkiov15[i].bi_inode;
632 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
633 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
634 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
635 blkiov[i].bi_version = blkiov15[i].bi_version;
636 blkiov[i].bi_bp = blkiov15[i].bi_bp;
637 blkiov[i].bi_size = blkiov15[i].bi_size;
638 }
639
640 if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
641 for (i = 0; i < blkcnt; i++) {
642 blkiov15[i].bi_inode = blkiov[i].bi_inode;
643 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
644 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
645 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
646 blkiov15[i].bi_version = blkiov[i].bi_version;
647 blkiov15[i].bi_bp = blkiov[i].bi_bp;
648 blkiov15[i].bi_size = blkiov[i].bi_size;
649 }
650 copyout(blkiov15, SCARG(uap, blkiov),
651 blkcnt * sizeof(BLOCK_INFO_15));
652 }
653 out:
654 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
655 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
656 return error;
657 }
658 #endif
659
660 int
661 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
662 {
663 BLOCK_INFO *blkp;
664 IFILE *ifp;
665 struct buf *bp;
666 struct inode *ip = NULL;
667 struct lfs *fs;
668 struct mount *mntp;
669 struct ufsmount *ump;
670 struct vnode *vp;
671 ino_t lastino;
672 daddr_t v_daddr;
673 int cnt, error;
674 int numrefed = 0;
675
676 lfs_cleaner_pid = p->p_pid;
677
678 if ((mntp = vfs_getvfs(fsidp)) == NULL)
679 return (ENOENT);
680
681 ump = VFSTOUFS(mntp);
682 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
683 return (error);
684
685 cnt = blkcnt;
686
687 fs = VFSTOUFS(mntp)->um_lfs;
688
689 error = 0;
690
691 /* these were inside the initialization for the for loop */
692 v_daddr = LFS_UNUSED_DADDR;
693 lastino = LFS_UNUSED_INUM;
694 for (blkp = blkiov; cnt--; ++blkp)
695 {
696 /*
697 * Get the IFILE entry (only once) and see if the file still
698 * exists.
699 */
700 if (lastino != blkp->bi_inode) {
701 /*
702 * Finish the old file, if there was one. The presence
703 * of a usable vnode in vp is signaled by a valid
704 * v_daddr.
705 */
706 if (v_daddr != LFS_UNUSED_DADDR) {
707 lfs_vunref(vp);
708 numrefed--;
709 }
710
711 /*
712 * Start a new file
713 */
714 lastino = blkp->bi_inode;
715 if (blkp->bi_inode == LFS_IFILE_INUM)
716 v_daddr = fs->lfs_idaddr;
717 else {
718 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
719 v_daddr = ifp->if_daddr;
720 brelse(bp, 0);
721 }
722 if (v_daddr == LFS_UNUSED_DADDR) {
723 blkp->bi_daddr = LFS_UNUSED_DADDR;
724 continue;
725 }
726 /*
727 * A regular call to VFS_VGET could deadlock
728 * here. Instead, we try an unlocked access.
729 */
730 mutex_enter(&ufs_ihash_lock);
731 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
732 if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) {
733 ip = VTOI(vp);
734 mutex_enter(&vp->v_interlock);
735 mutex_exit(&ufs_ihash_lock);
736 if (lfs_vref(vp)) {
737 v_daddr = LFS_UNUSED_DADDR;
738 continue;
739 }
740 numrefed++;
741 } else {
742 mutex_exit(&ufs_ihash_lock);
743 /*
744 * Don't VFS_VGET if we're being unmounted,
745 * since we hold vfs_busy().
746 */
747 if (mntp->mnt_iflag & IMNT_UNMOUNT) {
748 v_daddr = LFS_UNUSED_DADDR;
749 continue;
750 }
751 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
752 if (error) {
753 DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino"
754 "%d failed with %d",
755 blkp->bi_inode,error));
756 v_daddr = LFS_UNUSED_DADDR;
757 continue;
758 } else {
759 KASSERT(VOP_ISLOCKED(vp));
760 VOP_UNLOCK(vp, 0);
761 numrefed++;
762 }
763 }
764 ip = VTOI(vp);
765 } else if (v_daddr == LFS_UNUSED_DADDR) {
766 /*
767 * This can only happen if the vnode is dead.
768 * Keep going. Note that we DO NOT set the
769 * bi_addr to anything -- if we failed to get
770 * the vnode, for example, we want to assume
771 * conservatively that all of its blocks *are*
772 * located in the segment in question.
773 * lfs_markv will throw them out if we are
774 * wrong.
775 */
776 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
777 continue;
778 }
779
780 /* Past this point we are guaranteed that vp, ip are valid. */
781
782 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
783 /*
784 * We just want the inode address, which is
785 * conveniently in v_daddr.
786 */
787 blkp->bi_daddr = v_daddr;
788 } else {
789 daddr_t bi_daddr;
790
791 /* XXX ondisk32 */
792 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
793 &bi_daddr, NULL);
794 if (error)
795 {
796 blkp->bi_daddr = LFS_UNUSED_DADDR;
797 continue;
798 }
799 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
800 /* Fill in the block size, too */
801 if (blkp->bi_lbn >= 0)
802 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
803 else
804 blkp->bi_size = fs->lfs_bsize;
805 }
806 }
807
808 /*
809 * Finish the old file, if there was one. The presence
810 * of a usable vnode in vp is signaled by a valid v_daddr.
811 */
812 if (v_daddr != LFS_UNUSED_DADDR) {
813 lfs_vunref(vp);
814 numrefed--;
815 }
816
817 #ifdef DIAGNOSTIC
818 if (numrefed != 0)
819 panic("lfs_bmapv: numrefed=%d", numrefed);
820 #endif
821
822 vfs_unbusy(mntp);
823
824 return 0;
825 }
826
827 /*
828 * sys_lfs_segclean:
829 *
830 * Mark the segment clean.
831 *
832 * 0 on success
833 * -1/errno is return on error.
834 */
835 int
836 sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval)
837 {
838 /* {
839 syscallarg(fsid_t *) fsidp;
840 syscallarg(u_long) segment;
841 } */
842 struct lfs *fs;
843 struct mount *mntp;
844 fsid_t fsid;
845 int error;
846 unsigned long segnum;
847
848 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
849 NULL)) != 0)
850 return (error);
851
852 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
853 return (error);
854 if ((mntp = vfs_getvfs(&fsid)) == NULL)
855 return (ENOENT);
856
857 fs = VFSTOUFS(mntp)->um_lfs;
858 segnum = SCARG(uap, segment);
859
860 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
861 return (error);
862
863 lfs_seglock(fs, SEGM_PROT);
864 error = lfs_do_segclean(fs, segnum);
865 lfs_segunlock(fs);
866 vfs_unbusy(mntp);
867 return error;
868 }
869
870 /*
871 * Actually mark the segment clean.
872 * Must be called with the segment lock held.
873 */
874 int
875 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
876 {
877 extern int lfs_dostats;
878 struct buf *bp;
879 CLEANERINFO *cip;
880 SEGUSE *sup;
881
882 if (dtosn(fs, fs->lfs_curseg) == segnum) {
883 return (EBUSY);
884 }
885
886 LFS_SEGENTRY(sup, fs, segnum, bp);
887 if (sup->su_nbytes) {
888 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
889 " %d live bytes\n", segnum, sup->su_nbytes));
890 brelse(bp, 0);
891 return (EBUSY);
892 }
893 if (sup->su_flags & SEGUSE_ACTIVE) {
894 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
895 " segment is active\n", segnum));
896 brelse(bp, 0);
897 return (EBUSY);
898 }
899 if (!(sup->su_flags & SEGUSE_DIRTY)) {
900 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
901 " segment is already clean\n", segnum));
902 brelse(bp, 0);
903 return (EALREADY);
904 }
905
906 fs->lfs_avail += segtod(fs, 1);
907 if (sup->su_flags & SEGUSE_SUPERBLOCK)
908 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
909 if (fs->lfs_version > 1 && segnum == 0 &&
910 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
911 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
912 mutex_enter(&lfs_lock);
913 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
914 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
915 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
916 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
917 if (fs->lfs_dmeta < 0)
918 fs->lfs_dmeta = 0;
919 mutex_exit(&lfs_lock);
920 sup->su_flags &= ~SEGUSE_DIRTY;
921 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
922
923 LFS_CLEANERINFO(cip, fs, bp);
924 ++cip->clean;
925 --cip->dirty;
926 fs->lfs_nclean = cip->clean;
927 cip->bfree = fs->lfs_bfree;
928 mutex_enter(&lfs_lock);
929 cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
930 wakeup(&fs->lfs_avail);
931 mutex_exit(&lfs_lock);
932 (void) LFS_BWRITE_LOG(bp);
933
934 if (lfs_dostats)
935 ++lfs_stats.segs_reclaimed;
936
937 return (0);
938 }
939
940 /*
941 * This will block until a segment in file system fsid is written. A timeout
942 * in milliseconds may be specified which will awake the cleaner automatically.
943 * An fsid of -1 means any file system, and a timeout of 0 means forever.
944 */
945 int
946 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
947 {
948 struct mount *mntp;
949 void *addr;
950 u_long timeout;
951 int error;
952
953 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
954 addr = &lfs_allclean_wakeup;
955 else
956 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
957 /*
958 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
959 * XXX IS THAT WHAT IS INTENDED?
960 */
961 timeout = tvtohz(tv);
962 error = tsleep(addr, PCATCH | PVFS, "segment", timeout);
963 return (error == ERESTART ? EINTR : 0);
964 }
965
966 /*
967 * sys_lfs_segwait:
968 *
969 * System call wrapper around lfs_segwait().
970 *
971 * 0 on success
972 * 1 on timeout
973 * -1/errno is return on error.
974 */
975 int
976 sys_lfs_segwait(struct lwp *l, const struct sys_lfs_segwait_args *uap, register_t *retval)
977 {
978 /* {
979 syscallarg(fsid_t *) fsidp;
980 syscallarg(struct timeval *) tv;
981 } */
982 struct timeval atv;
983 fsid_t fsid;
984 int error;
985
986 /* XXX need we be su to segwait? */
987 if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
988 NULL)) != 0)
989 return (error);
990 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
991 return (error);
992
993 if (SCARG(uap, tv)) {
994 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
995 if (error)
996 return (error);
997 if (itimerfix(&atv))
998 return (EINVAL);
999 } else /* NULL or invalid */
1000 atv.tv_sec = atv.tv_usec = 0;
1001 return lfs_segwait(&fsid, &atv);
1002 }
1003
1004 /*
1005 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1006 * daddr from the ifile, so don't look it up again. If the cleaner is
1007 * processing IINFO structures, it may have the ondisk inode already, so
1008 * don't go retrieving it again.
1009 *
1010 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1011 * when finished.
1012 */
1013 extern kmutex_t ufs_hashlock;
1014
1015 int
1016 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1017 {
1018 struct vnode *vp;
1019
1020 mutex_enter(&ufs_ihash_lock);
1021 if ((vp = ufs_ihashlookup(dev, ino)) != NULL) {
1022 mutex_enter(&vp->v_interlock);
1023 mutex_exit(&ufs_ihash_lock);
1024 if (vp->v_iflag & VI_XLOCK) {
1025 DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n",
1026 ino));
1027 lfs_stats.clean_vnlocked++;
1028 mutex_exit(&vp->v_interlock);
1029 return EAGAIN;
1030 }
1031 if (lfs_vref(vp)) {
1032 DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed"
1033 " for ino %d\n", ino));
1034 lfs_stats.clean_inlocked++;
1035 return EAGAIN;
1036 }
1037 } else {
1038 mutex_exit(&ufs_ihash_lock);
1039 }
1040 *vpp = vp;
1041
1042 return (0);
1043 }
1044
1045 int
1046 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp,
1047 struct ufs1_dinode *dinp)
1048 {
1049 struct inode *ip;
1050 struct ufs1_dinode *dip;
1051 struct vnode *vp;
1052 struct ufsmount *ump;
1053 dev_t dev;
1054 int error, retries;
1055 struct buf *bp;
1056 struct lfs *fs;
1057
1058 ump = VFSTOUFS(mp);
1059 dev = ump->um_dev;
1060 fs = ump->um_lfs;
1061
1062 /*
1063 * Wait until the filesystem is fully mounted before allowing vget
1064 * to complete. This prevents possible problems with roll-forward.
1065 */
1066 mutex_enter(&lfs_lock);
1067 while (fs->lfs_flags & LFS_NOTYET) {
1068 mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0,
1069 &lfs_lock);
1070 }
1071 mutex_exit(&lfs_lock);
1072
1073 /*
1074 * This is playing fast and loose. Someone may have the inode
1075 * locked, in which case they are going to be distinctly unhappy
1076 * if we trash something.
1077 */
1078
1079 error = lfs_fasthashget(dev, ino, vpp);
1080 if (error != 0 || *vpp != NULL)
1081 return (error);
1082
1083 /*
1084 * getnewvnode(9) will call vfs_busy, which will block if the
1085 * filesystem is being unmounted; but umount(9) is waiting for
1086 * us because we're already holding the fs busy.
1087 * XXXMP
1088 */
1089 if (mp->mnt_iflag & IMNT_UNMOUNT) {
1090 *vpp = NULL;
1091 return EDEADLK;
1092 }
1093 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1094 *vpp = NULL;
1095 return (error);
1096 }
1097
1098 mutex_enter(&ufs_hashlock);
1099 error = lfs_fasthashget(dev, ino, vpp);
1100 if (error != 0 || *vpp != NULL) {
1101 mutex_exit(&ufs_hashlock);
1102 ungetnewvnode(vp);
1103 return (error);
1104 }
1105
1106 /* Allocate new vnode/inode. */
1107 lfs_vcreate(mp, ino, vp);
1108
1109 /*
1110 * Put it onto its hash chain and lock it so that other requests for
1111 * this inode will block if they arrive while we are sleeping waiting
1112 * for old data structures to be purged or for the contents of the
1113 * disk portion of this inode to be read.
1114 */
1115 ip = VTOI(vp);
1116 ufs_ihashins(ip);
1117 mutex_exit(&ufs_hashlock);
1118
1119 /*
1120 * XXX
1121 * This may not need to be here, logically it should go down with
1122 * the i_devvp initialization.
1123 * Ask Kirk.
1124 */
1125 ip->i_lfs = fs;
1126
1127 /* Read in the disk contents for the inode, copy into the inode. */
1128 if (dinp) {
1129 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1130 if (error) {
1131 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed"
1132 " for ino %d\n", ino));
1133 ufs_ihashrem(ip);
1134
1135 /* Unlock and discard unneeded inode. */
1136 vlockmgr(&vp->v_lock, LK_RELEASE);
1137 lfs_vunref(vp);
1138 *vpp = NULL;
1139 return (error);
1140 }
1141 if (ip->i_number != ino)
1142 panic("lfs_fastvget: I was fed the wrong inode!");
1143 } else {
1144 retries = 0;
1145 again:
1146 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1147 NOCRED, &bp);
1148 if (error) {
1149 DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n",
1150 error));
1151 /*
1152 * The inode does not contain anything useful, so it
1153 * would be misleading to leave it on its hash chain.
1154 * Iput() will return it to the free list.
1155 */
1156 ufs_ihashrem(ip);
1157
1158 /* Unlock and discard unneeded inode. */
1159 vlockmgr(&vp->v_lock, LK_RELEASE);
1160 lfs_vunref(vp);
1161 brelse(bp, 0);
1162 *vpp = NULL;
1163 return (error);
1164 }
1165 dip = lfs_ifind(ump->um_lfs, ino, bp);
1166 if (dip == NULL) {
1167 /* Assume write has not completed yet; try again */
1168 brelse(bp, BC_INVAL);
1169 ++retries;
1170 if (retries > LFS_IFIND_RETRIES)
1171 panic("lfs_fastvget: dinode not found");
1172 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found,"
1173 " retrying...\n"));
1174 goto again;
1175 }
1176 *ip->i_din.ffs1_din = *dip;
1177 brelse(bp, 0);
1178 }
1179 lfs_vinit(mp, &vp);
1180
1181 *vpp = vp;
1182
1183 KASSERT(VOP_ISLOCKED(vp));
1184 VOP_UNLOCK(vp, 0);
1185
1186 return (0);
1187 }
1188
1189 /*
1190 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1191 */
1192 struct buf *
1193 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr)
1194 {
1195 struct buf *bp;
1196 int error;
1197
1198 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1199
1200 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1201 error = copyin(uaddr, bp->b_data, size);
1202 if (error) {
1203 lfs_freebuf(fs, bp);
1204 return NULL;
1205 }
1206 KDASSERT(bp->b_iodone == lfs_callback);
1207
1208 #if 0
1209 mutex_enter(&lfs_lock);
1210 ++fs->lfs_iocount;
1211 mutex_exit(&lfs_lock);
1212 #endif
1213 bp->b_bufsize = size;
1214 bp->b_bcount = size;
1215 return (bp);
1216 }
1217