lfs_syscalls.c revision 1.113 1 /* $NetBSD: lfs_syscalls.c,v 1.113 2006/05/14 21:32:45 elad Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*-
39 * Copyright (c) 1991, 1993, 1994
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_syscalls.c 8.10 (Berkeley) 5/14/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.113 2006/05/14 21:32:45 elad Exp $");
71
72 #ifndef LFS
73 # define LFS /* for prototypes in syscallargs.h */
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/proc.h>
79 #include <sys/buf.h>
80 #include <sys/mount.h>
81 #include <sys/vnode.h>
82 #include <sys/kernel.h>
83 #include <sys/kauth.h>
84
85 #include <sys/sa.h>
86 #include <sys/syscallargs.h>
87
88 #include <ufs/ufs/inode.h>
89 #include <ufs/ufs/ufsmount.h>
90 #include <ufs/ufs/ufs_extern.h>
91
92 #include <ufs/lfs/lfs.h>
93 #include <ufs/lfs/lfs_extern.h>
94
95 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
96 int lfs_fasthashget(dev_t, ino_t, struct vnode **);
97
98 pid_t lfs_cleaner_pid = 0;
99
100 /*
101 * sys_lfs_markv:
102 *
103 * This will mark inodes and blocks dirty, so they are written into the log.
104 * It will block until all the blocks have been written. The segment create
105 * time passed in the block_info and inode_info structures is used to decide
106 * if the data is valid for each block (in case some process dirtied a block
107 * or inode that is being cleaned between the determination that a block is
108 * live and the lfs_markv call).
109 *
110 * 0 on success
111 * -1/errno is return on error.
112 */
113 #ifdef USE_64BIT_SYSCALLS
114 int
115 sys_lfs_markv(struct lwp *l, void *v, register_t *retval)
116 {
117 struct sys_lfs_markv_args /* {
118 syscallarg(fsid_t *) fsidp;
119 syscallarg(struct block_info *) blkiov;
120 syscallarg(int) blkcnt;
121 } */ *uap = v;
122 BLOCK_INFO *blkiov;
123 struct proc *p = l->l_proc;
124 int blkcnt, error;
125 fsid_t fsid;
126 struct lfs *fs;
127 struct mount *mntp;
128
129 if ((error = kauth_authorize_generic(p->p_cred, KAUTH_GENERIC_ISSUSER,
130 &p->p_acflag)) != 0)
131 return (error);
132
133 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
134 return (error);
135
136 if ((mntp = vfs_getvfs(fsidp)) == NULL)
137 return (ENOENT);
138 fs = VFSTOUFS(mntp)->um_lfs;
139
140 blkcnt = SCARG(uap, blkcnt);
141 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
142 return (EINVAL);
143
144 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
145 if ((error = copyin(SCARG(uap, blkiov), blkiov,
146 blkcnt * sizeof(BLOCK_INFO))) != 0)
147 goto out;
148
149 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
150 copyout(blkiov, SCARG(uap, blkiov),
151 blkcnt * sizeof(BLOCK_INFO));
152 out:
153 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
154 return error;
155 }
156 #else
157 int
158 sys_lfs_markv(struct lwp *l, void *v, register_t *retval)
159 {
160 struct sys_lfs_markv_args /* {
161 syscallarg(fsid_t *) fsidp;
162 syscallarg(struct block_info *) blkiov;
163 syscallarg(int) blkcnt;
164 } */ *uap = v;
165 BLOCK_INFO *blkiov;
166 BLOCK_INFO_15 *blkiov15;
167 struct proc *p = l->l_proc;
168 int i, blkcnt, error;
169 fsid_t fsid;
170 struct lfs *fs;
171 struct mount *mntp;
172
173 if ((error = kauth_authorize_generic(p->p_cred, KAUTH_GENERIC_ISSUSER,
174 &p->p_acflag)) != 0)
175 return (error);
176
177 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
178 return (error);
179
180 if ((mntp = vfs_getvfs(&fsid)) == NULL)
181 return (ENOENT);
182 fs = VFSTOUFS(mntp)->um_lfs;
183
184 blkcnt = SCARG(uap, blkcnt);
185 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
186 return (EINVAL);
187
188 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
189 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
190 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
191 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
192 goto out;
193
194 for (i = 0; i < blkcnt; i++) {
195 blkiov[i].bi_inode = blkiov15[i].bi_inode;
196 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
197 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
198 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
199 blkiov[i].bi_version = blkiov15[i].bi_version;
200 blkiov[i].bi_bp = blkiov15[i].bi_bp;
201 blkiov[i].bi_size = blkiov15[i].bi_size;
202 }
203
204 if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0) {
205 for (i = 0; i < blkcnt; i++) {
206 blkiov15[i].bi_inode = blkiov[i].bi_inode;
207 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
208 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
209 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
210 blkiov15[i].bi_version = blkiov[i].bi_version;
211 blkiov15[i].bi_bp = blkiov[i].bi_bp;
212 blkiov15[i].bi_size = blkiov[i].bi_size;
213 }
214 copyout(blkiov15, SCARG(uap, blkiov),
215 blkcnt * sizeof(BLOCK_INFO_15));
216 }
217 out:
218 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
219 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
220 return error;
221 }
222 #endif
223
224 #define LFS_MARKV_MAX_BLOCKS (LFS_MAX_BUFS)
225
226 int
227 lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
228 {
229 BLOCK_INFO *blkp;
230 IFILE *ifp;
231 struct buf *bp;
232 struct inode *ip = NULL;
233 struct lfs *fs;
234 struct mount *mntp;
235 struct vnode *vp = NULL;
236 ino_t lastino;
237 daddr_t b_daddr, v_daddr;
238 int cnt, error;
239 int do_again = 0;
240 int numrefed = 0;
241 ino_t maxino;
242 size_t obsize;
243
244 /* number of blocks/inodes that we have already bwrite'ed */
245 int nblkwritten, ninowritten;
246
247 if ((mntp = vfs_getvfs(fsidp)) == NULL)
248 return (ENOENT);
249
250 fs = VFSTOUFS(mntp)->um_lfs;
251
252 if (fs->lfs_ronly)
253 return EROFS;
254
255 maxino = (fragstoblks(fs, fsbtofrags(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks)) -
256 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
257
258 cnt = blkcnt;
259
260 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
261 return (error);
262
263 /*
264 * This seglock is just to prevent the fact that we might have to sleep
265 * from allowing the possibility that our blocks might become
266 * invalid.
267 *
268 * It is also important to note here that unless we specify SEGM_CKP,
269 * any Ifile blocks that we might be asked to clean will never get
270 * to the disk.
271 */
272 lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
273
274 /* Mark blocks/inodes dirty. */
275 error = 0;
276
277 /* these were inside the initialization for the for loop */
278 v_daddr = LFS_UNUSED_DADDR;
279 lastino = LFS_UNUSED_INUM;
280 nblkwritten = ninowritten = 0;
281 for (blkp = blkiov; cnt--; ++blkp)
282 {
283 /* Bounds-check incoming data, avoid panic for failed VGET */
284 if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
285 error = EINVAL;
286 goto err3;
287 }
288 /*
289 * Get the IFILE entry (only once) and see if the file still
290 * exists.
291 */
292 if (lastino != blkp->bi_inode) {
293 /*
294 * Finish the old file, if there was one. The presence
295 * of a usable vnode in vp is signaled by a valid v_daddr.
296 */
297 if (v_daddr != LFS_UNUSED_DADDR) {
298 lfs_vunref(vp);
299 numrefed--;
300 }
301
302 /*
303 * Start a new file
304 */
305 lastino = blkp->bi_inode;
306 if (blkp->bi_inode == LFS_IFILE_INUM)
307 v_daddr = fs->lfs_idaddr;
308 else {
309 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
310 /* XXX fix for force write */
311 v_daddr = ifp->if_daddr;
312 brelse(bp);
313 }
314 if (v_daddr == LFS_UNUSED_DADDR)
315 continue;
316
317 /* Get the vnode/inode. */
318 error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
319 &vp,
320 (blkp->bi_lbn == LFS_UNUSED_LBN
321 ? blkp->bi_bp
322 : NULL));
323
324 if (!error) {
325 numrefed++;
326 }
327 if (error) {
328 DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
329 " failed with %d (ino %d, segment %d)\n",
330 error, blkp->bi_inode,
331 dtosn(fs, blkp->bi_daddr)));
332 /*
333 * If we got EAGAIN, that means that the
334 * Inode was locked. This is
335 * recoverable: just clean the rest of
336 * this segment, and let the cleaner try
337 * again with another. (When the
338 * cleaner runs again, this segment will
339 * sort high on the list, since it is
340 * now almost entirely empty.) But, we
341 * still set v_daddr = LFS_UNUSED_ADDR
342 * so as not to test this over and over
343 * again.
344 */
345 if (error == EAGAIN) {
346 error = 0;
347 do_again++;
348 }
349 #ifdef DIAGNOSTIC
350 else if (error != ENOENT)
351 panic("lfs_markv VFS_VGET FAILED");
352 #endif
353 /* lastino = LFS_UNUSED_INUM; */
354 v_daddr = LFS_UNUSED_DADDR;
355 vp = NULL;
356 ip = NULL;
357 continue;
358 }
359 ip = VTOI(vp);
360 ninowritten++;
361 } else if (v_daddr == LFS_UNUSED_DADDR) {
362 /*
363 * This can only happen if the vnode is dead (or
364 * in any case we can't get it...e.g., it is
365 * inlocked). Keep going.
366 */
367 continue;
368 }
369
370 /* Past this point we are guaranteed that vp, ip are valid. */
371
372 /* If this BLOCK_INFO didn't contain a block, keep going. */
373 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
374 /* XXX need to make sure that the inode gets written in this case */
375 /* XXX but only write the inode if it's the right one */
376 if (blkp->bi_inode != LFS_IFILE_INUM) {
377 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
378 if (ifp->if_daddr == blkp->bi_daddr)
379 LFS_SET_UINO(ip, IN_CLEANING);
380 brelse(bp);
381 }
382 continue;
383 }
384
385 b_daddr = 0;
386 if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
387 dbtofsb(fs, b_daddr) != blkp->bi_daddr)
388 {
389 if (dtosn(fs, dbtofsb(fs, b_daddr)) ==
390 dtosn(fs, blkp->bi_daddr))
391 {
392 DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
393 (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)));
394 }
395 do_again++;
396 continue;
397 }
398
399 /*
400 * Check block sizes. The blocks being cleaned come from
401 * disk, so they should have the same size as their on-disk
402 * counterparts.
403 */
404 if (blkp->bi_lbn >= 0)
405 obsize = blksize(fs, ip, blkp->bi_lbn);
406 else
407 obsize = fs->lfs_bsize;
408 /* Check for fragment size change */
409 if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
410 obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
411 }
412 if (obsize != blkp->bi_size) {
413 DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
414 " size (%ld != %d), try again\n",
415 blkp->bi_inode, (long long)blkp->bi_lbn,
416 (long) obsize, blkp->bi_size));
417 do_again++;
418 continue;
419 }
420
421 /*
422 * If we get to here, then we are keeping the block. If
423 * it is an indirect block, we want to actually put it
424 * in the buffer cache so that it can be updated in the
425 * finish_meta section. If it's not, we need to
426 * allocate a fake buffer so that writeseg can perform
427 * the copyin and write the buffer.
428 */
429 if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
430 /* Data Block */
431 bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
432 blkp->bi_size, blkp->bi_bp);
433 /* Pretend we used bread() to get it */
434 bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
435 } else {
436 /* Indirect block or ifile */
437 if (blkp->bi_size != fs->lfs_bsize &&
438 ip->i_number != LFS_IFILE_INUM)
439 panic("lfs_markv: partial indirect block?"
440 " size=%d\n", blkp->bi_size);
441 bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
442 if (!(bp->b_flags & (B_DONE|B_DELWRI))) { /* B_CACHE */
443 /*
444 * The block in question was not found
445 * in the cache; i.e., the block that
446 * getblk() returned is empty. So, we
447 * can (and should) copy in the
448 * contents, because we've already
449 * determined that this was the right
450 * version of this block on disk.
451 *
452 * And, it can't have changed underneath
453 * us, because we have the segment lock.
454 */
455 error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
456 if (error)
457 goto err2;
458 }
459 }
460 if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
461 goto err2;
462
463 nblkwritten++;
464 /*
465 * XXX should account indirect blocks and ifile pages as well
466 */
467 if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
468 > LFS_MARKV_MAX_BLOCKS) {
469 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
470 nblkwritten, ninowritten));
471 lfs_segwrite(mntp, SEGM_CLEAN);
472 nblkwritten = ninowritten = 0;
473 }
474 }
475
476 /*
477 * Finish the old file, if there was one
478 */
479 if (v_daddr != LFS_UNUSED_DADDR) {
480 lfs_vunref(vp);
481 numrefed--;
482 }
483
484 #ifdef DIAGNOSTIC
485 if (numrefed != 0)
486 panic("lfs_markv: numrefed=%d", numrefed);
487 #endif
488 DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
489 nblkwritten, ninowritten));
490
491 /*
492 * The last write has to be SEGM_SYNC, because of calling semantics.
493 * It also has to be SEGM_CKP, because otherwise we could write
494 * over the newly cleaned data contained in a checkpoint, and then
495 * we'd be unhappy at recovery time.
496 */
497 lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
498
499 lfs_segunlock(fs);
500
501 vfs_unbusy(mntp);
502 if (error)
503 return (error);
504 else if (do_again)
505 return EAGAIN;
506
507 return 0;
508
509 err2:
510 DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
511
512 /*
513 * XXX we're here because copyin() failed.
514 * XXX it means that we can't trust the cleanerd. too bad.
515 * XXX how can we recover from this?
516 */
517
518 err3:
519 /*
520 * XXX should do segwrite here anyway?
521 */
522
523 if (v_daddr != LFS_UNUSED_DADDR) {
524 lfs_vunref(vp);
525 --numrefed;
526 }
527
528 lfs_segunlock(fs);
529 vfs_unbusy(mntp);
530 #ifdef DIAGNOSTIC
531 if (numrefed != 0)
532 panic("lfs_markv: numrefed=%d", numrefed);
533 #endif
534
535 return (error);
536 }
537
538 /*
539 * sys_lfs_bmapv:
540 *
541 * This will fill in the current disk address for arrays of blocks.
542 *
543 * 0 on success
544 * -1/errno is return on error.
545 */
546 #ifdef USE_64BIT_SYSCALLS
547 int
548 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval)
549 {
550 struct sys_lfs_bmapv_args /* {
551 syscallarg(fsid_t *) fsidp;
552 syscallarg(struct block_info *) blkiov;
553 syscallarg(int) blkcnt;
554 } */ *uap = v;
555 struct proc *p = l->l_proc;
556 BLOCK_INFO *blkiov;
557 int blkcnt, error;
558 fsid_t fsid;
559 struct lfs *fs;
560 struct mount *mntp;
561
562 if ((error = kauth_authorize_generic(p->p_cred, KAUTH_GENERIC_ISSUSER,
563 &p->p_acflag)) != 0)
564 return (error);
565
566 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
567 return (error);
568
569 if ((mntp = vfs_getvfs(&fsid)) == NULL)
570 return (ENOENT);
571 fs = VFSTOUFS(mntp)->um_lfs;
572
573 blkcnt = SCARG(uap, blkcnt);
574 if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
575 return (EINVAL);
576 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
577 if ((error = copyin(SCARG(uap, blkiov), blkiov,
578 blkcnt * sizeof(BLOCK_INFO))) != 0)
579 goto out;
580
581 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
582 copyout(blkiov, SCARG(uap, blkiov),
583 blkcnt * sizeof(BLOCK_INFO));
584 out:
585 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
586 return error;
587 }
588 #else
589 int
590 sys_lfs_bmapv(struct lwp *l, void *v, register_t *retval)
591 {
592 struct sys_lfs_bmapv_args /* {
593 syscallarg(fsid_t *) fsidp;
594 syscallarg(struct block_info *) blkiov;
595 syscallarg(int) blkcnt;
596 } */ *uap = v;
597 struct proc *p = l->l_proc;
598 BLOCK_INFO *blkiov;
599 BLOCK_INFO_15 *blkiov15;
600 int i, blkcnt, error;
601 fsid_t fsid;
602 struct lfs *fs;
603 struct mount *mntp;
604
605 if ((error = kauth_authorize_generic(p->p_cred, KAUTH_GENERIC_ISSUSER,
606 &p->p_acflag)) != 0)
607 return (error);
608
609 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
610 return (error);
611
612 if ((mntp = vfs_getvfs(&fsid)) == NULL)
613 return (ENOENT);
614 fs = VFSTOUFS(mntp)->um_lfs;
615
616 blkcnt = SCARG(uap, blkcnt);
617 if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
618 return (EINVAL);
619 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
620 blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
621 if ((error = copyin(SCARG(uap, blkiov), blkiov15,
622 blkcnt * sizeof(BLOCK_INFO_15))) != 0)
623 goto out;
624
625 for (i = 0; i < blkcnt; i++) {
626 blkiov[i].bi_inode = blkiov15[i].bi_inode;
627 blkiov[i].bi_lbn = blkiov15[i].bi_lbn;
628 blkiov[i].bi_daddr = blkiov15[i].bi_daddr;
629 blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
630 blkiov[i].bi_version = blkiov15[i].bi_version;
631 blkiov[i].bi_bp = blkiov15[i].bi_bp;
632 blkiov[i].bi_size = blkiov15[i].bi_size;
633 }
634
635 if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0) {
636 for (i = 0; i < blkcnt; i++) {
637 blkiov15[i].bi_inode = blkiov[i].bi_inode;
638 blkiov15[i].bi_lbn = blkiov[i].bi_lbn;
639 blkiov15[i].bi_daddr = blkiov[i].bi_daddr;
640 blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
641 blkiov15[i].bi_version = blkiov[i].bi_version;
642 blkiov15[i].bi_bp = blkiov[i].bi_bp;
643 blkiov15[i].bi_size = blkiov[i].bi_size;
644 }
645 copyout(blkiov15, SCARG(uap, blkiov),
646 blkcnt * sizeof(BLOCK_INFO_15));
647 }
648 out:
649 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
650 lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
651 return error;
652 }
653 #endif
654
655 int
656 lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
657 {
658 BLOCK_INFO *blkp;
659 IFILE *ifp;
660 struct buf *bp;
661 struct inode *ip = NULL;
662 struct lfs *fs;
663 struct mount *mntp;
664 struct ufsmount *ump;
665 struct vnode *vp;
666 ino_t lastino;
667 daddr_t v_daddr;
668 int cnt, error;
669 int numrefed = 0;
670
671 lfs_cleaner_pid = p->p_pid;
672
673 if ((mntp = vfs_getvfs(fsidp)) == NULL)
674 return (ENOENT);
675
676 ump = VFSTOUFS(mntp);
677 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
678 return (error);
679
680 cnt = blkcnt;
681
682 fs = VFSTOUFS(mntp)->um_lfs;
683
684 error = 0;
685
686 /* these were inside the initialization for the for loop */
687 v_daddr = LFS_UNUSED_DADDR;
688 lastino = LFS_UNUSED_INUM;
689 for (blkp = blkiov; cnt--; ++blkp)
690 {
691 /*
692 * Get the IFILE entry (only once) and see if the file still
693 * exists.
694 */
695 if (lastino != blkp->bi_inode) {
696 /*
697 * Finish the old file, if there was one. The presence
698 * of a usable vnode in vp is signaled by a valid
699 * v_daddr.
700 */
701 if (v_daddr != LFS_UNUSED_DADDR) {
702 lfs_vunref(vp);
703 numrefed--;
704 }
705
706 /*
707 * Start a new file
708 */
709 lastino = blkp->bi_inode;
710 if (blkp->bi_inode == LFS_IFILE_INUM)
711 v_daddr = fs->lfs_idaddr;
712 else {
713 LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
714 v_daddr = ifp->if_daddr;
715 brelse(bp);
716 }
717 if (v_daddr == LFS_UNUSED_DADDR) {
718 blkp->bi_daddr = LFS_UNUSED_DADDR;
719 continue;
720 }
721 /*
722 * A regular call to VFS_VGET could deadlock
723 * here. Instead, we try an unlocked access.
724 */
725 vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
726 if (vp != NULL && !(vp->v_flag & VXLOCK)) {
727 ip = VTOI(vp);
728 if (lfs_vref(vp)) {
729 v_daddr = LFS_UNUSED_DADDR;
730 continue;
731 }
732 numrefed++;
733 } else {
734 /*
735 * Don't VFS_VGET if we're being unmounted,
736 * since we hold vfs_busy().
737 */
738 if (mntp->mnt_iflag & IMNT_UNMOUNT) {
739 v_daddr = LFS_UNUSED_DADDR;
740 continue;
741 }
742 error = VFS_VGET(mntp, blkp->bi_inode, &vp);
743 if (error) {
744 DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino"
745 "%d failed with %d",
746 blkp->bi_inode,error));
747 v_daddr = LFS_UNUSED_DADDR;
748 continue;
749 } else {
750 KASSERT(VOP_ISLOCKED(vp));
751 VOP_UNLOCK(vp, 0);
752 numrefed++;
753 }
754 }
755 ip = VTOI(vp);
756 } else if (v_daddr == LFS_UNUSED_DADDR) {
757 /*
758 * This can only happen if the vnode is dead.
759 * Keep going. Note that we DO NOT set the
760 * bi_addr to anything -- if we failed to get
761 * the vnode, for example, we want to assume
762 * conservatively that all of its blocks *are*
763 * located in the segment in question.
764 * lfs_markv will throw them out if we are
765 * wrong.
766 */
767 /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
768 continue;
769 }
770
771 /* Past this point we are guaranteed that vp, ip are valid. */
772
773 if (blkp->bi_lbn == LFS_UNUSED_LBN) {
774 /*
775 * We just want the inode address, which is
776 * conveniently in v_daddr.
777 */
778 blkp->bi_daddr = v_daddr;
779 } else {
780 daddr_t bi_daddr;
781
782 /* XXX ondisk32 */
783 error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
784 &bi_daddr, NULL);
785 if (error)
786 {
787 blkp->bi_daddr = LFS_UNUSED_DADDR;
788 continue;
789 }
790 blkp->bi_daddr = dbtofsb(fs, bi_daddr);
791 /* Fill in the block size, too */
792 if (blkp->bi_lbn >= 0)
793 blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
794 else
795 blkp->bi_size = fs->lfs_bsize;
796 }
797 }
798
799 /*
800 * Finish the old file, if there was one. The presence
801 * of a usable vnode in vp is signaled by a valid v_daddr.
802 */
803 if (v_daddr != LFS_UNUSED_DADDR) {
804 lfs_vunref(vp);
805 numrefed--;
806 }
807
808 #ifdef DIAGNOSTIC
809 if (numrefed != 0)
810 panic("lfs_bmapv: numrefed=%d", numrefed);
811 #endif
812
813 vfs_unbusy(mntp);
814
815 return 0;
816 }
817
818 /*
819 * sys_lfs_segclean:
820 *
821 * Mark the segment clean.
822 *
823 * 0 on success
824 * -1/errno is return on error.
825 */
826 int
827 sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
828 {
829 struct sys_lfs_segclean_args /* {
830 syscallarg(fsid_t *) fsidp;
831 syscallarg(u_long) segment;
832 } */ *uap = v;
833 struct lfs *fs;
834 struct mount *mntp;
835 fsid_t fsid;
836 int error;
837 unsigned long segnum;
838 struct proc *p = l->l_proc;
839
840 if ((error = kauth_authorize_generic(p->p_cred, KAUTH_GENERIC_ISSUSER,
841 &p->p_acflag)) != 0)
842 return (error);
843
844 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
845 return (error);
846 if ((mntp = vfs_getvfs(&fsid)) == NULL)
847 return (ENOENT);
848
849 fs = VFSTOUFS(mntp)->um_lfs;
850 segnum = SCARG(uap, segment);
851
852 if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0)
853 return (error);
854
855 lfs_seglock(fs, SEGM_PROT);
856 error = lfs_do_segclean(fs, segnum);
857 lfs_segunlock(fs);
858 vfs_unbusy(mntp);
859 return error;
860 }
861
862 /*
863 * Actually mark the segment clean.
864 * Must be called with the segment lock held.
865 */
866 int
867 lfs_do_segclean(struct lfs *fs, unsigned long segnum)
868 {
869 extern int lfs_dostats;
870 struct buf *bp;
871 CLEANERINFO *cip;
872 SEGUSE *sup;
873
874 if (dtosn(fs, fs->lfs_curseg) == segnum) {
875 return (EBUSY);
876 }
877
878 LFS_SEGENTRY(sup, fs, segnum, bp);
879 if (sup->su_nbytes) {
880 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
881 " %d live bytes\n", segnum, sup->su_nbytes));
882 brelse(bp);
883 return (EBUSY);
884 }
885 if (sup->su_flags & SEGUSE_ACTIVE) {
886 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
887 " segment is active\n", segnum));
888 brelse(bp);
889 return (EBUSY);
890 }
891 if (!(sup->su_flags & SEGUSE_DIRTY)) {
892 DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
893 " segment is already clean\n", segnum));
894 brelse(bp);
895 return (EALREADY);
896 }
897
898 fs->lfs_avail += segtod(fs, 1);
899 if (sup->su_flags & SEGUSE_SUPERBLOCK)
900 fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
901 if (fs->lfs_version > 1 && segnum == 0 &&
902 fs->lfs_start < btofsb(fs, LFS_LABELPAD))
903 fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
904 simple_lock(&fs->lfs_interlock);
905 fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
906 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
907 fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
908 btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
909 if (fs->lfs_dmeta < 0)
910 fs->lfs_dmeta = 0;
911 simple_unlock(&fs->lfs_interlock);
912 sup->su_flags &= ~SEGUSE_DIRTY;
913 LFS_WRITESEGENTRY(sup, fs, segnum, bp);
914
915 LFS_CLEANERINFO(cip, fs, bp);
916 ++cip->clean;
917 --cip->dirty;
918 fs->lfs_nclean = cip->clean;
919 cip->bfree = fs->lfs_bfree;
920 simple_lock(&fs->lfs_interlock);
921 cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
922 wakeup(&fs->lfs_avail);
923 simple_unlock(&fs->lfs_interlock);
924 (void) LFS_BWRITE_LOG(bp);
925
926 if (lfs_dostats)
927 ++lfs_stats.segs_reclaimed;
928
929 return (0);
930 }
931
932 /*
933 * This will block until a segment in file system fsid is written. A timeout
934 * in milliseconds may be specified which will awake the cleaner automatically.
935 * An fsid of -1 means any file system, and a timeout of 0 means forever.
936 */
937 int
938 lfs_segwait(fsid_t *fsidp, struct timeval *tv)
939 {
940 struct mount *mntp;
941 void *addr;
942 u_long timeout;
943 int error, s;
944
945 if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
946 addr = &lfs_allclean_wakeup;
947 else
948 addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
949 /*
950 * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
951 * XXX IS THAT WHAT IS INTENDED?
952 */
953 s = splclock();
954 timeradd(tv, &time, tv);
955 timeout = hzto(tv);
956 splx(s);
957 error = tsleep(addr, PCATCH | PVFS, "segment", timeout);
958 return (error == ERESTART ? EINTR : 0);
959 }
960
961 /*
962 * sys_lfs_segwait:
963 *
964 * System call wrapper around lfs_segwait().
965 *
966 * 0 on success
967 * 1 on timeout
968 * -1/errno is return on error.
969 */
970 int
971 sys_lfs_segwait(struct lwp *l, void *v, register_t *retval)
972 {
973 struct sys_lfs_segwait_args /* {
974 syscallarg(fsid_t *) fsidp;
975 syscallarg(struct timeval *) tv;
976 } */ *uap = v;
977 struct proc *p = l->l_proc;
978 struct timeval atv;
979 fsid_t fsid;
980 int error;
981
982 /* XXX need we be su to segwait? */
983 if ((error = kauth_authorize_generic(p->p_cred, KAUTH_GENERIC_ISSUSER,
984 &p->p_acflag)) != 0) {
985 return (error);
986 }
987 if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
988 return (error);
989
990 if (SCARG(uap, tv)) {
991 error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
992 if (error)
993 return (error);
994 if (itimerfix(&atv))
995 return (EINVAL);
996 } else /* NULL or invalid */
997 atv.tv_sec = atv.tv_usec = 0;
998 return lfs_segwait(&fsid, &atv);
999 }
1000
1001 /*
1002 * VFS_VGET call specialized for the cleaner. The cleaner already knows the
1003 * daddr from the ifile, so don't look it up again. If the cleaner is
1004 * processing IINFO structures, it may have the ondisk inode already, so
1005 * don't go retrieving it again.
1006 *
1007 * we lfs_vref, and it is the caller's responsibility to lfs_vunref
1008 * when finished.
1009 */
1010 extern struct lock ufs_hashlock;
1011
1012 int
1013 lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
1014 {
1015 if ((*vpp = ufs_ihashlookup(dev, ino)) != NULL) {
1016 if ((*vpp)->v_flag & VXLOCK) {
1017 DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VXLOCK\n",
1018 ino));
1019 lfs_stats.clean_vnlocked++;
1020 return EAGAIN;
1021 }
1022 if (lfs_vref(*vpp)) {
1023 DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed"
1024 " for ino %d\n", ino));
1025 lfs_stats.clean_inlocked++;
1026 return EAGAIN;
1027 }
1028 } else
1029 *vpp = NULL;
1030
1031 return (0);
1032 }
1033
1034 int
1035 lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp, struct ufs1_dinode *dinp)
1036 {
1037 struct inode *ip;
1038 struct ufs1_dinode *dip;
1039 struct vnode *vp;
1040 struct ufsmount *ump;
1041 dev_t dev;
1042 int error, retries;
1043 struct buf *bp;
1044 struct lfs *fs;
1045
1046 ump = VFSTOUFS(mp);
1047 dev = ump->um_dev;
1048 fs = ump->um_lfs;
1049
1050 /*
1051 * Wait until the filesystem is fully mounted before allowing vget
1052 * to complete. This prevents possible problems with roll-forward.
1053 */
1054 simple_lock(&fs->lfs_interlock);
1055 while (fs->lfs_flags & LFS_NOTYET) {
1056 ltsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0,
1057 &fs->lfs_interlock);
1058 }
1059 simple_unlock(&fs->lfs_interlock);
1060
1061 /*
1062 * This is playing fast and loose. Someone may have the inode
1063 * locked, in which case they are going to be distinctly unhappy
1064 * if we trash something.
1065 */
1066
1067 error = lfs_fasthashget(dev, ino, vpp);
1068 if (error != 0 || *vpp != NULL)
1069 return (error);
1070
1071 /*
1072 * getnewvnode(9) will call vfs_busy, which will block if the
1073 * filesystem is being unmounted; but umount(9) is waiting for
1074 * us because we're already holding the fs busy.
1075 * XXXMP
1076 */
1077 if (mp->mnt_iflag & IMNT_UNMOUNT) {
1078 *vpp = NULL;
1079 return EDEADLK;
1080 }
1081 if ((error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, &vp)) != 0) {
1082 *vpp = NULL;
1083 return (error);
1084 }
1085
1086 do {
1087 error = lfs_fasthashget(dev, ino, vpp);
1088 if (error != 0 || *vpp != NULL) {
1089 ungetnewvnode(vp);
1090 return (error);
1091 }
1092 } while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));
1093
1094 /* Allocate new vnode/inode. */
1095 lfs_vcreate(mp, ino, vp);
1096
1097 /*
1098 * Put it onto its hash chain and lock it so that other requests for
1099 * this inode will block if they arrive while we are sleeping waiting
1100 * for old data structures to be purged or for the contents of the
1101 * disk portion of this inode to be read.
1102 */
1103 ip = VTOI(vp);
1104 ufs_ihashins(ip);
1105 lockmgr(&ufs_hashlock, LK_RELEASE, 0);
1106
1107 /*
1108 * XXX
1109 * This may not need to be here, logically it should go down with
1110 * the i_devvp initialization.
1111 * Ask Kirk.
1112 */
1113 ip->i_lfs = fs;
1114
1115 /* Read in the disk contents for the inode, copy into the inode. */
1116 if (dinp) {
1117 error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
1118 if (error) {
1119 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed"
1120 " for ino %d\n", ino));
1121 ufs_ihashrem(ip);
1122
1123 /* Unlock and discard unneeded inode. */
1124 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1125 lfs_vunref(vp);
1126 *vpp = NULL;
1127 return (error);
1128 }
1129 if (ip->i_number != ino)
1130 panic("lfs_fastvget: I was fed the wrong inode!");
1131 } else {
1132 retries = 0;
1133 again:
1134 error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
1135 NOCRED, &bp);
1136 if (error) {
1137 DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n",
1138 error));
1139 /*
1140 * The inode does not contain anything useful, so it
1141 * would be misleading to leave it on its hash chain.
1142 * Iput() will return it to the free list.
1143 */
1144 ufs_ihashrem(ip);
1145
1146 /* Unlock and discard unneeded inode. */
1147 lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
1148 lfs_vunref(vp);
1149 brelse(bp);
1150 *vpp = NULL;
1151 return (error);
1152 }
1153 dip = lfs_ifind(ump->um_lfs, ino, bp);
1154 if (dip == NULL) {
1155 /* Assume write has not completed yet; try again */
1156 bp->b_flags |= B_INVAL;
1157 brelse(bp);
1158 ++retries;
1159 if (retries > LFS_IFIND_RETRIES)
1160 panic("lfs_fastvget: dinode not found");
1161 DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found,"
1162 " retrying...\n"));
1163 goto again;
1164 }
1165 *ip->i_din.ffs1_din = *dip;
1166 brelse(bp);
1167 }
1168 lfs_vinit(mp, &vp);
1169
1170 *vpp = vp;
1171
1172 KASSERT(VOP_ISLOCKED(vp));
1173 VOP_UNLOCK(vp, 0);
1174
1175 return (0);
1176 }
1177
1178 /*
1179 * Make up a "fake" cleaner buffer, copy the data from userland into it.
1180 */
1181 struct buf *
1182 lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
1183 {
1184 struct buf *bp;
1185 int error;
1186
1187 KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
1188
1189 bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
1190 error = copyin(uaddr, bp->b_data, size);
1191 if (error) {
1192 lfs_freebuf(fs, bp);
1193 return NULL;
1194 }
1195 KDASSERT(bp->b_iodone == lfs_callback);
1196
1197 #if 0
1198 simple_lock(&fs->lfs_interlock);
1199 ++fs->lfs_iocount;
1200 simple_unlock(&fs->lfs_interlock);
1201 #endif
1202 bp->b_bufsize = size;
1203 bp->b_bcount = size;
1204 return (bp);
1205 }
1206