lfs_vnops.c revision 1.195 1 /* $NetBSD: lfs_vnops.c,v 1.195 2007/01/03 02:42:23 perseant Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Konrad E. Schroder <perseant (at) hhhh.org>.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38 /*
39 * Copyright (c) 1986, 1989, 1991, 1993, 1995
40 * The Regents of the University of California. All rights reserved.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 * notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 * notice, this list of conditions and the following disclaimer in the
49 * documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95
67 */
68
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.195 2007/01/03 02:42:23 perseant Exp $");
71
72 #ifdef _KERNEL_OPT
73 #include "opt_compat_netbsd.h"
74 #endif
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/namei.h>
79 #include <sys/resourcevar.h>
80 #include <sys/kernel.h>
81 #include <sys/file.h>
82 #include <sys/stat.h>
83 #include <sys/buf.h>
84 #include <sys/proc.h>
85 #include <sys/mount.h>
86 #include <sys/vnode.h>
87 #include <sys/pool.h>
88 #include <sys/signalvar.h>
89 #include <sys/kauth.h>
90 #include <sys/syslog.h>
91
92 #include <miscfs/fifofs/fifo.h>
93 #include <miscfs/genfs/genfs.h>
94 #include <miscfs/specfs/specdev.h>
95
96 #include <ufs/ufs/inode.h>
97 #include <ufs/ufs/dir.h>
98 #include <ufs/ufs/ufsmount.h>
99 #include <ufs/ufs/ufs_extern.h>
100
101 #include <uvm/uvm.h>
102 #include <uvm/uvm_pmap.h>
103 #include <uvm/uvm_stat.h>
104 #include <uvm/uvm_pager.h>
105
106 #include <ufs/lfs/lfs.h>
107 #include <ufs/lfs/lfs_extern.h>
108
109 extern pid_t lfs_writer_daemon;
110
111 /* Global vfs data structures for lfs. */
112 int (**lfs_vnodeop_p)(void *);
113 const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
114 { &vop_default_desc, vn_default_error },
115 { &vop_lookup_desc, ufs_lookup }, /* lookup */
116 { &vop_create_desc, lfs_create }, /* create */
117 { &vop_whiteout_desc, ufs_whiteout }, /* whiteout */
118 { &vop_mknod_desc, lfs_mknod }, /* mknod */
119 { &vop_open_desc, ufs_open }, /* open */
120 { &vop_close_desc, lfs_close }, /* close */
121 { &vop_access_desc, ufs_access }, /* access */
122 { &vop_getattr_desc, lfs_getattr }, /* getattr */
123 { &vop_setattr_desc, lfs_setattr }, /* setattr */
124 { &vop_read_desc, lfs_read }, /* read */
125 { &vop_write_desc, lfs_write }, /* write */
126 { &vop_lease_desc, ufs_lease_check }, /* lease */
127 { &vop_ioctl_desc, ufs_ioctl }, /* ioctl */
128 { &vop_fcntl_desc, lfs_fcntl }, /* fcntl */
129 { &vop_poll_desc, ufs_poll }, /* poll */
130 { &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */
131 { &vop_revoke_desc, ufs_revoke }, /* revoke */
132 { &vop_mmap_desc, lfs_mmap }, /* mmap */
133 { &vop_fsync_desc, lfs_fsync }, /* fsync */
134 { &vop_seek_desc, ufs_seek }, /* seek */
135 { &vop_remove_desc, lfs_remove }, /* remove */
136 { &vop_link_desc, lfs_link }, /* link */
137 { &vop_rename_desc, lfs_rename }, /* rename */
138 { &vop_mkdir_desc, lfs_mkdir }, /* mkdir */
139 { &vop_rmdir_desc, lfs_rmdir }, /* rmdir */
140 { &vop_symlink_desc, lfs_symlink }, /* symlink */
141 { &vop_readdir_desc, ufs_readdir }, /* readdir */
142 { &vop_readlink_desc, ufs_readlink }, /* readlink */
143 { &vop_abortop_desc, ufs_abortop }, /* abortop */
144 { &vop_inactive_desc, lfs_inactive }, /* inactive */
145 { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */
146 { &vop_lock_desc, ufs_lock }, /* lock */
147 { &vop_unlock_desc, ufs_unlock }, /* unlock */
148 { &vop_bmap_desc, ufs_bmap }, /* bmap */
149 { &vop_strategy_desc, lfs_strategy }, /* strategy */
150 { &vop_print_desc, ufs_print }, /* print */
151 { &vop_islocked_desc, ufs_islocked }, /* islocked */
152 { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */
153 { &vop_advlock_desc, ufs_advlock }, /* advlock */
154 { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */
155 { &vop_getpages_desc, lfs_getpages }, /* getpages */
156 { &vop_putpages_desc, lfs_putpages }, /* putpages */
157 { NULL, NULL }
158 };
159 const struct vnodeopv_desc lfs_vnodeop_opv_desc =
160 { &lfs_vnodeop_p, lfs_vnodeop_entries };
161
162 int (**lfs_specop_p)(void *);
163 const struct vnodeopv_entry_desc lfs_specop_entries[] = {
164 { &vop_default_desc, vn_default_error },
165 { &vop_lookup_desc, spec_lookup }, /* lookup */
166 { &vop_create_desc, spec_create }, /* create */
167 { &vop_mknod_desc, spec_mknod }, /* mknod */
168 { &vop_open_desc, spec_open }, /* open */
169 { &vop_close_desc, lfsspec_close }, /* close */
170 { &vop_access_desc, ufs_access }, /* access */
171 { &vop_getattr_desc, lfs_getattr }, /* getattr */
172 { &vop_setattr_desc, lfs_setattr }, /* setattr */
173 { &vop_read_desc, ufsspec_read }, /* read */
174 { &vop_write_desc, ufsspec_write }, /* write */
175 { &vop_lease_desc, spec_lease_check }, /* lease */
176 { &vop_ioctl_desc, spec_ioctl }, /* ioctl */
177 { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
178 { &vop_poll_desc, spec_poll }, /* poll */
179 { &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */
180 { &vop_revoke_desc, spec_revoke }, /* revoke */
181 { &vop_mmap_desc, spec_mmap }, /* mmap */
182 { &vop_fsync_desc, spec_fsync }, /* fsync */
183 { &vop_seek_desc, spec_seek }, /* seek */
184 { &vop_remove_desc, spec_remove }, /* remove */
185 { &vop_link_desc, spec_link }, /* link */
186 { &vop_rename_desc, spec_rename }, /* rename */
187 { &vop_mkdir_desc, spec_mkdir }, /* mkdir */
188 { &vop_rmdir_desc, spec_rmdir }, /* rmdir */
189 { &vop_symlink_desc, spec_symlink }, /* symlink */
190 { &vop_readdir_desc, spec_readdir }, /* readdir */
191 { &vop_readlink_desc, spec_readlink }, /* readlink */
192 { &vop_abortop_desc, spec_abortop }, /* abortop */
193 { &vop_inactive_desc, lfs_inactive }, /* inactive */
194 { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */
195 { &vop_lock_desc, ufs_lock }, /* lock */
196 { &vop_unlock_desc, ufs_unlock }, /* unlock */
197 { &vop_bmap_desc, spec_bmap }, /* bmap */
198 { &vop_strategy_desc, spec_strategy }, /* strategy */
199 { &vop_print_desc, ufs_print }, /* print */
200 { &vop_islocked_desc, ufs_islocked }, /* islocked */
201 { &vop_pathconf_desc, spec_pathconf }, /* pathconf */
202 { &vop_advlock_desc, spec_advlock }, /* advlock */
203 { &vop_bwrite_desc, vn_bwrite }, /* bwrite */
204 { &vop_getpages_desc, spec_getpages }, /* getpages */
205 { &vop_putpages_desc, spec_putpages }, /* putpages */
206 { NULL, NULL }
207 };
208 const struct vnodeopv_desc lfs_specop_opv_desc =
209 { &lfs_specop_p, lfs_specop_entries };
210
211 int (**lfs_fifoop_p)(void *);
212 const struct vnodeopv_entry_desc lfs_fifoop_entries[] = {
213 { &vop_default_desc, vn_default_error },
214 { &vop_lookup_desc, fifo_lookup }, /* lookup */
215 { &vop_create_desc, fifo_create }, /* create */
216 { &vop_mknod_desc, fifo_mknod }, /* mknod */
217 { &vop_open_desc, fifo_open }, /* open */
218 { &vop_close_desc, lfsfifo_close }, /* close */
219 { &vop_access_desc, ufs_access }, /* access */
220 { &vop_getattr_desc, lfs_getattr }, /* getattr */
221 { &vop_setattr_desc, lfs_setattr }, /* setattr */
222 { &vop_read_desc, ufsfifo_read }, /* read */
223 { &vop_write_desc, ufsfifo_write }, /* write */
224 { &vop_lease_desc, fifo_lease_check }, /* lease */
225 { &vop_ioctl_desc, fifo_ioctl }, /* ioctl */
226 { &vop_fcntl_desc, ufs_fcntl }, /* fcntl */
227 { &vop_poll_desc, fifo_poll }, /* poll */
228 { &vop_kqfilter_desc, fifo_kqfilter }, /* kqfilter */
229 { &vop_revoke_desc, fifo_revoke }, /* revoke */
230 { &vop_mmap_desc, fifo_mmap }, /* mmap */
231 { &vop_fsync_desc, fifo_fsync }, /* fsync */
232 { &vop_seek_desc, fifo_seek }, /* seek */
233 { &vop_remove_desc, fifo_remove }, /* remove */
234 { &vop_link_desc, fifo_link }, /* link */
235 { &vop_rename_desc, fifo_rename }, /* rename */
236 { &vop_mkdir_desc, fifo_mkdir }, /* mkdir */
237 { &vop_rmdir_desc, fifo_rmdir }, /* rmdir */
238 { &vop_symlink_desc, fifo_symlink }, /* symlink */
239 { &vop_readdir_desc, fifo_readdir }, /* readdir */
240 { &vop_readlink_desc, fifo_readlink }, /* readlink */
241 { &vop_abortop_desc, fifo_abortop }, /* abortop */
242 { &vop_inactive_desc, lfs_inactive }, /* inactive */
243 { &vop_reclaim_desc, lfs_reclaim }, /* reclaim */
244 { &vop_lock_desc, ufs_lock }, /* lock */
245 { &vop_unlock_desc, ufs_unlock }, /* unlock */
246 { &vop_bmap_desc, fifo_bmap }, /* bmap */
247 { &vop_strategy_desc, fifo_strategy }, /* strategy */
248 { &vop_print_desc, ufs_print }, /* print */
249 { &vop_islocked_desc, ufs_islocked }, /* islocked */
250 { &vop_pathconf_desc, fifo_pathconf }, /* pathconf */
251 { &vop_advlock_desc, fifo_advlock }, /* advlock */
252 { &vop_bwrite_desc, lfs_bwrite }, /* bwrite */
253 { &vop_putpages_desc, fifo_putpages }, /* putpages */
254 { NULL, NULL }
255 };
256 const struct vnodeopv_desc lfs_fifoop_opv_desc =
257 { &lfs_fifoop_p, lfs_fifoop_entries };
258
259 static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int);
260
261 #define LFS_READWRITE
262 #include <ufs/ufs/ufs_readwrite.c>
263 #undef LFS_READWRITE
264
265 /*
266 * Synch an open file.
267 */
268 /* ARGSUSED */
269 int
270 lfs_fsync(void *v)
271 {
272 struct vop_fsync_args /* {
273 struct vnode *a_vp;
274 kauth_cred_t a_cred;
275 int a_flags;
276 off_t offlo;
277 off_t offhi;
278 struct lwp *a_l;
279 } */ *ap = v;
280 struct vnode *vp = ap->a_vp;
281 int error, wait;
282
283 /* If we're mounted read-only, don't try to sync. */
284 if (VTOI(vp)->i_lfs->lfs_ronly)
285 return 0;
286
287 /*
288 * Trickle sync checks for need to do a checkpoint after possible
289 * activity from the pagedaemon.
290 */
291 if (ap->a_flags & FSYNC_LAZY) {
292 simple_lock(&lfs_subsys_lock);
293 wakeup(&lfs_writer_daemon);
294 simple_unlock(&lfs_subsys_lock);
295 return 0;
296 }
297
298 /*
299 * If a vnode is bring cleaned, flush it out before we try to
300 * reuse it. This prevents the cleaner from writing files twice
301 * in the same partial segment, causing an accounting underflow.
302 */
303 if (ap->a_flags & FSYNC_RECLAIM && VTOI(vp)->i_flags & IN_CLEANING) {
304 /* printf("avoiding VONWORKLIST panic\n"); */
305 lfs_vflush(vp);
306 }
307
308 wait = (ap->a_flags & FSYNC_WAIT);
309 simple_lock(&vp->v_interlock);
310 error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
311 round_page(ap->a_offhi),
312 PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
313 if (error)
314 return error;
315 error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
316 if (error == 0 && ap->a_flags & FSYNC_CACHE) {
317 int l = 0;
318 error = VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
319 ap->a_l->l_cred, ap->a_l);
320 }
321 if (wait && !VPISEMPTY(vp))
322 LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
323
324 return error;
325 }
326
327 /*
328 * Take IN_ADIROP off, then call ufs_inactive.
329 */
330 int
331 lfs_inactive(void *v)
332 {
333 struct vop_inactive_args /* {
334 struct vnode *a_vp;
335 struct lwp *a_l;
336 } */ *ap = v;
337
338 KASSERT(VTOI(ap->a_vp)->i_nlink == VTOI(ap->a_vp)->i_ffs_effnlink);
339
340 lfs_unmark_vnode(ap->a_vp);
341
342 /*
343 * The Ifile is only ever inactivated on unmount.
344 * Streamline this process by not giving it more dirty blocks.
345 */
346 if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) {
347 LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD);
348 VOP_UNLOCK(ap->a_vp, 0);
349 return 0;
350 }
351
352 return ufs_inactive(v);
353 }
354
355 /*
356 * These macros are used to bracket UFS directory ops, so that we can
357 * identify all the pages touched during directory ops which need to
358 * be ordered and flushed atomically, so that they may be recovered.
359 *
360 * Because we have to mark nodes VDIROP in order to prevent
361 * the cache from reclaiming them while a dirop is in progress, we must
362 * also manage the number of nodes so marked (otherwise we can run out).
363 * We do this by setting lfs_dirvcount to the number of marked vnodes; it
364 * is decremented during segment write, when VDIROP is taken off.
365 */
366 #define MARK_VNODE(vp) lfs_mark_vnode(vp)
367 #define UNMARK_VNODE(vp) lfs_unmark_vnode(vp)
368 #define SET_DIROP_CREATE(dvp, vpp) lfs_set_dirop_create((dvp), (vpp))
369 #define SET_DIROP_REMOVE(dvp, vp) lfs_set_dirop((dvp), (vp))
370 static int lfs_set_dirop_create(struct vnode *, struct vnode **);
371 static int lfs_set_dirop(struct vnode *, struct vnode *);
372
373 static int
374 lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
375 {
376 struct lfs *fs;
377 int error;
378
379 KASSERT(VOP_ISLOCKED(dvp));
380 KASSERT(vp == NULL || VOP_ISLOCKED(vp));
381
382 fs = VTOI(dvp)->i_lfs;
383
384 ASSERT_NO_SEGLOCK(fs);
385 /*
386 * LFS_NRESERVE calculates direct and indirect blocks as well
387 * as an inode block; an overestimate in most cases.
388 */
389 if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0)
390 return (error);
391
392 restart:
393 simple_lock(&fs->lfs_interlock);
394 if (fs->lfs_dirops == 0) {
395 simple_unlock(&fs->lfs_interlock);
396 lfs_check(dvp, LFS_UNUSED_LBN, 0);
397 simple_lock(&fs->lfs_interlock);
398 }
399 while (fs->lfs_writer) {
400 error = ltsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH,
401 "lfs_sdirop", 0, &fs->lfs_interlock);
402 if (error == EINTR) {
403 simple_unlock(&fs->lfs_interlock);
404 goto unreserve;
405 }
406 }
407 simple_lock(&lfs_subsys_lock);
408 if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
409 wakeup(&lfs_writer_daemon);
410 simple_unlock(&lfs_subsys_lock);
411 simple_unlock(&fs->lfs_interlock);
412 preempt(1);
413 goto restart;
414 }
415
416 if (lfs_dirvcount > LFS_MAX_DIROP) {
417 simple_unlock(&fs->lfs_interlock);
418 DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, "
419 "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount));
420 if ((error = ltsleep(&lfs_dirvcount,
421 PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0,
422 &lfs_subsys_lock)) != 0) {
423 goto unreserve;
424 }
425 goto restart;
426 }
427 simple_unlock(&lfs_subsys_lock);
428
429 ++fs->lfs_dirops;
430 fs->lfs_doifile = 1;
431 simple_unlock(&fs->lfs_interlock);
432
433 /* Hold a reference so SET_ENDOP will be happy */
434 vref(dvp);
435 if (vp) {
436 vref(vp);
437 MARK_VNODE(vp);
438 }
439
440 MARK_VNODE(dvp);
441 return 0;
442
443 unreserve:
444 lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs));
445 return error;
446 }
447
448 /*
449 * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
450 * in getnewvnode(), if we have a stacked filesystem mounted on top
451 * of us.
452 *
453 * NB: this means we have to clear the new vnodes on error. Fortunately
454 * SET_ENDOP is there to do that for us.
455 */
456 static int
457 lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp)
458 {
459 int error;
460 struct lfs *fs;
461
462 fs = VFSTOUFS(dvp->v_mount)->um_lfs;
463 ASSERT_NO_SEGLOCK(fs);
464 if (fs->lfs_ronly)
465 return EROFS;
466 if (vpp && (error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, vpp))) {
467 DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n",
468 dvp, error));
469 return error;
470 }
471 if ((error = lfs_set_dirop(dvp, NULL)) != 0) {
472 if (vpp) {
473 ungetnewvnode(*vpp);
474 *vpp = NULL;
475 }
476 return error;
477 }
478 return 0;
479 }
480
481 #define SET_ENDOP_BASE(fs, dvp, str) \
482 do { \
483 simple_lock(&(fs)->lfs_interlock); \
484 --(fs)->lfs_dirops; \
485 if (!(fs)->lfs_dirops) { \
486 if ((fs)->lfs_nadirop) { \
487 panic("SET_ENDOP: %s: no dirops but " \
488 " nadirop=%d", (str), \
489 (fs)->lfs_nadirop); \
490 } \
491 wakeup(&(fs)->lfs_writer); \
492 simple_unlock(&(fs)->lfs_interlock); \
493 lfs_check((dvp), LFS_UNUSED_LBN, 0); \
494 } else \
495 simple_unlock(&(fs)->lfs_interlock); \
496 } while(0)
497 #define SET_ENDOP_CREATE(fs, dvp, nvpp, str) \
498 do { \
499 UNMARK_VNODE(dvp); \
500 if (nvpp && *nvpp) \
501 UNMARK_VNODE(*nvpp); \
502 /* Check for error return to stem vnode leakage */ \
503 if (nvpp && *nvpp && !((*nvpp)->v_flag & VDIROP)) \
504 ungetnewvnode(*(nvpp)); \
505 SET_ENDOP_BASE((fs), (dvp), (str)); \
506 lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs)); \
507 vrele(dvp); \
508 } while(0)
509 #define SET_ENDOP_CREATE_AP(ap, str) \
510 SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp, \
511 (ap)->a_vpp, (str))
512 #define SET_ENDOP_REMOVE(fs, dvp, ovp, str) \
513 do { \
514 UNMARK_VNODE(dvp); \
515 if (ovp) \
516 UNMARK_VNODE(ovp); \
517 SET_ENDOP_BASE((fs), (dvp), (str)); \
518 lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs)); \
519 vrele(dvp); \
520 if (ovp) \
521 vrele(ovp); \
522 } while(0)
523
524 void
525 lfs_mark_vnode(struct vnode *vp)
526 {
527 struct inode *ip = VTOI(vp);
528 struct lfs *fs = ip->i_lfs;
529
530 simple_lock(&fs->lfs_interlock);
531 if (!(ip->i_flag & IN_ADIROP)) {
532 if (!(vp->v_flag & VDIROP)) {
533 (void)lfs_vref(vp);
534 simple_lock(&lfs_subsys_lock);
535 ++lfs_dirvcount;
536 ++fs->lfs_dirvcount;
537 simple_unlock(&lfs_subsys_lock);
538 TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
539 vp->v_flag |= VDIROP;
540 }
541 ++fs->lfs_nadirop;
542 ip->i_flag |= IN_ADIROP;
543 } else
544 KASSERT(vp->v_flag & VDIROP);
545 simple_unlock(&fs->lfs_interlock);
546 }
547
548 void
549 lfs_unmark_vnode(struct vnode *vp)
550 {
551 struct inode *ip = VTOI(vp);
552
553 if (ip && (ip->i_flag & IN_ADIROP)) {
554 KASSERT(vp->v_flag & VDIROP);
555 simple_lock(&ip->i_lfs->lfs_interlock);
556 --ip->i_lfs->lfs_nadirop;
557 simple_unlock(&ip->i_lfs->lfs_interlock);
558 ip->i_flag &= ~IN_ADIROP;
559 }
560 }
561
562 int
563 lfs_symlink(void *v)
564 {
565 struct vop_symlink_args /* {
566 struct vnode *a_dvp;
567 struct vnode **a_vpp;
568 struct componentname *a_cnp;
569 struct vattr *a_vap;
570 char *a_target;
571 } */ *ap = v;
572 int error;
573
574 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
575 vput(ap->a_dvp);
576 return error;
577 }
578 error = ufs_symlink(ap);
579 SET_ENDOP_CREATE_AP(ap, "symlink");
580 return (error);
581 }
582
583 int
584 lfs_mknod(void *v)
585 {
586 struct vop_mknod_args /* {
587 struct vnode *a_dvp;
588 struct vnode **a_vpp;
589 struct componentname *a_cnp;
590 struct vattr *a_vap;
591 } */ *ap = v;
592 struct vattr *vap = ap->a_vap;
593 struct vnode **vpp = ap->a_vpp;
594 struct inode *ip;
595 int error;
596 struct mount *mp;
597 ino_t ino;
598
599 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
600 vput(ap->a_dvp);
601 return error;
602 }
603 error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
604 ap->a_dvp, vpp, ap->a_cnp);
605
606 /* Either way we're done with the dirop at this point */
607 SET_ENDOP_CREATE_AP(ap, "mknod");
608
609 if (error)
610 return (error);
611
612 ip = VTOI(*vpp);
613 mp = (*vpp)->v_mount;
614 ino = ip->i_number;
615 ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
616 if (vap->va_rdev != VNOVAL) {
617 /*
618 * Want to be able to use this to make badblock
619 * inodes, so don't truncate the dev number.
620 */
621 #if 0
622 ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
623 UFS_MPNEEDSWAP((*vpp)->v_mount));
624 #else
625 ip->i_ffs1_rdev = vap->va_rdev;
626 #endif
627 }
628
629 /*
630 * Call fsync to write the vnode so that we don't have to deal with
631 * flushing it when it's marked VDIROP|VXLOCK.
632 *
633 * XXX KS - If we can't flush we also can't call vgone(), so must
634 * return. But, that leaves this vnode in limbo, also not good.
635 * Can this ever happen (barring hardware failure)?
636 */
637 if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0,
638 curlwp)) != 0) {
639 panic("lfs_mknod: couldn't fsync (ino %llu)",
640 (unsigned long long)ino);
641 /* return (error); */
642 }
643 /*
644 * Remove vnode so that it will be reloaded by VFS_VGET and
645 * checked to see if it is an alias of an existing entry in
646 * the inode cache.
647 */
648 /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
649
650 VOP_UNLOCK(*vpp, 0);
651 lfs_vunref(*vpp);
652 (*vpp)->v_type = VNON;
653 vgone(*vpp);
654 error = VFS_VGET(mp, ino, vpp);
655
656 if (error != 0) {
657 *vpp = NULL;
658 return (error);
659 }
660 return (0);
661 }
662
663 int
664 lfs_create(void *v)
665 {
666 struct vop_create_args /* {
667 struct vnode *a_dvp;
668 struct vnode **a_vpp;
669 struct componentname *a_cnp;
670 struct vattr *a_vap;
671 } */ *ap = v;
672 int error;
673
674 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
675 vput(ap->a_dvp);
676 return error;
677 }
678 error = ufs_create(ap);
679 SET_ENDOP_CREATE_AP(ap, "create");
680 return (error);
681 }
682
683 int
684 lfs_mkdir(void *v)
685 {
686 struct vop_mkdir_args /* {
687 struct vnode *a_dvp;
688 struct vnode **a_vpp;
689 struct componentname *a_cnp;
690 struct vattr *a_vap;
691 } */ *ap = v;
692 int error;
693
694 if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
695 vput(ap->a_dvp);
696 return error;
697 }
698 error = ufs_mkdir(ap);
699 SET_ENDOP_CREATE_AP(ap, "mkdir");
700 return (error);
701 }
702
703 int
704 lfs_remove(void *v)
705 {
706 struct vop_remove_args /* {
707 struct vnode *a_dvp;
708 struct vnode *a_vp;
709 struct componentname *a_cnp;
710 } */ *ap = v;
711 struct vnode *dvp, *vp;
712 struct inode *ip;
713 int error;
714
715 dvp = ap->a_dvp;
716 vp = ap->a_vp;
717 ip = VTOI(vp);
718 if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) {
719 if (dvp == vp)
720 vrele(vp);
721 else
722 vput(vp);
723 vput(dvp);
724 return error;
725 }
726 error = ufs_remove(ap);
727 if (ip->i_nlink == 0)
728 lfs_orphan(ip->i_lfs, ip->i_number);
729 SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove");
730 return (error);
731 }
732
733 int
734 lfs_rmdir(void *v)
735 {
736 struct vop_rmdir_args /* {
737 struct vnodeop_desc *a_desc;
738 struct vnode *a_dvp;
739 struct vnode *a_vp;
740 struct componentname *a_cnp;
741 } */ *ap = v;
742 struct vnode *vp;
743 struct inode *ip;
744 int error;
745
746 vp = ap->a_vp;
747 ip = VTOI(vp);
748 if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) {
749 if (ap->a_dvp == vp)
750 vrele(ap->a_dvp);
751 else
752 vput(ap->a_dvp);
753 vput(vp);
754 return error;
755 }
756 error = ufs_rmdir(ap);
757 if (ip->i_nlink == 0)
758 lfs_orphan(ip->i_lfs, ip->i_number);
759 SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
760 return (error);
761 }
762
763 int
764 lfs_link(void *v)
765 {
766 struct vop_link_args /* {
767 struct vnode *a_dvp;
768 struct vnode *a_vp;
769 struct componentname *a_cnp;
770 } */ *ap = v;
771 int error;
772 struct vnode **vpp = NULL;
773
774 if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) {
775 vput(ap->a_dvp);
776 return error;
777 }
778 error = ufs_link(ap);
779 SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link");
780 return (error);
781 }
782
783 int
784 lfs_rename(void *v)
785 {
786 struct vop_rename_args /* {
787 struct vnode *a_fdvp;
788 struct vnode *a_fvp;
789 struct componentname *a_fcnp;
790 struct vnode *a_tdvp;
791 struct vnode *a_tvp;
792 struct componentname *a_tcnp;
793 } */ *ap = v;
794 struct vnode *tvp, *fvp, *tdvp, *fdvp;
795 struct componentname *tcnp, *fcnp;
796 int error;
797 struct lfs *fs;
798
799 fs = VTOI(ap->a_fdvp)->i_lfs;
800 tvp = ap->a_tvp;
801 tdvp = ap->a_tdvp;
802 tcnp = ap->a_tcnp;
803 fvp = ap->a_fvp;
804 fdvp = ap->a_fdvp;
805 fcnp = ap->a_fcnp;
806
807 /*
808 * Check for cross-device rename.
809 * If it is, we don't want to set dirops, just error out.
810 * (In particular note that MARK_VNODE(tdvp) will DTWT on
811 * a cross-device rename.)
812 *
813 * Copied from ufs_rename.
814 */
815 if ((fvp->v_mount != tdvp->v_mount) ||
816 (tvp && (fvp->v_mount != tvp->v_mount))) {
817 error = EXDEV;
818 goto errout;
819 }
820
821 /*
822 * Check to make sure we're not renaming a vnode onto itself
823 * (deleting a hard link by renaming one name onto another);
824 * if we are we can't recursively call VOP_REMOVE since that
825 * would leave us with an unaccounted-for number of live dirops.
826 *
827 * Inline the relevant section of ufs_rename here, *before*
828 * calling SET_DIROP_REMOVE.
829 */
830 if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
831 (VTOI(tdvp)->i_flags & APPEND))) {
832 error = EPERM;
833 goto errout;
834 }
835 if (fvp == tvp) {
836 if (fvp->v_type == VDIR) {
837 error = EINVAL;
838 goto errout;
839 }
840
841 /* Release destination completely. */
842 VOP_ABORTOP(tdvp, tcnp);
843 vput(tdvp);
844 vput(tvp);
845
846 /* Delete source. */
847 vrele(fvp);
848 fcnp->cn_flags &= ~(MODMASK | SAVESTART);
849 fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
850 fcnp->cn_nameiop = DELETE;
851 vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
852 if ((error = relookup(fdvp, &fvp, fcnp))) {
853 vput(fdvp);
854 return (error);
855 }
856 return (VOP_REMOVE(fdvp, fvp, fcnp));
857 }
858
859 if ((error = SET_DIROP_REMOVE(tdvp, tvp)) != 0)
860 goto errout;
861 MARK_VNODE(fdvp);
862 MARK_VNODE(fvp);
863
864 error = ufs_rename(ap);
865 UNMARK_VNODE(fdvp);
866 UNMARK_VNODE(fvp);
867 SET_ENDOP_REMOVE(fs, tdvp, tvp, "rename");
868 return (error);
869
870 errout:
871 VOP_ABORTOP(tdvp, ap->a_tcnp); /* XXX, why not in NFS? */
872 if (tdvp == tvp)
873 vrele(tdvp);
874 else
875 vput(tdvp);
876 if (tvp)
877 vput(tvp);
878 VOP_ABORTOP(fdvp, ap->a_fcnp); /* XXX, why not in NFS? */
879 vrele(fdvp);
880 vrele(fvp);
881 return (error);
882 }
883
884 /* XXX hack to avoid calling ITIMES in getattr */
885 int
886 lfs_getattr(void *v)
887 {
888 struct vop_getattr_args /* {
889 struct vnode *a_vp;
890 struct vattr *a_vap;
891 kauth_cred_t a_cred;
892 struct lwp *a_l;
893 } */ *ap = v;
894 struct vnode *vp = ap->a_vp;
895 struct inode *ip = VTOI(vp);
896 struct vattr *vap = ap->a_vap;
897 struct lfs *fs = ip->i_lfs;
898 /*
899 * Copy from inode table
900 */
901 vap->va_fsid = ip->i_dev;
902 vap->va_fileid = ip->i_number;
903 vap->va_mode = ip->i_mode & ~IFMT;
904 vap->va_nlink = ip->i_nlink;
905 vap->va_uid = ip->i_uid;
906 vap->va_gid = ip->i_gid;
907 vap->va_rdev = (dev_t)ip->i_ffs1_rdev;
908 vap->va_size = vp->v_size;
909 vap->va_atime.tv_sec = ip->i_ffs1_atime;
910 vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
911 vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
912 vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
913 vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
914 vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
915 vap->va_flags = ip->i_flags;
916 vap->va_gen = ip->i_gen;
917 /* this doesn't belong here */
918 if (vp->v_type == VBLK)
919 vap->va_blocksize = BLKDEV_IOSIZE;
920 else if (vp->v_type == VCHR)
921 vap->va_blocksize = MAXBSIZE;
922 else
923 vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
924 vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
925 vap->va_type = vp->v_type;
926 vap->va_filerev = ip->i_modrev;
927 return (0);
928 }
929
930 /*
931 * Check to make sure the inode blocks won't choke the buffer
932 * cache, then call ufs_setattr as usual.
933 */
934 int
935 lfs_setattr(void *v)
936 {
937 struct vop_setattr_args /* {
938 struct vnode *a_vp;
939 struct vattr *a_vap;
940 kauth_cred_t a_cred;
941 struct lwp *a_l;
942 } */ *ap = v;
943 struct vnode *vp = ap->a_vp;
944
945 lfs_check(vp, LFS_UNUSED_LBN, 0);
946 return ufs_setattr(v);
947 }
948
949 /*
950 * Release the block we hold on lfs_newseg wrapping. Called on file close,
951 * or explicitly from LFCNWRAPGO. Called with the interlock held.
952 */
953 static int
954 lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor)
955 {
956 if (lockstatus(&fs->lfs_stoplock) != LK_EXCLUSIVE)
957 return EBUSY;
958
959 lockmgr(&fs->lfs_stoplock, LK_RELEASE, &fs->lfs_interlock);
960
961 KASSERT(fs->lfs_nowrap > 0);
962 if (fs->lfs_nowrap <= 0) {
963 return 0;
964 }
965
966 if (--fs->lfs_nowrap == 0) {
967 log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt);
968 wakeup(&fs->lfs_wrappass);
969 lfs_wakeup_cleaner(fs);
970 }
971 if (waitfor) {
972 ltsleep(&fs->lfs_nextseg, PCATCH | PUSER,
973 "segment", 0, &fs->lfs_interlock);
974 }
975
976 return 0;
977 }
978
979 /*
980 * Close called
981 */
982 /* ARGSUSED */
983 int
984 lfs_close(void *v)
985 {
986 struct vop_close_args /* {
987 struct vnode *a_vp;
988 int a_fflag;
989 kauth_cred_t a_cred;
990 struct lwp *a_l;
991 } */ *ap = v;
992 struct vnode *vp = ap->a_vp;
993 struct inode *ip = VTOI(vp);
994 struct lfs *fs = ip->i_lfs;
995
996 if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) &&
997 lockstatus(&fs->lfs_stoplock) == LK_EXCLUSIVE) {
998 simple_lock(&fs->lfs_interlock);
999 log(LOG_NOTICE, "lfs_close: releasing log wrap control\n");
1000 lfs_wrapgo(fs, ip, 0);
1001 simple_unlock(&fs->lfs_interlock);
1002 }
1003
1004 if (vp == ip->i_lfs->lfs_ivnode &&
1005 vp->v_mount->mnt_iflag & IMNT_UNMOUNT)
1006 return 0;
1007
1008 if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) {
1009 LFS_ITIMES(ip, NULL, NULL, NULL);
1010 }
1011 return (0);
1012 }
1013
1014 /*
1015 * Close wrapper for special devices.
1016 *
1017 * Update the times on the inode then do device close.
1018 */
1019 int
1020 lfsspec_close(void *v)
1021 {
1022 struct vop_close_args /* {
1023 struct vnode *a_vp;
1024 int a_fflag;
1025 kauth_cred_t a_cred;
1026 struct lwp *a_l;
1027 } */ *ap = v;
1028 struct vnode *vp;
1029 struct inode *ip;
1030
1031 vp = ap->a_vp;
1032 ip = VTOI(vp);
1033 if (vp->v_usecount > 1) {
1034 LFS_ITIMES(ip, NULL, NULL, NULL);
1035 }
1036 return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
1037 }
1038
1039 /*
1040 * Close wrapper for fifo's.
1041 *
1042 * Update the times on the inode then do device close.
1043 */
1044 int
1045 lfsfifo_close(void *v)
1046 {
1047 struct vop_close_args /* {
1048 struct vnode *a_vp;
1049 int a_fflag;
1050 kauth_cred_ a_cred;
1051 struct lwp *a_l;
1052 } */ *ap = v;
1053 struct vnode *vp;
1054 struct inode *ip;
1055
1056 vp = ap->a_vp;
1057 ip = VTOI(vp);
1058 if (ap->a_vp->v_usecount > 1) {
1059 LFS_ITIMES(ip, NULL, NULL, NULL);
1060 }
1061 return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
1062 }
1063
1064 /*
1065 * Reclaim an inode so that it can be used for other purposes.
1066 */
1067
1068 int
1069 lfs_reclaim(void *v)
1070 {
1071 struct vop_reclaim_args /* {
1072 struct vnode *a_vp;
1073 struct lwp *a_l;
1074 } */ *ap = v;
1075 struct vnode *vp = ap->a_vp;
1076 struct inode *ip = VTOI(vp);
1077 int error;
1078
1079 KASSERT(ip->i_nlink == ip->i_ffs_effnlink);
1080
1081 LFS_CLR_UINO(ip, IN_ALLMOD);
1082 if ((error = ufs_reclaim(vp, ap->a_l)))
1083 return (error);
1084 pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din);
1085 lfs_deregister_all(vp);
1086 pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
1087 ip->inode_ext.lfs = NULL;
1088 pool_put(&lfs_inode_pool, vp->v_data);
1089 vp->v_data = NULL;
1090 return (0);
1091 }
1092
1093 /*
1094 * Read a block from a storage device.
1095 * In order to avoid reading blocks that are in the process of being
1096 * written by the cleaner---and hence are not mutexed by the normal
1097 * buffer cache / page cache mechanisms---check for collisions before
1098 * reading.
1099 *
1100 * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
1101 * the active cleaner test.
1102 *
1103 * XXX This code assumes that lfs_markv makes synchronous checkpoints.
1104 */
1105 int
1106 lfs_strategy(void *v)
1107 {
1108 struct vop_strategy_args /* {
1109 struct vnode *a_vp;
1110 struct buf *a_bp;
1111 } */ *ap = v;
1112 struct buf *bp;
1113 struct lfs *fs;
1114 struct vnode *vp;
1115 struct inode *ip;
1116 daddr_t tbn;
1117 int i, sn, error, slept;
1118
1119 bp = ap->a_bp;
1120 vp = ap->a_vp;
1121 ip = VTOI(vp);
1122 fs = ip->i_lfs;
1123
1124 /* lfs uses its strategy routine only for read */
1125 KASSERT(bp->b_flags & B_READ);
1126
1127 if (vp->v_type == VBLK || vp->v_type == VCHR)
1128 panic("lfs_strategy: spec");
1129 KASSERT(bp->b_bcount != 0);
1130 if (bp->b_blkno == bp->b_lblkno) {
1131 error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
1132 NULL);
1133 if (error) {
1134 bp->b_error = error;
1135 bp->b_flags |= B_ERROR;
1136 biodone(bp);
1137 return (error);
1138 }
1139 if ((long)bp->b_blkno == -1) /* no valid data */
1140 clrbuf(bp);
1141 }
1142 if ((long)bp->b_blkno < 0) { /* block is not on disk */
1143 biodone(bp);
1144 return (0);
1145 }
1146
1147 slept = 1;
1148 simple_lock(&fs->lfs_interlock);
1149 while (slept && fs->lfs_seglock) {
1150 simple_unlock(&fs->lfs_interlock);
1151 /*
1152 * Look through list of intervals.
1153 * There will only be intervals to look through
1154 * if the cleaner holds the seglock.
1155 * Since the cleaner is synchronous, we can trust
1156 * the list of intervals to be current.
1157 */
1158 tbn = dbtofsb(fs, bp->b_blkno);
1159 sn = dtosn(fs, tbn);
1160 slept = 0;
1161 for (i = 0; i < fs->lfs_cleanind; i++) {
1162 if (sn == dtosn(fs, fs->lfs_cleanint[i]) &&
1163 tbn >= fs->lfs_cleanint[i]) {
1164 DLOG((DLOG_CLEAN,
1165 "lfs_strategy: ino %d lbn %" PRId64
1166 " ind %d sn %d fsb %" PRIx32
1167 " given sn %d fsb %" PRIx64 "\n",
1168 ip->i_number, bp->b_lblkno, i,
1169 dtosn(fs, fs->lfs_cleanint[i]),
1170 fs->lfs_cleanint[i], sn, tbn));
1171 DLOG((DLOG_CLEAN,
1172 "lfs_strategy: sleeping on ino %d lbn %"
1173 PRId64 "\n", ip->i_number, bp->b_lblkno));
1174 simple_lock(&fs->lfs_interlock);
1175 if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
1176 /* Cleaner can't wait for itself */
1177 ltsleep(&fs->lfs_iocount,
1178 (PRIBIO + 1) | PNORELOCK,
1179 "clean2", 0,
1180 &fs->lfs_interlock);
1181 slept = 1;
1182 break;
1183 } else if (fs->lfs_seglock) {
1184 ltsleep(&fs->lfs_seglock,
1185 (PRIBIO + 1) | PNORELOCK,
1186 "clean1", 0,
1187 &fs->lfs_interlock);
1188 slept = 1;
1189 break;
1190 }
1191 simple_unlock(&fs->lfs_interlock);
1192 }
1193 }
1194 simple_lock(&fs->lfs_interlock);
1195 }
1196 simple_unlock(&fs->lfs_interlock);
1197
1198 vp = ip->i_devvp;
1199 VOP_STRATEGY(vp, bp);
1200 return (0);
1201 }
1202
1203 void
1204 lfs_flush_dirops(struct lfs *fs)
1205 {
1206 struct inode *ip, *nip;
1207 struct vnode *vp;
1208 extern int lfs_dostats;
1209 struct segment *sp;
1210 int waslocked;
1211
1212 ASSERT_MAYBE_SEGLOCK(fs);
1213 KASSERT(fs->lfs_nadirop == 0);
1214
1215 if (fs->lfs_ronly)
1216 return;
1217
1218 simple_lock(&fs->lfs_interlock);
1219 if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
1220 simple_unlock(&fs->lfs_interlock);
1221 return;
1222 } else
1223 simple_unlock(&fs->lfs_interlock);
1224
1225 if (lfs_dostats)
1226 ++lfs_stats.flush_invoked;
1227
1228 /*
1229 * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
1230 * Technically this is a checkpoint (the on-disk state is valid)
1231 * even though we are leaving out all the file data.
1232 */
1233 lfs_imtime(fs);
1234 lfs_seglock(fs, SEGM_CKP);
1235 sp = fs->lfs_sp;
1236
1237 /*
1238 * lfs_writevnodes, optimized to get dirops out of the way.
1239 * Only write dirops, and don't flush files' pages, only
1240 * blocks from the directories.
1241 *
1242 * We don't need to vref these files because they are
1243 * dirops and so hold an extra reference until the
1244 * segunlock clears them of that status.
1245 *
1246 * We don't need to check for IN_ADIROP because we know that
1247 * no dirops are active.
1248 *
1249 */
1250 simple_lock(&fs->lfs_interlock);
1251 for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
1252 nip = TAILQ_NEXT(ip, i_lfs_dchain);
1253 simple_unlock(&fs->lfs_interlock);
1254 vp = ITOV(ip);
1255
1256 KASSERT((ip->i_flag & IN_ADIROP) == 0);
1257
1258 /*
1259 * All writes to directories come from dirops; all
1260 * writes to files' direct blocks go through the page
1261 * cache, which we're not touching. Reads to files
1262 * and/or directories will not be affected by writing
1263 * directory blocks inodes and file inodes. So we don't
1264 * really need to lock. If we don't lock, though,
1265 * make sure that we don't clear IN_MODIFIED
1266 * unnecessarily.
1267 */
1268 if (vp->v_flag & (VXLOCK | VFREEING)) {
1269 simple_lock(&fs->lfs_interlock);
1270 continue;
1271 }
1272 waslocked = VOP_ISLOCKED(vp);
1273 if (vp->v_type != VREG &&
1274 ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
1275 lfs_writefile(fs, sp, vp);
1276 if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
1277 !(ip->i_flag & IN_ALLMOD)) {
1278 LFS_SET_UINO(ip, IN_MODIFIED);
1279 }
1280 }
1281 KDASSERT(ip->i_number != LFS_IFILE_INUM);
1282 (void) lfs_writeinode(fs, sp, ip);
1283 if (waslocked == LK_EXCLOTHER)
1284 LFS_SET_UINO(ip, IN_MODIFIED);
1285 simple_lock(&fs->lfs_interlock);
1286 }
1287 simple_unlock(&fs->lfs_interlock);
1288 /* We've written all the dirops there are */
1289 ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
1290 lfs_finalize_fs_seguse(fs);
1291 (void) lfs_writeseg(fs, sp);
1292 lfs_segunlock(fs);
1293 }
1294
1295 /*
1296 * Flush all vnodes for which the pagedaemon has requested pageouts.
1297 * Skip over any files that are marked VDIROP (since lfs_flush_dirop()
1298 * has just run, this would be an error). If we have to skip a vnode
1299 * for any reason, just skip it; if we have to wait for the cleaner,
1300 * abort. The writer daemon will call us again later.
1301 */
1302 void
1303 lfs_flush_pchain(struct lfs *fs)
1304 {
1305 struct inode *ip, *nip;
1306 struct vnode *vp;
1307 extern int lfs_dostats;
1308 struct segment *sp;
1309 int error;
1310
1311 ASSERT_NO_SEGLOCK(fs);
1312
1313 if (fs->lfs_ronly)
1314 return;
1315
1316 simple_lock(&fs->lfs_interlock);
1317 if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
1318 simple_unlock(&fs->lfs_interlock);
1319 return;
1320 } else
1321 simple_unlock(&fs->lfs_interlock);
1322
1323 /* Get dirops out of the way */
1324 lfs_flush_dirops(fs);
1325
1326 if (lfs_dostats)
1327 ++lfs_stats.flush_invoked;
1328
1329 /*
1330 * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
1331 */
1332 lfs_imtime(fs);
1333 lfs_seglock(fs, 0);
1334 sp = fs->lfs_sp;
1335
1336 /*
1337 * lfs_writevnodes, optimized to clear pageout requests.
1338 * Only write non-dirop files that are in the pageout queue.
1339 * We're very conservative about what we write; we want to be
1340 * fast and async.
1341 */
1342 simple_lock(&fs->lfs_interlock);
1343 top:
1344 for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) {
1345 nip = TAILQ_NEXT(ip, i_lfs_pchain);
1346 vp = ITOV(ip);
1347
1348 if (!(ip->i_flags & IN_PAGING))
1349 goto top;
1350
1351 if (vp->v_flag & (VXLOCK|VDIROP))
1352 continue;
1353 if (vp->v_type != VREG)
1354 continue;
1355 if (lfs_vref(vp))
1356 continue;
1357 simple_unlock(&fs->lfs_interlock);
1358
1359 if (VOP_ISLOCKED(vp)) {
1360 lfs_vunref(vp);
1361 simple_lock(&fs->lfs_interlock);
1362 continue;
1363 }
1364
1365 error = lfs_writefile(fs, sp, vp);
1366 if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
1367 !(ip->i_flag & IN_ALLMOD)) {
1368 LFS_SET_UINO(ip, IN_MODIFIED);
1369 }
1370 KDASSERT(ip->i_number != LFS_IFILE_INUM);
1371 (void) lfs_writeinode(fs, sp, ip);
1372
1373 lfs_vunref(vp);
1374
1375 if (error == EAGAIN) {
1376 lfs_writeseg(fs, sp);
1377 simple_lock(&fs->lfs_interlock);
1378 break;
1379 }
1380 simple_lock(&fs->lfs_interlock);
1381 }
1382 simple_unlock(&fs->lfs_interlock);
1383 (void) lfs_writeseg(fs, sp);
1384 lfs_segunlock(fs);
1385 }
1386
1387 /*
1388 * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
1389 */
1390 int
1391 lfs_fcntl(void *v)
1392 {
1393 struct vop_fcntl_args /* {
1394 struct vnode *a_vp;
1395 u_long a_command;
1396 caddr_t a_data;
1397 int a_fflag;
1398 kauth_cred_t a_cred;
1399 struct lwp *a_l;
1400 } */ *ap = v;
1401 struct timeval *tvp;
1402 BLOCK_INFO *blkiov;
1403 CLEANERINFO *cip;
1404 SEGUSE *sup;
1405 int blkcnt, error, oclean;
1406 size_t fh_size;
1407 struct lfs_fcntl_markv blkvp;
1408 struct lwp *l;
1409 fsid_t *fsidp;
1410 struct lfs *fs;
1411 struct buf *bp;
1412 fhandle_t *fhp;
1413 daddr_t off;
1414
1415 /* Only respect LFS fcntls on fs root or Ifile */
1416 if (VTOI(ap->a_vp)->i_number != ROOTINO &&
1417 VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) {
1418 return ufs_fcntl(v);
1419 }
1420
1421 /* Avoid locking a draining lock */
1422 if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) {
1423 return ESHUTDOWN;
1424 }
1425
1426 /* LFS control and monitoring fcntls are available only to root */
1427 l = ap->a_l;
1428 if (((ap->a_command & 0xff00) >> 8) == 'L' &&
1429 (error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
1430 &l->l_acflag)) != 0)
1431 return (error);
1432
1433 fs = VTOI(ap->a_vp)->i_lfs;
1434 fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx;
1435
1436 error = 0;
1437 switch (ap->a_command) {
1438 case LFCNSEGWAITALL:
1439 case LFCNSEGWAITALL_COMPAT:
1440 fsidp = NULL;
1441 /* FALLSTHROUGH */
1442 case LFCNSEGWAIT:
1443 case LFCNSEGWAIT_COMPAT:
1444 tvp = (struct timeval *)ap->a_data;
1445 simple_lock(&fs->lfs_interlock);
1446 ++fs->lfs_sleepers;
1447 simple_unlock(&fs->lfs_interlock);
1448
1449 error = lfs_segwait(fsidp, tvp);
1450
1451 simple_lock(&fs->lfs_interlock);
1452 if (--fs->lfs_sleepers == 0)
1453 wakeup(&fs->lfs_sleepers);
1454 simple_unlock(&fs->lfs_interlock);
1455 return error;
1456
1457 case LFCNBMAPV:
1458 case LFCNMARKV:
1459 blkvp = *(struct lfs_fcntl_markv *)ap->a_data;
1460
1461 blkcnt = blkvp.blkcnt;
1462 if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
1463 return (EINVAL);
1464 blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
1465 if ((error = copyin(blkvp.blkiov, blkiov,
1466 blkcnt * sizeof(BLOCK_INFO))) != 0) {
1467 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
1468 return error;
1469 }
1470
1471 simple_lock(&fs->lfs_interlock);
1472 ++fs->lfs_sleepers;
1473 simple_unlock(&fs->lfs_interlock);
1474 if (ap->a_command == LFCNBMAPV)
1475 error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt);
1476 else /* LFCNMARKV */
1477 error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt);
1478 if (error == 0)
1479 error = copyout(blkiov, blkvp.blkiov,
1480 blkcnt * sizeof(BLOCK_INFO));
1481 simple_lock(&fs->lfs_interlock);
1482 if (--fs->lfs_sleepers == 0)
1483 wakeup(&fs->lfs_sleepers);
1484 simple_unlock(&fs->lfs_interlock);
1485 lfs_free(fs, blkiov, LFS_NB_BLKIOV);
1486 return error;
1487
1488 case LFCNRECLAIM:
1489 /*
1490 * Flush dirops and write Ifile, allowing empty segments
1491 * to be immediately reclaimed.
1492 */
1493 lfs_writer_enter(fs, "pndirop");
1494 off = fs->lfs_offset;
1495 lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP);
1496 lfs_flush_dirops(fs);
1497 LFS_CLEANERINFO(cip, fs, bp);
1498 oclean = cip->clean;
1499 LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
1500 lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP);
1501 fs->lfs_sp->seg_flags |= SEGM_PROT;
1502 lfs_segunlock(fs);
1503 lfs_writer_leave(fs);
1504
1505 #ifdef DEBUG
1506 LFS_CLEANERINFO(cip, fs, bp);
1507 DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64
1508 " blocks, cleaned %" PRId32 " segments (activesb %d)\n",
1509 fs->lfs_offset - off, cip->clean - oclean,
1510 fs->lfs_activesb));
1511 LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
1512 #endif
1513
1514 return 0;
1515
1516 #ifdef COMPAT_30
1517 case LFCNIFILEFH_COMPAT:
1518 /* Return the filehandle of the Ifile */
1519 if ((error = kauth_authorize_generic(l->l_cred,
1520 KAUTH_GENERIC_ISSUSER, &l->l_acflag)) != 0)
1521 return (error);
1522 fhp = (struct fhandle *)ap->a_data;
1523 fhp->fh_fsid = *fsidp;
1524 fh_size = 16; /* former VFS_MAXFIDSIZ */
1525 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
1526 #endif
1527
1528 case LFCNIFILEFH_COMPAT2:
1529 case LFCNIFILEFH:
1530 /* Return the filehandle of the Ifile */
1531 fhp = (struct fhandle *)ap->a_data;
1532 fhp->fh_fsid = *fsidp;
1533 fh_size = sizeof(struct lfs_fhandle) -
1534 offsetof(fhandle_t, fh_fid);
1535 return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
1536
1537 case LFCNREWIND:
1538 /* Move lfs_offset to the lowest-numbered segment */
1539 return lfs_rewind(fs, *(int *)ap->a_data);
1540
1541 case LFCNINVAL:
1542 /* Mark a segment SEGUSE_INVAL */
1543 LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp);
1544 if (sup->su_nbytes > 0) {
1545 brelse(bp);
1546 lfs_unset_inval_all(fs);
1547 return EBUSY;
1548 }
1549 sup->su_flags |= SEGUSE_INVAL;
1550 VOP_BWRITE(bp);
1551 return 0;
1552
1553 case LFCNRESIZE:
1554 /* Resize the filesystem */
1555 return lfs_resize_fs(fs, *(int *)ap->a_data);
1556
1557 case LFCNWRAPSTOP:
1558 case LFCNWRAPSTOP_COMPAT:
1559 /*
1560 * Hold lfs_newseg at segment 0; if requested, sleep until
1561 * the filesystem wraps around. To support external agents
1562 * (dump, fsck-based regression test) that need to look at
1563 * a snapshot of the filesystem, without necessarily
1564 * requiring that all fs activity stops.
1565 */
1566 if (lockstatus(&fs->lfs_stoplock))
1567 return EALREADY;
1568
1569 simple_lock(&fs->lfs_interlock);
1570 lockmgr(&fs->lfs_stoplock, LK_EXCLUSIVE, &fs->lfs_interlock);
1571 if (fs->lfs_nowrap == 0)
1572 log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt);
1573 ++fs->lfs_nowrap;
1574 if (*(int *)ap->a_data == 1 ||
1575 ap->a_command == LFCNWRAPSTOP_COMPAT) {
1576 log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n");
1577 error = ltsleep(&fs->lfs_nowrap, PCATCH | PUSER,
1578 "segwrap", 0, &fs->lfs_interlock);
1579 log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n");
1580 if (error) {
1581 lfs_wrapgo(fs, VTOI(ap->a_vp), 0);
1582 }
1583 }
1584 simple_unlock(&fs->lfs_interlock);
1585 return 0;
1586
1587 case LFCNWRAPGO:
1588 case LFCNWRAPGO_COMPAT:
1589 /*
1590 * Having done its work, the agent wakes up the writer.
1591 * If the argument is 1, it sleeps until a new segment
1592 * is selected.
1593 */
1594 simple_lock(&fs->lfs_interlock);
1595 error = lfs_wrapgo(fs, VTOI(ap->a_vp),
1596 (ap->a_command == LFCNWRAPGO_COMPAT ? 1 :
1597 *((int *)ap->a_data)));
1598 simple_unlock(&fs->lfs_interlock);
1599 return error;
1600
1601 case LFCNWRAPPASS:
1602 if (lockstatus(&fs->lfs_stoplock) != LK_EXCLUSIVE)
1603 return EALREADY;
1604 if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT))
1605 return EALREADY;
1606 simple_lock(&fs->lfs_interlock);
1607 if (fs->lfs_nowrap == 0) {
1608 simple_unlock(&fs->lfs_interlock);
1609 return EBUSY;
1610 }
1611 fs->lfs_wrappass = 1;
1612 wakeup(&fs->lfs_wrappass);
1613 /* Wait for the log to wrap, if asked */
1614 if (*(int *)ap->a_data) {
1615 lfs_vref(ap->a_vp);
1616 VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
1617 log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
1618 error = ltsleep(&fs->lfs_nowrap, PCATCH | PUSER,
1619 "segwrap", 0, &fs->lfs_interlock);
1620 log(LOG_NOTICE, "LFCNPASS done waiting\n");
1621 VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT;
1622 lfs_vunref(ap->a_vp);
1623 }
1624 simple_unlock(&fs->lfs_interlock);
1625 return error;
1626
1627 case LFCNWRAPSTATUS:
1628 simple_lock(&fs->lfs_interlock);
1629 *(int *)ap->a_data = fs->lfs_wrapstatus;
1630 simple_unlock(&fs->lfs_interlock);
1631 return 0;
1632
1633 default:
1634 return ufs_fcntl(v);
1635 }
1636 return 0;
1637 }
1638
1639 int
1640 lfs_getpages(void *v)
1641 {
1642 struct vop_getpages_args /* {
1643 struct vnode *a_vp;
1644 voff_t a_offset;
1645 struct vm_page **a_m;
1646 int *a_count;
1647 int a_centeridx;
1648 vm_prot_t a_access_type;
1649 int a_advice;
1650 int a_flags;
1651 } */ *ap = v;
1652
1653 if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM &&
1654 (ap->a_access_type & VM_PROT_WRITE) != 0) {
1655 return EPERM;
1656 }
1657 if ((ap->a_access_type & VM_PROT_WRITE) != 0) {
1658 LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED);
1659 }
1660
1661 /*
1662 * we're relying on the fact that genfs_getpages() always read in
1663 * entire filesystem blocks.
1664 */
1665 return genfs_getpages(v);
1666 }
1667
1668 /*
1669 * Make sure that for all pages in every block in the given range,
1670 * either all are dirty or all are clean. If any of the pages
1671 * we've seen so far are dirty, put the vnode on the paging chain,
1672 * and mark it IN_PAGING.
1673 *
1674 * If checkfirst != 0, don't check all the pages but return at the
1675 * first dirty page.
1676 */
1677 static int
1678 check_dirty(struct lfs *fs, struct vnode *vp,
1679 off_t startoffset, off_t endoffset, off_t blkeof,
1680 int flags, int checkfirst)
1681 {
1682 int by_list;
1683 struct vm_page *curpg = NULL; /* XXX: gcc */
1684 struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg;
1685 off_t soff = 0; /* XXX: gcc */
1686 voff_t off;
1687 int i;
1688 int nonexistent;
1689 int any_dirty; /* number of dirty pages */
1690 int dirty; /* number of dirty pages in a block */
1691 int tdirty;
1692 int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
1693 int pagedaemon = (curproc == uvm.pagedaemon_proc);
1694
1695 ASSERT_MAYBE_SEGLOCK(fs);
1696 top:
1697 by_list = (vp->v_uobj.uo_npages <=
1698 ((endoffset - startoffset) >> PAGE_SHIFT) *
1699 UVM_PAGE_HASH_PENALTY);
1700 any_dirty = 0;
1701
1702 if (by_list) {
1703 curpg = TAILQ_FIRST(&vp->v_uobj.memq);
1704 } else {
1705 soff = startoffset;
1706 }
1707 while (by_list || soff < MIN(blkeof, endoffset)) {
1708 if (by_list) {
1709 /*
1710 * Find the first page in a block. Skip
1711 * blocks outside our area of interest or beyond
1712 * the end of file.
1713 */
1714 if (pages_per_block > 1) {
1715 while (curpg &&
1716 ((curpg->offset & fs->lfs_bmask) ||
1717 curpg->offset >= vp->v_size ||
1718 curpg->offset >= endoffset))
1719 curpg = TAILQ_NEXT(curpg, listq);
1720 }
1721 if (curpg == NULL)
1722 break;
1723 soff = curpg->offset;
1724 }
1725
1726 /*
1727 * Mark all pages in extended range busy; find out if any
1728 * of them are dirty.
1729 */
1730 nonexistent = dirty = 0;
1731 for (i = 0; i == 0 || i < pages_per_block; i++) {
1732 if (by_list && pages_per_block <= 1) {
1733 pgs[i] = pg = curpg;
1734 } else {
1735 off = soff + (i << PAGE_SHIFT);
1736 pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
1737 if (pg == NULL) {
1738 ++nonexistent;
1739 continue;
1740 }
1741 }
1742 KASSERT(pg != NULL);
1743
1744 /*
1745 * If we're holding the segment lock, we can deadlock
1746 * against a process that has our page and is waiting
1747 * for the cleaner, while the cleaner waits for the
1748 * segment lock. Just bail in that case.
1749 */
1750 if ((pg->flags & PG_BUSY) &&
1751 (pagedaemon || LFS_SEGLOCK_HELD(fs))) {
1752 if (by_list && i > 0)
1753 uvm_page_unbusy(pgs, i);
1754 DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
1755 return -1;
1756 }
1757
1758 while (pg->flags & PG_BUSY) {
1759 pg->flags |= PG_WANTED;
1760 UVM_UNLOCK_AND_WAIT(pg, &vp->v_interlock, 0,
1761 "lfsput", 0);
1762 simple_lock(&vp->v_interlock);
1763 if (by_list) {
1764 if (i > 0)
1765 uvm_page_unbusy(pgs, i);
1766 goto top;
1767 }
1768 }
1769 pg->flags |= PG_BUSY;
1770 UVM_PAGE_OWN(pg, "lfs_putpages");
1771
1772 pmap_page_protect(pg, VM_PROT_NONE);
1773 tdirty = (pmap_clear_modify(pg) ||
1774 (pg->flags & PG_CLEAN) == 0);
1775 dirty += tdirty;
1776 }
1777 if (pages_per_block > 0 && nonexistent >= pages_per_block) {
1778 if (by_list) {
1779 curpg = TAILQ_NEXT(curpg, listq);
1780 } else {
1781 soff += fs->lfs_bsize;
1782 }
1783 continue;
1784 }
1785
1786 any_dirty += dirty;
1787 KASSERT(nonexistent == 0);
1788
1789 /*
1790 * If any are dirty make all dirty; unbusy them,
1791 * but if we were asked to clean, wire them so that
1792 * the pagedaemon doesn't bother us about them while
1793 * they're on their way to disk.
1794 */
1795 for (i = 0; i == 0 || i < pages_per_block; i++) {
1796 pg = pgs[i];
1797 KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
1798 if (dirty) {
1799 pg->flags &= ~PG_CLEAN;
1800 if (flags & PGO_FREE) {
1801 /*
1802 * Wire the page so that
1803 * pdaemon doesn't see it again.
1804 */
1805 uvm_lock_pageq();
1806 uvm_pagewire(pg);
1807 uvm_unlock_pageq();
1808
1809 /* Suspended write flag */
1810 pg->flags |= PG_DELWRI;
1811 }
1812 }
1813 if (pg->flags & PG_WANTED)
1814 wakeup(pg);
1815 pg->flags &= ~(PG_WANTED|PG_BUSY);
1816 UVM_PAGE_OWN(pg, NULL);
1817 }
1818
1819 if (checkfirst && any_dirty)
1820 break;
1821
1822 if (by_list) {
1823 curpg = TAILQ_NEXT(curpg, listq);
1824 } else {
1825 soff += MAX(PAGE_SIZE, fs->lfs_bsize);
1826 }
1827 }
1828
1829 return any_dirty;
1830 }
1831
1832 /*
1833 * lfs_putpages functions like genfs_putpages except that
1834 *
1835 * (1) It needs to bounds-check the incoming requests to ensure that
1836 * they are block-aligned; if they are not, expand the range and
1837 * do the right thing in case, e.g., the requested range is clean
1838 * but the expanded range is dirty.
1839 *
1840 * (2) It needs to explicitly send blocks to be written when it is done.
1841 * VOP_PUTPAGES is not ever called with the seglock held, so
1842 * we simply take the seglock and let lfs_segunlock wait for us.
1843 * XXX Actually we can be called with the seglock held, if we have
1844 * XXX to flush a vnode while lfs_markv is in operation. As of this
1845 * XXX writing we panic in this case.
1846 *
1847 * Assumptions:
1848 *
1849 * (1) The caller does not hold any pages in this vnode busy. If it does,
1850 * there is a danger that when we expand the page range and busy the
1851 * pages we will deadlock.
1852 *
1853 * (2) We are called with vp->v_interlock held; we must return with it
1854 * released.
1855 *
1856 * (3) We don't absolutely have to free pages right away, provided that
1857 * the request does not have PGO_SYNCIO. When the pagedaemon gives
1858 * us a request with PGO_FREE, we take the pages out of the paging
1859 * queue and wake up the writer, which will handle freeing them for us.
1860 *
1861 * We ensure that for any filesystem block, all pages for that
1862 * block are either resident or not, even if those pages are higher
1863 * than EOF; that means that we will be getting requests to free
1864 * "unused" pages above EOF all the time, and should ignore them.
1865 *
1866 * (4) If we are called with PGO_LOCKED, the finfo array we are to write
1867 * into has been set up for us by lfs_writefile. If not, we will
1868 * have to handle allocating and/or freeing an finfo entry.
1869 *
1870 * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
1871 */
1872
1873 int
1874 lfs_putpages(void *v)
1875 {
1876 int error;
1877 struct vop_putpages_args /* {
1878 struct vnode *a_vp;
1879 voff_t a_offlo;
1880 voff_t a_offhi;
1881 int a_flags;
1882 } */ *ap = v;
1883 struct vnode *vp;
1884 struct inode *ip;
1885 struct lfs *fs;
1886 struct segment *sp;
1887 off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
1888 off_t off, max_endoffset;
1889 int s;
1890 boolean_t seglocked, sync, pagedaemon;
1891 struct vm_page *pg;
1892 UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
1893
1894 vp = ap->a_vp;
1895 ip = VTOI(vp);
1896 fs = ip->i_lfs;
1897 sync = (ap->a_flags & PGO_SYNCIO) != 0;
1898 pagedaemon = (curproc == uvm.pagedaemon_proc);
1899
1900 /* Putpages does nothing for metadata. */
1901 if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
1902 simple_unlock(&vp->v_interlock);
1903 return 0;
1904 }
1905
1906 /*
1907 * If there are no pages, don't do anything.
1908 */
1909 if (vp->v_uobj.uo_npages == 0) {
1910 s = splbio();
1911 if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
1912 (vp->v_flag & VONWORKLST) &&
1913 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
1914 vp->v_flag &= ~VWRITEMAPDIRTY;
1915 vn_syncer_remove_from_worklist(vp);
1916 }
1917 splx(s);
1918 simple_unlock(&vp->v_interlock);
1919
1920 /* Remove us from paging queue, if we were on it */
1921 simple_lock(&fs->lfs_interlock);
1922 if (ip->i_flags & IN_PAGING) {
1923 ip->i_flags &= ~IN_PAGING;
1924 TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
1925 }
1926 simple_unlock(&fs->lfs_interlock);
1927 return 0;
1928 }
1929
1930 blkeof = blkroundup(fs, ip->i_size);
1931
1932 /*
1933 * Ignore requests to free pages past EOF but in the same block
1934 * as EOF, unless the request is synchronous. (If the request is
1935 * sync, it comes from lfs_truncate.)
1936 * XXXUBC Make these pages look "active" so the pagedaemon won't
1937 * XXXUBC bother us with them again.
1938 */
1939 if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
1940 origoffset = ap->a_offlo;
1941 for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
1942 pg = uvm_pagelookup(&vp->v_uobj, off);
1943 KASSERT(pg != NULL);
1944 while (pg->flags & PG_BUSY) {
1945 pg->flags |= PG_WANTED;
1946 UVM_UNLOCK_AND_WAIT(pg, &vp->v_interlock, 0,
1947 "lfsput2", 0);
1948 simple_lock(&vp->v_interlock);
1949 }
1950 uvm_lock_pageq();
1951 uvm_pageactivate(pg);
1952 uvm_unlock_pageq();
1953 }
1954 ap->a_offlo = blkeof;
1955 if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
1956 simple_unlock(&vp->v_interlock);
1957 return 0;
1958 }
1959 }
1960
1961 /*
1962 * Extend page range to start and end at block boundaries.
1963 * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
1964 */
1965 origoffset = ap->a_offlo;
1966 origendoffset = ap->a_offhi;
1967 startoffset = origoffset & ~(fs->lfs_bmask);
1968 max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
1969 << fs->lfs_bshift;
1970
1971 if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
1972 endoffset = max_endoffset;
1973 origendoffset = endoffset;
1974 } else {
1975 origendoffset = round_page(ap->a_offhi);
1976 endoffset = round_page(blkroundup(fs, origendoffset));
1977 }
1978
1979 KASSERT(startoffset > 0 || endoffset >= startoffset);
1980 if (startoffset == endoffset) {
1981 /* Nothing to do, why were we called? */
1982 simple_unlock(&vp->v_interlock);
1983 DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %"
1984 PRId64 "\n", startoffset));
1985 return 0;
1986 }
1987
1988 ap->a_offlo = startoffset;
1989 ap->a_offhi = endoffset;
1990
1991 if (!(ap->a_flags & PGO_CLEANIT))
1992 return genfs_putpages(v);
1993
1994 /*
1995 * If there are more than one page per block, we don't want
1996 * to get caught locking them backwards; so set PGO_BUSYFAIL
1997 * to avoid deadlocks.
1998 */
1999 ap->a_flags |= PGO_BUSYFAIL;
2000
2001 do {
2002 int r;
2003
2004 /* If no pages are dirty, we can just use genfs_putpages. */
2005 r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
2006 ap->a_flags, 1);
2007 if (r < 0) {
2008 simple_unlock(&vp->v_interlock);
2009 return EDEADLK;
2010 }
2011 if (r > 0)
2012 break;
2013
2014 /*
2015 * Sometimes pages are dirtied between the time that
2016 * we check and the time we try to clean them.
2017 * Instruct lfs_gop_write to return EDEADLK in this case
2018 * so we can write them properly.
2019 */
2020 ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE;
2021 r = genfs_putpages(v);
2022 ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
2023 if (r != EDEADLK)
2024 return r;
2025
2026 /* Start over. */
2027 preempt(1);
2028 simple_lock(&vp->v_interlock);
2029 } while(1);
2030
2031 /*
2032 * Dirty and asked to clean.
2033 *
2034 * Pagedaemon can't actually write LFS pages; wake up
2035 * the writer to take care of that. The writer will
2036 * notice the pager inode queue and act on that.
2037 */
2038 if (pagedaemon) {
2039 simple_lock(&fs->lfs_interlock);
2040 if (!(ip->i_flags & IN_PAGING)) {
2041 ip->i_flags |= IN_PAGING;
2042 TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2043 }
2044 simple_lock(&lfs_subsys_lock);
2045 wakeup(&lfs_writer_daemon);
2046 simple_unlock(&lfs_subsys_lock);
2047 simple_unlock(&fs->lfs_interlock);
2048 simple_unlock(&vp->v_interlock);
2049 preempt(1);
2050 return EWOULDBLOCK;
2051 }
2052
2053 /*
2054 * If this is a file created in a recent dirop, we can't flush its
2055 * inode until the dirop is complete. Drain dirops, then flush the
2056 * filesystem (taking care of any other pending dirops while we're
2057 * at it).
2058 */
2059 if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
2060 (vp->v_flag & VDIROP)) {
2061 int locked;
2062
2063 DLOG((DLOG_PAGE, "lfs_putpages: flushing VDIROP\n"));
2064 locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
2065 simple_unlock(&vp->v_interlock);
2066 lfs_writer_enter(fs, "ppdirop");
2067 if (locked)
2068 VOP_UNLOCK(vp, 0);
2069
2070 simple_lock(&fs->lfs_interlock);
2071 lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
2072 simple_unlock(&fs->lfs_interlock);
2073
2074 simple_lock(&vp->v_interlock);
2075 if (locked) {
2076 VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2077 simple_lock(&vp->v_interlock);
2078 }
2079 lfs_writer_leave(fs);
2080
2081 /* XXX the flush should have taken care of this one too! */
2082 }
2083
2084 /*
2085 * This is it. We are going to write some pages. From here on
2086 * down it's all just mechanics.
2087 *
2088 * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
2089 */
2090 ap->a_flags &= ~PGO_SYNCIO;
2091
2092 /*
2093 * If we've already got the seglock, flush the node and return.
2094 * The FIP has already been set up for us by lfs_writefile,
2095 * and FIP cleanup and lfs_updatemeta will also be done there,
2096 * unless genfs_putpages returns EDEADLK; then we must flush
2097 * what we have, and correct FIP and segment header accounting.
2098 */
2099 get_seglock:
2100 seglocked = (ap->a_flags & PGO_LOCKED) != 0;
2101 if (!seglocked) {
2102 simple_unlock(&vp->v_interlock);
2103 /*
2104 * Take the seglock, because we are going to be writing pages.
2105 */
2106 error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
2107 if (error != 0)
2108 return error;
2109 simple_lock(&vp->v_interlock);
2110 }
2111
2112 /*
2113 * VOP_PUTPAGES should not be called while holding the seglock.
2114 * XXXUBC fix lfs_markv, or do this properly.
2115 */
2116 #ifdef notyet
2117 KASSERT(fs->lfs_seglock == 1);
2118 #endif /* notyet */
2119
2120 /*
2121 * We assume we're being called with sp->fip pointing at blank space.
2122 * Account for a new FIP in the segment header, and set sp->vp.
2123 * (This should duplicate the setup at the top of lfs_writefile().)
2124 */
2125 sp = fs->lfs_sp;
2126 if (!seglocked)
2127 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
2128 KASSERT(sp->vp == NULL);
2129 sp->vp = vp;
2130
2131 if (!seglocked) {
2132 if (vp->v_flag & VDIROP)
2133 ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
2134 }
2135
2136 /*
2137 * Loop through genfs_putpages until all pages are gathered.
2138 * genfs_putpages() drops the interlock, so reacquire it if necessary.
2139 * Whenever we lose the interlock we have to rerun check_dirty, as
2140 * well.
2141 */
2142 again:
2143 if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
2144 ap->a_flags, 0) < 0) {
2145 simple_unlock(&vp->v_interlock);
2146 sp->vp = NULL;
2147 if (!seglocked) {
2148 lfs_release_finfo(fs);
2149 lfs_segunlock(fs);
2150 }
2151 if (pagedaemon)
2152 return EDEADLK;
2153 /* else seglocked == 0 */
2154 preempt(1);
2155 simple_lock(&vp->v_interlock);
2156 goto get_seglock;
2157 }
2158
2159 error = genfs_putpages(v);
2160 if (error == EDEADLK || error == EAGAIN) {
2161 DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
2162 " EDEADLK [2] ino %d off %x (seg %d)\n",
2163 ip->i_number, fs->lfs_offset,
2164 dtosn(fs, fs->lfs_offset)));
2165 /* If nothing to write, short-circuit */
2166 if (sp->cbpp - sp->bpp > 1) {
2167 /* Write gathered pages */
2168 lfs_updatemeta(sp);
2169 lfs_release_finfo(fs);
2170 (void) lfs_writeseg(fs, sp);
2171
2172 /*
2173 * Reinitialize brand new FIP and add us to it.
2174 */
2175 KASSERT(sp->vp == vp);
2176 lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
2177 }
2178
2179 /* Give the write a chance to complete */
2180 preempt(1);
2181
2182 /* We've lost the interlock. Start over. */
2183 if (error == EDEADLK) {
2184 simple_lock(&vp->v_interlock);
2185 goto again;
2186 }
2187 }
2188
2189 KASSERT(sp->vp == vp);
2190 if (!seglocked) {
2191 sp->vp = NULL;
2192
2193 /* Write indirect blocks as well */
2194 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir);
2195 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir);
2196 lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir);
2197
2198 KASSERT(sp->vp == NULL);
2199 sp->vp = vp;
2200 }
2201
2202 /*
2203 * Blocks are now gathered into a segment waiting to be written.
2204 * All that's left to do is update metadata, and write them.
2205 */
2206 lfs_updatemeta(sp);
2207 KASSERT(sp->vp == vp);
2208 sp->vp = NULL;
2209
2210 if (seglocked) {
2211 /* we're called by lfs_writefile. */
2212 return error;
2213 }
2214
2215 /* Clean up FIP and send it to disk. */
2216 lfs_release_finfo(fs);
2217 lfs_writeseg(fs, fs->lfs_sp);
2218
2219 /*
2220 * Remove us from paging queue, since we've now written all our
2221 * pages.
2222 */
2223 simple_lock(&fs->lfs_interlock);
2224 if (ip->i_flags & IN_PAGING) {
2225 ip->i_flags &= ~IN_PAGING;
2226 TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
2227 }
2228 simple_unlock(&fs->lfs_interlock);
2229
2230 /*
2231 * XXX - with the malloc/copy writeseg, the pages are freed by now
2232 * even if we don't wait (e.g. if we hold a nested lock). This
2233 * will not be true if we stop using malloc/copy.
2234 */
2235 KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
2236 lfs_segunlock(fs);
2237
2238 /*
2239 * Wait for v_numoutput to drop to zero. The seglock should
2240 * take care of this, but there is a slight possibility that
2241 * aiodoned might not have got around to our buffers yet.
2242 */
2243 if (sync) {
2244 s = splbio();
2245 simple_lock(&global_v_numoutput_slock);
2246 while (vp->v_numoutput > 0) {
2247 DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on"
2248 " num %d\n", ip->i_number, vp->v_numoutput));
2249 vp->v_flag |= VBWAIT;
2250 ltsleep(&vp->v_numoutput, PRIBIO + 1, "lfs_vn", 0,
2251 &global_v_numoutput_slock);
2252 }
2253 simple_unlock(&global_v_numoutput_slock);
2254 splx(s);
2255 }
2256 return error;
2257 }
2258
2259 /*
2260 * Return the last logical file offset that should be written for this file
2261 * if we're doing a write that ends at "size". If writing, we need to know
2262 * about sizes on disk, i.e. fragments if there are any; if reading, we need
2263 * to know about entire blocks.
2264 */
2265 void
2266 lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
2267 {
2268 struct inode *ip = VTOI(vp);
2269 struct lfs *fs = ip->i_lfs;
2270 daddr_t olbn, nlbn;
2271
2272 olbn = lblkno(fs, ip->i_size);
2273 nlbn = lblkno(fs, size);
2274 if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) {
2275 *eobp = fragroundup(fs, size);
2276 } else {
2277 *eobp = blkroundup(fs, size);
2278 }
2279 }
2280
2281 #ifdef DEBUG
2282 void lfs_dump_vop(void *);
2283
2284 void
2285 lfs_dump_vop(void *v)
2286 {
2287 struct vop_putpages_args /* {
2288 struct vnode *a_vp;
2289 voff_t a_offlo;
2290 voff_t a_offhi;
2291 int a_flags;
2292 } */ *ap = v;
2293
2294 #ifdef DDB
2295 vfs_vnode_print(ap->a_vp, 0, printf);
2296 #endif
2297 lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din);
2298 }
2299 #endif
2300
2301 int
2302 lfs_mmap(void *v)
2303 {
2304 struct vop_mmap_args /* {
2305 const struct vnodeop_desc *a_desc;
2306 struct vnode *a_vp;
2307 int a_fflags;
2308 kauth_cred_t a_cred;
2309 struct lwp *a_l;
2310 } */ *ap = v;
2311
2312 if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
2313 return EOPNOTSUPP;
2314 return ufs_mmap(v);
2315 }
2316