genfs_vnops.c revision 1.46 1 /* $NetBSD: genfs_vnops.c,v 1.46 2002/01/26 02:44:27 chs Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.46 2002/01/26 02:44:27 chs Exp $");
39
40 #include "opt_nfsserver.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/proc.h>
45 #include <sys/kernel.h>
46 #include <sys/mount.h>
47 #include <sys/namei.h>
48 #include <sys/vnode.h>
49 #include <sys/fcntl.h>
50 #include <sys/malloc.h>
51 #include <sys/poll.h>
52 #include <sys/mman.h>
53
54 #include <miscfs/genfs/genfs.h>
55 #include <miscfs/genfs/genfs_node.h>
56 #include <miscfs/specfs/specdev.h>
57
58 #include <uvm/uvm.h>
59 #include <uvm/uvm_pager.h>
60
61 #ifdef NFSSERVER
62 #include <nfs/rpcv2.h>
63 #include <nfs/nfsproto.h>
64 #include <nfs/nfs.h>
65 #include <nfs/nqnfs.h>
66 #include <nfs/nfs_var.h>
67 #endif
68
69 #define MAX_READ_AHEAD 16 /* XXXUBC 16 */
70
71 int
72 genfs_poll(v)
73 void *v;
74 {
75 struct vop_poll_args /* {
76 struct vnode *a_vp;
77 int a_events;
78 struct proc *a_p;
79 } */ *ap = v;
80
81 return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
82 }
83
84 int
85 genfs_fsync(v)
86 void *v;
87 {
88 struct vop_fsync_args /* {
89 struct vnode *a_vp;
90 struct ucred *a_cred;
91 int a_flags;
92 off_t offlo;
93 off_t offhi;
94 struct proc *a_p;
95 } */ *ap = v;
96 struct vnode *vp = ap->a_vp;
97 int wait;
98
99 wait = (ap->a_flags & FSYNC_WAIT) != 0;
100 vflushbuf(vp, wait);
101 if ((ap->a_flags & FSYNC_DATAONLY) != 0)
102 return (0);
103 else
104 return (VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0));
105 }
106
107 int
108 genfs_seek(v)
109 void *v;
110 {
111 struct vop_seek_args /* {
112 struct vnode *a_vp;
113 off_t a_oldoff;
114 off_t a_newoff;
115 struct ucred *a_ucred;
116 } */ *ap = v;
117
118 if (ap->a_newoff < 0)
119 return (EINVAL);
120
121 return (0);
122 }
123
124 int
125 genfs_abortop(v)
126 void *v;
127 {
128 struct vop_abortop_args /* {
129 struct vnode *a_dvp;
130 struct componentname *a_cnp;
131 } */ *ap = v;
132
133 if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF)
134 PNBUF_PUT(ap->a_cnp->cn_pnbuf);
135 return (0);
136 }
137
138 int
139 genfs_fcntl(v)
140 void *v;
141 {
142 struct vop_fcntl_args /* {
143 struct vnode *a_vp;
144 u_int a_command;
145 caddr_t a_data;
146 int a_fflag;
147 struct ucred *a_cred;
148 struct proc *a_p;
149 } */ *ap = v;
150
151 if (ap->a_command == F_SETFL)
152 return (0);
153 else
154 return (EOPNOTSUPP);
155 }
156
157 /*ARGSUSED*/
158 int
159 genfs_badop(v)
160 void *v;
161 {
162
163 panic("genfs: bad op");
164 }
165
166 /*ARGSUSED*/
167 int
168 genfs_nullop(v)
169 void *v;
170 {
171
172 return (0);
173 }
174
175 /*ARGSUSED*/
176 int
177 genfs_einval(v)
178 void *v;
179 {
180
181 return (EINVAL);
182 }
183
184 /*ARGSUSED*/
185 int
186 genfs_eopnotsupp(v)
187 void *v;
188 {
189
190 return (EOPNOTSUPP);
191 }
192
193 /*
194 * Called when an fs doesn't support a particular vop but the vop needs to
195 * vrele, vput, or vunlock passed in vnodes.
196 */
197 int
198 genfs_eopnotsupp_rele(v)
199 void *v;
200 {
201 struct vop_generic_args /*
202 struct vnodeop_desc *a_desc;
203 / * other random data follows, presumably * /
204 } */ *ap = v;
205 struct vnodeop_desc *desc = ap->a_desc;
206 struct vnode *vp;
207 int flags, i, j, offset;
208
209 flags = desc->vdesc_flags;
210 for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) {
211 if ((offset = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET)
212 break; /* stop at end of list */
213 if ((j = flags & VDESC_VP0_WILLPUT)) {
214 vp = *VOPARG_OFFSETTO(struct vnode**,offset,ap);
215 switch (j) {
216 case VDESC_VP0_WILLPUT:
217 vput(vp);
218 break;
219 case VDESC_VP0_WILLUNLOCK:
220 VOP_UNLOCK(vp, 0);
221 break;
222 case VDESC_VP0_WILLRELE:
223 vrele(vp);
224 break;
225 }
226 }
227 }
228
229 return (EOPNOTSUPP);
230 }
231
232 /*ARGSUSED*/
233 int
234 genfs_ebadf(v)
235 void *v;
236 {
237
238 return (EBADF);
239 }
240
241 /* ARGSUSED */
242 int
243 genfs_enoioctl(v)
244 void *v;
245 {
246
247 return (ENOTTY);
248 }
249
250
251 /*
252 * Eliminate all activity associated with the requested vnode
253 * and with all vnodes aliased to the requested vnode.
254 */
255 int
256 genfs_revoke(v)
257 void *v;
258 {
259 struct vop_revoke_args /* {
260 struct vnode *a_vp;
261 int a_flags;
262 } */ *ap = v;
263 struct vnode *vp, *vq;
264 struct proc *p = curproc; /* XXX */
265
266 #ifdef DIAGNOSTIC
267 if ((ap->a_flags & REVOKEALL) == 0)
268 panic("genfs_revoke: not revokeall");
269 #endif
270
271 vp = ap->a_vp;
272 simple_lock(&vp->v_interlock);
273
274 if (vp->v_flag & VALIASED) {
275 /*
276 * If a vgone (or vclean) is already in progress,
277 * wait until it is done and return.
278 */
279 if (vp->v_flag & VXLOCK) {
280 vp->v_flag |= VXWANT;
281 simple_unlock(&vp->v_interlock);
282 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
283 return (0);
284 }
285 /*
286 * Ensure that vp will not be vgone'd while we
287 * are eliminating its aliases.
288 */
289 vp->v_flag |= VXLOCK;
290 simple_unlock(&vp->v_interlock);
291 while (vp->v_flag & VALIASED) {
292 simple_lock(&spechash_slock);
293 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
294 if (vq->v_rdev != vp->v_rdev ||
295 vq->v_type != vp->v_type || vp == vq)
296 continue;
297 simple_unlock(&spechash_slock);
298 vgone(vq);
299 break;
300 }
301 if (vq == NULLVP)
302 simple_unlock(&spechash_slock);
303 }
304 /*
305 * Remove the lock so that vgone below will
306 * really eliminate the vnode after which time
307 * vgone will awaken any sleepers.
308 */
309 simple_lock(&vp->v_interlock);
310 vp->v_flag &= ~VXLOCK;
311 }
312 vgonel(vp, p);
313 return (0);
314 }
315
316 /*
317 * Lock the node.
318 */
319 int
320 genfs_lock(v)
321 void *v;
322 {
323 struct vop_lock_args /* {
324 struct vnode *a_vp;
325 int a_flags;
326 } */ *ap = v;
327 struct vnode *vp = ap->a_vp;
328
329 return (lockmgr(&vp->v_lock, ap->a_flags, &vp->v_interlock));
330 }
331
332 /*
333 * Unlock the node.
334 */
335 int
336 genfs_unlock(v)
337 void *v;
338 {
339 struct vop_unlock_args /* {
340 struct vnode *a_vp;
341 int a_flags;
342 } */ *ap = v;
343 struct vnode *vp = ap->a_vp;
344
345 return (lockmgr(&vp->v_lock, ap->a_flags | LK_RELEASE,
346 &vp->v_interlock));
347 }
348
349 /*
350 * Return whether or not the node is locked.
351 */
352 int
353 genfs_islocked(v)
354 void *v;
355 {
356 struct vop_islocked_args /* {
357 struct vnode *a_vp;
358 } */ *ap = v;
359 struct vnode *vp = ap->a_vp;
360
361 return (lockstatus(&vp->v_lock));
362 }
363
364 /*
365 * Stubs to use when there is no locking to be done on the underlying object.
366 */
367 int
368 genfs_nolock(v)
369 void *v;
370 {
371 struct vop_lock_args /* {
372 struct vnode *a_vp;
373 int a_flags;
374 struct proc *a_p;
375 } */ *ap = v;
376
377 /*
378 * Since we are not using the lock manager, we must clear
379 * the interlock here.
380 */
381 if (ap->a_flags & LK_INTERLOCK)
382 simple_unlock(&ap->a_vp->v_interlock);
383 return (0);
384 }
385
386 int
387 genfs_nounlock(v)
388 void *v;
389 {
390 return (0);
391 }
392
393 int
394 genfs_noislocked(v)
395 void *v;
396 {
397 return (0);
398 }
399
400 /*
401 * Local lease check for NFS servers. Just set up args and let
402 * nqsrv_getlease() do the rest. If NFSSERVER is not in the kernel,
403 * this is a null operation.
404 */
405 int
406 genfs_lease_check(v)
407 void *v;
408 {
409 #ifdef NFSSERVER
410 struct vop_lease_args /* {
411 struct vnode *a_vp;
412 struct proc *a_p;
413 struct ucred *a_cred;
414 int a_flag;
415 } */ *ap = v;
416 u_int32_t duration = 0;
417 int cache;
418 u_quad_t frev;
419
420 (void) nqsrv_getlease(ap->a_vp, &duration, ND_CHECK | ap->a_flag,
421 NQLOCALSLP, ap->a_p, (struct mbuf *)0, &cache, &frev, ap->a_cred);
422 return (0);
423 #else
424 return (0);
425 #endif /* NFSSERVER */
426 }
427
428 int
429 genfs_mmap(v)
430 void *v;
431 {
432 return 0;
433 }
434
435 /*
436 * generic VM getpages routine.
437 * Return PG_BUSY pages for the given range,
438 * reading from backing store if necessary.
439 */
440
441 int
442 genfs_getpages(v)
443 void *v;
444 {
445 struct vop_getpages_args /* {
446 struct vnode *a_vp;
447 voff_t a_offset;
448 struct vm_page **a_m;
449 int *a_count;
450 int a_centeridx;
451 vm_prot_t a_access_type;
452 int a_advice;
453 int a_flags;
454 } */ *ap = v;
455
456 off_t newsize, diskeof, memeof;
457 off_t offset, origoffset, startoffset, endoffset, raoffset;
458 daddr_t lbn, blkno;
459 int s, i, error, npages, orignpages, npgs, run, ridx, pidx, pcount;
460 int fs_bshift, fs_bsize, dev_bshift;
461 int flags = ap->a_flags;
462 size_t bytes, iobytes, tailbytes, totalbytes, skipbytes;
463 vaddr_t kva;
464 struct buf *bp, *mbp;
465 struct vnode *vp = ap->a_vp;
466 struct vnode *devvp;
467 struct genfs_node *gp = VTOG(vp);
468 struct uvm_object *uobj = &vp->v_uobj;
469 struct vm_page *pg, *pgs[MAX_READ_AHEAD];
470 struct ucred *cred = curproc->p_ucred; /* XXXUBC curproc */
471 boolean_t async = (flags & PGO_SYNCIO) == 0;
472 boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0;
473 boolean_t sawhole = FALSE;
474 boolean_t overwrite = (flags & PGO_OVERWRITE) != 0;
475 UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
476
477 UVMHIST_LOG(ubchist, "vp %p off 0x%x/%x count %d",
478 vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
479
480 /* XXXUBC temp limit */
481 if (*ap->a_count > MAX_READ_AHEAD) {
482 panic("genfs_getpages: too many pages");
483 }
484
485 error = 0;
486 origoffset = ap->a_offset;
487 orignpages = *ap->a_count;
488 GOP_SIZE(vp, vp->v_size, &diskeof);
489 if (flags & PGO_PASTEOF) {
490 newsize = MAX(vp->v_size,
491 origoffset + (orignpages << PAGE_SHIFT));
492 GOP_SIZE(vp, newsize, &memeof);
493 } else {
494 memeof = diskeof;
495 }
496 KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages);
497 KASSERT((origoffset & (PAGE_SIZE - 1)) == 0 && origoffset >= 0);
498 KASSERT(orignpages > 0);
499
500 /*
501 * Bounds-check the request.
502 */
503
504 if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
505 if ((flags & PGO_LOCKED) == 0) {
506 simple_unlock(&uobj->vmobjlock);
507 }
508 UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x",
509 origoffset, *ap->a_count, memeof,0);
510 return EINVAL;
511 }
512
513 /*
514 * For PGO_LOCKED requests, just return whatever's in memory.
515 */
516
517 if (flags & PGO_LOCKED) {
518 uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m,
519 UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY);
520
521 return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
522 }
523
524 /* vnode is VOP_LOCKed, uobj is locked */
525
526 if (write && (vp->v_flag & VONWORKLST) == 0) {
527 vn_syncer_add_to_worklist(vp, filedelay);
528 }
529
530 /*
531 * find the requested pages and make some simple checks.
532 * leave space in the page array for a whole block.
533 */
534
535 if (vp->v_type == VREG) {
536 fs_bshift = vp->v_mount->mnt_fs_bshift;
537 dev_bshift = vp->v_mount->mnt_dev_bshift;
538 } else {
539 fs_bshift = DEV_BSHIFT;
540 dev_bshift = DEV_BSHIFT;
541 }
542 fs_bsize = 1 << fs_bshift;
543
544 orignpages = MIN(orignpages,
545 round_page(memeof - origoffset) >> PAGE_SHIFT);
546 npages = orignpages;
547 startoffset = origoffset & ~(fs_bsize - 1);
548 endoffset = round_page((origoffset + (npages << PAGE_SHIFT)
549 + fs_bsize - 1) & ~(fs_bsize - 1));
550 endoffset = MIN(endoffset, round_page(memeof));
551 ridx = (origoffset - startoffset) >> PAGE_SHIFT;
552
553 memset(pgs, 0, sizeof(pgs));
554 uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL);
555
556 /*
557 * if the pages are already resident, just return them.
558 */
559
560 for (i = 0; i < npages; i++) {
561 struct vm_page *pg = pgs[ridx + i];
562
563 if ((pg->flags & PG_FAKE) ||
564 (write && (pg->flags & PG_RDONLY))) {
565 break;
566 }
567 }
568 if (i == npages) {
569 UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
570 raoffset = origoffset + (orignpages << PAGE_SHIFT);
571 npages += ridx;
572 goto raout;
573 }
574
575 /*
576 * if PGO_OVERWRITE is set, don't bother reading the pages.
577 */
578
579 if (flags & PGO_OVERWRITE) {
580 UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
581
582 for (i = 0; i < npages; i++) {
583 struct vm_page *pg = pgs[ridx + i];
584
585 pg->flags &= ~(PG_RDONLY|PG_CLEAN);
586 }
587 npages += ridx;
588 goto out;
589 }
590
591 /*
592 * the page wasn't resident and we're not overwriting,
593 * so we're going to have to do some i/o.
594 * find any additional pages needed to cover the expanded range.
595 */
596
597 npages = (endoffset - startoffset) >> PAGE_SHIFT;
598 if (startoffset != origoffset || npages != orignpages) {
599
600 /*
601 * we need to avoid deadlocks caused by locking
602 * additional pages at lower offsets than pages we
603 * already have locked. unlock them all and start over.
604 */
605
606 for (i = 0; i < orignpages; i++) {
607 struct vm_page *pg = pgs[ridx + i];
608
609 if (pg->flags & PG_FAKE) {
610 pg->flags |= PG_RELEASED;
611 }
612 }
613 uvm_page_unbusy(&pgs[ridx], orignpages);
614 memset(pgs, 0, sizeof(pgs));
615
616 UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x",
617 startoffset, endoffset, 0,0);
618 npgs = npages;
619 uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL);
620 }
621 simple_unlock(&uobj->vmobjlock);
622
623 /*
624 * read the desired page(s).
625 */
626
627 totalbytes = npages << PAGE_SHIFT;
628 bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
629 tailbytes = totalbytes - bytes;
630 skipbytes = 0;
631
632 kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK |
633 UVMPAGER_MAPIN_READ);
634
635 s = splbio();
636 mbp = pool_get(&bufpool, PR_WAITOK);
637 splx(s);
638 mbp->b_bufsize = totalbytes;
639 mbp->b_data = (void *)kva;
640 mbp->b_resid = mbp->b_bcount = bytes;
641 mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL : 0);
642 mbp->b_iodone = (async ? uvm_aio_biodone : 0);
643 mbp->b_vp = vp;
644 LIST_INIT(&mbp->b_dep);
645
646 /*
647 * if EOF is in the middle of the range, zero the part past EOF.
648 * if the page including EOF is not PG_FAKE, skip over it since
649 * in that case it has valid data that we need to preserve.
650 */
651
652 if (tailbytes > 0) {
653 size_t tailstart = bytes;
654
655 if ((pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE) == 0) {
656 tailstart = round_page(tailstart);
657 tailbytes -= tailstart - bytes;
658 }
659 UVMHIST_LOG(ubchist, "tailbytes %p 0x%x 0x%x",
660 kva, tailstart, tailbytes,0);
661 memset((void *)(kva + tailstart), 0, tailbytes);
662 }
663
664 /*
665 * now loop over the pages, reading as needed.
666 */
667
668 if (write) {
669 lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL);
670 } else {
671 lockmgr(&gp->g_glock, LK_SHARED, NULL);
672 }
673
674 bp = NULL;
675 for (offset = startoffset;
676 bytes > 0;
677 offset += iobytes, bytes -= iobytes) {
678
679 /*
680 * skip pages which don't need to be read.
681 */
682
683 pidx = (offset - startoffset) >> PAGE_SHIFT;
684 while ((pgs[pidx]->flags & (PG_FAKE|PG_RDONLY)) == 0) {
685 size_t b;
686
687 KASSERT((offset & (PAGE_SIZE - 1)) == 0);
688 b = MIN(PAGE_SIZE, bytes);
689 offset += b;
690 bytes -= b;
691 skipbytes += b;
692 pidx++;
693 UVMHIST_LOG(ubchist, "skipping, new offset 0x%x",
694 offset, 0,0,0);
695 if (bytes == 0) {
696 goto loopdone;
697 }
698 }
699
700 /*
701 * bmap the file to find out the blkno to read from and
702 * how much we can read in one i/o. if bmap returns an error,
703 * skip the rest of the top-level i/o.
704 */
705
706 lbn = offset >> fs_bshift;
707 error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
708 if (error) {
709 UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n",
710 lbn, error,0,0);
711 skipbytes += bytes;
712 goto loopdone;
713 }
714
715 /*
716 * see how many pages can be read with this i/o.
717 * reduce the i/o size if necessary to avoid
718 * overwriting pages with valid data.
719 */
720
721 iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
722 bytes);
723 if (offset + iobytes > round_page(offset)) {
724 pcount = 1;
725 while (pidx + pcount < npages &&
726 pgs[pidx + pcount]->flags & PG_FAKE) {
727 pcount++;
728 }
729 iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
730 (offset - trunc_page(offset)));
731 }
732
733 /*
734 * if this block isn't allocated, zero it instead of reading it.
735 * if this is a read access, mark the pages we zeroed PG_RDONLY.
736 */
737
738 if (blkno < 0) {
739 int holepages = (round_page(offset + iobytes) -
740 trunc_page(offset)) >> PAGE_SHIFT;
741 UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE", lbn,0,0,0);
742
743 sawhole = TRUE;
744 memset((char *)kva + (offset - startoffset), 0,
745 iobytes);
746 skipbytes += iobytes;
747
748 for (i = 0; i < holepages; i++) {
749 if (write) {
750 pgs[pidx + i]->flags &= ~PG_CLEAN;
751 } else {
752 pgs[pidx + i]->flags |= PG_RDONLY;
753 }
754 }
755 continue;
756 }
757
758 /*
759 * allocate a sub-buf for this piece of the i/o
760 * (or just use mbp if there's only 1 piece),
761 * and start it going.
762 */
763
764 if (offset == startoffset && iobytes == bytes) {
765 bp = mbp;
766 } else {
767 s = splbio();
768 bp = pool_get(&bufpool, PR_WAITOK);
769 splx(s);
770 bp->b_data = (char *)kva + offset - startoffset;
771 bp->b_resid = bp->b_bcount = iobytes;
772 bp->b_flags = B_BUSY|B_READ|B_CALL;
773 bp->b_iodone = uvm_aio_biodone1;
774 bp->b_vp = vp;
775 bp->b_proc = NULL;
776 LIST_INIT(&bp->b_dep);
777 }
778 bp->b_lblkno = 0;
779 bp->b_private = mbp;
780 if (devvp->v_type == VBLK) {
781 bp->b_dev = devvp->v_rdev;
782 }
783
784 /* adjust physical blkno for partial blocks */
785 bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
786 dev_bshift);
787
788 UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x",
789 bp, offset, iobytes, bp->b_blkno);
790
791 VOP_STRATEGY(bp);
792 }
793
794 loopdone:
795 if (skipbytes) {
796 s = splbio();
797 if (error) {
798 mbp->b_flags |= B_ERROR;
799 mbp->b_error = error;
800 }
801 mbp->b_resid -= skipbytes;
802 if (mbp->b_resid == 0) {
803 biodone(mbp);
804 }
805 splx(s);
806 }
807
808 if (async) {
809 UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
810 lockmgr(&gp->g_glock, LK_RELEASE, NULL);
811 return 0;
812 }
813 if (bp != NULL) {
814 error = biowait(mbp);
815 }
816 s = splbio();
817 pool_put(&bufpool, mbp);
818 splx(s);
819 uvm_pagermapout(kva, npages);
820 raoffset = startoffset + totalbytes;
821
822 /*
823 * if this we encountered a hole then we have to do a little more work.
824 * for read faults, we marked the page PG_RDONLY so that future
825 * write accesses to the page will fault again.
826 * for write faults, we must make sure that the backing store for
827 * the page is completely allocated while the pages are locked.
828 */
829
830 if (!error && sawhole && write) {
831 for (i = 0; i < npages; i++) {
832 if (pgs[i] == NULL) {
833 continue;
834 }
835 pgs[i]->flags &= ~PG_CLEAN;
836 UVMHIST_LOG(ubchist, "mark dirty pg %p", pgs[i],0,0,0);
837 }
838 error = GOP_ALLOC(vp, startoffset, npages << PAGE_SHIFT, 0,
839 cred);
840 UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d",
841 startoffset, npages << PAGE_SHIFT, error,0);
842 }
843 lockmgr(&gp->g_glock, LK_RELEASE, NULL);
844 simple_lock(&uobj->vmobjlock);
845
846 /*
847 * see if we want to start any readahead.
848 * XXXUBC for now, just read the next 128k on 64k boundaries.
849 * this is pretty nonsensical, but it is 50% faster than reading
850 * just the next 64k.
851 */
852
853 raout:
854 if (!error && !async && !write && ((int)raoffset & 0xffff) == 0 &&
855 PAGE_SHIFT <= 16) {
856 off_t rasize;
857 int racount;
858
859 /* XXXUBC temp limit, from above */
860 racount = MIN(1 << (16 - PAGE_SHIFT), MAX_READ_AHEAD);
861 rasize = racount << PAGE_SHIFT;
862 (void) VOP_GETPAGES(vp, raoffset, NULL, &racount, 0,
863 VM_PROT_READ, 0, 0);
864 simple_lock(&uobj->vmobjlock);
865
866 /* XXXUBC temp limit, from above */
867 racount = MIN(1 << (16 - PAGE_SHIFT), MAX_READ_AHEAD);
868 (void) VOP_GETPAGES(vp, raoffset + rasize, NULL, &racount, 0,
869 VM_PROT_READ, 0, 0);
870 simple_lock(&uobj->vmobjlock);
871 }
872
873 /*
874 * we're almost done! release the pages...
875 * for errors, we free the pages.
876 * otherwise we activate them and mark them as valid and clean.
877 * also, unbusy pages that were not actually requested.
878 */
879
880 if (error) {
881 for (i = 0; i < npages; i++) {
882 if (pgs[i] == NULL) {
883 continue;
884 }
885 UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
886 pgs[i], pgs[i]->flags, 0,0);
887 if (pgs[i]->flags & PG_FAKE) {
888 pgs[i]->flags |= PG_RELEASED;
889 }
890 }
891 uvm_lock_pageq();
892 uvm_page_unbusy(pgs, npages);
893 uvm_unlock_pageq();
894 simple_unlock(&uobj->vmobjlock);
895 UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0);
896 return error;
897 }
898
899 out:
900 UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0);
901 uvm_lock_pageq();
902 for (i = 0; i < npages; i++) {
903 pg = pgs[i];
904 if (pg == NULL) {
905 continue;
906 }
907 UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
908 pg, pg->flags, 0,0);
909 if (pg->flags & PG_FAKE && !overwrite) {
910 pg->flags &= ~(PG_FAKE);
911 pmap_clear_modify(pgs[i]);
912 }
913 if (write) {
914 pg->flags &= ~(PG_RDONLY);
915 }
916 if (i < ridx || i >= ridx + orignpages || async) {
917 UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
918 pg, pg->offset,0,0);
919 if (pg->flags & PG_WANTED) {
920 wakeup(pg);
921 }
922 if (pg->flags & PG_FAKE) {
923 KASSERT(overwrite);
924 uvm_pagezero(pg);
925 }
926 if (pg->flags & PG_RELEASED) {
927 uvm_pagefree(pg);
928 continue;
929 }
930 uvm_pageactivate(pg);
931 pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE);
932 UVM_PAGE_OWN(pg, NULL);
933 }
934 }
935 uvm_unlock_pageq();
936 simple_unlock(&uobj->vmobjlock);
937 if (ap->a_m != NULL) {
938 memcpy(ap->a_m, &pgs[ridx],
939 orignpages * sizeof(struct vm_page *));
940 }
941 return 0;
942 }
943
944 /*
945 * generic VM putpages routine.
946 * Write the given range of pages to backing store.
947 *
948 * => "offhi == 0" means flush all pages at or after "offlo".
949 * => object should be locked by caller. we may _unlock_ the object
950 * if (and only if) we need to clean a page (PGO_CLEANIT), or
951 * if PGO_SYNCIO is set and there are pages busy.
952 * we return with the object locked.
953 * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
954 * thus, a caller might want to unlock higher level resources
955 * (e.g. vm_map) before calling flush.
956 * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, then we will neither
957 * unlock the object nor block.
958 * => if PGO_ALLPAGES is set, then all pages in the object will be processed.
959 * => NOTE: we rely on the fact that the object's memq is a TAILQ and
960 * that new pages are inserted on the tail end of the list. thus,
961 * we can make a complete pass through the object in one go by starting
962 * at the head and working towards the tail (new pages are put in
963 * front of us).
964 * => NOTE: we are allowed to lock the page queues, so the caller
965 * must not be holding the page queue lock.
966 *
967 * note on "cleaning" object and PG_BUSY pages:
968 * this routine is holding the lock on the object. the only time
969 * that it can run into a PG_BUSY page that it does not own is if
970 * some other process has started I/O on the page (e.g. either
971 * a pagein, or a pageout). if the PG_BUSY page is being paged
972 * in, then it can not be dirty (!PG_CLEAN) because no one has
973 * had a chance to modify it yet. if the PG_BUSY page is being
974 * paged out then it means that someone else has already started
975 * cleaning the page for us (how nice!). in this case, if we
976 * have syncio specified, then after we make our pass through the
977 * object we need to wait for the other PG_BUSY pages to clear
978 * off (i.e. we need to do an iosync). also note that once a
979 * page is PG_BUSY it must stay in its object until it is un-busyed.
980 *
981 * note on page traversal:
982 * we can traverse the pages in an object either by going down the
983 * linked list in "uobj->memq", or we can go over the address range
984 * by page doing hash table lookups for each address. depending
985 * on how many pages are in the object it may be cheaper to do one
986 * or the other. we set "by_list" to true if we are using memq.
987 * if the cost of a hash lookup was equal to the cost of the list
988 * traversal we could compare the number of pages in the start->stop
989 * range to the total number of pages in the object. however, it
990 * seems that a hash table lookup is more expensive than the linked
991 * list traversal, so we multiply the number of pages in the
992 * range by an estimate of the relatively higher cost of the hash lookup.
993 */
994
995 int
996 genfs_putpages(v)
997 void *v;
998 {
999 struct vop_putpages_args /* {
1000 struct vnode *a_vp;
1001 voff_t a_offlo;
1002 voff_t a_offhi;
1003 int a_flags;
1004 } */ *ap = v;
1005 struct vnode *vp = ap->a_vp;
1006 struct uvm_object *uobj = &vp->v_uobj;
1007 struct simplelock *slock = &uobj->vmobjlock;
1008 off_t startoff = ap->a_offlo;
1009 off_t endoff = ap->a_offhi;
1010 off_t off;
1011 int flags = ap->a_flags;
1012 int n = MAXBSIZE >> PAGE_SHIFT;
1013 int i, s, error, npages, nback;
1014 int freeflag;
1015 struct vm_page *pgs[n], *pg, *nextpg, *tpg, curmp, endmp;
1016 boolean_t wasclean, by_list, needs_clean;
1017 boolean_t async = (flags & PGO_SYNCIO) == 0;
1018 UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
1019
1020 KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE));
1021 KASSERT((startoff & PAGE_MASK) == 0 && (endoff & PAGE_MASK) == 0);
1022 KASSERT(startoff < endoff || endoff == 0);
1023
1024 UVMHIST_LOG(ubchist, "vp %p pages %d off 0x%x len 0x%x",
1025 vp, uobj->uo_npages, startoff, endoff - startoff);
1026 if (uobj->uo_npages == 0) {
1027 if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
1028 (vp->v_flag & VONWORKLST)) {
1029 vp->v_flag &= ~VONWORKLST;
1030 LIST_REMOVE(vp, v_synclist);
1031 }
1032 simple_unlock(slock);
1033 return 0;
1034 }
1035
1036 /*
1037 * the vnode has pages, set up to process the request.
1038 */
1039
1040 error = 0;
1041 s = splbio();
1042 wasclean = (vp->v_numoutput == 0);
1043 splx(s);
1044 off = startoff;
1045 if (endoff == 0 || flags & PGO_ALLPAGES) {
1046 endoff = trunc_page(LLONG_MAX);
1047 }
1048 by_list = (uobj->uo_npages <=
1049 ((endoff - startoff) >> PAGE_SHIFT) * UVM_PAGE_HASH_PENALTY);
1050
1051 /*
1052 * start the loop. when scanning by list, hold the last page
1053 * in the list before we start. pages allocated after we start
1054 * will be added to the end of the list, so we can stop at the
1055 * current last page.
1056 */
1057
1058 freeflag = (curproc == uvm.pagedaemon_proc) ? PG_PAGEOUT : PG_RELEASED;
1059 curmp.uobject = uobj;
1060 curmp.offset = (voff_t)-1;
1061 curmp.flags = PG_BUSY;
1062 endmp.uobject = uobj;
1063 endmp.offset = (voff_t)-1;
1064 endmp.flags = PG_BUSY;
1065 if (by_list) {
1066 pg = TAILQ_FIRST(&uobj->memq);
1067 TAILQ_INSERT_TAIL(&uobj->memq, &endmp, listq);
1068 PHOLD(curproc);
1069 } else {
1070 pg = uvm_pagelookup(uobj, off);
1071 }
1072 nextpg = NULL;
1073 while (by_list || off < endoff) {
1074 if (curproc->p_cpu->ci_schedstate.spc_flags &
1075 SPCF_SHOULDYIELD) {
1076 simple_unlock(slock);
1077 preempt(NULL);
1078 simple_lock(slock);
1079 }
1080
1081 /*
1082 * if the current page is not interesting, move on to the next.
1083 */
1084
1085 KASSERT(pg == NULL || pg->uobject == uobj);
1086 KASSERT(pg == NULL ||
1087 (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
1088 (pg->flags & PG_BUSY) != 0);
1089 if (by_list) {
1090 if (pg == &endmp) {
1091 break;
1092 }
1093 if (pg->offset < startoff || pg->offset >= endoff ||
1094 pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
1095 pg = TAILQ_NEXT(pg, listq);
1096 continue;
1097 }
1098 off = pg->offset;
1099 } else if (pg == NULL || pg->flags & (PG_RELEASED|PG_PAGEOUT)) {
1100 off += PAGE_SIZE;
1101 if (off < endoff) {
1102 pg = uvm_pagelookup(uobj, off);
1103 }
1104 continue;
1105 }
1106
1107 /*
1108 * if the current page needs to be cleaned and it's busy,
1109 * wait for it to become unbusy.
1110 */
1111
1112 if (flags & PGO_FREE) {
1113 pmap_page_protect(pg, VM_PROT_NONE);
1114 }
1115 if (flags & PGO_CLEANIT) {
1116 needs_clean = pmap_clear_modify(pg) ||
1117 (pg->flags & PG_CLEAN) == 0;
1118 pg->flags |= PG_CLEAN;
1119 } else {
1120 needs_clean = FALSE;
1121 }
1122 if (needs_clean && pg->flags & PG_BUSY) {
1123 KASSERT(curproc != uvm.pagedaemon_proc);
1124 UVMHIST_LOG(ubchist, "busy %p", pg,0,0,0);
1125 if (by_list) {
1126 TAILQ_INSERT_BEFORE(pg, &curmp, listq);
1127 UVMHIST_LOG(ubchist, "curmp next %p",
1128 TAILQ_NEXT(&curmp, listq), 0,0,0);
1129 }
1130 pg->flags |= PG_WANTED;
1131 pg->flags &= ~PG_CLEAN;
1132 UVM_UNLOCK_AND_WAIT(pg, slock, 0, "genput", 0);
1133 simple_lock(slock);
1134 if (by_list) {
1135 UVMHIST_LOG(ubchist, "after next %p",
1136 TAILQ_NEXT(&curmp, listq), 0,0,0);
1137 pg = TAILQ_NEXT(&curmp, listq);
1138 TAILQ_REMOVE(&uobj->memq, &curmp, listq);
1139 } else {
1140 pg = uvm_pagelookup(uobj, off);
1141 }
1142 continue;
1143 }
1144
1145 /*
1146 * if we're cleaning, build a cluster.
1147 * the cluster will consist of pages which are currently dirty,
1148 * but they will be returned to us marked clean.
1149 * if not cleaning, just operate on the one page.
1150 */
1151
1152 if (needs_clean) {
1153 wasclean = FALSE;
1154 memset(pgs, 0, sizeof(pgs));
1155 pg->flags |= PG_BUSY;
1156 UVM_PAGE_OWN(pg, "genfs_putpages");
1157
1158 /*
1159 * first look backward.
1160 */
1161
1162 npages = MIN(n >> 1, off >> PAGE_SHIFT);
1163 nback = npages;
1164 uvn_findpages(uobj, off - PAGE_SIZE, &nback, &pgs[0],
1165 UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
1166 if (nback) {
1167 memmove(&pgs[0], &pgs[npages - nback],
1168 nback * sizeof(pgs[0]));
1169 }
1170 n -= nback;
1171
1172 /*
1173 * then plug in our page of interest.
1174 */
1175
1176 pgs[nback] = pg;
1177
1178 /*
1179 * then look forward to fill in the remaining space in
1180 * the array of pages.
1181 */
1182
1183 npages = MIN(n, (endoff - off) >> PAGE_SHIFT) - 1;
1184 uvn_findpages(uobj, off + PAGE_SIZE, &npages,
1185 &pgs[nback + 1],
1186 UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
1187 npages += nback + 1;
1188 } else {
1189 pgs[0] = pg;
1190 npages = 1;
1191 }
1192
1193 /*
1194 * apply FREE or DEACTIVATE options if requested.
1195 */
1196
1197 if (flags & (PGO_DEACTIVATE|PGO_FREE)) {
1198 uvm_lock_pageq();
1199 }
1200 for (i = 0; i < npages; i++) {
1201 tpg = pgs[i];
1202 KASSERT(tpg->uobject == uobj);
1203 if (flags & PGO_DEACTIVATE &&
1204 (tpg->pqflags & PQ_INACTIVE) == 0 &&
1205 tpg->wire_count == 0) {
1206 (void) pmap_clear_reference(tpg);
1207 uvm_pagedeactivate(tpg);
1208 } else if (flags & PGO_FREE) {
1209 pmap_page_protect(tpg, VM_PROT_NONE);
1210 if (tpg->flags & PG_BUSY) {
1211 tpg->flags |= freeflag;
1212 if (freeflag == PG_PAGEOUT) {
1213 uvmexp.paging++;
1214 uvm_pagedequeue(tpg);
1215 }
1216 } else {
1217 nextpg = TAILQ_NEXT(tpg, listq);
1218 uvm_pagefree(tpg);
1219 }
1220 }
1221 }
1222 if (flags & (PGO_DEACTIVATE|PGO_FREE)) {
1223 uvm_unlock_pageq();
1224 }
1225 if (needs_clean) {
1226
1227 /*
1228 * start the i/o. if we're traversing by list,
1229 * keep our place in the list with a marker page.
1230 */
1231
1232 if (by_list) {
1233 TAILQ_INSERT_AFTER(&uobj->memq, pg, &curmp,
1234 listq);
1235 }
1236 simple_unlock(slock);
1237 error = GOP_WRITE(vp, pgs, npages, flags);
1238 simple_lock(slock);
1239 if (by_list) {
1240 pg = TAILQ_NEXT(&curmp, listq);
1241 TAILQ_REMOVE(&uobj->memq, &curmp, listq);
1242 }
1243 if (error == ENOMEM) {
1244 for (i = 0; i < npages; i++) {
1245 tpg = pgs[i];
1246 if (tpg->flags & PG_PAGEOUT) {
1247 tpg->flags &= ~PG_PAGEOUT;
1248 uvmexp.paging--;
1249 }
1250 tpg->flags &= ~PG_CLEAN;
1251 uvm_pageactivate(tpg);
1252 }
1253 uvm_page_unbusy(pgs, npages);
1254 }
1255 if (error) {
1256 break;
1257 }
1258 if (by_list) {
1259 continue;
1260 }
1261 }
1262
1263 /*
1264 * find the next page and continue if there was no error.
1265 */
1266
1267 if (by_list) {
1268 if (nextpg) {
1269 pg = nextpg;
1270 nextpg = NULL;
1271 } else {
1272 pg = TAILQ_NEXT(pg, listq);
1273 }
1274 } else {
1275 off += npages << PAGE_SHIFT;
1276 if (off < endoff) {
1277 pg = uvm_pagelookup(uobj, off);
1278 }
1279 }
1280 }
1281 if (by_list) {
1282 TAILQ_REMOVE(&uobj->memq, &endmp, listq);
1283 PRELE(curproc);
1284 }
1285
1286 /*
1287 * if we're cleaning and there was nothing to clean,
1288 * take us off the syncer list. if we started any i/o
1289 * and we're doing sync i/o, wait for all writes to finish.
1290 */
1291
1292 if ((flags & PGO_CLEANIT) && wasclean &&
1293 startoff == 0 && endoff == trunc_page(LLONG_MAX) &&
1294 LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
1295 (vp->v_flag & VONWORKLST)) {
1296 vp->v_flag &= ~VONWORKLST;
1297 LIST_REMOVE(vp, v_synclist);
1298 }
1299 if (!wasclean && !async) {
1300 s = splbio();
1301 while (vp->v_numoutput != 0) {
1302 vp->v_flag |= VBWAIT;
1303 UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, slock, FALSE,
1304 "genput2", 0);
1305 simple_lock(slock);
1306 }
1307 splx(s);
1308 }
1309 simple_unlock(&uobj->vmobjlock);
1310 return error;
1311 }
1312
1313 int
1314 genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
1315 {
1316 int s, error, run;
1317 int fs_bshift, dev_bshift;
1318 vaddr_t kva;
1319 off_t eof, offset, startoffset;
1320 size_t bytes, iobytes, skipbytes;
1321 daddr_t lbn, blkno;
1322 struct vm_page *pg;
1323 struct buf *mbp, *bp;
1324 struct vnode *devvp;
1325 boolean_t async = (flags & PGO_SYNCIO) == 0;
1326 UVMHIST_FUNC("genfs_gop_write"); UVMHIST_CALLED(ubchist);
1327
1328 UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
1329 vp, pgs, npages, flags);
1330
1331 GOP_SIZE(vp, vp->v_size, &eof);
1332 if (vp->v_type == VREG) {
1333 fs_bshift = vp->v_mount->mnt_fs_bshift;
1334 dev_bshift = vp->v_mount->mnt_dev_bshift;
1335 } else {
1336 fs_bshift = DEV_BSHIFT;
1337 dev_bshift = DEV_BSHIFT;
1338 }
1339 error = 0;
1340 pg = pgs[0];
1341 startoffset = pg->offset;
1342 bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
1343 skipbytes = 0;
1344 KASSERT(bytes != 0);
1345
1346 kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
1347 UVMPAGER_MAPIN_WAITOK);
1348
1349 s = splbio();
1350 vp->v_numoutput += 2;
1351 mbp = pool_get(&bufpool, PR_WAITOK);
1352 UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
1353 vp, mbp, vp->v_numoutput, bytes);
1354 splx(s);
1355 mbp->b_bufsize = npages << PAGE_SHIFT;
1356 mbp->b_data = (void *)kva;
1357 mbp->b_resid = mbp->b_bcount = bytes;
1358 mbp->b_flags = B_BUSY|B_WRITE|B_AGE| (async ? (B_CALL|B_ASYNC) : 0);
1359 mbp->b_iodone = uvm_aio_biodone;
1360 mbp->b_vp = vp;
1361 LIST_INIT(&mbp->b_dep);
1362
1363 bp = NULL;
1364 for (offset = startoffset;
1365 bytes > 0;
1366 offset += iobytes, bytes -= iobytes) {
1367 lbn = offset >> fs_bshift;
1368 error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
1369 if (error) {
1370 UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0);
1371 skipbytes += bytes;
1372 bytes = 0;
1373 break;
1374 }
1375
1376 iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
1377 bytes);
1378 if (blkno == (daddr_t)-1) {
1379 skipbytes += iobytes;
1380 continue;
1381 }
1382
1383 /* if it's really one i/o, don't make a second buf */
1384 if (offset == startoffset && iobytes == bytes) {
1385 bp = mbp;
1386 } else {
1387 s = splbio();
1388 vp->v_numoutput++;
1389 bp = pool_get(&bufpool, PR_WAITOK);
1390 UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
1391 vp, bp, vp->v_numoutput, 0);
1392 splx(s);
1393 bp->b_data = (char *)kva +
1394 (vaddr_t)(offset - pg->offset);
1395 bp->b_resid = bp->b_bcount = iobytes;
1396 bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
1397 bp->b_iodone = uvm_aio_biodone1;
1398 bp->b_vp = vp;
1399 LIST_INIT(&bp->b_dep);
1400 }
1401 bp->b_lblkno = 0;
1402 bp->b_private = mbp;
1403 if (devvp->v_type == VBLK) {
1404 bp->b_dev = devvp->v_rdev;
1405 }
1406
1407 /* adjust physical blkno for partial blocks */
1408 bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
1409 dev_bshift);
1410 UVMHIST_LOG(ubchist, "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
1411 vp, offset, bp->b_bcount, bp->b_blkno);
1412 VOP_STRATEGY(bp);
1413 }
1414 if (skipbytes) {
1415 UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
1416 s = splbio();
1417 if (error) {
1418 mbp->b_flags |= B_ERROR;
1419 mbp->b_error = error;
1420 }
1421 mbp->b_resid -= skipbytes;
1422 if (mbp->b_resid == 0) {
1423 biodone(mbp);
1424 }
1425 splx(s);
1426 }
1427 if (async) {
1428 UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
1429 return 0;
1430 }
1431 UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
1432 error = biowait(mbp);
1433 uvm_aio_aiodone(mbp);
1434 UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
1435 return error;
1436 }
1437
1438 /*
1439 * VOP_PUTPAGES() for vnodes which never have pages.
1440 */
1441
1442 int
1443 genfs_null_putpages(void *v)
1444 {
1445 struct vop_putpages_args /* {
1446 struct vnode *a_vp;
1447 voff_t a_offlo;
1448 voff_t a_offhi;
1449 int a_flags;
1450 } */ *ap = v;
1451 struct vnode *vp = ap->a_vp;
1452
1453 KASSERT(vp->v_uobj.uo_npages == 0);
1454 simple_unlock(&vp->v_interlock);
1455 return (0);
1456 }
1457
1458 void
1459 genfs_node_init(struct vnode *vp, struct genfs_ops *ops)
1460 {
1461 struct genfs_node *gp = VTOG(vp);
1462
1463 lockinit(&gp->g_glock, PINOD, "glock", 0, 0);
1464 gp->g_op = ops;
1465 }
1466
1467 void
1468 genfs_size(struct vnode *vp, off_t size, off_t *eobp)
1469 {
1470 int bsize;
1471
1472 bsize = 1 << vp->v_mount->mnt_fs_bshift;
1473 *eobp = (size + bsize - 1) & ~(bsize - 1);
1474 }
1475
1476 int
1477 genfs_compat_getpages(void *v)
1478 {
1479 struct vop_getpages_args /* {
1480 struct vnode *a_vp;
1481 voff_t a_offset;
1482 struct vm_page **a_m;
1483 int *a_count;
1484 int a_centeridx;
1485 vm_prot_t a_access_type;
1486 int a_advice;
1487 int a_flags;
1488 } */ *ap = v;
1489
1490 off_t origoffset;
1491 struct vnode *vp = ap->a_vp;
1492 struct uvm_object *uobj = &vp->v_uobj;
1493 struct vm_page *pg, **pgs;
1494 vaddr_t kva;
1495 int i, error, orignpages, npages;
1496 struct iovec iov;
1497 struct uio uio;
1498 struct ucred *cred = curproc->p_ucred;
1499 boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0;
1500
1501 error = 0;
1502 origoffset = ap->a_offset;
1503 orignpages = *ap->a_count;
1504 pgs = ap->a_m;
1505
1506 if (write && (vp->v_flag & VONWORKLST) == 0) {
1507 vn_syncer_add_to_worklist(vp, filedelay);
1508 }
1509 if (ap->a_flags & PGO_LOCKED) {
1510 uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m,
1511 UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY);
1512
1513 return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
1514 }
1515 if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) {
1516 simple_unlock(&uobj->vmobjlock);
1517 return EINVAL;
1518 }
1519 npages = orignpages;
1520 uvn_findpages(uobj, origoffset, &npages, pgs, UFP_ALL);
1521 simple_unlock(&uobj->vmobjlock);
1522 kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK |
1523 UVMPAGER_MAPIN_READ);
1524 for (i = 0; i < npages; i++) {
1525 pg = pgs[i];
1526 if ((pg->flags & PG_FAKE) == 0) {
1527 continue;
1528 }
1529 iov.iov_base = (char *)kva + (i << PAGE_SHIFT);
1530 iov.iov_len = PAGE_SIZE;
1531 uio.uio_iov = &iov;
1532 uio.uio_iovcnt = 1;
1533 uio.uio_offset = origoffset + (i << PAGE_SHIFT);
1534 uio.uio_segflg = UIO_SYSSPACE;
1535 uio.uio_rw = UIO_READ;
1536 uio.uio_resid = PAGE_SIZE;
1537 uio.uio_procp = curproc;
1538 error = VOP_READ(vp, &uio, 0, cred);
1539 if (error) {
1540 break;
1541 }
1542 }
1543 uvm_pagermapout(kva, npages);
1544 simple_lock(&uobj->vmobjlock);
1545 uvm_lock_pageq();
1546 for (i = 0; i < npages; i++) {
1547 pg = pgs[i];
1548 if (error && (pg->flags & PG_FAKE) != 0) {
1549 pg->flags |= PG_RELEASED;
1550 } else {
1551 pmap_clear_modify(pg);
1552 uvm_pageactivate(pg);
1553 }
1554 }
1555 if (error) {
1556 uvm_page_unbusy(pgs, npages);
1557 }
1558 uvm_unlock_pageq();
1559 simple_unlock(&uobj->vmobjlock);
1560 return error;
1561 }
1562
1563 int
1564 genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
1565 int flags)
1566 {
1567 off_t offset;
1568 struct iovec iov;
1569 struct uio uio;
1570 struct ucred *cred = curproc->p_ucred;
1571 struct buf *bp;
1572 vaddr_t kva;
1573 int s, error;
1574
1575 offset = pgs[0]->offset;
1576 kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
1577 UVMPAGER_MAPIN_WAITOK);
1578
1579 iov.iov_base = (void *)kva;
1580 iov.iov_len = npages << PAGE_SHIFT;
1581 uio.uio_iov = &iov;
1582 uio.uio_iovcnt = npages;
1583 uio.uio_offset = offset;
1584 uio.uio_segflg = UIO_SYSSPACE;
1585 uio.uio_rw = UIO_WRITE;
1586 uio.uio_resid = npages << PAGE_SHIFT;
1587 uio.uio_procp = curproc;
1588 error = VOP_WRITE(vp, &uio, 0, cred);
1589
1590 s = splbio();
1591 vp->v_numoutput++;
1592 bp = pool_get(&bufpool, PR_WAITOK);
1593 splx(s);
1594
1595 bp->b_flags = B_BUSY | B_WRITE | B_AGE;
1596 bp->b_vp = vp;
1597 bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift;
1598 bp->b_data = (char *)kva;
1599 bp->b_bcount = npages << PAGE_SHIFT;
1600 bp->b_bufsize = npages << PAGE_SHIFT;
1601 bp->b_resid = 0;
1602 LIST_INIT(&bp->b_dep);
1603 if (error) {
1604 bp->b_flags |= B_ERROR;
1605 bp->b_error = error;
1606 }
1607 uvm_aio_aiodone(bp);
1608 return error;
1609 }
1610