nfs_bio.c revision 1.31 1 /* $NetBSD: nfs_bio.c,v 1.31 1997/04/20 16:24:44 fvdl Exp $ */
2
3 /*
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Rick Macklem at The University of Guelph.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
39 */
40
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/resourcevar.h>
45 #include <sys/signalvar.h>
46 #include <sys/proc.h>
47 #include <sys/buf.h>
48 #include <sys/vnode.h>
49 #include <sys/trace.h>
50 #include <sys/mount.h>
51 #include <sys/kernel.h>
52 #include <sys/namei.h>
53
54 #include <vm/vm.h>
55
56 #include <nfs/rpcv2.h>
57 #include <nfs/nfsproto.h>
58 #include <nfs/nfs.h>
59 #include <nfs/nfsmount.h>
60 #include <nfs/nqnfs.h>
61 #include <nfs/nfsnode.h>
62 #include <nfs/nfs_var.h>
63
64 extern int nfs_numasync;
65 extern struct nfsstats nfsstats;
66
67 /*
68 * Vnode op for read using bio
69 * Any similarity to readip() is purely coincidental
70 */
71 int
72 nfs_bioread(vp, uio, ioflag, cred)
73 register struct vnode *vp;
74 register struct uio *uio;
75 int ioflag;
76 struct ucred *cred;
77 {
78 register struct nfsnode *np = VTONFS(vp);
79 register int biosize, diff, i;
80 struct buf *bp = NULL, *rabp;
81 struct vattr vattr;
82 struct proc *p;
83 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
84 daddr_t lbn, bn, rabn;
85 caddr_t baddr;
86 int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin;
87
88 #ifdef DIAGNOSTIC
89 if (uio->uio_rw != UIO_READ)
90 panic("nfs_read mode");
91 #endif
92 if (uio->uio_resid == 0)
93 return (0);
94 if (uio->uio_offset < 0)
95 return (EINVAL);
96 p = uio->uio_procp;
97 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
98 (void)nfs_fsinfo(nmp, vp, cred, p);
99 biosize = nmp->nm_rsize;
100 /*
101 * For nfs, cache consistency can only be maintained approximately.
102 * Although RFC1094 does not specify the criteria, the following is
103 * believed to be compatible with the reference port.
104 * For nqnfs, full cache consistency is maintained within the loop.
105 * For nfs:
106 * If the file's modify time on the server has changed since the
107 * last read rpc or you have written to the file,
108 * you may have lost data cache consistency with the
109 * server, so flush all of the file's data out of the cache.
110 * Then force a getattr rpc to ensure that you have up to date
111 * attributes.
112 * NB: This implies that cache data can be read when up to
113 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
114 * attributes this could be forced by setting n_attrstamp to 0 before
115 * the VOP_GETATTR() call.
116 */
117 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) {
118 if (np->n_flag & NMODIFIED) {
119 if (vp->v_type != VREG) {
120 if (vp->v_type != VDIR)
121 panic("nfs: bioread, not dir");
122 nfs_invaldir(vp);
123 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
124 if (error)
125 return (error);
126 }
127 np->n_attrstamp = 0;
128 error = VOP_GETATTR(vp, &vattr, cred, p);
129 if (error)
130 return (error);
131 np->n_mtime = vattr.va_mtime.tv_sec;
132 } else {
133 error = VOP_GETATTR(vp, &vattr, cred, p);
134 if (error)
135 return (error);
136 if (np->n_mtime != vattr.va_mtime.tv_sec) {
137 if (vp->v_type == VDIR)
138 nfs_invaldir(vp);
139 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
140 if (error)
141 return (error);
142 np->n_mtime = vattr.va_mtime.tv_sec;
143 }
144 }
145 }
146 do {
147
148 /*
149 * Get a valid lease. If cached data is stale, flush it.
150 */
151 if (nmp->nm_flag & NFSMNT_NQNFS) {
152 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
153 do {
154 error = nqnfs_getlease(vp, ND_READ, cred, p);
155 } while (error == NQNFS_EXPIRED);
156 if (error)
157 return (error);
158 if (np->n_lrev != np->n_brev ||
159 (np->n_flag & NQNFSNONCACHE) ||
160 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
161 if (vp->v_type == VDIR)
162 nfs_invaldir(vp);
163 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
164 if (error)
165 return (error);
166 np->n_brev = np->n_lrev;
167 }
168 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
169 nfs_invaldir(vp);
170 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
171 if (error)
172 return (error);
173 }
174 }
175 /*
176 * Don't cache symlinks.
177 */
178 if (np->n_flag & NQNFSNONCACHE
179 || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) {
180 switch (vp->v_type) {
181 case VREG:
182 return (nfs_readrpc(vp, uio, cred));
183 case VLNK:
184 return (nfs_readlinkrpc(vp, uio, cred));
185 case VDIR:
186 break;
187 default:
188 printf(" NQNFSNONCACHE: type %x unexpected\n",
189 vp->v_type);
190 };
191 }
192 baddr = (caddr_t)0;
193 switch (vp->v_type) {
194 case VREG:
195 nfsstats.biocache_reads++;
196 lbn = uio->uio_offset / biosize;
197 on = uio->uio_offset & (biosize - 1);
198 bn = lbn * (biosize / DEV_BSIZE);
199 not_readin = 1;
200
201 /*
202 * Start the read ahead(s), as required.
203 */
204 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
205 for (nra = 0; nra < nmp->nm_readahead &&
206 (lbn + 1 + nra) * biosize < np->n_size; nra++) {
207 rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE);
208 if (!incore(vp, rabn)) {
209 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
210 if (!rabp)
211 return (EINTR);
212 if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) {
213 rabp->b_flags |= (B_READ | B_ASYNC);
214 if (nfs_asyncio(rabp, cred)) {
215 rabp->b_flags |= B_INVAL;
216 brelse(rabp);
217 }
218 } else
219 brelse(rabp);
220 }
221 }
222 }
223
224 /*
225 * If the block is in the cache and has the required data
226 * in a valid region, just copy it out.
227 * Otherwise, get the block and write back/read in,
228 * as required.
229 */
230 if ((bp = incore(vp, bn)) &&
231 (bp->b_flags & (B_BUSY | B_WRITEINPROG)) ==
232 (B_BUSY | B_WRITEINPROG))
233 got_buf = 0;
234 else {
235 again:
236 bp = nfs_getcacheblk(vp, bn, biosize, p);
237 if (!bp)
238 return (EINTR);
239 got_buf = 1;
240 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
241 bp->b_flags |= B_READ;
242 not_readin = 0;
243 error = nfs_doio(bp, cred, p);
244 if (error) {
245 brelse(bp);
246 return (error);
247 }
248 }
249 }
250 n = min((unsigned)(biosize - on), uio->uio_resid);
251 diff = np->n_size - uio->uio_offset;
252 if (diff < n)
253 n = diff;
254 if (not_readin && n > 0) {
255 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
256 if (!got_buf) {
257 bp = nfs_getcacheblk(vp, bn, biosize, p);
258 if (!bp)
259 return (EINTR);
260 got_buf = 1;
261 }
262 bp->b_flags |= B_INVAFTERWRITE;
263 if (bp->b_dirtyend > 0) {
264 if ((bp->b_flags & B_DELWRI) == 0)
265 panic("nfsbioread");
266 if (VOP_BWRITE(bp) == EINTR)
267 return (EINTR);
268 } else
269 brelse(bp);
270 goto again;
271 }
272 }
273 vp->v_lastr = lbn;
274 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
275 if (diff < n)
276 n = diff;
277 break;
278 case VLNK:
279 nfsstats.biocache_readlinks++;
280 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
281 if (!bp)
282 return (EINTR);
283 if ((bp->b_flags & B_DONE) == 0) {
284 bp->b_flags |= B_READ;
285 error = nfs_doio(bp, cred, p);
286 if (error) {
287 brelse(bp);
288 return (error);
289 }
290 }
291 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
292 got_buf = 1;
293 on = 0;
294 break;
295 case VDIR:
296 if (uio->uio_resid < NFS_READDIRBLKSIZ)
297 return (0);
298 nfsstats.biocache_readdirs++;
299 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
300 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
301 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
302 if (!bp)
303 return (EINTR);
304 if ((bp->b_flags & B_DONE) == 0) {
305 bp->b_flags |= B_READ;
306 error = nfs_doio(bp, cred, p);
307 if (error) {
308 brelse(bp);
309 while (error == NFSERR_BAD_COOKIE) {
310 nfs_invaldir(vp);
311 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
312 /*
313 * Yuck! The directory has been modified on the
314 * server. The only way to get the block is by
315 * reading from the beginning to get all the
316 * offset cookies.
317 */
318 for (i = 0; i <= lbn && !error; i++) {
319 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
320 if (!bp)
321 return (EINTR);
322 if ((bp->b_flags & B_DONE) == 0) {
323 bp->b_flags |= B_READ;
324 error = nfs_doio(bp, cred, p);
325 if (error)
326 brelse(bp);
327 }
328 }
329 }
330 if (error)
331 return (error);
332 }
333 }
334
335 /*
336 * If not eof and read aheads are enabled, start one.
337 * (You need the current block first, so that you have the
338 * directory offset cookie of the next block.)
339 */
340 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
341 (np->n_direofoffset == 0 ||
342 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
343 !(np->n_flag & NQNFSNONCACHE) &&
344 !incore(vp, lbn + 1)) {
345 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
346 if (rabp) {
347 if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) {
348 rabp->b_flags |= (B_READ | B_ASYNC);
349 if (nfs_asyncio(rabp, cred)) {
350 rabp->b_flags |= B_INVAL;
351 brelse(rabp);
352 }
353 } else
354 brelse(rabp);
355 }
356 }
357 n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
358 got_buf = 1;
359 break;
360 default:
361 printf(" nfsbioread: type %x unexpected\n",vp->v_type);
362 break;
363 };
364
365 if (n > 0) {
366 if (!baddr)
367 baddr = bp->b_data;
368 error = uiomove(baddr + on, (int)n, uio);
369 }
370 switch (vp->v_type) {
371 case VREG:
372 break;
373 case VLNK:
374 n = 0;
375 break;
376 case VDIR:
377 if (np->n_flag & NQNFSNONCACHE)
378 bp->b_flags |= B_INVAL;
379 break;
380 default:
381 printf(" nfsbioread: type %x unexpected\n",vp->v_type);
382 }
383 if (got_buf)
384 brelse(bp);
385 } while (error == 0 && uio->uio_resid > 0 && n > 0);
386 return (error);
387 }
388
389 /*
390 * Vnode op for write using bio
391 */
392 int
393 nfs_write(v)
394 void *v;
395 {
396 struct vop_write_args /* {
397 struct vnode *a_vp;
398 struct uio *a_uio;
399 int a_ioflag;
400 struct ucred *a_cred;
401 } */ *ap = v;
402 register int biosize;
403 register struct uio *uio = ap->a_uio;
404 struct proc *p = uio->uio_procp;
405 register struct vnode *vp = ap->a_vp;
406 struct nfsnode *np = VTONFS(vp);
407 register struct ucred *cred = ap->a_cred;
408 int ioflag = ap->a_ioflag;
409 struct buf *bp;
410 struct vattr vattr;
411 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
412 daddr_t lbn, bn;
413 int n, on, error = 0, iomode, must_commit;
414
415 #ifdef DIAGNOSTIC
416 if (uio->uio_rw != UIO_WRITE)
417 panic("nfs_write mode");
418 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
419 panic("nfs_write proc");
420 #endif
421 if (vp->v_type != VREG)
422 return (EIO);
423 if (np->n_flag & NWRITEERR) {
424 np->n_flag &= ~NWRITEERR;
425 return (np->n_error);
426 }
427 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
428 (void)nfs_fsinfo(nmp, vp, cred, p);
429 if (ioflag & (IO_APPEND | IO_SYNC)) {
430 if (np->n_flag & NMODIFIED) {
431 np->n_attrstamp = 0;
432 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
433 if (error)
434 return (error);
435 }
436 if (ioflag & IO_APPEND) {
437 np->n_attrstamp = 0;
438 error = VOP_GETATTR(vp, &vattr, cred, p);
439 if (error)
440 return (error);
441 uio->uio_offset = np->n_size;
442 }
443 }
444 if (uio->uio_offset < 0)
445 return (EINVAL);
446 if (uio->uio_resid == 0)
447 return (0);
448 /*
449 * Maybe this should be above the vnode op call, but so long as
450 * file servers have no limits, i don't think it matters
451 */
452 if (p && uio->uio_offset + uio->uio_resid >
453 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
454 psignal(p, SIGXFSZ);
455 return (EFBIG);
456 }
457 /*
458 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
459 * will be the same size within a filesystem. nfs_writerpc will
460 * still use nm_wsize when sizing the rpc's.
461 */
462 biosize = nmp->nm_rsize;
463 do {
464
465 /*
466 * XXX make sure we aren't cached in the VM page cache
467 */
468 (void)vnode_pager_uncache(vp);
469
470 /*
471 * Check for a valid write lease.
472 */
473 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
474 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
475 do {
476 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
477 } while (error == NQNFS_EXPIRED);
478 if (error)
479 return (error);
480 if (np->n_lrev != np->n_brev ||
481 (np->n_flag & NQNFSNONCACHE)) {
482 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
483 if (error)
484 return (error);
485 np->n_brev = np->n_lrev;
486 }
487 }
488 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
489 iomode = NFSV3WRITE_FILESYNC;
490 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
491 if (must_commit)
492 nfs_clearcommit(vp->v_mount);
493 return (error);
494 }
495 nfsstats.biocache_writes++;
496 lbn = uio->uio_offset / biosize;
497 on = uio->uio_offset & (biosize-1);
498 n = min((unsigned)(biosize - on), uio->uio_resid);
499 bn = lbn * (biosize / DEV_BSIZE);
500 again:
501 bp = nfs_getcacheblk(vp, bn, biosize, p);
502 if (!bp)
503 return (EINTR);
504 if (bp->b_wcred == NOCRED) {
505 crhold(cred);
506 bp->b_wcred = cred;
507 }
508 np->n_flag |= NMODIFIED;
509 if (uio->uio_offset + n > np->n_size) {
510 np->n_size = uio->uio_offset + n;
511 vnode_pager_setsize(vp, (u_long)np->n_size);
512 }
513
514 /*
515 * If the new write will leave a contiguous dirty
516 * area, just update the b_dirtyoff and b_dirtyend,
517 * otherwise force a write rpc of the old dirty area.
518 */
519 if (bp->b_dirtyend > 0 &&
520 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
521 bp->b_proc = p;
522 if (VOP_BWRITE(bp) == EINTR)
523 return (EINTR);
524 goto again;
525 }
526
527 /*
528 * Check for valid write lease and get one as required.
529 * In case getblk() and/or bwrite() delayed us.
530 */
531 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
532 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
533 do {
534 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
535 } while (error == NQNFS_EXPIRED);
536 if (error) {
537 brelse(bp);
538 return (error);
539 }
540 if (np->n_lrev != np->n_brev ||
541 (np->n_flag & NQNFSNONCACHE)) {
542 brelse(bp);
543 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
544 if (error)
545 return (error);
546 np->n_brev = np->n_lrev;
547 goto again;
548 }
549 }
550 error = uiomove((char *)bp->b_data + on, n, uio);
551 if (error) {
552 bp->b_flags |= B_ERROR;
553 brelse(bp);
554 return (error);
555 }
556 if (bp->b_dirtyend > 0) {
557 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
558 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
559 } else {
560 bp->b_dirtyoff = on;
561 bp->b_dirtyend = on + n;
562 }
563 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
564 bp->b_validoff > bp->b_dirtyend) {
565 bp->b_validoff = bp->b_dirtyoff;
566 bp->b_validend = bp->b_dirtyend;
567 } else {
568 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
569 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
570 }
571
572 /*
573 * Since this block is being modified, it must be written
574 * again and not just committed.
575 */
576 bp->b_flags &= ~B_NEEDCOMMIT;
577
578 /*
579 * If the lease is non-cachable or IO_SYNC do bwrite().
580 */
581 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
582 bp->b_proc = p;
583 error = VOP_BWRITE(bp);
584 if (error)
585 return (error);
586 if (np->n_flag & NQNFSNONCACHE) {
587 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
588 if (error)
589 return (error);
590 }
591 } else if ((n + on) == biosize &&
592 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
593 bp->b_proc = (struct proc *)0;
594 bp->b_flags |= B_ASYNC;
595 (void)nfs_writebp(bp, 0);
596 } else {
597 bdwrite(bp);
598 }
599 } while (uio->uio_resid > 0 && n > 0);
600 return (0);
601 }
602
603 /*
604 * Get an nfs cache block.
605 * Allocate a new one if the block isn't currently in the cache
606 * and return the block marked busy. If the calling process is
607 * interrupted by a signal for an interruptible mount point, return
608 * NULL.
609 */
610 struct buf *
611 nfs_getcacheblk(vp, bn, size, p)
612 struct vnode *vp;
613 daddr_t bn;
614 int size;
615 struct proc *p;
616 {
617 register struct buf *bp;
618 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
619
620 if (nmp->nm_flag & NFSMNT_INT) {
621 bp = getblk(vp, bn, size, PCATCH, 0);
622 while (bp == (struct buf *)0) {
623 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
624 return ((struct buf *)0);
625 bp = getblk(vp, bn, size, 0, 2 * hz);
626 }
627 } else
628 bp = getblk(vp, bn, size, 0, 0);
629 return (bp);
630 }
631
632 /*
633 * Flush and invalidate all dirty buffers. If another process is already
634 * doing the flush, just wait for completion.
635 */
636 int
637 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
638 struct vnode *vp;
639 int flags;
640 struct ucred *cred;
641 struct proc *p;
642 int intrflg;
643 {
644 register struct nfsnode *np = VTONFS(vp);
645 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
646 int error = 0, slpflag, slptimeo;
647
648 if ((nmp->nm_flag & NFSMNT_INT) == 0)
649 intrflg = 0;
650 if (intrflg) {
651 slpflag = PCATCH;
652 slptimeo = 2 * hz;
653 } else {
654 slpflag = 0;
655 slptimeo = 0;
656 }
657 /*
658 * First wait for any other process doing a flush to complete.
659 */
660 while (np->n_flag & NFLUSHINPROG) {
661 np->n_flag |= NFLUSHWANT;
662 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
663 slptimeo);
664 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
665 return (EINTR);
666 }
667
668 /*
669 * Now, flush as required.
670 */
671 np->n_flag |= NFLUSHINPROG;
672 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
673 while (error) {
674 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
675 np->n_flag &= ~NFLUSHINPROG;
676 if (np->n_flag & NFLUSHWANT) {
677 np->n_flag &= ~NFLUSHWANT;
678 wakeup((caddr_t)&np->n_flag);
679 }
680 return (EINTR);
681 }
682 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
683 }
684 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
685 if (np->n_flag & NFLUSHWANT) {
686 np->n_flag &= ~NFLUSHWANT;
687 wakeup((caddr_t)&np->n_flag);
688 }
689 return (0);
690 }
691
692 /*
693 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
694 * This is mainly to avoid queueing async I/O requests when the nfsiods
695 * are all hung on a dead server.
696 */
697 int
698 nfs_asyncio(bp, cred)
699 register struct buf *bp;
700 struct ucred *cred;
701 {
702 register int i;
703 register struct nfsmount *nmp;
704 int gotiod, slpflag = 0, slptimeo = 0, error;
705
706 if (nfs_numasync == 0)
707 return (EIO);
708
709
710 nmp = VFSTONFS(bp->b_vp->v_mount);
711 again:
712 if (nmp->nm_flag & NFSMNT_INT)
713 slpflag = PCATCH;
714 gotiod = FALSE;
715
716 /*
717 * Find a free iod to process this request.
718 */
719
720 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
721 if (nfs_iodwant[i]) {
722 /*
723 * Found one, so wake it up and tell it which
724 * mount to process.
725 */
726 nfs_iodwant[i] = (struct proc *)0;
727 nfs_iodmount[i] = nmp;
728 nmp->nm_bufqiods++;
729 wakeup((caddr_t)&nfs_iodwant[i]);
730 gotiod = TRUE;
731 break;
732 }
733 /*
734 * If none are free, we may already have an iod working on this mount
735 * point. If so, it will process our request.
736 */
737 if (!gotiod && nmp->nm_bufqiods > 0)
738 gotiod = TRUE;
739
740 /*
741 * If we have an iod which can process the request, then queue
742 * the buffer.
743 */
744 if (gotiod) {
745 /*
746 * Ensure that the queue never grows too large.
747 */
748 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
749 nmp->nm_bufqwant = TRUE;
750 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
751 "nfsaio", slptimeo);
752 if (error) {
753 if (nfs_sigintr(nmp, NULL, bp->b_proc))
754 return (EINTR);
755 if (slpflag == PCATCH) {
756 slpflag = 0;
757 slptimeo = 2 * hz;
758 }
759 }
760 /*
761 * We might have lost our iod while sleeping,
762 * so check and loop if nescessary.
763 */
764 if (nmp->nm_bufqiods == 0)
765 goto again;
766 }
767
768 if (bp->b_flags & B_READ) {
769 if (bp->b_rcred == NOCRED && cred != NOCRED) {
770 crhold(cred);
771 bp->b_rcred = cred;
772 }
773 } else {
774 bp->b_flags |= B_WRITEINPROG;
775 if (bp->b_wcred == NOCRED && cred != NOCRED) {
776 crhold(cred);
777 bp->b_wcred = cred;
778 }
779 }
780
781 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
782 nmp->nm_bufqlen++;
783 return (0);
784 }
785
786 /*
787 * All the iods are busy on other mounts, so return EIO to
788 * force the caller to process the i/o synchronously.
789 */
790 return (EIO);
791 }
792
793 /*
794 * Do an I/O operation to/from a cache block. This may be called
795 * synchronously or from an nfsiod.
796 */
797 int
798 nfs_doio(bp, cr, p)
799 register struct buf *bp;
800 struct ucred *cr;
801 struct proc *p;
802 {
803 register struct uio *uiop;
804 register struct vnode *vp;
805 struct nfsnode *np;
806 struct nfsmount *nmp;
807 int error = 0, diff, len, iomode, must_commit = 0;
808 struct uio uio;
809 struct iovec io;
810
811 vp = bp->b_vp;
812 np = VTONFS(vp);
813 nmp = VFSTONFS(vp->v_mount);
814 uiop = &uio;
815 uiop->uio_iov = &io;
816 uiop->uio_iovcnt = 1;
817 uiop->uio_segflg = UIO_SYSSPACE;
818 uiop->uio_procp = p;
819
820 /*
821 * Historically, paging was done with physio, but no more...
822 */
823 if (bp->b_flags & B_PHYS) {
824 /*
825 * ...though reading /dev/drum still gets us here.
826 */
827 io.iov_len = uiop->uio_resid = bp->b_bcount;
828 /* mapping was done by vmapbuf() */
829 io.iov_base = bp->b_data;
830 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
831 if (bp->b_flags & B_READ) {
832 uiop->uio_rw = UIO_READ;
833 nfsstats.read_physios++;
834 error = nfs_readrpc(vp, uiop, cr);
835 } else {
836 iomode = NFSV3WRITE_DATASYNC;
837 uiop->uio_rw = UIO_WRITE;
838 nfsstats.write_physios++;
839 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
840 }
841 if (error) {
842 bp->b_flags |= B_ERROR;
843 bp->b_error = error;
844 }
845 } else if (bp->b_flags & B_READ) {
846 io.iov_len = uiop->uio_resid = bp->b_bcount;
847 io.iov_base = bp->b_data;
848 uiop->uio_rw = UIO_READ;
849 switch (vp->v_type) {
850 case VREG:
851 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
852 nfsstats.read_bios++;
853 error = nfs_readrpc(vp, uiop, cr);
854 if (!error) {
855 bp->b_validoff = 0;
856 if (uiop->uio_resid) {
857 /*
858 * If len > 0, there is a hole in the file and
859 * no writes after the hole have been pushed to
860 * the server yet.
861 * Just zero fill the rest of the valid area.
862 */
863 diff = bp->b_bcount - uiop->uio_resid;
864 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
865 + diff);
866 if (len > 0) {
867 len = min(len, uiop->uio_resid);
868 bzero((char *)bp->b_data + diff, len);
869 bp->b_validend = diff + len;
870 } else
871 bp->b_validend = diff;
872 } else
873 bp->b_validend = bp->b_bcount;
874 }
875 if (p && (vp->v_flag & VTEXT) &&
876 (((nmp->nm_flag & NFSMNT_NQNFS) &&
877 NQNFS_CKINVALID(vp, np, ND_READ) &&
878 np->n_lrev != np->n_brev) ||
879 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
880 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
881 uprintf("Process killed due to text file modification\n");
882 psignal(p, SIGKILL);
883 p->p_holdcnt++;
884 }
885 break;
886 case VLNK:
887 uiop->uio_offset = (off_t)0;
888 nfsstats.readlink_bios++;
889 error = nfs_readlinkrpc(vp, uiop, cr);
890 break;
891 case VDIR:
892 nfsstats.readdir_bios++;
893 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
894 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
895 error = nfs_readdirplusrpc(vp, uiop, cr);
896 if (error == NFSERR_NOTSUPP)
897 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
898 }
899 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
900 error = nfs_readdirrpc(vp, uiop, cr);
901 break;
902 default:
903 printf("nfs_doio: type %x unexpected\n",vp->v_type);
904 break;
905 };
906 if (error) {
907 bp->b_flags |= B_ERROR;
908 bp->b_error = error;
909 }
910 } else {
911 io.iov_len = uiop->uio_resid = bp->b_dirtyend
912 - bp->b_dirtyoff;
913 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
914 + bp->b_dirtyoff;
915 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
916 uiop->uio_rw = UIO_WRITE;
917 nfsstats.write_bios++;
918 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
919 iomode = NFSV3WRITE_UNSTABLE;
920 else
921 iomode = NFSV3WRITE_FILESYNC;
922 bp->b_flags |= B_WRITEINPROG;
923 #ifdef fvdl_debug
924 printf("nfs_doio(%x): bp %x doff %d dend %d\n",
925 vp, bp, bp->b_dirtyoff, bp->b_dirtyend);
926 #endif
927 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
928 if (!error && iomode == NFSV3WRITE_UNSTABLE)
929 bp->b_flags |= B_NEEDCOMMIT;
930 else
931 bp->b_flags &= ~B_NEEDCOMMIT;
932 bp->b_flags &= ~B_WRITEINPROG;
933
934 /*
935 * For an interrupted write, the buffer is still valid and the
936 * write hasn't been pushed to the server yet, so we can't set
937 * B_ERROR and report the interruption by setting B_EINTR. For
938 * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt
939 * is essentially a noop.
940 * For the case of a V3 write rpc not being committed to stable
941 * storage, the block is still dirty and requires either a commit
942 * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC
943 * before the block is reused. This is indicated by setting the
944 * B_DELWRI and B_NEEDCOMMIT flags.
945 */
946 if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
947 bp->b_flags |= B_DELWRI;
948
949 /*
950 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
951 * buffer to the clean list, we have to reassign it back to the
952 * dirty one. Ugh.
953 */
954 if (bp->b_flags & B_ASYNC)
955 reassignbuf(bp, vp);
956 else if (error)
957 bp->b_flags |= B_EINTR;
958 } else {
959 if (error) {
960 bp->b_flags |= B_ERROR;
961 bp->b_error = np->n_error = error;
962 np->n_flag |= NWRITEERR;
963 }
964 bp->b_dirtyoff = bp->b_dirtyend = 0;
965 }
966 }
967 bp->b_resid = uiop->uio_resid;
968 if (must_commit)
969 nfs_clearcommit(vp->v_mount);
970 biodone(bp);
971 return (error);
972 }
973