Home | History | Annotate | Line # | Download | only in nfs
nfs_bio.c revision 1.34
      1 /*	$NetBSD: nfs_bio.c,v 1.34 1997/10/10 01:53:18 fvdl Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1989, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Rick Macklem at The University of Guelph.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the University of
     21  *	California, Berkeley and its contributors.
     22  * 4. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  *
     38  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
     39  */
     40 
     41 
     42 #include <sys/param.h>
     43 #include <sys/systm.h>
     44 #include <sys/resourcevar.h>
     45 #include <sys/signalvar.h>
     46 #include <sys/proc.h>
     47 #include <sys/buf.h>
     48 #include <sys/vnode.h>
     49 #include <sys/trace.h>
     50 #include <sys/mount.h>
     51 #include <sys/kernel.h>
     52 #include <sys/namei.h>
     53 #include <sys/dirent.h>
     54 
     55 #include <vm/vm.h>
     56 
     57 #include <nfs/rpcv2.h>
     58 #include <nfs/nfsproto.h>
     59 #include <nfs/nfs.h>
     60 #include <nfs/nfsmount.h>
     61 #include <nfs/nqnfs.h>
     62 #include <nfs/nfsnode.h>
     63 #include <nfs/nfs_var.h>
     64 
     65 extern int nfs_numasync;
     66 extern struct nfsstats nfsstats;
     67 
     68 /*
     69  * Vnode op for read using bio
     70  * Any similarity to readip() is purely coincidental
     71  */
     72 int
     73 nfs_bioread(vp, uio, ioflag, cred, cflag)
     74 	register struct vnode *vp;
     75 	register struct uio *uio;
     76 	int ioflag, cflag;
     77 	struct ucred *cred;
     78 {
     79 	register struct nfsnode *np = VTONFS(vp);
     80 	register int biosize, diff;
     81 	struct buf *bp = NULL, *rabp;
     82 	struct vattr vattr;
     83 	struct proc *p;
     84 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
     85 	struct nfsdircache *ndp = NULL;
     86 	daddr_t lbn, bn, rabn;
     87 	caddr_t baddr, ep, edp;
     88 	int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin, en, enn;
     89 	int enough = 0;
     90 	struct dirent *dp, *pdp;
     91 	off_t curoff = 0;
     92 
     93 #ifdef DIAGNOSTIC
     94 	if (uio->uio_rw != UIO_READ)
     95 		panic("nfs_read mode");
     96 #endif
     97 	if (uio->uio_resid == 0)
     98 		return (0);
     99 	if (vp->v_type != VDIR && uio->uio_offset < 0)
    100 		return (EINVAL);
    101 	p = uio->uio_procp;
    102 	if ((nmp->nm_flag & NFSMNT_NFSV3) &&
    103 	    !(nmp->nm_iflag & NFSMNT_GOTFSINFO))
    104 		(void)nfs_fsinfo(nmp, vp, cred, p);
    105 	if (vp->v_type != VDIR &&
    106 	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
    107 		return (EFBIG);
    108 	biosize = nmp->nm_rsize;
    109 	/*
    110 	 * For nfs, cache consistency can only be maintained approximately.
    111 	 * Although RFC1094 does not specify the criteria, the following is
    112 	 * believed to be compatible with the reference port.
    113 	 * For nqnfs, full cache consistency is maintained within the loop.
    114 	 * For nfs:
    115 	 * If the file's modify time on the server has changed since the
    116 	 * last read rpc or you have written to the file,
    117 	 * you may have lost data cache consistency with the
    118 	 * server, so flush all of the file's data out of the cache.
    119 	 * Then force a getattr rpc to ensure that you have up to date
    120 	 * attributes.
    121 	 * NB: This implies that cache data can be read when up to
    122 	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
    123 	 * attributes this could be forced by setting n_attrstamp to 0 before
    124 	 * the VOP_GETATTR() call.
    125 	 */
    126 	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) {
    127 		if (np->n_flag & NMODIFIED) {
    128 			if (vp->v_type != VREG) {
    129 				if (vp->v_type != VDIR)
    130 					panic("nfs: bioread, not dir");
    131 				nfs_invaldircache(vp);
    132 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    133 				if (error)
    134 					return (error);
    135 			}
    136 			np->n_attrstamp = 0;
    137 			error = VOP_GETATTR(vp, &vattr, cred, p);
    138 			if (error)
    139 				return (error);
    140 			np->n_mtime = vattr.va_mtime.tv_sec;
    141 		} else {
    142 			error = VOP_GETATTR(vp, &vattr, cred, p);
    143 			if (error)
    144 				return (error);
    145 			if (np->n_mtime != vattr.va_mtime.tv_sec) {
    146 				if (vp->v_type == VDIR)
    147 					nfs_invaldircache(vp);
    148 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    149 				if (error)
    150 					return (error);
    151 				np->n_mtime = vattr.va_mtime.tv_sec;
    152 			}
    153 		}
    154 	}
    155 	do {
    156 
    157 	    /*
    158 	     * Get a valid lease. If cached data is stale, flush it.
    159 	     */
    160 	    if (nmp->nm_flag & NFSMNT_NQNFS) {
    161 		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
    162 		    do {
    163 			error = nqnfs_getlease(vp, ND_READ, cred, p);
    164 		    } while (error == NQNFS_EXPIRED);
    165 		    if (error)
    166 			return (error);
    167 		    if (np->n_lrev != np->n_brev ||
    168 			(np->n_flag & NQNFSNONCACHE) ||
    169 			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
    170 			if (vp->v_type == VDIR)
    171 				nfs_invaldircache(vp);
    172 			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    173 			if (error)
    174 			    return (error);
    175 			np->n_brev = np->n_lrev;
    176 		    }
    177 		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
    178 		    nfs_invaldircache(vp);
    179 		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    180 		    if (error)
    181 			return (error);
    182 		}
    183 	    }
    184 	    /*
    185 	     * Don't cache symlinks.
    186 	     */
    187 	    if (np->n_flag & NQNFSNONCACHE
    188 		|| ((vp->v_flag & VROOT) && vp->v_type == VLNK)) {
    189 		switch (vp->v_type) {
    190 		case VREG:
    191 			return (nfs_readrpc(vp, uio, cred));
    192 		case VLNK:
    193 			return (nfs_readlinkrpc(vp, uio, cred));
    194 		case VDIR:
    195 			break;
    196 		default:
    197 			printf(" NQNFSNONCACHE: type %x unexpected\n",
    198 			    vp->v_type);
    199 		};
    200 	    }
    201 	    baddr = (caddr_t)0;
    202 	    switch (vp->v_type) {
    203 	    case VREG:
    204 		nfsstats.biocache_reads++;
    205 		lbn = uio->uio_offset / biosize;
    206 		on = uio->uio_offset & (biosize - 1);
    207 		bn = lbn * (biosize / DEV_BSIZE);
    208 		not_readin = 1;
    209 
    210 		/*
    211 		 * Start the read ahead(s), as required.
    212 		 */
    213 		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
    214 		    for (nra = 0; nra < nmp->nm_readahead &&
    215 			(lbn + 1 + nra) * biosize < np->n_size; nra++) {
    216 			rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE);
    217 			if (!incore(vp, rabn)) {
    218 			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
    219 			    if (!rabp)
    220 				return (EINTR);
    221 			    if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) {
    222 				rabp->b_flags |= (B_READ | B_ASYNC);
    223 				if (nfs_asyncio(rabp, cred)) {
    224 				    rabp->b_flags |= B_INVAL;
    225 				    brelse(rabp);
    226 				}
    227 			    } else
    228 				brelse(rabp);
    229 			}
    230 		    }
    231 		}
    232 
    233 		/*
    234 		 * If the block is in the cache and has the required data
    235 		 * in a valid region, just copy it out.
    236 		 * Otherwise, get the block and write back/read in,
    237 		 * as required.
    238 		 */
    239 		if ((bp = incore(vp, bn)) &&
    240 		    (bp->b_flags & (B_BUSY | B_WRITEINPROG)) ==
    241 		    (B_BUSY | B_WRITEINPROG))
    242 			got_buf = 0;
    243 		else {
    244 again:
    245 			bp = nfs_getcacheblk(vp, bn, biosize, p);
    246 			if (!bp)
    247 				return (EINTR);
    248 			got_buf = 1;
    249 			if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
    250 				bp->b_flags |= B_READ;
    251 				not_readin = 0;
    252 				error = nfs_doio(bp, cred, p);
    253 				if (error) {
    254 				    brelse(bp);
    255 				    return (error);
    256 				}
    257 			}
    258 		}
    259 		n = min((unsigned)(biosize - on), uio->uio_resid);
    260 		diff = np->n_size - uio->uio_offset;
    261 		if (diff < n)
    262 			n = diff;
    263 		if (not_readin && n > 0) {
    264 			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
    265 				if (!got_buf) {
    266 				    bp = nfs_getcacheblk(vp, bn, biosize, p);
    267 				    if (!bp)
    268 					return (EINTR);
    269 				    got_buf = 1;
    270 				}
    271 				bp->b_flags |= B_INVAFTERWRITE;
    272 				if (bp->b_dirtyend > 0) {
    273 				    if ((bp->b_flags & B_DELWRI) == 0)
    274 					panic("nfsbioread");
    275 				    if (VOP_BWRITE(bp) == EINTR)
    276 					return (EINTR);
    277 				} else
    278 				    brelse(bp);
    279 				goto again;
    280 			}
    281 		}
    282 		vp->v_lastr = lbn;
    283 		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
    284 		if (diff < n)
    285 			n = diff;
    286 		break;
    287 	    case VLNK:
    288 		nfsstats.biocache_readlinks++;
    289 		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
    290 		if (!bp)
    291 			return (EINTR);
    292 		if ((bp->b_flags & B_DONE) == 0) {
    293 			bp->b_flags |= B_READ;
    294 			error = nfs_doio(bp, cred, p);
    295 			if (error) {
    296 				brelse(bp);
    297 				return (error);
    298 			}
    299 		}
    300 		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
    301 		got_buf = 1;
    302 		on = 0;
    303 		break;
    304 	    case VDIR:
    305 diragain:
    306 		nfsstats.biocache_readdirs++;
    307 		if (uio->uio_offset != 0 &&
    308 		    uio->uio_offset == np->n_direofoffset)
    309 			return (0);
    310 		ndp = nfs_lookdircache(vp, uio->uio_offset, 0, 0, 1);
    311 #ifdef DIAGNOSTIC
    312 		if (!ndp)
    313 			panic("nfs_bioread: bad dir cache");
    314 #endif
    315 		bp = nfs_getcacheblk(vp, ndp->dc_blkno, NFS_DIRBLKSIZ, p);
    316 		if (!bp)
    317 		    return (EINTR);
    318 		if ((bp->b_flags & B_DONE) == 0) {
    319 		    bp->b_flags |= B_READ;
    320 		    bp->b_dcookie = ndp->dc_cookie;
    321 		    error = nfs_doio(bp, cred, p);
    322 		    if (error) {
    323 			/*
    324 			 * Yuck! The directory has been modified on the
    325 			 * server. Punt and let the userland code
    326 			 * deal with it.
    327 			 */
    328 			brelse(bp);
    329 			if (error == NFSERR_BAD_COOKIE) {
    330 			    nfs_invaldircache(vp);
    331 			    nfs_vinvalbuf(vp, 0, cred, p, 1);
    332 			    error = EINVAL;
    333 			}
    334 			return (error);
    335 		    }
    336 		}
    337 
    338 		/*
    339 		 * Find the entry we were looking for in the block.
    340 		 */
    341 
    342 		en = ndp->dc_entry;
    343 
    344 		pdp = dp = (struct dirent *)bp->b_data;
    345 		edp = bp->b_data + bp->b_validend;
    346 		enn = 0;
    347 		while (enn < en && (caddr_t)dp < edp) {
    348 			pdp = dp;
    349 			dp = (struct dirent *)((caddr_t)dp + dp->d_reclen);
    350 			enn++;
    351 		}
    352 
    353 		/*
    354 		 * If the entry number was bigger than the number of
    355 		 * entries in the block, or the cookie of the previous
    356 		 * entry doesn't match, the directory cache is
    357 		 * stale. Flush it and try again (i.e. go to
    358 		 * the server).
    359 		 */
    360 		if ((caddr_t)dp >= edp || (caddr_t)dp + dp->d_reclen > edp ||
    361 		    (en > 0 && NFS_GETCOOKIE(pdp) != uio->uio_offset)) {
    362 #ifdef DEBUG
    363 		    	printf("invalid cache: %p %p %p len %u off %lx %lx\n",
    364 				pdp, dp, edp, dp->d_reclen,
    365 				(unsigned long)uio->uio_offset,
    366 				(unsigned long)NFS_GETCOOKIE(pdp));
    367 #endif
    368 			brelse(bp);
    369 			nfs_invaldircache(vp);
    370 			nfs_vinvalbuf(vp, 0, cred, p, 0);
    371 			goto diragain;
    372 		}
    373 
    374 		on = (caddr_t)dp - bp->b_data;
    375 
    376 		/*
    377 		 * Cache all entries that may be exported to the
    378 		 * user, as they may be thrown back at us. The
    379 		 * NFSBIO_CACHECOOKIES flag indicates that all
    380 		 * entries are being 'exported', so cache them all.
    381 		 */
    382 
    383 		if (en == 0 && pdp == dp) {
    384 			dp = (struct dirent *)
    385 			    ((caddr_t)dp + dp->d_reclen);
    386 			enn++;
    387 		}
    388 
    389 		if (uio->uio_resid < (bp->b_validend - on)) {
    390 			n = uio->uio_resid;
    391 			enough = 1;
    392 		} else
    393 			n = bp->b_validend - on;
    394 
    395 		ep = bp->b_data + on + n;
    396 
    397 		/*
    398 		 * Find last complete entry to copy, caching entries
    399 		 * (if requested) as we go.
    400 		 */
    401 
    402 		while ((caddr_t)dp < ep && (caddr_t)dp + dp->d_reclen <= ep) {
    403 			if (cflag & NFSBIO_CACHECOOKIES)
    404 				nfs_lookdircache(vp, NFS_GETCOOKIE(pdp), enn,
    405 				    bp->b_lblkno, 1);
    406 			pdp = dp;
    407 			dp = (struct dirent *)((caddr_t)dp + dp->d_reclen);
    408 			enn++;
    409 		}
    410 
    411 		/*
    412 		 * If the last requested entry was not the last in the
    413 		 * buffer (happens if NFS_DIRFRAGSIZ < NFS_DIRBLKSIZ),
    414 		 * cache the cookie of the last requested one, and
    415 		 * set of the offset to it.
    416 		 */
    417 
    418 		if ((on + n) < bp->b_validend) {
    419 			curoff = NFS_GETCOOKIE(pdp);
    420 			nfs_lookdircache(vp, curoff, enn, bp->b_lblkno, 1);
    421 		} else
    422 			curoff = bp->b_dcookie;
    423 
    424 		n = ((caddr_t)pdp + pdp->d_reclen) - (bp->b_data + on);
    425 
    426 		/*
    427 		 * If not eof and read aheads are enabled, start one.
    428 		 * (You need the current block first, so that you have the
    429 		 *  directory offset cookie of the next block.)
    430 		 */
    431 		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
    432 		    np->n_direofoffset == 0 && !(np->n_flag & NQNFSNONCACHE)) {
    433 			ndp = nfs_lookdircache(vp, bp->b_dcookie, 0, 0, 1);
    434 			rabp = nfs_getcacheblk(vp, ndp->dc_blkno,
    435 						NFS_DIRBLKSIZ, p);
    436 			if (rabp) {
    437 			    if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) {
    438 				rabp->b_dcookie = ndp->dc_cookie;
    439 				rabp->b_flags |= (B_READ | B_ASYNC);
    440 				if (nfs_asyncio(rabp, cred)) {
    441 				    rabp->b_flags |= B_INVAL;
    442 				    brelse(rabp);
    443 				}
    444 			    } else
    445 				brelse(rabp);
    446 			}
    447 		}
    448 		got_buf = 1;
    449 		break;
    450 	    default:
    451 		printf(" nfsbioread: type %x unexpected\n",vp->v_type);
    452 		break;
    453 	    };
    454 
    455 	    if (n > 0) {
    456 		if (!baddr)
    457 			baddr = bp->b_data;
    458 		error = uiomove(baddr + on, (int)n, uio);
    459 	    }
    460 	    switch (vp->v_type) {
    461 	    case VREG:
    462 		break;
    463 	    case VLNK:
    464 		n = 0;
    465 		break;
    466 	    case VDIR:
    467 		if (np->n_flag & NQNFSNONCACHE)
    468 			bp->b_flags |= B_INVAL;
    469 		uio->uio_offset = curoff;
    470 		if (enough)
    471 			n = 0;
    472 		break;
    473 	    default:
    474 		printf(" nfsbioread: type %x unexpected\n",vp->v_type);
    475 	    }
    476 	    if (got_buf)
    477 		brelse(bp);
    478 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
    479 	return (error);
    480 }
    481 
    482 /*
    483  * Vnode op for write using bio
    484  */
    485 int
    486 nfs_write(v)
    487 	void *v;
    488 {
    489 	struct vop_write_args /* {
    490 		struct vnode *a_vp;
    491 		struct uio *a_uio;
    492 		int  a_ioflag;
    493 		struct ucred *a_cred;
    494 	} */ *ap = v;
    495 	register int biosize;
    496 	register struct uio *uio = ap->a_uio;
    497 	struct proc *p = uio->uio_procp;
    498 	register struct vnode *vp = ap->a_vp;
    499 	struct nfsnode *np = VTONFS(vp);
    500 	register struct ucred *cred = ap->a_cred;
    501 	int ioflag = ap->a_ioflag;
    502 	struct buf *bp;
    503 	struct vattr vattr;
    504 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
    505 	daddr_t lbn, bn;
    506 	int n, on, error = 0, iomode, must_commit;
    507 
    508 #ifdef DIAGNOSTIC
    509 	if (uio->uio_rw != UIO_WRITE)
    510 		panic("nfs_write mode");
    511 	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
    512 		panic("nfs_write proc");
    513 #endif
    514 	if (vp->v_type != VREG)
    515 		return (EIO);
    516 	if (np->n_flag & NWRITEERR) {
    517 		np->n_flag &= ~NWRITEERR;
    518 		return (np->n_error);
    519 	}
    520 	if ((nmp->nm_flag & NFSMNT_NFSV3) &&
    521 	    !(nmp->nm_iflag & NFSMNT_GOTFSINFO))
    522 		(void)nfs_fsinfo(nmp, vp, cred, p);
    523 	if (ioflag & (IO_APPEND | IO_SYNC)) {
    524 		if (np->n_flag & NMODIFIED) {
    525 			np->n_attrstamp = 0;
    526 			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    527 			if (error)
    528 				return (error);
    529 		}
    530 		if (ioflag & IO_APPEND) {
    531 			np->n_attrstamp = 0;
    532 			error = VOP_GETATTR(vp, &vattr, cred, p);
    533 			if (error)
    534 				return (error);
    535 			uio->uio_offset = np->n_size;
    536 		}
    537 	}
    538 	if (uio->uio_offset < 0)
    539 		return (EINVAL);
    540 	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
    541 		return (EFBIG);
    542 	if (uio->uio_resid == 0)
    543 		return (0);
    544 	/*
    545 	 * Maybe this should be above the vnode op call, but so long as
    546 	 * file servers have no limits, i don't think it matters
    547 	 */
    548 	if (p && uio->uio_offset + uio->uio_resid >
    549 	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
    550 		psignal(p, SIGXFSZ);
    551 		return (EFBIG);
    552 	}
    553 	/*
    554 	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
    555 	 * will be the same size within a filesystem. nfs_writerpc will
    556 	 * still use nm_wsize when sizing the rpc's.
    557 	 */
    558 	biosize = nmp->nm_rsize;
    559 	do {
    560 
    561 		/*
    562 		 * XXX make sure we aren't cached in the VM page cache
    563 		 */
    564 		(void)vnode_pager_uncache(vp);
    565 
    566 		/*
    567 		 * Check for a valid write lease.
    568 		 */
    569 		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
    570 		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
    571 			do {
    572 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
    573 			} while (error == NQNFS_EXPIRED);
    574 			if (error)
    575 				return (error);
    576 			if (np->n_lrev != np->n_brev ||
    577 			    (np->n_flag & NQNFSNONCACHE)) {
    578 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    579 				if (error)
    580 					return (error);
    581 				np->n_brev = np->n_lrev;
    582 			}
    583 		}
    584 		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
    585 		    iomode = NFSV3WRITE_FILESYNC;
    586 		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
    587 		    if (must_commit)
    588 			nfs_clearcommit(vp->v_mount);
    589 		    return (error);
    590 		}
    591 		nfsstats.biocache_writes++;
    592 		lbn = uio->uio_offset / biosize;
    593 		on = uio->uio_offset & (biosize-1);
    594 		n = min((unsigned)(biosize - on), uio->uio_resid);
    595 		bn = lbn * (biosize / DEV_BSIZE);
    596 again:
    597 		bp = nfs_getcacheblk(vp, bn, biosize, p);
    598 		if (!bp)
    599 			return (EINTR);
    600 		if (bp->b_wcred == NOCRED) {
    601 			crhold(cred);
    602 			bp->b_wcred = cred;
    603 		}
    604 		np->n_flag |= NMODIFIED;
    605 		if (uio->uio_offset + n > np->n_size) {
    606 			np->n_size = uio->uio_offset + n;
    607 			vnode_pager_setsize(vp, np->n_size);
    608 		}
    609 
    610 		/*
    611 		 * If the new write will leave a contiguous dirty
    612 		 * area, just update the b_dirtyoff and b_dirtyend,
    613 		 * otherwise force a write rpc of the old dirty area.
    614 		 */
    615 		if (bp->b_dirtyend > 0 &&
    616 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
    617 			bp->b_proc = p;
    618 			if (VOP_BWRITE(bp) == EINTR)
    619 				return (EINTR);
    620 			goto again;
    621 		}
    622 
    623 		/*
    624 		 * Check for valid write lease and get one as required.
    625 		 * In case getblk() and/or bwrite() delayed us.
    626 		 */
    627 		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
    628 		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
    629 			do {
    630 				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
    631 			} while (error == NQNFS_EXPIRED);
    632 			if (error) {
    633 				brelse(bp);
    634 				return (error);
    635 			}
    636 			if (np->n_lrev != np->n_brev ||
    637 			    (np->n_flag & NQNFSNONCACHE)) {
    638 				brelse(bp);
    639 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    640 				if (error)
    641 					return (error);
    642 				np->n_brev = np->n_lrev;
    643 				goto again;
    644 			}
    645 		}
    646 		error = uiomove((char *)bp->b_data + on, n, uio);
    647 		if (error) {
    648 			bp->b_flags |= B_ERROR;
    649 			brelse(bp);
    650 			return (error);
    651 		}
    652 		if (bp->b_dirtyend > 0) {
    653 			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
    654 			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
    655 		} else {
    656 			bp->b_dirtyoff = on;
    657 			bp->b_dirtyend = on + n;
    658 		}
    659 		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
    660 		    bp->b_validoff > bp->b_dirtyend) {
    661 			bp->b_validoff = bp->b_dirtyoff;
    662 			bp->b_validend = bp->b_dirtyend;
    663 		} else {
    664 			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
    665 			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
    666 		}
    667 
    668 		/*
    669 		 * Since this block is being modified, it must be written
    670 		 * again and not just committed.
    671 		 */
    672 		bp->b_flags &= ~B_NEEDCOMMIT;
    673 
    674 		/*
    675 		 * If the lease is non-cachable or IO_SYNC do bwrite().
    676 		 */
    677 		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
    678 			bp->b_proc = p;
    679 			error = VOP_BWRITE(bp);
    680 			if (error)
    681 				return (error);
    682 			if (np->n_flag & NQNFSNONCACHE) {
    683 				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
    684 				if (error)
    685 					return (error);
    686 			}
    687 		} else if ((n + on) == biosize &&
    688 			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
    689 			bp->b_proc = (struct proc *)0;
    690 			bp->b_flags |= B_ASYNC;
    691 			(void)nfs_writebp(bp, 0);
    692 		} else {
    693 			bdwrite(bp);
    694 		}
    695 	} while (uio->uio_resid > 0 && n > 0);
    696 	return (0);
    697 }
    698 
    699 /*
    700  * Get an nfs cache block.
    701  * Allocate a new one if the block isn't currently in the cache
    702  * and return the block marked busy. If the calling process is
    703  * interrupted by a signal for an interruptible mount point, return
    704  * NULL.
    705  */
    706 struct buf *
    707 nfs_getcacheblk(vp, bn, size, p)
    708 	struct vnode *vp;
    709 	daddr_t bn;
    710 	int size;
    711 	struct proc *p;
    712 {
    713 	register struct buf *bp;
    714 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
    715 
    716 	if (nmp->nm_flag & NFSMNT_INT) {
    717 		bp = getblk(vp, bn, size, PCATCH, 0);
    718 		while (bp == (struct buf *)0) {
    719 			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
    720 				return ((struct buf *)0);
    721 			bp = getblk(vp, bn, size, 0, 2 * hz);
    722 		}
    723 	} else
    724 		bp = getblk(vp, bn, size, 0, 0);
    725 	return (bp);
    726 }
    727 
    728 /*
    729  * Flush and invalidate all dirty buffers. If another process is already
    730  * doing the flush, just wait for completion.
    731  */
    732 int
    733 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
    734 	struct vnode *vp;
    735 	int flags;
    736 	struct ucred *cred;
    737 	struct proc *p;
    738 	int intrflg;
    739 {
    740 	register struct nfsnode *np = VTONFS(vp);
    741 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
    742 	int error = 0, slpflag, slptimeo;
    743 
    744 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
    745 		intrflg = 0;
    746 	if (intrflg) {
    747 		slpflag = PCATCH;
    748 		slptimeo = 2 * hz;
    749 	} else {
    750 		slpflag = 0;
    751 		slptimeo = 0;
    752 	}
    753 	/*
    754 	 * First wait for any other process doing a flush to complete.
    755 	 */
    756 	while (np->n_flag & NFLUSHINPROG) {
    757 		np->n_flag |= NFLUSHWANT;
    758 		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
    759 			slptimeo);
    760 		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
    761 			return (EINTR);
    762 	}
    763 
    764 	/*
    765 	 * Now, flush as required.
    766 	 */
    767 	np->n_flag |= NFLUSHINPROG;
    768 	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
    769 	while (error) {
    770 		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
    771 			np->n_flag &= ~NFLUSHINPROG;
    772 			if (np->n_flag & NFLUSHWANT) {
    773 				np->n_flag &= ~NFLUSHWANT;
    774 				wakeup((caddr_t)&np->n_flag);
    775 			}
    776 			return (EINTR);
    777 		}
    778 		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
    779 	}
    780 	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
    781 	if (np->n_flag & NFLUSHWANT) {
    782 		np->n_flag &= ~NFLUSHWANT;
    783 		wakeup((caddr_t)&np->n_flag);
    784 	}
    785 	return (0);
    786 }
    787 
    788 /*
    789  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
    790  * This is mainly to avoid queueing async I/O requests when the nfsiods
    791  * are all hung on a dead server.
    792  */
    793 int
    794 nfs_asyncio(bp, cred)
    795 	register struct buf *bp;
    796 	struct ucred *cred;
    797 {
    798 	register int i;
    799 	register struct nfsmount *nmp;
    800 	int gotiod, slpflag = 0, slptimeo = 0, error;
    801 
    802 	if (nfs_numasync == 0)
    803 		return (EIO);
    804 
    805 
    806 	nmp = VFSTONFS(bp->b_vp->v_mount);
    807 again:
    808 	if (nmp->nm_flag & NFSMNT_INT)
    809 		slpflag = PCATCH;
    810 	gotiod = FALSE;
    811 
    812 	/*
    813 	 * Find a free iod to process this request.
    814 	 */
    815 
    816 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
    817 		if (nfs_iodwant[i]) {
    818 			/*
    819 			 * Found one, so wake it up and tell it which
    820 			 * mount to process.
    821 			 */
    822 			nfs_iodwant[i] = (struct proc *)0;
    823 			nfs_iodmount[i] = nmp;
    824 			nmp->nm_bufqiods++;
    825 			wakeup((caddr_t)&nfs_iodwant[i]);
    826 			gotiod = TRUE;
    827 			break;
    828 		}
    829 	/*
    830 	 * If none are free, we may already have an iod working on this mount
    831 	 * point.  If so, it will process our request.
    832 	 */
    833 	if (!gotiod && nmp->nm_bufqiods > 0)
    834 		gotiod = TRUE;
    835 
    836 	/*
    837 	 * If we have an iod which can process the request, then queue
    838 	 * the buffer.
    839 	 */
    840 	if (gotiod) {
    841 		/*
    842 		 * Ensure that the queue never grows too large.
    843 		 */
    844 		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
    845 			nmp->nm_bufqwant = TRUE;
    846 			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
    847 				"nfsaio", slptimeo);
    848 			if (error) {
    849 				if (nfs_sigintr(nmp, NULL, bp->b_proc))
    850 					return (EINTR);
    851 				if (slpflag == PCATCH) {
    852 					slpflag = 0;
    853 					slptimeo = 2 * hz;
    854 				}
    855 			}
    856 			/*
    857 			 * We might have lost our iod while sleeping,
    858 			 * so check and loop if nescessary.
    859 			 */
    860 			if (nmp->nm_bufqiods == 0)
    861 				goto again;
    862 		}
    863 
    864 		if (bp->b_flags & B_READ) {
    865 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
    866 				crhold(cred);
    867 				bp->b_rcred = cred;
    868 			}
    869 		} else {
    870 			bp->b_flags |= B_WRITEINPROG;
    871 			if (bp->b_wcred == NOCRED && cred != NOCRED) {
    872 				crhold(cred);
    873 				bp->b_wcred = cred;
    874 			}
    875 		}
    876 
    877 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
    878 		nmp->nm_bufqlen++;
    879 		return (0);
    880 	    }
    881 
    882 	/*
    883 	 * All the iods are busy on other mounts, so return EIO to
    884 	 * force the caller to process the i/o synchronously.
    885 	 */
    886 	return (EIO);
    887 }
    888 
    889 /*
    890  * Do an I/O operation to/from a cache block. This may be called
    891  * synchronously or from an nfsiod.
    892  */
    893 int
    894 nfs_doio(bp, cr, p)
    895 	register struct buf *bp;
    896 	struct ucred *cr;
    897 	struct proc *p;
    898 {
    899 	register struct uio *uiop;
    900 	register struct vnode *vp;
    901 	struct nfsnode *np;
    902 	struct nfsmount *nmp;
    903 	int error = 0, diff, len, iomode, must_commit = 0;
    904 	struct uio uio;
    905 	struct iovec io;
    906 
    907 	vp = bp->b_vp;
    908 	np = VTONFS(vp);
    909 	nmp = VFSTONFS(vp->v_mount);
    910 	uiop = &uio;
    911 	uiop->uio_iov = &io;
    912 	uiop->uio_iovcnt = 1;
    913 	uiop->uio_segflg = UIO_SYSSPACE;
    914 	uiop->uio_procp = p;
    915 
    916 	/*
    917 	 * Historically, paging was done with physio, but no more...
    918 	 */
    919 	if (bp->b_flags & B_PHYS) {
    920 	    /*
    921 	     * ...though reading /dev/drum still gets us here.
    922 	     */
    923 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
    924 	    /* mapping was done by vmapbuf() */
    925 	    io.iov_base = bp->b_data;
    926 	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
    927 	    if (bp->b_flags & B_READ) {
    928 		uiop->uio_rw = UIO_READ;
    929 		nfsstats.read_physios++;
    930 		error = nfs_readrpc(vp, uiop, cr);
    931 	    } else {
    932 		iomode = NFSV3WRITE_DATASYNC;
    933 		uiop->uio_rw = UIO_WRITE;
    934 		nfsstats.write_physios++;
    935 		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
    936 	    }
    937 	    if (error) {
    938 		bp->b_flags |= B_ERROR;
    939 		bp->b_error = error;
    940 	    }
    941 	} else if (bp->b_flags & B_READ) {
    942 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
    943 	    io.iov_base = bp->b_data;
    944 	    uiop->uio_rw = UIO_READ;
    945 	    switch (vp->v_type) {
    946 	    case VREG:
    947 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
    948 		nfsstats.read_bios++;
    949 		error = nfs_readrpc(vp, uiop, cr);
    950 		if (!error) {
    951 		    bp->b_validoff = 0;
    952 		    if (uiop->uio_resid) {
    953 			/*
    954 			 * If len > 0, there is a hole in the file and
    955 			 * no writes after the hole have been pushed to
    956 			 * the server yet.
    957 			 * Just zero fill the rest of the valid area.
    958 			 */
    959 			diff = bp->b_bcount - uiop->uio_resid;
    960 			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
    961 				+ diff);
    962 			if (len > 0) {
    963 			    len = min(len, uiop->uio_resid);
    964 			    bzero((char *)bp->b_data + diff, len);
    965 			    bp->b_validend = diff + len;
    966 			} else
    967 			    bp->b_validend = diff;
    968 		    } else
    969 			bp->b_validend = bp->b_bcount;
    970 		}
    971 		if (p && (vp->v_flag & VTEXT) &&
    972 			(((nmp->nm_flag & NFSMNT_NQNFS) &&
    973 			  NQNFS_CKINVALID(vp, np, ND_READ) &&
    974 			  np->n_lrev != np->n_brev) ||
    975 			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
    976 			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
    977 			uprintf("Process killed due to text file modification\n");
    978 			psignal(p, SIGKILL);
    979 			p->p_holdcnt++;
    980 		}
    981 		break;
    982 	    case VLNK:
    983 		uiop->uio_offset = (off_t)0;
    984 		nfsstats.readlink_bios++;
    985 		error = nfs_readlinkrpc(vp, uiop, cr);
    986 		break;
    987 	    case VDIR:
    988 		nfsstats.readdir_bios++;
    989 		uiop->uio_offset = bp->b_dcookie;
    990 		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
    991 			error = nfs_readdirplusrpc(vp, uiop, cr);
    992 			if (error == NFSERR_NOTSUPP)
    993 				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
    994 		}
    995 		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
    996 			error = nfs_readdirrpc(vp, uiop, cr);
    997 		if (!error) {
    998 			bp->b_dcookie = uiop->uio_offset;
    999 			bp->b_validoff = 0;
   1000 			bp->b_validend = bp->b_bcount - uiop->uio_resid;
   1001 		}
   1002 		break;
   1003 	    default:
   1004 		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
   1005 		break;
   1006 	    };
   1007 	    if (error) {
   1008 		bp->b_flags |= B_ERROR;
   1009 		bp->b_error = error;
   1010 	    }
   1011 	} else {
   1012 	    io.iov_len = uiop->uio_resid = bp->b_dirtyend
   1013 		- bp->b_dirtyoff;
   1014 	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
   1015 		+ bp->b_dirtyoff;
   1016 	    io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
   1017 	    uiop->uio_rw = UIO_WRITE;
   1018 	    nfsstats.write_bios++;
   1019 	    if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
   1020 		iomode = NFSV3WRITE_UNSTABLE;
   1021 	    else
   1022 		iomode = NFSV3WRITE_FILESYNC;
   1023 	    bp->b_flags |= B_WRITEINPROG;
   1024 #ifdef fvdl_debug
   1025 	    printf("nfs_doio(%x): bp %x doff %d dend %d\n",
   1026 		vp, bp, bp->b_dirtyoff, bp->b_dirtyend);
   1027 #endif
   1028 	    error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
   1029 	    if (!error && iomode == NFSV3WRITE_UNSTABLE)
   1030 		bp->b_flags |= B_NEEDCOMMIT;
   1031 	    else
   1032 		bp->b_flags &= ~B_NEEDCOMMIT;
   1033 	    bp->b_flags &= ~B_WRITEINPROG;
   1034 
   1035 	    /*
   1036 	     * For an interrupted write, the buffer is still valid and the
   1037 	     * write hasn't been pushed to the server yet, so we can't set
   1038 	     * B_ERROR and report the interruption by setting B_EINTR. For
   1039 	     * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt
   1040 	     * is essentially a noop.
   1041 	     * For the case of a V3 write rpc not being committed to stable
   1042 	     * storage, the block is still dirty and requires either a commit
   1043 	     * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC
   1044 	     * before the block is reused. This is indicated by setting the
   1045 	     * B_DELWRI and B_NEEDCOMMIT flags.
   1046 	     */
   1047 	    if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
   1048 		bp->b_flags |= B_DELWRI;
   1049 
   1050 		/*
   1051 		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
   1052 		 * buffer to the clean list, we have to reassign it back to the
   1053 		 * dirty one. Ugh.
   1054 		 */
   1055 		if (bp->b_flags & B_ASYNC)
   1056 		    reassignbuf(bp, vp);
   1057 		else if (error)
   1058 		    bp->b_flags |= B_EINTR;
   1059 	    } else {
   1060 		if (error) {
   1061 		    bp->b_flags |= B_ERROR;
   1062 		    bp->b_error = np->n_error = error;
   1063 		    np->n_flag |= NWRITEERR;
   1064 		}
   1065 		bp->b_dirtyoff = bp->b_dirtyend = 0;
   1066 	    }
   1067 	}
   1068 	bp->b_resid = uiop->uio_resid;
   1069 	if (must_commit)
   1070 		nfs_clearcommit(vp->v_mount);
   1071 	biodone(bp);
   1072 	return (error);
   1073 }
   1074