Home | History | Annotate | Line # | Download | only in server
      1 /*	$NetBSD: nfs_nfsdcache.c,v 1.5 2024/07/05 04:31:52 rin Exp $	*/
      2 /*-
      3  * Copyright (c) 1989, 1993
      4  *	The Regents of the University of California.  All rights reserved.
      5  *
      6  * This code is derived from software contributed to Berkeley by
      7  * Rick Macklem at The University of Guelph.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 4. Neither the name of the University nor the names of its contributors
     18  *    may be used to endorse or promote products derived from this software
     19  *    without specific prior written permission.
     20  *
     21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     31  * SUCH DAMAGE.
     32  *
     33  */
     34 
     35 #include <sys/cdefs.h>
     36 /* __FBSDID("FreeBSD: head/sys/fs/nfsserver/nfs_nfsdcache.c 304026 2016-08-12 22:44:59Z rmacklem "); */
     37 __RCSID("$NetBSD: nfs_nfsdcache.c,v 1.5 2024/07/05 04:31:52 rin Exp $");
     38 
     39 /*
     40  * Here is the basic algorithm:
     41  * First, some design criteria I used:
     42  * - I think a false hit is more serious than a false miss
     43  * - A false hit for an RPC that has Op(s) that order via seqid# must be
     44  *   avoided at all cost
     45  * - A valid hit will probably happen a long time after the original reply
     46  *   and the TCP socket that the original request was received on will no
     47  *   longer be active
     48  *   (The long time delay implies to me that LRU is not appropriate.)
     49  * - The mechanism will satisfy the requirements of ordering Ops with seqid#s
     50  *   in them as well as minimizing the risk of redoing retried non-idempotent
     51  *   Ops.
     52  * Because it is biased towards avoiding false hits, multiple entries with
     53  * the same xid are to be expected, especially for the case of the entry
     54  * in the cache being related to a seqid# sequenced Op.
     55  *
     56  * The basic algorithm I'm about to code up:
     57  * - Null RPCs bypass the cache and are just done
     58  * For TCP
     59  * 	- key on <xid, NFS version> (as noted above, there can be several
     60  * 				     entries with the same key)
     61  * 	When a request arrives:
     62  * 		For all that match key
     63  * 		- if RPC# != OR request_size !=
     64  * 			- not a match with this one
     65  * 		- if NFSv4 and received on same TCP socket OR
     66  *			received on a TCP connection created before the
     67  *			entry was cached
     68  * 			- not a match with this one
     69  * 			(V2,3 clients might retry on same TCP socket)
     70  * 		- calculate checksum on first N bytes of NFS XDR
     71  * 		- if checksum !=
     72  * 			- not a match for this one
     73  * 		If any of the remaining ones that match has a
     74  * 			seqid_refcnt > 0
     75  * 			- not a match (go do RPC, using new cache entry)
     76  * 		If one match left
     77  * 			- a hit (reply from cache)
     78  * 		else
     79  * 			- miss (go do RPC, using new cache entry)
     80  *
     81  * 	During processing of NFSv4 request:
     82  * 		- set a flag when a non-idempotent Op is processed
     83  * 		- when an Op that uses a seqid# (Open,...) is processed
     84  * 			- if same seqid# as referenced entry in cache
     85  * 				- free new cache entry
     86  * 				- reply from referenced cache entry
     87  * 			  else if next seqid# in order
     88  * 				- free referenced cache entry
     89  * 				- increment seqid_refcnt on new cache entry
     90  * 				- set pointer from Openowner/Lockowner to
     91  * 					new cache entry (aka reference it)
     92  * 			  else if first seqid# in sequence
     93  * 				- increment seqid_refcnt on new cache entry
     94  * 				- set pointer from Openowner/Lockowner to
     95  * 					new cache entry (aka reference it)
     96  *
     97  * 	At end of RPC processing:
     98  * 		- if seqid_refcnt > 0 OR flagged non-idempotent on new
     99  * 			cache entry
    100  * 			- save reply in cache entry
    101  * 			- calculate checksum on first N bytes of NFS XDR
    102  * 				request
    103  * 			- note op and length of XDR request (in bytes)
    104  * 			- timestamp it
    105  * 		  else
    106  * 			- free new cache entry
    107  * 		- Send reply (noting info for socket activity check, below)
    108  *
    109  * 	For cache entries saved above:
    110  * 		- if saved since seqid_refcnt was > 0
    111  * 			- free when seqid_refcnt decrements to 0
    112  * 			  (when next one in sequence is processed above, or
    113  * 			   when Openowner/Lockowner is discarded)
    114  * 		  else { non-idempotent Op(s) }
    115  * 			- free when
    116  * 				- some further activity observed on same
    117  * 					socket
    118  * 				  (I'm not yet sure how I'm going to do
    119  * 				   this. Maybe look at the TCP connection
    120  * 				   to see if the send_tcp_sequence# is well
    121  * 				   past sent reply OR K additional RPCs
    122  * 				   replied on same socket OR?)
    123  * 			  OR
    124  * 				- when very old (hours, days, weeks?)
    125  *
    126  * For UDP (v2, 3 only), pretty much the old way:
    127  * - key on <xid, NFS version, RPC#, Client host ip#>
    128  *   (at most one entry for each key)
    129  *
    130  * When a Request arrives:
    131  * - if a match with entry via key
    132  * 	- if RPC marked In_progress
    133  * 		- discard request (don't send reply)
    134  * 	  else
    135  * 		- reply from cache
    136  * 		- timestamp cache entry
    137  *   else
    138  * 	- add entry to cache, marked In_progress
    139  * 	- do RPC
    140  * 	- when RPC done
    141  * 		- if RPC# non-idempotent
    142  * 			- mark entry Done (not In_progress)
    143  * 			- save reply
    144  * 			- timestamp cache entry
    145  * 		  else
    146  * 			- free cache entry
    147  * 		- send reply
    148  *
    149  * Later, entries with saved replies are free'd a short time (few minutes)
    150  * after reply sent (timestamp).
    151  * Reference: Chet Juszczak, "Improving the Performance and Correctness
    152  *		of an NFS Server", in Proc. Winter 1989 USENIX Conference,
    153  *		pages 53-63. San Diego, February 1989.
    154  *	 for the UDP case.
    155  * nfsrc_floodlevel is set to the allowable upper limit for saved replies
    156  *	for TCP. For V3, a reply won't be saved when the flood level is
    157  *	hit. For V4, the non-idempotent Op will return NFSERR_RESOURCE in
    158  *	that case. This level should be set high enough that this almost
    159  *	never happens.
    160  */
    161 #ifndef APPLEKEXT
    162 #include <fs/nfs/common/nfsport.h>
    163 
    164 extern struct nfsstatsv1 nfsstatsv1;
    165 extern struct mtx nfsrc_udpmtx;
    166 extern struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
    167 extern struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
    168 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
    169 #endif	/* !APPLEKEXT */
    170 
    171 SYSCTL_DECL(_vfs_nfsd);
    172 
    173 static u_int	nfsrc_tcphighwater = 0;
    174 static int
    175 sysctl_tcphighwater(SYSCTL_HANDLER_ARGS)
    176 {
    177 	int error, newhighwater;
    178 
    179 	newhighwater = nfsrc_tcphighwater;
    180 	error = sysctl_handle_int(oidp, &newhighwater, 0, req);
    181 	if (error != 0 || req->newptr == NULL)
    182 		return (error);
    183 	if (newhighwater < 0)
    184 		return (EINVAL);
    185 	if (newhighwater >= nfsrc_floodlevel)
    186 		nfsrc_floodlevel = newhighwater + newhighwater / 5;
    187 	nfsrc_tcphighwater = newhighwater;
    188 	return (0);
    189 }
    190 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, tcphighwater, CTLTYPE_UINT | CTLFLAG_RW, 0,
    191     sizeof(nfsrc_tcphighwater), sysctl_tcphighwater, "IU",
    192     "High water mark for TCP cache entries");
    193 
    194 static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
    195 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
    196     &nfsrc_udphighwater, 0,
    197     "High water mark for UDP cache entries");
    198 static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
    199 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
    200     &nfsrc_tcptimeout, 0,
    201     "Timeout for TCP entries in the DRC");
    202 static u_int nfsrc_tcpnonidempotent = 1;
    203 SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
    204     &nfsrc_tcpnonidempotent, 0,
    205     "Enable the DRC for NFS over TCP");
    206 
    207 static int nfsrc_udpcachesize = 0;
    208 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
    209 static struct nfsrvhashhead nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
    210 
    211 /*
    212  * and the reverse mapping from generic to Version 2 procedure numbers
    213  */
    214 static int newnfsv2_procid[NFS_V3NPROCS] = {
    215 	NFSV2PROC_NULL,
    216 	NFSV2PROC_GETATTR,
    217 	NFSV2PROC_SETATTR,
    218 	NFSV2PROC_LOOKUP,
    219 	NFSV2PROC_NOOP,
    220 	NFSV2PROC_READLINK,
    221 	NFSV2PROC_READ,
    222 	NFSV2PROC_WRITE,
    223 	NFSV2PROC_CREATE,
    224 	NFSV2PROC_MKDIR,
    225 	NFSV2PROC_SYMLINK,
    226 	NFSV2PROC_CREATE,
    227 	NFSV2PROC_REMOVE,
    228 	NFSV2PROC_RMDIR,
    229 	NFSV2PROC_RENAME,
    230 	NFSV2PROC_LINK,
    231 	NFSV2PROC_READDIR,
    232 	NFSV2PROC_NOOP,
    233 	NFSV2PROC_STATFS,
    234 	NFSV2PROC_NOOP,
    235 	NFSV2PROC_NOOP,
    236 	NFSV2PROC_NOOP,
    237 };
    238 
    239 #define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
    240 #define	NFSRCUDPHASH(xid) \
    241 	(&nfsrvudphashtbl[nfsrc_hash(xid)])
    242 #define	NFSRCHASH(xid) \
    243 	(&nfsrchash_table[nfsrc_hash(xid)].tbl)
    244 #define	NFSRCAHASH(xid) (&nfsrcahash_table[nfsrc_hash(xid)])
    245 #define	TRUE	1
    246 #define	FALSE	0
    247 #define	NFSRVCACHE_CHECKLEN	100
    248 
    249 /* True iff the rpc reply is an nfs status ONLY! */
    250 static int nfsv2_repstat[NFS_V3NPROCS] = {
    251 	FALSE,
    252 	FALSE,
    253 	FALSE,
    254 	FALSE,
    255 	FALSE,
    256 	FALSE,
    257 	FALSE,
    258 	FALSE,
    259 	FALSE,
    260 	FALSE,
    261 	TRUE,
    262 	TRUE,
    263 	TRUE,
    264 	TRUE,
    265 	FALSE,
    266 	TRUE,
    267 	FALSE,
    268 	FALSE,
    269 	FALSE,
    270 	FALSE,
    271 	FALSE,
    272 	FALSE,
    273 };
    274 
    275 /*
    276  * Will NFS want to work over IPv6 someday?
    277  */
    278 #define	NETFAMILY(rp) \
    279 		(((rp)->rc_flag & RC_INETIPV6) ? AF_INET6 : AF_INET)
    280 
    281 /* local functions */
    282 static int nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
    283 static int nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp);
    284 static void nfsrc_lock(struct nfsrvcache *rp);
    285 static void nfsrc_unlock(struct nfsrvcache *rp);
    286 static void nfsrc_wanted(struct nfsrvcache *rp);
    287 static void nfsrc_freecache(struct nfsrvcache *rp);
    288 static int nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum);
    289 static void nfsrc_marksametcpconn(u_int64_t);
    290 
    291 /*
    292  * Return the correct mutex for this cache entry.
    293  */
    294 static __inline struct mtx *
    295 nfsrc_cachemutex(struct nfsrvcache *rp)
    296 {
    297 
    298 	if ((rp->rc_flag & RC_UDP) != 0)
    299 		return (&nfsrc_udpmtx);
    300 	return (&nfsrchash_table[nfsrc_hash(rp->rc_xid)].mtx);
    301 }
    302 
    303 /*
    304  * Initialize the server request cache list
    305  */
    306 APPLESTATIC void
    307 nfsrvd_initcache(void)
    308 {
    309 	int i;
    310 	static int inited = 0;
    311 
    312 	if (inited)
    313 		return;
    314 	inited = 1;
    315 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
    316 		LIST_INIT(&nfsrvudphashtbl[i]);
    317 		LIST_INIT(&nfsrchash_table[i].tbl);
    318 		LIST_INIT(&nfsrcahash_table[i].tbl);
    319 	}
    320 	TAILQ_INIT(&nfsrvudplru);
    321 	nfsrc_tcpsavedreplies = 0;
    322 	nfsrc_udpcachesize = 0;
    323 	nfsstatsv1.srvcache_tcppeak = 0;
    324 	nfsstatsv1.srvcache_size = 0;
    325 }
    326 
    327 /*
    328  * Get a cache entry for this request. Basically just malloc a new one
    329  * and then call nfsrc_getudp() or nfsrc_gettcp() to do the rest.
    330  */
    331 APPLESTATIC int
    332 nfsrvd_getcache(struct nfsrv_descript *nd)
    333 {
    334 	struct nfsrvcache *newrp;
    335 	int ret;
    336 
    337 	if (nd->nd_procnum == NFSPROC_NULL)
    338 		panic("nfsd cache null");
    339 	MALLOC(newrp, struct nfsrvcache *, sizeof (struct nfsrvcache),
    340 	    M_NFSRVCACHE, M_WAITOK);
    341 	NFSBZERO((caddr_t)newrp, sizeof (struct nfsrvcache));
    342 	if (nd->nd_flag & ND_NFSV4)
    343 		newrp->rc_flag = RC_NFSV4;
    344 	else if (nd->nd_flag & ND_NFSV3)
    345 		newrp->rc_flag = RC_NFSV3;
    346 	else
    347 		newrp->rc_flag = RC_NFSV2;
    348 	newrp->rc_xid = nd->nd_retxid;
    349 	newrp->rc_proc = nd->nd_procnum;
    350 	newrp->rc_sockref = nd->nd_sockref;
    351 	newrp->rc_cachetime = nd->nd_tcpconntime;
    352 	if (nd->nd_flag & ND_SAMETCPCONN)
    353 		newrp->rc_flag |= RC_SAMETCPCONN;
    354 	if (nd->nd_nam2 != NULL) {
    355 		newrp->rc_flag |= RC_UDP;
    356 		ret = nfsrc_getudp(nd, newrp);
    357 	} else {
    358 		ret = nfsrc_gettcp(nd, newrp);
    359 	}
    360 	NFSEXITCODE2(0, nd);
    361 	return (ret);
    362 }
    363 
    364 /*
    365  * For UDP (v2, v3):
    366  * - key on <xid, NFS version, RPC#, Client host ip#>
    367  *   (at most one entry for each key)
    368  */
    369 static int
    370 nfsrc_getudp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
    371 {
    372 	struct nfsrvcache *rp;
    373 	struct sockaddr_in *saddr;
    374 	struct sockaddr_in6 *saddr6;
    375 	struct nfsrvhashhead *hp;
    376 	int ret = 0;
    377 	struct mtx *mutex;
    378 
    379 	mutex = nfsrc_cachemutex(newrp);
    380 	hp = NFSRCUDPHASH(newrp->rc_xid);
    381 loop:
    382 	mtx_lock(mutex);
    383 	LIST_FOREACH(rp, hp, rc_hash) {
    384 	    if (newrp->rc_xid == rp->rc_xid &&
    385 		newrp->rc_proc == rp->rc_proc &&
    386 		(newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
    387 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
    388 			if ((rp->rc_flag & RC_LOCKED) != 0) {
    389 				rp->rc_flag |= RC_WANTED;
    390 				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
    391 				    "nfsrc", 10 * hz);
    392 				goto loop;
    393 			}
    394 			if (rp->rc_flag == 0)
    395 				panic("nfs udp cache0");
    396 			rp->rc_flag |= RC_LOCKED;
    397 			TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
    398 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
    399 			if (rp->rc_flag & RC_INPROG) {
    400 				nfsstatsv1.srvcache_inproghits++;
    401 				mtx_unlock(mutex);
    402 				ret = RC_DROPIT;
    403 			} else if (rp->rc_flag & RC_REPSTATUS) {
    404 				/*
    405 				 * V2 only.
    406 				 */
    407 				nfsstatsv1.srvcache_nonidemdonehits++;
    408 				mtx_unlock(mutex);
    409 				nfsrvd_rephead(nd);
    410 				*(nd->nd_errp) = rp->rc_status;
    411 				ret = RC_REPLY;
    412 				rp->rc_timestamp = NFSD_MONOSEC +
    413 					NFSRVCACHE_UDPTIMEOUT;
    414 			} else if (rp->rc_flag & RC_REPMBUF) {
    415 				nfsstatsv1.srvcache_nonidemdonehits++;
    416 				mtx_unlock(mutex);
    417 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
    418 					M_COPYALL, M_WAITOK);
    419 				ret = RC_REPLY;
    420 				rp->rc_timestamp = NFSD_MONOSEC +
    421 					NFSRVCACHE_UDPTIMEOUT;
    422 			} else {
    423 				panic("nfs udp cache1");
    424 			}
    425 			nfsrc_unlock(rp);
    426 			free((caddr_t)newrp, M_NFSRVCACHE);
    427 			goto out;
    428 		}
    429 	}
    430 	nfsstatsv1.srvcache_misses++;
    431 	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
    432 	nfsrc_udpcachesize++;
    433 
    434 	newrp->rc_flag |= RC_INPROG;
    435 	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
    436 	if (saddr->sin_family == AF_INET)
    437 		newrp->rc_inet = saddr->sin_addr.s_addr;
    438 	else if (saddr->sin_family == AF_INET6) {
    439 		saddr6 = (struct sockaddr_in6 *)saddr;
    440 		NFSBCOPY((caddr_t)&saddr6->sin6_addr, (caddr_t)&newrp->rc_inet6,
    441 		    sizeof (struct in6_addr));
    442 		newrp->rc_flag |= RC_INETIPV6;
    443 	}
    444 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
    445 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
    446 	mtx_unlock(mutex);
    447 	nd->nd_rp = newrp;
    448 	ret = RC_DOIT;
    449 
    450 out:
    451 	NFSEXITCODE2(0, nd);
    452 	return (ret);
    453 }
    454 
    455 /*
    456  * Update a request cache entry after the rpc has been done
    457  */
    458 APPLESTATIC struct nfsrvcache *
    459 nfsrvd_updatecache(struct nfsrv_descript *nd)
    460 {
    461 	struct nfsrvcache *rp;
    462 	struct nfsrvcache *retrp = NULL;
    463 	mbuf_t m;
    464 	struct mtx *mutex;
    465 
    466 	rp = nd->nd_rp;
    467 	if (!rp)
    468 		panic("nfsrvd_updatecache null rp");
    469 	nd->nd_rp = NULL;
    470 	mutex = nfsrc_cachemutex(rp);
    471 	mtx_lock(mutex);
    472 	nfsrc_lock(rp);
    473 	if (!(rp->rc_flag & RC_INPROG))
    474 		panic("nfsrvd_updatecache not inprog");
    475 	rp->rc_flag &= ~RC_INPROG;
    476 	if (rp->rc_flag & RC_UDP) {
    477 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
    478 		TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
    479 	}
    480 
    481 	/*
    482 	 * Reply from cache is a special case returned by nfsrv_checkseqid().
    483 	 */
    484 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
    485 		nfsstatsv1.srvcache_nonidemdonehits++;
    486 		mtx_unlock(mutex);
    487 		nd->nd_repstat = 0;
    488 		mbuf_freem(nd->nd_mreq);
    489 		if (!(rp->rc_flag & RC_REPMBUF))
    490 			panic("reply from cache");
    491 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
    492 		    M_COPYALL, M_WAITOK);
    493 		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
    494 		nfsrc_unlock(rp);
    495 		goto out;
    496 	}
    497 
    498 	/*
    499 	 * If rc_refcnt > 0, save it
    500 	 * For UDP, save it if ND_SAVEREPLY is set
    501 	 * For TCP, save it if ND_SAVEREPLY and nfsrc_tcpnonidempotent is set
    502 	 */
    503 	if (nd->nd_repstat != NFSERR_DONTREPLY &&
    504 	    (rp->rc_refcnt > 0 ||
    505 	     ((nd->nd_flag & ND_SAVEREPLY) && (rp->rc_flag & RC_UDP)) ||
    506 	     ((nd->nd_flag & ND_SAVEREPLY) && !(rp->rc_flag & RC_UDP) &&
    507 	      nfsrc_tcpsavedreplies <= nfsrc_floodlevel &&
    508 	      nfsrc_tcpnonidempotent))) {
    509 		if (rp->rc_refcnt > 0) {
    510 			if (!(rp->rc_flag & RC_NFSV4))
    511 				panic("update_cache refcnt");
    512 			rp->rc_flag |= RC_REFCNT;
    513 		}
    514 		if ((nd->nd_flag & ND_NFSV2) &&
    515 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
    516 			rp->rc_status = nd->nd_repstat;
    517 			rp->rc_flag |= RC_REPSTATUS;
    518 			mtx_unlock(mutex);
    519 		} else {
    520 			if (!(rp->rc_flag & RC_UDP)) {
    521 			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
    522 			    if (nfsrc_tcpsavedreplies >
    523 				nfsstatsv1.srvcache_tcppeak)
    524 				nfsstatsv1.srvcache_tcppeak =
    525 				    nfsrc_tcpsavedreplies;
    526 			}
    527 			mtx_unlock(mutex);
    528 			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
    529 			mtx_lock(mutex);
    530 			rp->rc_reply = m;
    531 			rp->rc_flag |= RC_REPMBUF;
    532 			mtx_unlock(mutex);
    533 		}
    534 		if (rp->rc_flag & RC_UDP) {
    535 			rp->rc_timestamp = NFSD_MONOSEC +
    536 			    NFSRVCACHE_UDPTIMEOUT;
    537 			nfsrc_unlock(rp);
    538 		} else {
    539 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
    540 			if (rp->rc_refcnt > 0)
    541 				nfsrc_unlock(rp);
    542 			else
    543 				retrp = rp;
    544 		}
    545 	} else {
    546 		nfsrc_freecache(rp);
    547 		mtx_unlock(mutex);
    548 	}
    549 
    550 out:
    551 	NFSEXITCODE2(0, nd);
    552 	return (retrp);
    553 }
    554 
    555 /*
    556  * Invalidate and, if possible, free an in prog cache entry.
    557  * Must not sleep.
    558  */
    559 APPLESTATIC void
    560 nfsrvd_delcache(struct nfsrvcache *rp)
    561 {
    562 	struct mtx *mutex;
    563 
    564 	mutex = nfsrc_cachemutex(rp);
    565 	if (!(rp->rc_flag & RC_INPROG))
    566 		panic("nfsrvd_delcache not in prog");
    567 	mtx_lock(mutex);
    568 	rp->rc_flag &= ~RC_INPROG;
    569 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
    570 		nfsrc_freecache(rp);
    571 	mtx_unlock(mutex);
    572 }
    573 
    574 /*
    575  * Called after nfsrvd_updatecache() once the reply is sent, to update
    576  * the entry's sequence number and unlock it. The argument is
    577  * the pointer returned by nfsrvd_updatecache().
    578  */
    579 APPLESTATIC void
    580 nfsrvd_sentcache(struct nfsrvcache *rp, int have_seq, uint32_t seq)
    581 {
    582 	struct nfsrchash_bucket *hbp;
    583 
    584 	KASSERT(rp->rc_flag & RC_LOCKED, ("nfsrvd_sentcache not locked"));
    585 	if (have_seq) {
    586 		hbp = NFSRCAHASH(rp->rc_sockref);
    587 		mtx_lock(&hbp->mtx);
    588 		rp->rc_tcpseq = seq;
    589 		if (rp->rc_acked != RC_NO_ACK)
    590 			LIST_INSERT_HEAD(&hbp->tbl, rp, rc_ahash);
    591 		rp->rc_acked = RC_NO_ACK;
    592 		mtx_unlock(&hbp->mtx);
    593 	}
    594 	nfsrc_unlock(rp);
    595 }
    596 
    597 /*
    598  * Get a cache entry for TCP
    599  * - key on <xid, nfs version>
    600  *   (allow multiple entries for a given key)
    601  */
    602 static int
    603 nfsrc_gettcp(struct nfsrv_descript *nd, struct nfsrvcache *newrp)
    604 {
    605 	struct nfsrvcache *rp, *nextrp;
    606 	int i;
    607 	struct nfsrvcache *hitrp;
    608 	struct nfsrvhashhead *hp, nfsrc_templist;
    609 	int hit, ret = 0;
    610 	struct mtx *mutex;
    611 
    612 	mutex = nfsrc_cachemutex(newrp);
    613 	hp = NFSRCHASH(newrp->rc_xid);
    614 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
    615 tryagain:
    616 	mtx_lock(mutex);
    617 	hit = 1;
    618 	LIST_INIT(&nfsrc_templist);
    619 	/*
    620 	 * Get all the matches and put them on the temp list.
    621 	 */
    622 	rp = LIST_FIRST(hp);
    623 	while (rp != NULL) {
    624 		nextrp = LIST_NEXT(rp, rc_hash);
    625 		if (newrp->rc_xid == rp->rc_xid &&
    626 		    (!(rp->rc_flag & RC_INPROG) ||
    627 		     ((newrp->rc_flag & RC_SAMETCPCONN) &&
    628 		      newrp->rc_sockref == rp->rc_sockref)) &&
    629 		    (newrp->rc_flag & rp->rc_flag & RC_NFSVERS) &&
    630 		    newrp->rc_proc == rp->rc_proc &&
    631 		    ((newrp->rc_flag & RC_NFSV4) &&
    632 		     newrp->rc_sockref != rp->rc_sockref &&
    633 		     newrp->rc_cachetime >= rp->rc_cachetime)
    634 		    && newrp->rc_reqlen == rp->rc_reqlen &&
    635 		    newrp->rc_cksum == rp->rc_cksum) {
    636 			LIST_REMOVE(rp, rc_hash);
    637 			LIST_INSERT_HEAD(&nfsrc_templist, rp, rc_hash);
    638 		}
    639 		rp = nextrp;
    640 	}
    641 
    642 	/*
    643 	 * Now, use nfsrc_templist to decide if there is a match.
    644 	 */
    645 	i = 0;
    646 	LIST_FOREACH(rp, &nfsrc_templist, rc_hash) {
    647 		i++;
    648 		if (rp->rc_refcnt > 0) {
    649 			hit = 0;
    650 			break;
    651 		}
    652 	}
    653 	/*
    654 	 * Can be a hit only if one entry left.
    655 	 * Note possible hit entry and put nfsrc_templist back on hash
    656 	 * list.
    657 	 */
    658 	if (i != 1)
    659 		hit = 0;
    660 	hitrp = rp = LIST_FIRST(&nfsrc_templist);
    661 	while (rp != NULL) {
    662 		nextrp = LIST_NEXT(rp, rc_hash);
    663 		LIST_REMOVE(rp, rc_hash);
    664 		LIST_INSERT_HEAD(hp, rp, rc_hash);
    665 		rp = nextrp;
    666 	}
    667 	if (LIST_FIRST(&nfsrc_templist) != NULL)
    668 		panic("nfs gettcp cache templist");
    669 
    670 	if (hit) {
    671 		rp = hitrp;
    672 		if ((rp->rc_flag & RC_LOCKED) != 0) {
    673 			rp->rc_flag |= RC_WANTED;
    674 			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
    675 			    "nfsrc", 10 * hz);
    676 			goto tryagain;
    677 		}
    678 		if (rp->rc_flag == 0)
    679 			panic("nfs tcp cache0");
    680 		rp->rc_flag |= RC_LOCKED;
    681 		if (rp->rc_flag & RC_INPROG) {
    682 			nfsstatsv1.srvcache_inproghits++;
    683 			mtx_unlock(mutex);
    684 			if (newrp->rc_sockref == rp->rc_sockref)
    685 				nfsrc_marksametcpconn(rp->rc_sockref);
    686 			ret = RC_DROPIT;
    687 		} else if (rp->rc_flag & RC_REPSTATUS) {
    688 			/*
    689 			 * V2 only.
    690 			 */
    691 			nfsstatsv1.srvcache_nonidemdonehits++;
    692 			mtx_unlock(mutex);
    693 			if (newrp->rc_sockref == rp->rc_sockref)
    694 				nfsrc_marksametcpconn(rp->rc_sockref);
    695 			ret = RC_REPLY;
    696 			nfsrvd_rephead(nd);
    697 			*(nd->nd_errp) = rp->rc_status;
    698 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
    699 		} else if (rp->rc_flag & RC_REPMBUF) {
    700 			nfsstatsv1.srvcache_nonidemdonehits++;
    701 			mtx_unlock(mutex);
    702 			if (newrp->rc_sockref == rp->rc_sockref)
    703 				nfsrc_marksametcpconn(rp->rc_sockref);
    704 			ret = RC_REPLY;
    705 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
    706 				M_COPYALL, M_WAITOK);
    707 			rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
    708 		} else {
    709 			panic("nfs tcp cache1");
    710 		}
    711 		nfsrc_unlock(rp);
    712 		free((caddr_t)newrp, M_NFSRVCACHE);
    713 		goto out;
    714 	}
    715 	nfsstatsv1.srvcache_misses++;
    716 	atomic_add_int(&nfsstatsv1.srvcache_size, 1);
    717 
    718 	/*
    719 	 * For TCP, multiple entries for a key are allowed, so don't
    720 	 * chain it into the hash table until done.
    721 	 */
    722 	newrp->rc_cachetime = NFSD_MONOSEC;
    723 	newrp->rc_flag |= RC_INPROG;
    724 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
    725 	mtx_unlock(mutex);
    726 	nd->nd_rp = newrp;
    727 	ret = RC_DOIT;
    728 
    729 out:
    730 	NFSEXITCODE2(0, nd);
    731 	return (ret);
    732 }
    733 
    734 /*
    735  * Lock a cache entry.
    736  */
    737 static void
    738 nfsrc_lock(struct nfsrvcache *rp)
    739 {
    740 	struct mtx *mutex;
    741 
    742 	mutex = nfsrc_cachemutex(rp);
    743 	mtx_assert(mutex, MA_OWNED);
    744 	while ((rp->rc_flag & RC_LOCKED) != 0) {
    745 		rp->rc_flag |= RC_WANTED;
    746 		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
    747 	}
    748 	rp->rc_flag |= RC_LOCKED;
    749 }
    750 
    751 /*
    752  * Unlock a cache entry.
    753  */
    754 static void
    755 nfsrc_unlock(struct nfsrvcache *rp)
    756 {
    757 	struct mtx *mutex;
    758 
    759 	mutex = nfsrc_cachemutex(rp);
    760 	mtx_lock(mutex);
    761 	rp->rc_flag &= ~RC_LOCKED;
    762 	nfsrc_wanted(rp);
    763 	mtx_unlock(mutex);
    764 }
    765 
    766 /*
    767  * Wakeup anyone wanting entry.
    768  */
    769 static void
    770 nfsrc_wanted(struct nfsrvcache *rp)
    771 {
    772 	if (rp->rc_flag & RC_WANTED) {
    773 		rp->rc_flag &= ~RC_WANTED;
    774 		wakeup((caddr_t)rp);
    775 	}
    776 }
    777 
    778 /*
    779  * Free up the entry.
    780  * Must not sleep.
    781  */
    782 static void
    783 nfsrc_freecache(struct nfsrvcache *rp)
    784 {
    785 	struct nfsrchash_bucket *hbp;
    786 
    787 	LIST_REMOVE(rp, rc_hash);
    788 	if (rp->rc_flag & RC_UDP) {
    789 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
    790 		nfsrc_udpcachesize--;
    791 	} else if (rp->rc_acked != RC_NO_SEQ) {
    792 		hbp = NFSRCAHASH(rp->rc_sockref);
    793 		mtx_lock(&hbp->mtx);
    794 		if (rp->rc_acked == RC_NO_ACK)
    795 			LIST_REMOVE(rp, rc_ahash);
    796 		mtx_unlock(&hbp->mtx);
    797 	}
    798 	nfsrc_wanted(rp);
    799 	if (rp->rc_flag & RC_REPMBUF) {
    800 		mbuf_freem(rp->rc_reply);
    801 		if (!(rp->rc_flag & RC_UDP))
    802 			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
    803 	}
    804 	FREE((caddr_t)rp, M_NFSRVCACHE);
    805 	atomic_add_int(&nfsstatsv1.srvcache_size, -1);
    806 }
    807 
    808 /*
    809  * Clean out the cache. Called when nfsserver module is unloaded.
    810  */
    811 APPLESTATIC void
    812 nfsrvd_cleancache(void)
    813 {
    814 	struct nfsrvcache *rp, *nextrp;
    815 	int i;
    816 
    817 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
    818 		mtx_lock(&nfsrchash_table[i].mtx);
    819 		LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash, nextrp)
    820 			nfsrc_freecache(rp);
    821 		mtx_unlock(&nfsrchash_table[i].mtx);
    822 	}
    823 	mtx_lock(&nfsrc_udpmtx);
    824 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
    825 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
    826 			nfsrc_freecache(rp);
    827 		}
    828 	}
    829 	nfsstatsv1.srvcache_size = 0;
    830 	mtx_unlock(&nfsrc_udpmtx);
    831 	nfsrc_tcpsavedreplies = 0;
    832 }
    833 
    834 #define HISTSIZE	16
    835 /*
    836  * The basic rule is to get rid of entries that are expired.
    837  */
    838 void
    839 nfsrc_trimcache(u_int64_t sockref, uint32_t snd_una, int final)
    840 {
    841 	struct nfsrchash_bucket *hbp;
    842 	struct nfsrvcache *rp, *nextrp;
    843 	int force, lastslot, i, j, k, tto, time_histo[HISTSIZE];
    844 	time_t thisstamp;
    845 	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
    846 	static int onethread = 0, oneslot = 0;
    847 
    848 	if (sockref != 0) {
    849 		hbp = NFSRCAHASH(sockref);
    850 		mtx_lock(&hbp->mtx);
    851 		LIST_FOREACH_SAFE(rp, &hbp->tbl, rc_ahash, nextrp) {
    852 			if (sockref == rp->rc_sockref) {
    853 				if (SEQ_GEQ(snd_una, rp->rc_tcpseq)) {
    854 					rp->rc_acked = RC_ACK;
    855 					LIST_REMOVE(rp, rc_ahash);
    856 				} else if (final) {
    857 					rp->rc_acked = RC_NACK;
    858 					LIST_REMOVE(rp, rc_ahash);
    859 				}
    860 			}
    861 		}
    862 		mtx_unlock(&hbp->mtx);
    863 	}
    864 
    865 	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
    866 		return;
    867 	if (NFSD_MONOSEC != udp_lasttrim ||
    868 	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
    869 	    nfsrc_udphighwater / 2)) {
    870 		mtx_lock(&nfsrc_udpmtx);
    871 		udp_lasttrim = NFSD_MONOSEC;
    872 		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
    873 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
    874 			     && rp->rc_refcnt == 0
    875 			     && ((rp->rc_flag & RC_REFCNT) ||
    876 				 udp_lasttrim > rp->rc_timestamp ||
    877 				 nfsrc_udpcachesize > nfsrc_udphighwater))
    878 				nfsrc_freecache(rp);
    879 		}
    880 		mtx_unlock(&nfsrc_udpmtx);
    881 	}
    882 	if (NFSD_MONOSEC != tcp_lasttrim ||
    883 	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
    884 		force = nfsrc_tcphighwater / 4;
    885 		if (force > 0 &&
    886 		    nfsrc_tcpsavedreplies + force >= nfsrc_tcphighwater) {
    887 			for (i = 0; i < HISTSIZE; i++)
    888 				time_histo[i] = 0;
    889 			i = 0;
    890 			lastslot = NFSRVCACHE_HASHSIZE - 1;
    891 		} else {
    892 			force = 0;
    893 			if (NFSD_MONOSEC != tcp_lasttrim) {
    894 				i = 0;
    895 				lastslot = NFSRVCACHE_HASHSIZE - 1;
    896 			} else {
    897 				lastslot = i = oneslot;
    898 				if (++oneslot >= NFSRVCACHE_HASHSIZE)
    899 					oneslot = 0;
    900 			}
    901 		}
    902 		tto = nfsrc_tcptimeout;
    903 		tcp_lasttrim = NFSD_MONOSEC;
    904 		for (; i <= lastslot; i++) {
    905 			mtx_lock(&nfsrchash_table[i].mtx);
    906 			LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl, rc_hash,
    907 			    nextrp) {
    908 				if (!(rp->rc_flag &
    909 				     (RC_INPROG|RC_LOCKED|RC_WANTED))
    910 				     && rp->rc_refcnt == 0) {
    911 					if ((rp->rc_flag & RC_REFCNT) ||
    912 					    tcp_lasttrim > rp->rc_timestamp ||
    913 					    rp->rc_acked == RC_ACK) {
    914 						nfsrc_freecache(rp);
    915 						continue;
    916 					}
    917 
    918 					if (force == 0)
    919 						continue;
    920 					/*
    921 					 * The timestamps range from roughly the
    922 					 * present (tcp_lasttrim) to the present
    923 					 * + nfsrc_tcptimeout. Generate a simple
    924 					 * histogram of where the timeouts fall.
    925 					 */
    926 					j = rp->rc_timestamp - tcp_lasttrim;
    927 					if (j >= tto)
    928 						j = HISTSIZE - 1;
    929 					else if (j < 0)
    930 						j = 0;
    931 					else
    932 						j = j * HISTSIZE / tto;
    933 					time_histo[j]++;
    934 				}
    935 			}
    936 			mtx_unlock(&nfsrchash_table[i].mtx);
    937 		}
    938 		if (force) {
    939 			/*
    940 			 * Trim some more with a smaller timeout of as little
    941 			 * as 20% of nfsrc_tcptimeout to try and get below
    942 			 * 80% of the nfsrc_tcphighwater.
    943 			 */
    944 			k = 0;
    945 			for (i = 0; i < (HISTSIZE - 2); i++) {
    946 				k += time_histo[i];
    947 				if (k > force)
    948 					break;
    949 			}
    950 			k = tto * (i + 1) / HISTSIZE;
    951 			if (k < 1)
    952 				k = 1;
    953 			thisstamp = tcp_lasttrim + k;
    954 			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
    955 				mtx_lock(&nfsrchash_table[i].mtx);
    956 				LIST_FOREACH_SAFE(rp, &nfsrchash_table[i].tbl,
    957 				    rc_hash, nextrp) {
    958 					if (!(rp->rc_flag &
    959 					     (RC_INPROG|RC_LOCKED|RC_WANTED))
    960 					     && rp->rc_refcnt == 0
    961 					     && ((rp->rc_flag & RC_REFCNT) ||
    962 						 thisstamp > rp->rc_timestamp ||
    963 						 rp->rc_acked == RC_ACK))
    964 						nfsrc_freecache(rp);
    965 				}
    966 				mtx_unlock(&nfsrchash_table[i].mtx);
    967 			}
    968 		}
    969 	}
    970 	atomic_store_rel_int(&onethread, 0);
    971 }
    972 
    973 /*
    974  * Add a seqid# reference to the cache entry.
    975  */
    976 APPLESTATIC void
    977 nfsrvd_refcache(struct nfsrvcache *rp)
    978 {
    979 	struct mtx *mutex;
    980 
    981 	if (rp == NULL)
    982 		/* For NFSv4.1, there is no cache entry. */
    983 		return;
    984 	mutex = nfsrc_cachemutex(rp);
    985 	mtx_lock(mutex);
    986 	if (rp->rc_refcnt < 0)
    987 		panic("nfs cache refcnt");
    988 	rp->rc_refcnt++;
    989 	mtx_unlock(mutex);
    990 }
    991 
    992 /*
    993  * Dereference a seqid# cache entry.
    994  */
    995 APPLESTATIC void
    996 nfsrvd_derefcache(struct nfsrvcache *rp)
    997 {
    998 	struct mtx *mutex;
    999 
   1000 	mutex = nfsrc_cachemutex(rp);
   1001 	mtx_lock(mutex);
   1002 	if (rp->rc_refcnt <= 0)
   1003 		panic("nfs cache derefcnt");
   1004 	rp->rc_refcnt--;
   1005 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
   1006 		nfsrc_freecache(rp);
   1007 	mtx_unlock(mutex);
   1008 }
   1009 
   1010 /*
   1011  * Calculate the length of the mbuf list and a checksum on the first up to
   1012  * NFSRVCACHE_CHECKLEN bytes.
   1013  */
   1014 static int
   1015 nfsrc_getlenandcksum(mbuf_t m1, u_int16_t *cksum)
   1016 {
   1017 	int len = 0, cklen;
   1018 	mbuf_t m;
   1019 
   1020 	m = m1;
   1021 	while (m) {
   1022 		len += mbuf_len(m);
   1023 		m = mbuf_next(m);
   1024 	}
   1025 	cklen = (len > NFSRVCACHE_CHECKLEN) ? NFSRVCACHE_CHECKLEN : len;
   1026 	*cksum = in_cksum(m1, cklen);
   1027 	return (len);
   1028 }
   1029 
   1030 /*
   1031  * Mark a TCP connection that is seeing retries. Should never happen for
   1032  * NFSv4.
   1033  */
   1034 static void
   1035 nfsrc_marksametcpconn(u_int64_t sockref)
   1036 {
   1037 }
   1038 
   1039