Home | History | Annotate | Line # | Download | only in netinet
tcp_sack.c revision 1.32
      1 /* $NetBSD: tcp_sack.c,v 1.32 2015/08/24 22:21:26 pooka Exp $ */
      2 
      3 /*
      4  * Copyright (c) 2005 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Kentaro A. Kurahone.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
     34  *	The Regents of the University of California.  All rights reserved.
     35  *
     36  * Redistribution and use in source and binary forms, with or without
     37  * modification, are permitted provided that the following conditions
     38  * are met:
     39  * 1. Redistributions of source code must retain the above copyright
     40  *    notice, this list of conditions and the following disclaimer.
     41  * 2. Redistributions in binary form must reproduce the above copyright
     42  *    notice, this list of conditions and the following disclaimer in the
     43  *    documentation and/or other materials provided with the distribution.
     44  * 4. Neither the name of the University nor the names of its contributors
     45  *    may be used to endorse or promote products derived from this software
     46  *    without specific prior written permission.
     47  *
     48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     58  * SUCH DAMAGE.
     59  *
     60  *	@(#)tcp_sack.c	8.12 (Berkeley) 5/24/95
     61  * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
     62  */
     63 
     64 /*
     65  *	@@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
     66  *
     67  * NRL grants permission for redistribution and use in source and binary
     68  * forms, with or without modification, of the software and documentation
     69  * created at NRL provided that the following conditions are met:
     70  *
     71  * 1. Redistributions of source code must retain the above copyright
     72  *    notice, this list of conditions and the following disclaimer.
     73  * 2. Redistributions in binary form must reproduce the above copyright
     74  *    notice, this list of conditions and the following disclaimer in the
     75  *    documentation and/or other materials provided with the distribution.
     76  * 3. All advertising materials mentioning features or use of this software
     77  *    must display the following acknowledgements:
     78  *	This product includes software developed by the University of
     79  *	California, Berkeley and its contributors.
     80  *	This product includes software developed at the Information
     81  *	Technology Division, US Naval Research Laboratory.
     82  * 4. Neither the name of the NRL nor the names of its contributors
     83  *    may be used to endorse or promote products derived from this software
     84  *    without specific prior written permission.
     85  *
     86  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
     87  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     88  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
     89  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
     90  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     91  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     92  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     93  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     94  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     95  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     96  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     97  *
     98  * The views and conclusions contained in the software and documentation
     99  * are those of the authors and should not be interpreted as representing
    100  * official policies, either expressed or implied, of the US Naval
    101  * Research Laboratory (NRL).
    102  */
    103 
    104 #include <sys/cdefs.h>
    105 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.32 2015/08/24 22:21:26 pooka Exp $");
    106 
    107 #ifdef _KERNEL_OPT
    108 #include "opt_inet.h"
    109 #include "opt_inet_csum.h"
    110 #include "opt_tcp_debug.h"
    111 #include "opt_ddb.h"
    112 #endif
    113 
    114 #include <sys/param.h>
    115 #include <sys/systm.h>
    116 #include <sys/mbuf.h>
    117 #include <sys/protosw.h>
    118 #include <sys/socket.h>
    119 #include <sys/socketvar.h>
    120 #include <sys/errno.h>
    121 #include <sys/syslog.h>
    122 #include <sys/pool.h>
    123 #include <sys/domain.h>
    124 #include <sys/kernel.h>
    125 
    126 #include <net/if.h>
    127 #include <net/route.h>
    128 #include <net/if_types.h>
    129 
    130 #include <netinet/in.h>
    131 #include <netinet/in_systm.h>
    132 #include <netinet/ip.h>
    133 #include <netinet/in_pcb.h>
    134 #include <netinet/in_var.h>
    135 #include <netinet/ip_var.h>
    136 
    137 #ifdef INET6
    138 #ifndef INET
    139 #include <netinet/in.h>
    140 #endif
    141 #include <netinet/ip6.h>
    142 #include <netinet6/ip6_var.h>
    143 #include <netinet6/in6_pcb.h>
    144 #include <netinet6/ip6_var.h>
    145 #include <netinet6/in6_var.h>
    146 #include <netinet/icmp6.h>
    147 #include <netinet6/nd6.h>
    148 #endif
    149 
    150 #ifndef INET6
    151 /* always need ip6.h for IP6_EXTHDR_GET */
    152 #include <netinet/ip6.h>
    153 #endif
    154 
    155 #include <netinet/tcp.h>
    156 #include <netinet/tcp_fsm.h>
    157 #include <netinet/tcp_seq.h>
    158 #include <netinet/tcp_timer.h>
    159 #include <netinet/tcp_var.h>
    160 #include <netinet/tcpip.h>
    161 #include <netinet/tcp_debug.h>
    162 
    163 /* SACK block pool. */
    164 static struct pool sackhole_pool;
    165 
    166 void
    167 tcp_sack_init(void)
    168 {
    169 
    170 	pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0,
    171 	    "sackholepl", NULL, IPL_SOFTNET);
    172 }
    173 
    174 static struct sackhole *
    175 sack_allochole(struct tcpcb *tp)
    176 {
    177 	struct sackhole *hole;
    178 
    179 	if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
    180 	    tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
    181 		return NULL;
    182 	}
    183 	hole = pool_get(&sackhole_pool, PR_NOWAIT);
    184 	if (hole == NULL) {
    185 		return NULL;
    186 	}
    187 	tp->snd_numholes++;
    188 	tcp_sack_globalholes++;
    189 
    190 	return hole;
    191 }
    192 
    193 static struct sackhole *
    194 sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
    195     struct sackhole *prev)
    196 {
    197 	struct sackhole *hole;
    198 
    199 	hole = sack_allochole(tp);
    200 	if (hole == NULL) {
    201 		return NULL;
    202 	}
    203 	hole->start = hole->rxmit = start;
    204 	hole->end = end;
    205 	if (prev != NULL) {
    206 		TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
    207 	} else {
    208 		TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
    209 	}
    210 	return hole;
    211 }
    212 
    213 static struct sackhole *
    214 sack_removehole(struct tcpcb *tp, struct sackhole *hole)
    215 {
    216 	struct sackhole *next;
    217 
    218 	next = TAILQ_NEXT(hole, sackhole_q);
    219 	tp->snd_numholes--;
    220 	tcp_sack_globalholes--;
    221 	TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
    222 	pool_put(&sackhole_pool, hole);
    223 
    224 	return next;
    225 }
    226 
    227 /*
    228  * tcp_new_dsack: record the reception of a duplicated segment.
    229  */
    230 
    231 void
    232 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
    233 {
    234 
    235 	if (TCP_SACK_ENABLED(tp)) {
    236 		tp->rcv_dsack_block.left = seq;
    237 		tp->rcv_dsack_block.right = seq + len;
    238 		tp->rcv_sack_flags |= TCPSACK_HAVED;
    239 	}
    240 }
    241 
    242 /*
    243  * tcp_sack_option: parse the given SACK option and update the scoreboard.
    244  */
    245 
    246 void
    247 tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
    248     int optlen)
    249 {
    250 	struct sackblk
    251 	    t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
    252 	struct sackblk *sack = NULL;
    253 	struct sackhole *cur = NULL;
    254 	struct sackhole *tmp = NULL;
    255 	const char *lp = cp + 2;
    256 	int i, j, num_sack_blks;
    257 	tcp_seq left, right, acked;
    258 
    259 	/*
    260 	 * If we aren't processing SACK responses, this is not an ACK
    261 	 * or the peer sends us a sack option with invalid length, don't
    262 	 * update the scoreboard.
    263 	 */
    264 	if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
    265 			(optlen % 8 != 2 || optlen < 10)) {
    266 		return;
    267 	}
    268 
    269 	/*
    270 	 * If we don't want any SACK holes to be allocated, just return.
    271 	 */
    272 	if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
    273 		return;
    274 	}
    275 
    276 	/* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
    277 	if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
    278 		return;
    279 
    280 	/*
    281 	 * Extract SACK blocks.
    282 	 *
    283 	 * Note that t_sack_block is sorted so that we only need to do
    284 	 * one pass over the sequence number space. (SACK "fast-path")
    285 	 */
    286 	num_sack_blks = optlen / 8;
    287 	acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
    288 	for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
    289 		memcpy(&left, lp, sizeof(uint32_t));
    290 		memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
    291 		left = ntohl(left);
    292 		right = ntohl(right);
    293 
    294 		if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
    295 		    SEQ_GEQ(left, right)) {
    296 			/* SACK entry that's old, or invalid. */
    297 			i--;
    298 			num_sack_blks--;
    299 			continue;
    300 		}
    301 
    302 		/* Insertion sort. */
    303 		for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
    304 		    j--) {
    305 			t_sack_block[j].left = t_sack_block[j - 1].left;
    306 			t_sack_block[j].right = t_sack_block[j - 1].right;
    307 		}
    308 		t_sack_block[j].left = left;
    309 		t_sack_block[j].right = right;
    310 	}
    311 
    312 	/* Update the scoreboard. */
    313 	cur = TAILQ_FIRST(&tp->snd_holes);
    314 	for (i = 0; i < num_sack_blks; i++) {
    315 		sack = &t_sack_block[i];
    316 		/*
    317 		 * FACK TCP.  Update snd_fack so we can enter Fast
    318 		 * Recovery early.
    319 		 */
    320 		if (SEQ_GEQ(sack->right, tp->snd_fack))
    321 			tp->snd_fack = sack->right;
    322 
    323 		if (TAILQ_EMPTY(&tp->snd_holes)) {
    324 			/* First hole. */
    325 			cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
    326 			if (cur == NULL) {
    327 				/* ENOBUFS, bail out*/
    328 				return;
    329 			}
    330 			tp->rcv_lastsack = sack->right;
    331 			continue; /* With next sack block */
    332 		}
    333 
    334 		/* Go through the list of holes. */
    335 		while (cur) {
    336 			if (SEQ_LEQ(sack->right, cur->start))
    337 				/* SACKs data before the current hole */
    338 				break; /* No use going through more holes */
    339 
    340 			if (SEQ_GEQ(sack->left, cur->end)) {
    341 				/* SACKs data beyond the current hole */
    342 				cur = TAILQ_NEXT(cur, sackhole_q);
    343 				continue;
    344 			}
    345 
    346 			if (SEQ_LEQ(sack->left, cur->start)) {
    347 				/* Data acks at least the beginning of hole */
    348 				if (SEQ_GEQ(sack->right, cur->end)) {
    349 					/* Acks entire hole, so delete hole */
    350 					cur = sack_removehole(tp, cur);
    351 					break;
    352 				}
    353 
    354 				/* Otherwise, move start of hole forward */
    355 				cur->start = sack->right;
    356 				cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
    357 				break;
    358 			}
    359 
    360 			if (SEQ_GEQ(sack->right, cur->end)) {
    361 				/* Move end of hole backward. */
    362 				cur->end = sack->left;
    363 				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
    364 				cur = TAILQ_NEXT(cur, sackhole_q);
    365 				break;
    366 			}
    367 
    368 			if (SEQ_LT(cur->start, sack->left) &&
    369 			    SEQ_GT(cur->end, sack->right)) {
    370 				/*
    371 				 * ACKs some data in middle of a hole; need to
    372 				 * split current hole
    373 				 */
    374 				tmp = sack_inserthole(tp, sack->right, cur->end,
    375 				    cur);
    376 				if (tmp == NULL) {
    377 					return;
    378 				}
    379 				tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
    380 				cur->end = sack->left;
    381 				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
    382 				cur = tmp;
    383 				break;
    384 			}
    385 		}
    386 
    387 		/* At this point, we have reached the tail of the list. */
    388 		if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
    389 			/*
    390 			 * Need to append new hole at end.
    391 			 */
    392 			cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
    393 			    NULL);
    394 			if (cur == NULL) {
    395 				return;
    396 			}
    397 		}
    398 		if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
    399 			tp->rcv_lastsack = sack->right;
    400 		}
    401 	}
    402 }
    403 
    404 /*
    405  * tcp_del_sackholes: remove holes covered by a cumulative ACK.
    406  */
    407 
    408 void
    409 tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
    410 {
    411 	/* Max because this could be an older ack that just arrived. */
    412 	tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
    413 		th->th_ack : tp->snd_una;
    414 	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
    415 
    416 	while (cur) {
    417 		if (SEQ_LEQ(cur->end, lastack)) {
    418 			cur = sack_removehole(tp, cur);
    419 		} else if (SEQ_LT(cur->start, lastack)) {
    420 			cur->start = lastack;
    421 			if (SEQ_LT(cur->rxmit, cur->start))
    422 				cur->rxmit = cur->start;
    423 			break;
    424 		} else
    425 			break;
    426 	}
    427 }
    428 
    429 /*
    430  * tcp_free_sackholes: clear the scoreboard.
    431  */
    432 
    433 void
    434 tcp_free_sackholes(struct tcpcb *tp)
    435 {
    436 	struct sackhole *sack;
    437 
    438 	/* Free up the SACK hole list. */
    439 	while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
    440 		sack_removehole(tp, sack);
    441 	}
    442 	KASSERT(tp->snd_numholes == 0);
    443 }
    444 
    445 /*
    446  * Returns pointer to a sackhole if there are any pending retransmissions;
    447  * NULL otherwise.
    448  */
    449 struct sackhole *
    450 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
    451 {
    452 	struct sackhole *cur = NULL;
    453 
    454 	if (!TCP_SACK_ENABLED(tp))
    455 		return (NULL);
    456 
    457 	*sack_bytes_rexmt = 0;
    458 	TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
    459 		if (SEQ_LT(cur->rxmit, cur->end)) {
    460 			if (SEQ_LT(cur->rxmit, tp->snd_una)) {
    461 				/* old SACK hole */
    462 				continue;
    463 			}
    464 			*sack_bytes_rexmt += (cur->rxmit - cur->start);
    465 			break;
    466 		}
    467 		*sack_bytes_rexmt += (cur->rxmit - cur->start);
    468 	}
    469 
    470 	return (cur);
    471 }
    472 
    473 /*
    474  * After a timeout, the SACK list may be rebuilt.  This SACK information
    475  * should be used to avoid retransmitting SACKed data.  This function
    476  * traverses the SACK list to see if snd_nxt should be moved forward.
    477  */
    478 void
    479 tcp_sack_adjust(struct tcpcb *tp)
    480 {
    481 	struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
    482 	struct sackhole *n = NULL;
    483 
    484 	if (TAILQ_EMPTY(&tp->snd_holes))
    485 		return; /* No holes */
    486 	if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
    487 		return; /* We're already beyond any SACKed blocks */
    488 
    489 	/*
    490 	 * Two cases for which we want to advance snd_nxt:
    491 	 * i) snd_nxt lies between end of one hole and beginning of another
    492 	 * ii) snd_nxt lies between end of last hole and rcv_lastsack
    493 	 */
    494 	while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
    495 		if (SEQ_LT(tp->snd_nxt, cur->end))
    496 			return;
    497 		if (SEQ_GEQ(tp->snd_nxt, n->start))
    498 			cur = n;
    499 		else {
    500 			tp->snd_nxt = n->start;
    501 			return;
    502 		}
    503 	}
    504 	if (SEQ_LT(tp->snd_nxt, cur->end))
    505 		return;
    506 	tp->snd_nxt = tp->rcv_lastsack;
    507 
    508 	return;
    509 }
    510 
    511 /*
    512  * tcp_sack_numblks: return the number of SACK blocks to send.
    513  */
    514 
    515 int
    516 tcp_sack_numblks(const struct tcpcb *tp)
    517 {
    518 	int numblks;
    519 
    520 	if (!TCP_SACK_ENABLED(tp)) {
    521 		return 0;
    522 	}
    523 
    524 	numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
    525 	    tp->t_segqlen;
    526 
    527 	if (numblks == 0) {
    528 		return 0;
    529 	}
    530 
    531 	if (numblks > TCP_SACK_MAX) {
    532 		numblks = TCP_SACK_MAX;
    533 	}
    534 
    535 	return numblks;
    536 }
    537 
    538 #if defined(DDB)
    539 void sack_dump(const struct tcpcb *);
    540 
    541 void
    542 sack_dump(const struct tcpcb *tp)
    543 {
    544 	const struct sackhole *cur;
    545 
    546 	printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
    547 	    tp->snd_una, tp->snd_max);
    548 	printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
    549 	    tp->rcv_lastsack, tp->snd_fack);
    550 	printf("numholes=%d\n", tp->snd_numholes);
    551 	TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
    552 		printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
    553 		    cur->start, cur->end, cur->rxmit);
    554 	}
    555 }
    556 #endif /* defined(DDB) */
    557