tcp_sack.c revision 1.9 1 /* $NetBSD: tcp_sack.c,v 1.9 2005/03/16 00:38:27 yamt Exp $ */
2
3 /*
4 * Copyright (c) 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Kentaro A. Kurahone.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
41 * The Regents of the University of California. All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
68 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
69 */
70
71 /*
72 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
73 *
74 * NRL grants permission for redistribution and use in source and binary
75 * forms, with or without modification, of the software and documentation
76 * created at NRL provided that the following conditions are met:
77 *
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgements:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * This product includes software developed at the Information
88 * Technology Division, US Naval Research Laboratory.
89 * 4. Neither the name of the NRL nor the names of its contributors
90 * may be used to endorse or promote products derived from this software
91 * without specific prior written permission.
92 *
93 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
94 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
96 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
97 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
98 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
99 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
100 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
101 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
102 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
103 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104 *
105 * The views and conclusions contained in the software and documentation
106 * are those of the authors and should not be interpreted as representing
107 * official policies, either expressed or implied, of the US Naval
108 * Research Laboratory (NRL).
109 */
110
111 #include <sys/cdefs.h>
112 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.9 2005/03/16 00:38:27 yamt Exp $");
113
114 #include "opt_inet.h"
115 #include "opt_ipsec.h"
116 #include "opt_inet_csum.h"
117 #include "opt_tcp_debug.h"
118
119 #include <sys/param.h>
120 #include <sys/systm.h>
121 #include <sys/malloc.h>
122 #include <sys/mbuf.h>
123 #include <sys/protosw.h>
124 #include <sys/socket.h>
125 #include <sys/socketvar.h>
126 #include <sys/errno.h>
127 #include <sys/syslog.h>
128 #include <sys/pool.h>
129 #include <sys/domain.h>
130 #include <sys/kernel.h>
131
132 #include <net/if.h>
133 #include <net/route.h>
134 #include <net/if_types.h>
135
136 #include <netinet/in.h>
137 #include <netinet/in_systm.h>
138 #include <netinet/ip.h>
139 #include <netinet/in_pcb.h>
140 #include <netinet/in_var.h>
141 #include <netinet/ip_var.h>
142
143 #ifdef INET6
144 #ifndef INET
145 #include <netinet/in.h>
146 #endif
147 #include <netinet/ip6.h>
148 #include <netinet6/ip6_var.h>
149 #include <netinet6/in6_pcb.h>
150 #include <netinet6/ip6_var.h>
151 #include <netinet6/in6_var.h>
152 #include <netinet/icmp6.h>
153 #include <netinet6/nd6.h>
154 #endif
155
156 #ifndef INET6
157 /* always need ip6.h for IP6_EXTHDR_GET */
158 #include <netinet/ip6.h>
159 #endif
160
161 #include <netinet/tcp.h>
162 #include <netinet/tcp_fsm.h>
163 #include <netinet/tcp_seq.h>
164 #include <netinet/tcp_timer.h>
165 #include <netinet/tcp_var.h>
166 #include <netinet/tcpip.h>
167 #include <netinet/tcp_debug.h>
168
169 #include <machine/stdarg.h>
170
171 #define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
172 #define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
173
174 /* SACK block pool. */
175 POOL_INIT(sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl", NULL);
176
177 void
178 tcp_update_sack_list(struct tcpcb *tp)
179 {
180 int i = 0;
181 struct ipqent *tiqe = NULL;
182
183 if (!TCP_SACK_ENABLED(tp) || (tp->t_flags & TF_SIGNATURE)) {
184 /* Can't SACK this connection. */
185 return;
186 }
187
188 /*
189 * If possible, tack on the D-SACK block. (RFC2883)
190 */
191 if (tp->rcv_sack_flags & TCPSACK_HAVED) {
192 tp->rcv_sack_block[0].left = tp->rcv_dsack_block.left;
193 tp->rcv_sack_block[0].right = tp->rcv_dsack_block.right;
194 i++;
195 }
196
197 /*
198 * Build up a list of holes in the TCP space. Note that
199 * the first SACK block is always the most recent segment
200 * received.
201 */
202 TAILQ_FOREACH(tiqe, &tp->timeq, ipqe_timeq) {
203 tp->rcv_sack_block[i].left = tiqe->ipqe_seq;
204 tp->rcv_sack_block[i].right = tiqe->ipqe_seq + tiqe->ipqe_len;
205 i++;
206 if (i >= TCP_SACK_MAX) {
207 break;
208 }
209 }
210
211 /* If we can SACK, do so. */
212 tp->rcv_sack_num = i;
213 }
214
215 void
216 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
217 {
218 if (TCP_SACK_ENABLED(tp)) {
219 tp->rcv_dsack_block.left = seq;
220 tp->rcv_dsack_block.right = seq + len;
221 tp->rcv_sack_flags |= TCPSACK_HAVED;
222 }
223 }
224
225 void
226 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
227 {
228 struct sackblk
229 t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
230 struct sackblk *sack = NULL;
231 struct sackhole *cur = NULL;
232 struct sackhole *tmp = NULL;
233 u_int32_t *lp = (u_int32_t *) (cp + 2);
234 int i, j, num_sack_blks;
235 tcp_seq left, right, acked;
236
237 /*
238 * If we aren't processing SACK responses, or the peer
239 * sends us a sack option with invalid length, don't
240 * update the scoreboard.
241 */
242 if (!TCP_SACK_ENABLED(tp) || (optlen % 8 != 2 || optlen < 10)) {
243 return;
244 }
245
246 /*
247 * Extract SACK blocks.
248 *
249 * Note that t_sack_block is sorted so that we only need to do
250 * one pass over the sequence number space. (SACK "fast-path")
251 */
252 num_sack_blks = optlen / 8;
253 acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
254 for (i = 0; i < num_sack_blks; i++, lp += 2) {
255 memcpy(&left, lp, sizeof(*lp));
256 memcpy(&right, lp + 1, sizeof(*lp));
257 left = ntohl(left);
258 right = ntohl(right);
259
260 if (SEQ_LEQ(right, acked) || SEQ_GEQ(left, tp->snd_max) ||
261 SEQ_GEQ(left, right)) {
262 /* SACK entry that's old, or invalid. */
263 i--;
264 num_sack_blks--;
265 continue;
266 }
267
268 /* Insertion sort. */
269 for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
270 j--) {
271 t_sack_block[j].left = t_sack_block[j - 1].left;
272 t_sack_block[j].right = t_sack_block[j - 1].right;
273 }
274 t_sack_block[j].left = left;
275 t_sack_block[j].right = right;
276 }
277
278 /* Update the scoreboard. */
279 cur = TAILQ_FIRST(&tp->snd_holes);
280 for (i = 0; i < num_sack_blks; i++) {
281 sack = &t_sack_block[i];
282 /*
283 * FACK TCP. Update snd_fack so we can enter Fast
284 * Recovery early.
285 */
286 if (SEQ_GEQ(sack->right, tp->snd_fack))
287 tp->snd_fack = sack->right;
288
289 if (TAILQ_EMPTY(&tp->snd_holes)) {
290 /* First hole. */
291 cur = (struct sackhole *)
292 pool_get(&sackhole_pool, PR_NOWAIT);
293 if (cur == NULL) {
294 /* ENOBUFS, bail out*/
295 return;
296 }
297 cur->start = th->th_ack;
298 cur->end = sack->left;
299 cur->rxmit = cur->start;
300 tp->rcv_lastsack = sack->right;
301 TAILQ_INSERT_HEAD(&tp->snd_holes, cur, sackhole_q);
302 continue; /* With next sack block */
303 }
304
305 /* Go through the list of holes. */
306 while (cur) {
307 if (SEQ_LEQ(sack->right, cur->start))
308 /* SACKs data before the current hole */
309 break; /* No use going through more holes */
310
311 if (SEQ_GEQ(sack->left, cur->end)) {
312 /* SACKs data beyond the current hole */
313 cur = TAILQ_NEXT(cur, sackhole_q);
314 continue;
315 }
316
317 if (SEQ_LEQ(sack->left, cur->start)) {
318 /* Data acks at least the beginning of hole */
319 if (SEQ_GEQ(sack->right, cur->end)) {
320 /* Acks entire hole, so delete hole */
321 tmp = cur;
322 cur = TAILQ_NEXT(cur, sackhole_q);
323 TAILQ_REMOVE(&tp->snd_holes, tmp,
324 sackhole_q);
325 pool_put(&sackhole_pool, tmp);
326 break;
327 }
328
329 /* Otherwise, move start of hole forward */
330 cur->start = sack->right;
331 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
332 break;
333 }
334
335 if (SEQ_GEQ(sack->right, cur->end)) {
336 /* Move end of hole backward. */
337 cur->end = sack->left;
338 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
339 cur = TAILQ_NEXT(cur, sackhole_q);
340 break;
341 }
342
343 if (SEQ_LT(cur->start, sack->left) &&
344 SEQ_GT(cur->end, sack->right)) {
345 /*
346 * ACKs some data in middle of a hole; need to
347 * split current hole
348 */
349 tmp = (struct sackhole *)
350 pool_get(&sackhole_pool, PR_NOWAIT);
351 if (tmp == NULL) {
352 /* ENOBUFS, bail out. */
353 return;
354 }
355 tmp->start = sack->right;
356 tmp->end = cur->end;
357 tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
358 cur->end = sack->left;
359 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
360 TAILQ_INSERT_AFTER(&tp->snd_holes, cur, tmp,
361 sackhole_q);
362 cur = tmp;
363 break;
364 }
365 }
366
367 /* At this point, we have reached the tail of the list. */
368 if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
369 /*
370 * Need to append new hole at end.
371 */
372 tmp = (struct sackhole *)
373 pool_get(&sackhole_pool, PR_NOWAIT);
374 if (tmp == NULL)
375 continue; /* ENOBUFS */
376 tmp->start = tp->rcv_lastsack;
377 tmp->end = sack->left;
378 tmp->rxmit = tmp->start;
379 TAILQ_INSERT_TAIL(&tp->snd_holes, tmp, sackhole_q);
380 cur = tmp;
381 }
382 if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
383 tp->rcv_lastsack = sack->right;
384 }
385 }
386 }
387
388 void
389 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th)
390 {
391 /* Max because this could be an older ack that just arrived. */
392 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
393 th->th_ack : tp->snd_una;
394 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
395 struct sackhole *tmp;
396
397 while (cur) {
398 if (SEQ_LEQ(cur->end, lastack)) {
399 tmp = cur;
400 cur = TAILQ_NEXT(cur, sackhole_q);
401 TAILQ_REMOVE(&tp->snd_holes, tmp, sackhole_q);
402 pool_put(&sackhole_pool, tmp);
403 } else if (SEQ_LT(cur->start, lastack)) {
404 cur->start = lastack;
405 if (SEQ_LT(cur->rxmit, cur->start))
406 cur->rxmit = cur->start;
407 break;
408 } else
409 break;
410
411 }
412 }
413
414 void
415 tcp_free_sackholes(struct tcpcb *tp)
416 {
417 struct sackhole *sack;
418
419 /* Free up the SACK hole list. */
420 while (!TAILQ_EMPTY(&tp->snd_holes)) {
421 sack = TAILQ_FIRST(&tp->snd_holes);
422 TAILQ_REMOVE(&tp->snd_holes, sack, sackhole_q);
423 pool_put(&sackhole_pool, sack);
424 }
425 }
426
427 /*
428 * Implements the SACK response to a new ack, checking for partial acks
429 * in fast recovery.
430 */
431 void
432 tcp_sack_newack(struct tcpcb *tp, struct tcphdr *th)
433 {
434 if (tp->t_partialacks < 0) {
435 /*
436 * Not in fast recovery. Reset the duplicate ack
437 * counter.
438 */
439 tp->t_dupacks = 0;
440 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
441 /*
442 * Partial ack handling within a sack recovery episode.
443 * Keeping this very simple for now. When a partial ack
444 * is received, force snd_cwnd to a value that will allow
445 * the sender to transmit no more than 2 segments.
446 * If necessary, a fancier scheme can be adopted at a
447 * later point, but for now, the goal is to prevent the
448 * sender from bursting a large amount of data in the midst
449 * of sack recovery.
450 */
451 int num_segs = 1;
452 int sack_bytes_rxmt = 0;
453
454 tp->t_partialacks++;
455 TCP_TIMER_DISARM(tp, TCPT_REXMT);
456 tp->t_rtttime = 0;
457
458 /*
459 * send one or 2 segments based on how much new data was acked
460 */
461 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
462 num_segs = 2;
463 (void)tcp_sack_output(tp, &sack_bytes_rxmt);
464 tp->snd_cwnd = sack_bytes_rxmt +
465 (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_segsz;
466 tp->t_flags |= TF_ACKNOW;
467 (void) tcp_output(tp);
468 } else {
469 /*
470 * Complete ack, inflate the congestion window to
471 * ssthresh and exit fast recovery.
472 *
473 * Window inflation should have left us with approx.
474 * snd_ssthresh outstanding data. But in case we
475 * would be inclined to send a burst, better to do
476 * it via the slow start mechanism.
477 */
478 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
479 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
480 + tp->t_segsz;
481 else
482 tp->snd_cwnd = tp->snd_ssthresh;
483 tp->t_partialacks = -1;
484 tp->t_dupacks = 0;
485 if (SEQ_GT(th->th_ack, tp->snd_fack))
486 tp->snd_fack = th->th_ack;
487 }
488 }
489
490 /*
491 * Returns pointer to a sackhole if there are any pending retransmissions;
492 * NULL otherwise.
493 */
494 struct sackhole *
495 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
496 {
497 struct sackhole *cur = NULL;
498
499 if(!TCP_SACK_ENABLED(tp))
500 return (NULL);
501
502 *sack_bytes_rexmt = 0;
503 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
504 if (SEQ_LT(cur->rxmit, cur->end)) {
505 if (SEQ_LT(cur->rxmit, tp->snd_una)) {
506 /* old SACK hole */
507 continue;
508 }
509 *sack_bytes_rexmt += (cur->rxmit - cur->start);
510 break;
511 }
512 *sack_bytes_rexmt += (cur->rxmit - cur->start);
513 }
514
515 return (cur);
516 }
517
518 /*
519 * After a timeout, the SACK list may be rebuilt. This SACK information
520 * should be used to avoid retransmitting SACKed data. This function
521 * traverses the SACK list to see if snd_nxt should be moved forward.
522 */
523 void
524 tcp_sack_adjust(struct tcpcb *tp)
525 {
526 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
527 struct sackhole *n = NULL;
528
529 if (TAILQ_EMPTY(&tp->snd_holes))
530 return; /* No holes */
531 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
532 return; /* We're already beyond any SACKed blocks */
533
534 /*
535 * Two cases for which we want to advance snd_nxt:
536 * i) snd_nxt lies between end of one hole and beginning of another
537 * ii) snd_nxt lies between end of last hole and rcv_lastsack
538 */
539 while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
540 if (SEQ_LT(tp->snd_nxt, cur->end))
541 return;
542 if (SEQ_GEQ(tp->snd_nxt, n->start))
543 cur = n;
544 else {
545 tp->snd_nxt = n->start;
546 return;
547 }
548 }
549 if (SEQ_LT(tp->snd_nxt, cur->end))
550 return;
551 tp->snd_nxt = tp->rcv_lastsack;
552
553 return;
554 }
555
556 int
557 tcp_sack_optlen(struct tcpcb *tp)
558 {
559
560 if (!TCP_SACK_ENABLED(tp) || tp->rcv_sack_num == 0) {
561 return 0;
562 }
563
564 return tp->rcv_sack_num * 8 + 2 + 2;
565 }
566