1 1.29 andvar /* $NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $ */ 2 1.1 rpaulo 3 1.1 rpaulo /*- 4 1.1 rpaulo * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc. 5 1.1 rpaulo * All rights reserved. 6 1.1 rpaulo * 7 1.1 rpaulo * This code is derived from software contributed to The NetBSD Foundation 8 1.1 rpaulo * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation 9 1.1 rpaulo * Facility, NASA Ames Research Center. 10 1.1 rpaulo * This code is derived from software contributed to The NetBSD Foundation 11 1.1 rpaulo * by Charles M. Hannum. 12 1.1 rpaulo * This code is derived from software contributed to The NetBSD Foundation 13 1.1 rpaulo * by Rui Paulo. 14 1.1 rpaulo * 15 1.1 rpaulo * Redistribution and use in source and binary forms, with or without 16 1.1 rpaulo * modification, are permitted provided that the following conditions 17 1.1 rpaulo * are met: 18 1.1 rpaulo * 1. Redistributions of source code must retain the above copyright 19 1.1 rpaulo * notice, this list of conditions and the following disclaimer. 20 1.1 rpaulo * 2. Redistributions in binary form must reproduce the above copyright 21 1.1 rpaulo * notice, this list of conditions and the following disclaimer in the 22 1.1 rpaulo * documentation and/or other materials provided with the distribution. 23 1.1 rpaulo * 24 1.1 rpaulo * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 25 1.1 rpaulo * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 26 1.1 rpaulo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 27 1.1 rpaulo * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 28 1.1 rpaulo * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 29 1.1 rpaulo * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 30 1.1 rpaulo * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31 1.1 rpaulo * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 32 1.1 rpaulo * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 33 1.1 rpaulo * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 1.1 rpaulo * POSSIBILITY OF SUCH DAMAGE. 35 1.1 rpaulo */ 36 1.1 rpaulo 37 1.1 rpaulo /* 38 1.1 rpaulo * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 39 1.1 rpaulo * All rights reserved. 40 1.1 rpaulo * 41 1.1 rpaulo * Redistribution and use in source and binary forms, with or without 42 1.1 rpaulo * modification, are permitted provided that the following conditions 43 1.1 rpaulo * are met: 44 1.1 rpaulo * 1. Redistributions of source code must retain the above copyright 45 1.1 rpaulo * notice, this list of conditions and the following disclaimer. 46 1.1 rpaulo * 2. Redistributions in binary form must reproduce the above copyright 47 1.1 rpaulo * notice, this list of conditions and the following disclaimer in the 48 1.1 rpaulo * documentation and/or other materials provided with the distribution. 49 1.1 rpaulo * 3. Neither the name of the project nor the names of its contributors 50 1.1 rpaulo * may be used to endorse or promote products derived from this software 51 1.1 rpaulo * without specific prior written permission. 52 1.1 rpaulo * 53 1.1 rpaulo * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 54 1.1 rpaulo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 1.1 rpaulo * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 1.1 rpaulo * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 57 1.1 rpaulo * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 1.1 rpaulo * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 1.1 rpaulo * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 1.1 rpaulo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 1.1 rpaulo * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 1.1 rpaulo * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 1.1 rpaulo * SUCH DAMAGE. 64 1.1 rpaulo */ 65 1.1 rpaulo 66 1.1 rpaulo /* 67 1.1 rpaulo * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 68 1.1 rpaulo * 69 1.1 rpaulo * NRL grants permission for redistribution and use in source and binary 70 1.1 rpaulo * forms, with or without modification, of the software and documentation 71 1.1 rpaulo * created at NRL provided that the following conditions are met: 72 1.1 rpaulo * 73 1.1 rpaulo * 1. Redistributions of source code must retain the above copyright 74 1.1 rpaulo * notice, this list of conditions and the following disclaimer. 75 1.1 rpaulo * 2. Redistributions in binary form must reproduce the above copyright 76 1.1 rpaulo * notice, this list of conditions and the following disclaimer in the 77 1.1 rpaulo * documentation and/or other materials provided with the distribution. 78 1.1 rpaulo * 3. All advertising materials mentioning features or use of this software 79 1.1 rpaulo * must display the following acknowledgements: 80 1.1 rpaulo * This product includes software developed by the University of 81 1.1 rpaulo * California, Berkeley and its contributors. 82 1.1 rpaulo * This product includes software developed at the Information 83 1.1 rpaulo * Technology Division, US Naval Research Laboratory. 84 1.1 rpaulo * 4. Neither the name of the NRL nor the names of its contributors 85 1.1 rpaulo * may be used to endorse or promote products derived from this software 86 1.1 rpaulo * without specific prior written permission. 87 1.1 rpaulo * 88 1.1 rpaulo * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 89 1.1 rpaulo * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 90 1.1 rpaulo * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 91 1.1 rpaulo * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 92 1.1 rpaulo * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 93 1.1 rpaulo * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 94 1.1 rpaulo * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 95 1.1 rpaulo * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 96 1.1 rpaulo * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 97 1.1 rpaulo * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 98 1.1 rpaulo * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 99 1.1 rpaulo * 100 1.1 rpaulo * The views and conclusions contained in the software and documentation 101 1.1 rpaulo * are those of the authors and should not be interpreted as representing 102 1.1 rpaulo * official policies, either expressed or implied, of the US Naval 103 1.1 rpaulo * Research Laboratory (NRL). 104 1.1 rpaulo */ 105 1.1 rpaulo 106 1.1 rpaulo /* 107 1.1 rpaulo * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 108 1.1 rpaulo * The Regents of the University of California. All rights reserved. 109 1.1 rpaulo * 110 1.1 rpaulo * Redistribution and use in source and binary forms, with or without 111 1.1 rpaulo * modification, are permitted provided that the following conditions 112 1.1 rpaulo * are met: 113 1.1 rpaulo * 1. Redistributions of source code must retain the above copyright 114 1.1 rpaulo * notice, this list of conditions and the following disclaimer. 115 1.1 rpaulo * 2. Redistributions in binary form must reproduce the above copyright 116 1.1 rpaulo * notice, this list of conditions and the following disclaimer in the 117 1.1 rpaulo * documentation and/or other materials provided with the distribution. 118 1.1 rpaulo * 3. Neither the name of the University nor the names of its contributors 119 1.1 rpaulo * may be used to endorse or promote products derived from this software 120 1.1 rpaulo * without specific prior written permission. 121 1.1 rpaulo * 122 1.1 rpaulo * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 123 1.1 rpaulo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 124 1.1 rpaulo * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 125 1.1 rpaulo * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 126 1.1 rpaulo * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 127 1.1 rpaulo * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 128 1.1 rpaulo * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 129 1.1 rpaulo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 130 1.1 rpaulo * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 131 1.1 rpaulo * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 132 1.1 rpaulo * SUCH DAMAGE. 133 1.1 rpaulo * 134 1.1 rpaulo * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 135 1.1 rpaulo */ 136 1.1 rpaulo 137 1.1 rpaulo #include <sys/cdefs.h> 138 1.29 andvar __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.29 2024/05/14 19:00:44 andvar Exp $"); 139 1.1 rpaulo 140 1.20 pooka #ifdef _KERNEL_OPT 141 1.1 rpaulo #include "opt_inet.h" 142 1.1 rpaulo #include "opt_tcp_debug.h" 143 1.1 rpaulo #include "opt_tcp_congctl.h" 144 1.20 pooka #endif 145 1.1 rpaulo 146 1.1 rpaulo #include <sys/param.h> 147 1.1 rpaulo #include <sys/systm.h> 148 1.1 rpaulo #include <sys/malloc.h> 149 1.1 rpaulo #include <sys/mbuf.h> 150 1.1 rpaulo #include <sys/protosw.h> 151 1.1 rpaulo #include <sys/socket.h> 152 1.1 rpaulo #include <sys/socketvar.h> 153 1.1 rpaulo #include <sys/errno.h> 154 1.1 rpaulo #include <sys/syslog.h> 155 1.1 rpaulo #include <sys/pool.h> 156 1.1 rpaulo #include <sys/domain.h> 157 1.1 rpaulo #include <sys/kernel.h> 158 1.13 xtraeme #include <sys/mutex.h> 159 1.1 rpaulo 160 1.1 rpaulo #include <net/if.h> 161 1.1 rpaulo 162 1.1 rpaulo #include <netinet/in.h> 163 1.1 rpaulo #include <netinet/in_systm.h> 164 1.1 rpaulo #include <netinet/ip.h> 165 1.1 rpaulo #include <netinet/in_pcb.h> 166 1.1 rpaulo #include <netinet/in_var.h> 167 1.1 rpaulo #include <netinet/ip_var.h> 168 1.1 rpaulo 169 1.1 rpaulo #ifdef INET6 170 1.1 rpaulo #include <netinet/ip6.h> 171 1.1 rpaulo #include <netinet6/ip6_var.h> 172 1.1 rpaulo #include <netinet6/in6_pcb.h> 173 1.1 rpaulo #include <netinet6/ip6_var.h> 174 1.1 rpaulo #include <netinet6/in6_var.h> 175 1.1 rpaulo #include <netinet/icmp6.h> 176 1.1 rpaulo #endif 177 1.1 rpaulo 178 1.1 rpaulo #include <netinet/tcp.h> 179 1.1 rpaulo #include <netinet/tcp_fsm.h> 180 1.1 rpaulo #include <netinet/tcp_seq.h> 181 1.1 rpaulo #include <netinet/tcp_timer.h> 182 1.1 rpaulo #include <netinet/tcp_var.h> 183 1.1 rpaulo #include <netinet/tcp_congctl.h> 184 1.1 rpaulo #ifdef TCP_DEBUG 185 1.1 rpaulo #include <netinet/tcp_debug.h> 186 1.1 rpaulo #endif 187 1.1 rpaulo 188 1.1 rpaulo /* 189 1.1 rpaulo * TODO: 190 1.1 rpaulo * consider separating the actual implementations in another file. 191 1.1 rpaulo */ 192 1.1 rpaulo 193 1.18 kefren static void tcp_common_congestion_exp(struct tcpcb *, int, int); 194 1.18 kefren 195 1.18 kefren static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *); 196 1.11 yamt static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 197 1.1 rpaulo static void tcp_reno_slow_retransmit(struct tcpcb *); 198 1.11 yamt static void tcp_reno_fast_retransmit_newack(struct tcpcb *, 199 1.11 yamt const struct tcphdr *); 200 1.11 yamt static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *); 201 1.6 rpaulo static void tcp_reno_congestion_exp(struct tcpcb *tp); 202 1.1 rpaulo 203 1.11 yamt static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *); 204 1.1 rpaulo static void tcp_newreno_fast_retransmit_newack(struct tcpcb *, 205 1.11 yamt const struct tcphdr *); 206 1.11 yamt static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *); 207 1.1 rpaulo 208 1.18 kefren static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *); 209 1.18 kefren static void tcp_cubic_slow_retransmit(struct tcpcb *tp); 210 1.18 kefren static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *); 211 1.18 kefren static void tcp_cubic_congestion_exp(struct tcpcb *); 212 1.1 rpaulo 213 1.1 rpaulo static void tcp_congctl_fillnames(void); 214 1.1 rpaulo 215 1.1 rpaulo extern int tcprexmtthresh; 216 1.1 rpaulo 217 1.1 rpaulo MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures"); 218 1.1 rpaulo 219 1.14 matt /* currently selected global congestion control */ 220 1.14 matt char tcp_congctl_global_name[TCPCC_MAXLEN]; 221 1.14 matt 222 1.14 matt /* available global congestion control algorithms */ 223 1.14 matt char tcp_congctl_avail[10 * TCPCC_MAXLEN]; 224 1.14 matt 225 1.1 rpaulo /* 226 1.1 rpaulo * Used to list the available congestion control algorithms. 227 1.1 rpaulo */ 228 1.14 matt TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd = 229 1.14 matt TAILQ_HEAD_INITIALIZER(tcp_congctlhd); 230 1.14 matt 231 1.14 matt static struct tcp_congctlent * tcp_congctl_global; 232 1.1 rpaulo 233 1.13 xtraeme static kmutex_t tcp_congctl_mtx; 234 1.1 rpaulo 235 1.1 rpaulo void 236 1.1 rpaulo tcp_congctl_init(void) 237 1.1 rpaulo { 238 1.17 martin int r __diagused; 239 1.1 rpaulo 240 1.13 xtraeme mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE); 241 1.1 rpaulo 242 1.1 rpaulo /* Base algorithms. */ 243 1.1 rpaulo r = tcp_congctl_register("reno", &tcp_reno_ctl); 244 1.1 rpaulo KASSERT(r == 0); 245 1.1 rpaulo r = tcp_congctl_register("newreno", &tcp_newreno_ctl); 246 1.1 rpaulo KASSERT(r == 0); 247 1.18 kefren r = tcp_congctl_register("cubic", &tcp_cubic_ctl); 248 1.18 kefren KASSERT(r == 0); 249 1.1 rpaulo 250 1.1 rpaulo /* NewReno is the default. */ 251 1.1 rpaulo #ifndef TCP_CONGCTL_DEFAULT 252 1.1 rpaulo #define TCP_CONGCTL_DEFAULT "newreno" 253 1.1 rpaulo #endif 254 1.1 rpaulo 255 1.1 rpaulo r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT); 256 1.1 rpaulo KASSERT(r == 0); 257 1.1 rpaulo } 258 1.1 rpaulo 259 1.1 rpaulo /* 260 1.1 rpaulo * Register a congestion algorithm and select it if we have none. 261 1.1 rpaulo */ 262 1.1 rpaulo int 263 1.14 matt tcp_congctl_register(const char *name, const struct tcp_congctl *tcc) 264 1.1 rpaulo { 265 1.1 rpaulo struct tcp_congctlent *ntcc, *tccp; 266 1.1 rpaulo 267 1.1 rpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 268 1.1 rpaulo if (!strcmp(name, tccp->congctl_name)) { 269 1.1 rpaulo /* name already registered */ 270 1.1 rpaulo return EEXIST; 271 1.1 rpaulo } 272 1.1 rpaulo 273 1.14 matt ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO); 274 1.1 rpaulo 275 1.1 rpaulo strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1); 276 1.1 rpaulo ntcc->congctl_ctl = tcc; 277 1.1 rpaulo 278 1.1 rpaulo TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent); 279 1.1 rpaulo tcp_congctl_fillnames(); 280 1.1 rpaulo 281 1.1 rpaulo if (TAILQ_FIRST(&tcp_congctlhd) == ntcc) 282 1.1 rpaulo tcp_congctl_select(NULL, name); 283 1.1 rpaulo 284 1.1 rpaulo return 0; 285 1.1 rpaulo } 286 1.1 rpaulo 287 1.1 rpaulo int 288 1.1 rpaulo tcp_congctl_unregister(const char *name) 289 1.1 rpaulo { 290 1.1 rpaulo struct tcp_congctlent *tccp, *rtccp; 291 1.1 rpaulo unsigned int size; 292 1.1 rpaulo 293 1.1 rpaulo rtccp = NULL; 294 1.1 rpaulo size = 0; 295 1.1 rpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 296 1.1 rpaulo if (!strcmp(name, tccp->congctl_name)) 297 1.1 rpaulo rtccp = tccp; 298 1.1 rpaulo size++; 299 1.1 rpaulo } 300 1.1 rpaulo 301 1.1 rpaulo if (!rtccp) 302 1.1 rpaulo return ENOENT; 303 1.1 rpaulo 304 1.14 matt if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt) 305 1.1 rpaulo return EBUSY; 306 1.1 rpaulo 307 1.1 rpaulo TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent); 308 1.1 rpaulo free(rtccp, M_TCPCONGCTL); 309 1.1 rpaulo tcp_congctl_fillnames(); 310 1.1 rpaulo 311 1.1 rpaulo return 0; 312 1.1 rpaulo } 313 1.1 rpaulo 314 1.1 rpaulo /* 315 1.1 rpaulo * Select a congestion algorithm by name. 316 1.1 rpaulo */ 317 1.1 rpaulo int 318 1.1 rpaulo tcp_congctl_select(struct tcpcb *tp, const char *name) 319 1.1 rpaulo { 320 1.14 matt struct tcp_congctlent *tccp, *old_tccp, *new_tccp; 321 1.14 matt bool old_found, new_found; 322 1.1 rpaulo 323 1.1 rpaulo KASSERT(name); 324 1.1 rpaulo 325 1.14 matt old_found = (tp == NULL || tp->t_congctl == NULL); 326 1.14 matt old_tccp = NULL; 327 1.14 matt new_found = false; 328 1.14 matt new_tccp = NULL; 329 1.14 matt 330 1.14 matt TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 331 1.14 matt if (!old_found && tccp->congctl_ctl == tp->t_congctl) { 332 1.14 matt old_tccp = tccp; 333 1.14 matt old_found = true; 334 1.14 matt } 335 1.14 matt 336 1.14 matt if (!new_found && !strcmp(name, tccp->congctl_name)) { 337 1.14 matt new_tccp = tccp; 338 1.14 matt new_found = true; 339 1.14 matt } 340 1.14 matt 341 1.14 matt if (new_found && old_found) { 342 1.1 rpaulo if (tp) { 343 1.13 xtraeme mutex_enter(&tcp_congctl_mtx); 344 1.14 matt if (old_tccp) 345 1.14 matt old_tccp->congctl_refcnt--; 346 1.14 matt tp->t_congctl = new_tccp->congctl_ctl; 347 1.14 matt new_tccp->congctl_refcnt++; 348 1.13 xtraeme mutex_exit(&tcp_congctl_mtx); 349 1.1 rpaulo } else { 350 1.14 matt tcp_congctl_global = new_tccp; 351 1.1 rpaulo strlcpy(tcp_congctl_global_name, 352 1.14 matt new_tccp->congctl_name, 353 1.1 rpaulo sizeof(tcp_congctl_global_name) - 1); 354 1.1 rpaulo } 355 1.1 rpaulo return 0; 356 1.1 rpaulo } 357 1.14 matt } 358 1.14 matt 359 1.14 matt return EINVAL; 360 1.14 matt } 361 1.14 matt 362 1.14 matt void 363 1.14 matt tcp_congctl_release(struct tcpcb *tp) 364 1.14 matt { 365 1.14 matt struct tcp_congctlent *tccp; 366 1.14 matt 367 1.14 matt KASSERT(tp->t_congctl); 368 1.1 rpaulo 369 1.14 matt TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 370 1.14 matt if (tccp->congctl_ctl == tp->t_congctl) { 371 1.14 matt tccp->congctl_refcnt--; 372 1.14 matt return; 373 1.14 matt } 374 1.14 matt } 375 1.1 rpaulo } 376 1.1 rpaulo 377 1.1 rpaulo /* 378 1.1 rpaulo * Returns the name of a congestion algorithm. 379 1.1 rpaulo */ 380 1.1 rpaulo const char * 381 1.1 rpaulo tcp_congctl_bystruct(const struct tcp_congctl *tcc) 382 1.1 rpaulo { 383 1.1 rpaulo struct tcp_congctlent *tccp; 384 1.1 rpaulo 385 1.1 rpaulo KASSERT(tcc); 386 1.1 rpaulo 387 1.1 rpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 388 1.1 rpaulo if (tccp->congctl_ctl == tcc) 389 1.1 rpaulo return tccp->congctl_name; 390 1.1 rpaulo 391 1.1 rpaulo return NULL; 392 1.1 rpaulo } 393 1.1 rpaulo 394 1.1 rpaulo static void 395 1.1 rpaulo tcp_congctl_fillnames(void) 396 1.1 rpaulo { 397 1.1 rpaulo struct tcp_congctlent *tccp; 398 1.1 rpaulo const char *delim = " "; 399 1.1 rpaulo 400 1.1 rpaulo tcp_congctl_avail[0] = '\0'; 401 1.1 rpaulo TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { 402 1.1 rpaulo strlcat(tcp_congctl_avail, tccp->congctl_name, 403 1.1 rpaulo sizeof(tcp_congctl_avail) - 1); 404 1.1 rpaulo if (TAILQ_NEXT(tccp, congctl_ent)) 405 1.1 rpaulo strlcat(tcp_congctl_avail, delim, 406 1.1 rpaulo sizeof(tcp_congctl_avail) - 1); 407 1.1 rpaulo } 408 1.1 rpaulo 409 1.1 rpaulo } 410 1.1 rpaulo 411 1.1 rpaulo /* ------------------------------------------------------------------------ */ 412 1.1 rpaulo 413 1.6 rpaulo /* 414 1.18 kefren * Common stuff 415 1.6 rpaulo */ 416 1.18 kefren 417 1.18 kefren /* Window reduction (1-beta) for [New]Reno: 0.5 */ 418 1.18 kefren #define RENO_BETAA 1 419 1.18 kefren #define RENO_BETAB 2 420 1.18 kefren /* Window reduction (1-beta) for Cubic: 0.8 */ 421 1.18 kefren #define CUBIC_BETAA 4 422 1.18 kefren #define CUBIC_BETAB 5 423 1.18 kefren /* Draft Rhee Section 4.1 */ 424 1.18 kefren #define CUBIC_CA 4 425 1.18 kefren #define CUBIC_CB 10 426 1.18 kefren 427 1.6 rpaulo static void 428 1.18 kefren tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab) 429 1.1 rpaulo { 430 1.27 msaitoh u_long win; 431 1.1 rpaulo 432 1.1 rpaulo /* 433 1.18 kefren * Reduce the congestion window and the slow start threshold. 434 1.1 rpaulo */ 435 1.27 msaitoh win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz; 436 1.1 rpaulo if (win < 2) 437 1.1 rpaulo win = 2; 438 1.1 rpaulo 439 1.1 rpaulo tp->snd_ssthresh = win * tp->t_segsz; 440 1.1 rpaulo tp->snd_recover = tp->snd_max; 441 1.1 rpaulo tp->snd_cwnd = tp->snd_ssthresh; 442 1.1 rpaulo 443 1.7 rpaulo /* 444 1.7 rpaulo * When using TCP ECN, notify the peer that 445 1.7 rpaulo * we reduced the cwnd. 446 1.7 rpaulo */ 447 1.1 rpaulo if (TCP_ECN_ALLOWED(tp)) 448 1.1 rpaulo tp->t_flags |= TF_ECN_SND_CWR; 449 1.1 rpaulo } 450 1.1 rpaulo 451 1.1 rpaulo 452 1.18 kefren /* ------------------------------------------------------------------------ */ 453 1.18 kefren 454 1.18 kefren /* 455 1.18 kefren * TCP/Reno congestion control. 456 1.18 kefren */ 457 1.18 kefren static void 458 1.18 kefren tcp_reno_congestion_exp(struct tcpcb *tp) 459 1.18 kefren { 460 1.18 kefren 461 1.18 kefren tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB); 462 1.18 kefren } 463 1.6 rpaulo 464 1.1 rpaulo static int 465 1.18 kefren tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 466 1.1 rpaulo { 467 1.7 rpaulo /* 468 1.7 rpaulo * Dup acks mean that packets have left the 469 1.7 rpaulo * network (they're now cached at the receiver) 470 1.7 rpaulo * so bump cwnd by the amount in the receiver 471 1.7 rpaulo * to keep a constant cwnd packets in the 472 1.7 rpaulo * network. 473 1.7 rpaulo * 474 1.7 rpaulo * If we are using TCP/SACK, then enter 475 1.7 rpaulo * Fast Recovery if the receiver SACKs 476 1.7 rpaulo * data that is tcprexmtthresh * MSS 477 1.7 rpaulo * bytes past the last ACKed segment, 478 1.7 rpaulo * irrespective of the number of DupAcks. 479 1.7 rpaulo */ 480 1.7 rpaulo 481 1.18 kefren tcp_seq onxt = tp->snd_nxt; 482 1.18 kefren 483 1.1 rpaulo tp->t_partialacks = 0; 484 1.1 rpaulo TCP_TIMER_DISARM(tp, TCPT_REXMT); 485 1.1 rpaulo tp->t_rtttime = 0; 486 1.1 rpaulo if (TCP_SACK_ENABLED(tp)) { 487 1.1 rpaulo tp->t_dupacks = tcprexmtthresh; 488 1.1 rpaulo tp->sack_newdata = tp->snd_nxt; 489 1.1 rpaulo tp->snd_cwnd = tp->t_segsz; 490 1.1 rpaulo (void) tcp_output(tp); 491 1.1 rpaulo return 0; 492 1.1 rpaulo } 493 1.1 rpaulo tp->snd_nxt = th->th_ack; 494 1.1 rpaulo tp->snd_cwnd = tp->t_segsz; 495 1.1 rpaulo (void) tcp_output(tp); 496 1.1 rpaulo tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks; 497 1.1 rpaulo if (SEQ_GT(onxt, tp->snd_nxt)) 498 1.1 rpaulo tp->snd_nxt = onxt; 499 1.19 kefren 500 1.1 rpaulo return 0; 501 1.1 rpaulo } 502 1.1 rpaulo 503 1.18 kefren static int 504 1.18 kefren tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 505 1.18 kefren { 506 1.18 kefren 507 1.19 kefren /* 508 1.19 kefren * We know we're losing at the current 509 1.19 kefren * window size so do congestion avoidance 510 1.19 kefren * (set ssthresh to half the current window 511 1.19 kefren * and pull our congestion window back to 512 1.19 kefren * the new ssthresh). 513 1.19 kefren */ 514 1.19 kefren 515 1.18 kefren tcp_reno_congestion_exp(tp); 516 1.18 kefren return tcp_reno_do_fast_retransmit(tp, th); 517 1.18 kefren } 518 1.18 kefren 519 1.1 rpaulo static void 520 1.1 rpaulo tcp_reno_slow_retransmit(struct tcpcb *tp) 521 1.1 rpaulo { 522 1.27 msaitoh u_long win; 523 1.1 rpaulo 524 1.1 rpaulo /* 525 1.1 rpaulo * Close the congestion window down to one segment 526 1.1 rpaulo * (we'll open it by one segment for each ack we get). 527 1.1 rpaulo * Since we probably have a window's worth of unacked 528 1.1 rpaulo * data accumulated, this "slow start" keeps us from 529 1.1 rpaulo * dumping all that data as back-to-back packets (which 530 1.1 rpaulo * might overwhelm an intermediate gateway). 531 1.1 rpaulo * 532 1.1 rpaulo * There are two phases to the opening: Initially we 533 1.1 rpaulo * open by one mss on each ack. This makes the window 534 1.1 rpaulo * size increase exponentially with time. If the 535 1.1 rpaulo * window is larger than the path can handle, this 536 1.1 rpaulo * exponential growth results in dropped packet(s) 537 1.1 rpaulo * almost immediately. To get more time between 538 1.1 rpaulo * drops but still "push" the network to take advantage 539 1.1 rpaulo * of improving conditions, we switch from exponential 540 1.28 andvar * to linear window opening at some threshold size. 541 1.28 andvar * For a threshold, we use half the current window 542 1.1 rpaulo * size, truncated to a multiple of the mss. 543 1.1 rpaulo * 544 1.1 rpaulo * (the minimum cwnd that will give us exponential 545 1.28 andvar * growth is 2 mss. We don't allow the threshold 546 1.1 rpaulo * to go below this.) 547 1.1 rpaulo */ 548 1.1 rpaulo 549 1.27 msaitoh win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz; 550 1.1 rpaulo if (win < 2) 551 1.1 rpaulo win = 2; 552 1.1 rpaulo /* Loss Window MUST be one segment. */ 553 1.1 rpaulo tp->snd_cwnd = tp->t_segsz; 554 1.1 rpaulo tp->snd_ssthresh = win * tp->t_segsz; 555 1.1 rpaulo tp->t_partialacks = -1; 556 1.1 rpaulo tp->t_dupacks = 0; 557 1.8 yamt tp->t_bytes_acked = 0; 558 1.18 kefren 559 1.18 kefren if (TCP_ECN_ALLOWED(tp)) 560 1.18 kefren tp->t_flags |= TF_ECN_SND_CWR; 561 1.1 rpaulo } 562 1.1 rpaulo 563 1.1 rpaulo static void 564 1.11 yamt tcp_reno_fast_retransmit_newack(struct tcpcb *tp, 565 1.12 christos const struct tcphdr *th) 566 1.1 rpaulo { 567 1.1 rpaulo if (tp->t_partialacks < 0) { 568 1.1 rpaulo /* 569 1.1 rpaulo * We were not in fast recovery. Reset the duplicate ack 570 1.1 rpaulo * counter. 571 1.1 rpaulo */ 572 1.1 rpaulo tp->t_dupacks = 0; 573 1.1 rpaulo } else { 574 1.1 rpaulo /* 575 1.1 rpaulo * Clamp the congestion window to the crossover point and 576 1.1 rpaulo * exit fast recovery. 577 1.1 rpaulo */ 578 1.1 rpaulo if (tp->snd_cwnd > tp->snd_ssthresh) 579 1.1 rpaulo tp->snd_cwnd = tp->snd_ssthresh; 580 1.1 rpaulo tp->t_partialacks = -1; 581 1.1 rpaulo tp->t_dupacks = 0; 582 1.8 yamt tp->t_bytes_acked = 0; 583 1.18 kefren if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 584 1.18 kefren tp->snd_fack = th->th_ack; 585 1.1 rpaulo } 586 1.1 rpaulo } 587 1.1 rpaulo 588 1.1 rpaulo static void 589 1.11 yamt tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th) 590 1.1 rpaulo { 591 1.1 rpaulo /* 592 1.1 rpaulo * When new data is acked, open the congestion window. 593 1.1 rpaulo */ 594 1.4 rpaulo 595 1.4 rpaulo u_int cw = tp->snd_cwnd; 596 1.4 rpaulo u_int incr = tp->t_segsz; 597 1.4 rpaulo 598 1.8 yamt if (tcp_do_abc) { 599 1.8 yamt 600 1.8 yamt /* 601 1.8 yamt * RFC 3465 Appropriate Byte Counting (ABC) 602 1.8 yamt */ 603 1.8 yamt 604 1.8 yamt int acked = th->th_ack - tp->snd_una; 605 1.8 yamt 606 1.8 yamt if (cw >= tp->snd_ssthresh) { 607 1.8 yamt tp->t_bytes_acked += acked; 608 1.8 yamt if (tp->t_bytes_acked >= cw) { 609 1.8 yamt /* Time to increase the window. */ 610 1.8 yamt tp->t_bytes_acked -= cw; 611 1.8 yamt } else { 612 1.8 yamt /* No need to increase yet. */ 613 1.8 yamt incr = 0; 614 1.8 yamt } 615 1.8 yamt } else { 616 1.8 yamt /* 617 1.8 yamt * use 2*SMSS or 1*SMSS for the "L" param, 618 1.8 yamt * depending on sysctl setting. 619 1.8 yamt * 620 1.8 yamt * (See RFC 3465 2.3 Choosing the Limit) 621 1.8 yamt */ 622 1.8 yamt u_int abc_lim; 623 1.8 yamt 624 1.9 yamt abc_lim = (tcp_abc_aggressive == 0 || 625 1.9 yamt tp->snd_nxt != tp->snd_max) ? incr : incr * 2; 626 1.26 riastrad incr = uimin(acked, abc_lim); 627 1.8 yamt } 628 1.8 yamt } else { 629 1.8 yamt 630 1.8 yamt /* 631 1.8 yamt * If the window gives us less than ssthresh packets 632 1.8 yamt * in flight, open exponentially (segsz per packet). 633 1.8 yamt * Otherwise open linearly: segsz per window 634 1.8 yamt * (segsz^2 / cwnd per packet). 635 1.8 yamt */ 636 1.8 yamt 637 1.8 yamt if (cw >= tp->snd_ssthresh) { 638 1.8 yamt incr = incr * incr / cw; 639 1.8 yamt } 640 1.8 yamt } 641 1.4 rpaulo 642 1.26 riastrad tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale); 643 1.1 rpaulo } 644 1.1 rpaulo 645 1.14 matt const struct tcp_congctl tcp_reno_ctl = { 646 1.1 rpaulo .fast_retransmit = tcp_reno_fast_retransmit, 647 1.1 rpaulo .slow_retransmit = tcp_reno_slow_retransmit, 648 1.1 rpaulo .fast_retransmit_newack = tcp_reno_fast_retransmit_newack, 649 1.1 rpaulo .newack = tcp_reno_newack, 650 1.6 rpaulo .cong_exp = tcp_reno_congestion_exp, 651 1.1 rpaulo }; 652 1.1 rpaulo 653 1.1 rpaulo /* 654 1.1 rpaulo * TCP/NewReno Congestion control. 655 1.1 rpaulo */ 656 1.1 rpaulo static int 657 1.11 yamt tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 658 1.1 rpaulo { 659 1.16 yamt 660 1.1 rpaulo if (SEQ_LT(th->th_ack, tp->snd_high)) { 661 1.1 rpaulo /* 662 1.1 rpaulo * False fast retransmit after timeout. 663 1.1 rpaulo * Do not enter fast recovery 664 1.1 rpaulo */ 665 1.1 rpaulo tp->t_dupacks = 0; 666 1.1 rpaulo return 1; 667 1.1 rpaulo } 668 1.16 yamt /* 669 1.16 yamt * Fast retransmit is same as reno. 670 1.16 yamt */ 671 1.16 yamt return tcp_reno_fast_retransmit(tp, th); 672 1.1 rpaulo } 673 1.1 rpaulo 674 1.1 rpaulo /* 675 1.1 rpaulo * Implement the NewReno response to a new ack, checking for partial acks in 676 1.1 rpaulo * fast recovery. 677 1.1 rpaulo */ 678 1.1 rpaulo static void 679 1.11 yamt tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th) 680 1.1 rpaulo { 681 1.1 rpaulo if (tp->t_partialacks < 0) { 682 1.1 rpaulo /* 683 1.1 rpaulo * We were not in fast recovery. Reset the duplicate ack 684 1.1 rpaulo * counter. 685 1.1 rpaulo */ 686 1.1 rpaulo tp->t_dupacks = 0; 687 1.1 rpaulo } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { 688 1.1 rpaulo /* 689 1.1 rpaulo * This is a partial ack. Retransmit the first unacknowledged 690 1.1 rpaulo * segment and deflate the congestion window by the amount of 691 1.1 rpaulo * acknowledged data. Do not exit fast recovery. 692 1.1 rpaulo */ 693 1.1 rpaulo tcp_seq onxt = tp->snd_nxt; 694 1.1 rpaulo u_long ocwnd = tp->snd_cwnd; 695 1.18 kefren int sack_num_segs = 1, sack_bytes_rxmt = 0; 696 1.1 rpaulo 697 1.1 rpaulo /* 698 1.1 rpaulo * snd_una has not yet been updated and the socket's send 699 1.1 rpaulo * buffer has not yet drained off the ACK'd data, so we 700 1.1 rpaulo * have to leave snd_una as it was to get the correct data 701 1.1 rpaulo * offset in tcp_output(). 702 1.1 rpaulo */ 703 1.18 kefren tp->t_partialacks++; 704 1.18 kefren TCP_TIMER_DISARM(tp, TCPT_REXMT); 705 1.1 rpaulo tp->t_rtttime = 0; 706 1.18 kefren 707 1.18 kefren if (TCP_SACK_ENABLED(tp)) { 708 1.18 kefren /* 709 1.18 kefren * Partial ack handling within a sack recovery episode. 710 1.18 kefren * Keeping this very simple for now. When a partial ack 711 1.18 kefren * is received, force snd_cwnd to a value that will 712 1.18 kefren * allow the sender to transmit no more than 2 segments. 713 1.18 kefren * If necessary, a fancier scheme can be adopted at a 714 1.18 kefren * later point, but for now, the goal is to prevent the 715 1.18 kefren * sender from bursting a large amount of data in the 716 1.18 kefren * midst of sack recovery. 717 1.18 kefren */ 718 1.18 kefren 719 1.18 kefren /* 720 1.18 kefren * send one or 2 segments based on how much 721 1.18 kefren * new data was acked 722 1.18 kefren */ 723 1.18 kefren if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) 724 1.18 kefren sack_num_segs = 2; 725 1.18 kefren (void)tcp_sack_output(tp, &sack_bytes_rxmt); 726 1.18 kefren tp->snd_cwnd = sack_bytes_rxmt + 727 1.18 kefren (tp->snd_nxt - tp->sack_newdata) + 728 1.18 kefren sack_num_segs * tp->t_segsz; 729 1.18 kefren tp->t_flags |= TF_ACKNOW; 730 1.18 kefren (void) tcp_output(tp); 731 1.18 kefren } else { 732 1.23 skrll tp->snd_nxt = th->th_ack; 733 1.18 kefren /* 734 1.18 kefren * Set snd_cwnd to one segment beyond ACK'd offset 735 1.18 kefren * snd_una is not yet updated when we're called 736 1.18 kefren */ 737 1.18 kefren tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una); 738 1.18 kefren (void) tcp_output(tp); 739 1.18 kefren tp->snd_cwnd = ocwnd; 740 1.18 kefren if (SEQ_GT(onxt, tp->snd_nxt)) 741 1.18 kefren tp->snd_nxt = onxt; 742 1.18 kefren /* 743 1.18 kefren * Partial window deflation. Relies on fact that 744 1.18 kefren * tp->snd_una not updated yet. 745 1.18 kefren */ 746 1.18 kefren tp->snd_cwnd -= (th->th_ack - tp->snd_una - 747 1.18 kefren tp->t_segsz); 748 1.18 kefren } 749 1.1 rpaulo } else { 750 1.1 rpaulo /* 751 1.1 rpaulo * Complete ack. Inflate the congestion window to ssthresh 752 1.1 rpaulo * and exit fast recovery. 753 1.1 rpaulo * 754 1.1 rpaulo * Window inflation should have left us with approx. 755 1.1 rpaulo * snd_ssthresh outstanding data. But in case we 756 1.1 rpaulo * would be inclined to send a burst, better to do 757 1.1 rpaulo * it via the slow start mechanism. 758 1.1 rpaulo */ 759 1.1 rpaulo if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) 760 1.1 rpaulo tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) 761 1.1 rpaulo + tp->t_segsz; 762 1.1 rpaulo else 763 1.1 rpaulo tp->snd_cwnd = tp->snd_ssthresh; 764 1.1 rpaulo tp->t_partialacks = -1; 765 1.1 rpaulo tp->t_dupacks = 0; 766 1.8 yamt tp->t_bytes_acked = 0; 767 1.18 kefren if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack)) 768 1.18 kefren tp->snd_fack = th->th_ack; 769 1.1 rpaulo } 770 1.1 rpaulo } 771 1.1 rpaulo 772 1.1 rpaulo static void 773 1.11 yamt tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th) 774 1.1 rpaulo { 775 1.1 rpaulo /* 776 1.4 rpaulo * If we are still in fast recovery (meaning we are using 777 1.4 rpaulo * NewReno and we have only received partial acks), do not 778 1.4 rpaulo * inflate the window yet. 779 1.1 rpaulo */ 780 1.4 rpaulo if (tp->t_partialacks < 0) 781 1.4 rpaulo tcp_reno_newack(tp, th); 782 1.1 rpaulo } 783 1.1 rpaulo 784 1.1 rpaulo 785 1.14 matt const struct tcp_congctl tcp_newreno_ctl = { 786 1.1 rpaulo .fast_retransmit = tcp_newreno_fast_retransmit, 787 1.1 rpaulo .slow_retransmit = tcp_reno_slow_retransmit, 788 1.1 rpaulo .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 789 1.1 rpaulo .newack = tcp_newreno_newack, 790 1.6 rpaulo .cong_exp = tcp_reno_congestion_exp, 791 1.1 rpaulo }; 792 1.1 rpaulo 793 1.18 kefren /* 794 1.18 kefren * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02 795 1.18 kefren */ 796 1.18 kefren 797 1.18 kefren /* Cubic prototypes */ 798 1.18 kefren static void tcp_cubic_update_ctime(struct tcpcb *tp); 799 1.18 kefren static uint32_t tcp_cubic_diff_ctime(struct tcpcb *); 800 1.18 kefren static uint32_t tcp_cubic_cbrt(uint32_t); 801 1.19 kefren static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t); 802 1.18 kefren 803 1.18 kefren /* Cubic TIME functions - XXX I don't like using timevals and microuptime */ 804 1.18 kefren /* 805 1.18 kefren * Set congestion timer to now 806 1.18 kefren */ 807 1.18 kefren static void 808 1.18 kefren tcp_cubic_update_ctime(struct tcpcb *tp) 809 1.18 kefren { 810 1.18 kefren struct timeval now_timeval; 811 1.18 kefren 812 1.18 kefren getmicrouptime(&now_timeval); 813 1.18 kefren tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 + 814 1.18 kefren now_timeval.tv_usec / 1000; 815 1.18 kefren } 816 1.18 kefren 817 1.18 kefren /* 818 1.29 andvar * milliseconds from last congestion 819 1.18 kefren */ 820 1.18 kefren static uint32_t 821 1.18 kefren tcp_cubic_diff_ctime(struct tcpcb *tp) 822 1.18 kefren { 823 1.18 kefren struct timeval now_timeval; 824 1.18 kefren 825 1.18 kefren getmicrouptime(&now_timeval); 826 1.18 kefren return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 - 827 1.18 kefren tp->snd_cubic_ctime; 828 1.18 kefren } 829 1.1 rpaulo 830 1.18 kefren /* 831 1.18 kefren * Approximate cubic root 832 1.18 kefren */ 833 1.18 kefren #define CBRT_ROUNDS 30 834 1.18 kefren static uint32_t 835 1.18 kefren tcp_cubic_cbrt(uint32_t v) 836 1.18 kefren { 837 1.18 kefren int i, rounds = CBRT_ROUNDS; 838 1.18 kefren uint64_t x = v / 3; 839 1.18 kefren 840 1.18 kefren /* We fail to calculate correct for small numbers */ 841 1.18 kefren if (v == 0) 842 1.18 kefren return 0; 843 1.18 kefren else if (v < 4) 844 1.18 kefren return 1; 845 1.18 kefren 846 1.18 kefren /* 847 1.18 kefren * largest x that 2*x^3+3*x fits 64bit 848 1.18 kefren * Avoid overflow for a time cost 849 1.18 kefren */ 850 1.18 kefren if (x > 2097151) 851 1.18 kefren rounds += 10; 852 1.18 kefren 853 1.18 kefren for (i = 0; i < rounds; i++) 854 1.18 kefren if (rounds == CBRT_ROUNDS) 855 1.18 kefren x = (v + 2 * x * x * x) / (3 * x * x); 856 1.18 kefren else 857 1.18 kefren /* Avoid overflow */ 858 1.18 kefren x = v / (3 * x * x) + 2 * x / 3; 859 1.18 kefren 860 1.18 kefren return (uint32_t)x; 861 1.18 kefren } 862 1.18 kefren 863 1.19 kefren /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */ 864 1.19 kefren static ulong 865 1.19 kefren tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt) 866 1.18 kefren { 867 1.19 kefren uint32_t K; 868 1.19 kefren long tK3; 869 1.18 kefren 870 1.19 kefren /* Section 3.1 Eq. 2 */ 871 1.19 kefren K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB * 872 1.18 kefren CUBIC_CB / CUBIC_CA); 873 1.19 kefren /* (t-K)^3 - not clear why is the measure unit mattering */ 874 1.19 kefren tK3 = (long)(ms_elapsed + rtt) - (long)K; 875 1.19 kefren tK3 = tK3 * tK3 * tK3; 876 1.18 kefren 877 1.19 kefren return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax; 878 1.18 kefren } 879 1.18 kefren 880 1.18 kefren static void 881 1.18 kefren tcp_cubic_congestion_exp(struct tcpcb *tp) 882 1.18 kefren { 883 1.18 kefren 884 1.19 kefren /* 885 1.19 kefren * Congestion - Set WMax and shrink cwnd 886 1.19 kefren */ 887 1.18 kefren tcp_cubic_update_ctime(tp); 888 1.18 kefren 889 1.18 kefren /* Section 3.6 - Fast Convergence */ 890 1.18 kefren if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) { 891 1.18 kefren tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 892 1.18 kefren tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 + 893 1.18 kefren tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2; 894 1.18 kefren } else { 895 1.18 kefren tp->snd_cubic_wmax_last = tp->snd_cubic_wmax; 896 1.18 kefren tp->snd_cubic_wmax = tp->snd_cwnd; 897 1.18 kefren } 898 1.19 kefren 899 1.26 riastrad tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax); 900 1.19 kefren 901 1.19 kefren /* Shrink CWND */ 902 1.18 kefren tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB); 903 1.18 kefren } 904 1.18 kefren 905 1.18 kefren static int 906 1.18 kefren tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th) 907 1.18 kefren { 908 1.18 kefren 909 1.18 kefren if (SEQ_LT(th->th_ack, tp->snd_high)) { 910 1.18 kefren /* See newreno */ 911 1.18 kefren tp->t_dupacks = 0; 912 1.18 kefren return 1; 913 1.18 kefren } 914 1.18 kefren 915 1.18 kefren /* 916 1.19 kefren * mark WMax 917 1.18 kefren */ 918 1.19 kefren tcp_cubic_congestion_exp(tp); 919 1.19 kefren 920 1.19 kefren /* Do fast retransmit */ 921 1.19 kefren return tcp_reno_do_fast_retransmit(tp, th); 922 1.18 kefren } 923 1.18 kefren 924 1.18 kefren static void 925 1.18 kefren tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th) 926 1.18 kefren { 927 1.18 kefren uint32_t ms_elapsed, rtt; 928 1.18 kefren u_long w_tcp; 929 1.18 kefren 930 1.19 kefren /* Congestion avoidance and not in fast recovery and usable rtt */ 931 1.19 kefren if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 && 932 1.19 kefren /* 933 1.19 kefren * t_srtt is 1/32 units of slow ticks 934 1.19 kefren * converting it in ms would be equal to 935 1.19 kefren * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ 936 1.19 kefren */ 937 1.19 kefren (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) { 938 1.18 kefren ms_elapsed = tcp_cubic_diff_ctime(tp); 939 1.18 kefren 940 1.19 kefren /* Compute W_tcp(t) */ 941 1.19 kefren w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB + 942 1.18 kefren ms_elapsed / rtt / 3; 943 1.18 kefren 944 1.18 kefren if (tp->snd_cwnd > w_tcp) { 945 1.19 kefren /* Not in TCP friendly mode */ 946 1.19 kefren tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) - 947 1.19 kefren tp->snd_cwnd) / tp->snd_cwnd; 948 1.18 kefren } else { 949 1.18 kefren /* friendly TCP mode */ 950 1.18 kefren tp->snd_cwnd = w_tcp; 951 1.18 kefren } 952 1.18 kefren 953 1.18 kefren /* Make sure we are within limits */ 954 1.26 riastrad tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz); 955 1.26 riastrad tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); 956 1.18 kefren } else { 957 1.18 kefren /* Use New Reno */ 958 1.18 kefren tcp_newreno_newack(tp, th); 959 1.18 kefren } 960 1.18 kefren } 961 1.18 kefren 962 1.18 kefren static void 963 1.18 kefren tcp_cubic_slow_retransmit(struct tcpcb *tp) 964 1.18 kefren { 965 1.18 kefren 966 1.19 kefren /* Timeout - Mark new congestion */ 967 1.19 kefren tcp_cubic_congestion_exp(tp); 968 1.18 kefren 969 1.19 kefren /* Loss Window MUST be one segment. */ 970 1.19 kefren tp->snd_cwnd = tp->t_segsz; 971 1.19 kefren tp->t_partialacks = -1; 972 1.19 kefren tp->t_dupacks = 0; 973 1.19 kefren tp->t_bytes_acked = 0; 974 1.19 kefren 975 1.19 kefren if (TCP_ECN_ALLOWED(tp)) 976 1.19 kefren tp->t_flags |= TF_ECN_SND_CWR; 977 1.18 kefren } 978 1.18 kefren 979 1.18 kefren const struct tcp_congctl tcp_cubic_ctl = { 980 1.18 kefren .fast_retransmit = tcp_cubic_fast_retransmit, 981 1.18 kefren .slow_retransmit = tcp_cubic_slow_retransmit, 982 1.18 kefren .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack, 983 1.18 kefren .newack = tcp_cubic_newack, 984 1.18 kefren .cong_exp = tcp_cubic_congestion_exp, 985 1.18 kefren }; 986