Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.11
      1 /*	$NetBSD: altq_red.c,v 1.11 2005/02/26 23:04:16 perry Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.9 2002/01/07 11:25:40 kjc Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.11 2005/02/26 23:04:16 perry Exp $");
     65 
     66 #if defined(__FreeBSD__) || defined(__NetBSD__)
     67 #include "opt_altq.h"
     68 #if (__FreeBSD__ != 2)
     69 #include "opt_inet.h"
     70 #ifdef __FreeBSD__
     71 #include "opt_inet6.h"
     72 #endif
     73 #endif
     74 #endif /* __FreeBSD__ || __NetBSD__ */
     75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     76 
     77 #include <sys/param.h>
     78 #include <sys/malloc.h>
     79 #include <sys/mbuf.h>
     80 #include <sys/socket.h>
     81 #include <sys/sockio.h>
     82 #include <sys/systm.h>
     83 #include <sys/proc.h>
     84 #include <sys/errno.h>
     85 #include <sys/kernel.h>
     86 #ifdef ALTQ_FLOWVALVE
     87 #include <sys/queue.h>
     88 #include <sys/time.h>
     89 #endif
     90 
     91 #include <net/if.h>
     92 #include <net/if_types.h>
     93 
     94 #include <netinet/in.h>
     95 #include <netinet/in_systm.h>
     96 #include <netinet/ip.h>
     97 #ifdef INET6
     98 #include <netinet/ip6.h>
     99 #endif
    100 
    101 #include <altq/altq.h>
    102 #include <altq/altq_conf.h>
    103 #include <altq/altq_red.h>
    104 #ifdef ALTQ_FLOWVALVE
    105 #include <altq/altq_flowvalve.h>
    106 #endif
    107 
    108 /*
    109  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    110  * fixed-point calculation.
    111  *
    112  * written by kjc using the ns code as a reference.
    113  * you can learn more about red and ns from Sally's home page at
    114  * http://www-nrg.ee.lbl.gov/floyd/
    115  *
    116  * most of the red parameter values are fixed in this implementation
    117  * to prevent fixed-point overflow/underflow.
    118  * if you change the parameters, watch out for overflow/underflow!
    119  *
    120  * the parameters used are recommended values by Sally.
    121  * the corresponding ns config looks:
    122  *	q_weight=0.00195
    123  *	minthresh=5 maxthresh=15 queue-size=60
    124  *	linterm=30
    125  *	dropmech=drop-tail
    126  *	bytes=false (can't be handled by 32-bit fixed-point)
    127  *	doubleq=false dqthresh=false
    128  *	wait=true
    129  */
    130 /*
    131  * alternative red parameters for a slow link.
    132  *
    133  * assume the queue length becomes from zero to L and keeps L, it takes
    134  * N packets for q_avg to reach 63% of L.
    135  * when q_weight is 0.002, N is about 500 packets.
    136  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    137  * when q_weight is 0.008, N is about 127 packets.
    138  * when q_weight is 0.016, N is about 63 packets.
    139  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    140  * are allowed for 0.016.
    141  * see Sally's paper for more details.
    142  */
    143 /* normal red parameters */
    144 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    145 				/* q_weight = 0.00195 */
    146 
    147 /* red parameters for a slow link */
    148 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    149 				/* q_weight = 0.0078125 */
    150 
    151 /* red parameters for a very slow link (e.g., dialup) */
    152 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    153 				/* q_weight = 0.015625 */
    154 
    155 /* fixed-point uses 12-bit decimal places */
    156 #define	FP_SHIFT	12	/* fixed-point shift */
    157 
    158 /* red parameters for drop probability */
    159 #define	INV_P_MAX	10	/* inverse of max drop probability */
    160 #define	TH_MIN		5	/* min threshold */
    161 #define	TH_MAX		15	/* max threshold */
    162 
    163 #define	RED_LIMIT	60	/* default max queue length */
    164 
    165 /*
    166  * our default policy for forced-drop is drop-tail.
    167  * (in altq-1.1.2 or earlier, the default was random-drop.
    168  * but it makes more sense to punish the cause of the surge.)
    169  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    170  */
    171 
    172 #ifdef ALTQ_FLOWVALVE
    173 /*
    174  * flow-valve is an extension to protect red from unresponsive flows
    175  * and to promote end-to-end congestion control.
    176  * flow-valve observes the average drop rates of the flows that have
    177  * experienced packet drops in the recent past.
    178  * when the average drop rate exceeds the threshold, the flow is
    179  * blocked by the flow-valve.  the trapped flow should back off
    180  * exponentially to escape from the flow-valve.
    181  */
    182 #ifdef RED_RANDOM_DROP
    183 #error "random-drop can't be used with flow-valve!"
    184 #endif
    185 #endif /* ALTQ_FLOWVALVE */
    186 
    187 /* red_list keeps all red_queue_t's allocated. */
    188 static red_queue_t *red_list = NULL;
    189 
    190 /* default red parameter values */
    191 static int default_th_min = TH_MIN;
    192 static int default_th_max = TH_MAX;
    193 static int default_inv_pmax = INV_P_MAX;
    194 
    195 /* internal function prototypes */
    196 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    197 			    struct altq_pktattr *));
    198 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    199 static int red_request __P((struct ifaltq *, int, void *));
    200 static void red_purgeq __P((red_queue_t *));
    201 static int red_detach __P((red_queue_t *));
    202 #ifdef ALTQ_FLOWVALVE
    203 static __inline struct fve *flowlist_lookup __P((struct flowvalve *,
    204 			 struct altq_pktattr *, struct timeval *));
    205 static __inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    206 						  struct altq_pktattr *));
    207 static __inline void flowlist_move_to_head __P((struct flowvalve *,
    208 						struct fve *));
    209 static __inline int fv_p2f __P((struct flowvalve *, int));
    210 static struct flowvalve *fv_alloc __P((struct red *));
    211 static void fv_destroy __P((struct flowvalve *));
    212 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    213 			     struct fve **));
    214 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    215 			      struct fve *));
    216 #endif
    217 
    218 /*
    219  * red device interface
    220  */
    221 altqdev_decl(red);
    222 
    223 int
    224 redopen(dev, flag, fmt, p)
    225 	dev_t dev;
    226 	int flag, fmt;
    227 	struct proc *p;
    228 {
    229 	/* everything will be done when the queueing scheme is attached. */
    230 	return 0;
    231 }
    232 
    233 int
    234 redclose(dev, flag, fmt, p)
    235 	dev_t dev;
    236 	int flag, fmt;
    237 	struct proc *p;
    238 {
    239 	red_queue_t *rqp;
    240 	int err, error = 0;
    241 
    242 	while ((rqp = red_list) != NULL) {
    243 		/* destroy all */
    244 		err = red_detach(rqp);
    245 		if (err != 0 && error == 0)
    246 			error = err;
    247 	}
    248 
    249 	return error;
    250 }
    251 
    252 int
    253 redioctl(dev, cmd, addr, flag, p)
    254 	dev_t dev;
    255 	ioctlcmd_t cmd;
    256 	caddr_t addr;
    257 	int flag;
    258 	struct proc *p;
    259 {
    260 	red_queue_t *rqp;
    261 	struct red_interface *ifacep;
    262 	struct ifnet *ifp;
    263 	int	error = 0;
    264 
    265 	/* check super-user privilege */
    266 	switch (cmd) {
    267 	case RED_GETSTATS:
    268 		break;
    269 	default:
    270 #if (__FreeBSD_version > 400000)
    271 		if ((error = suser(p)) != 0)
    272 #else
    273 		if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
    274 #endif
    275 			return (error);
    276 		break;
    277 	}
    278 
    279 	switch (cmd) {
    280 
    281 	case RED_ENABLE:
    282 		ifacep = (struct red_interface *)addr;
    283 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    284 			error = EBADF;
    285 			break;
    286 		}
    287 		error = altq_enable(rqp->rq_ifq);
    288 		break;
    289 
    290 	case RED_DISABLE:
    291 		ifacep = (struct red_interface *)addr;
    292 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    293 			error = EBADF;
    294 			break;
    295 		}
    296 		error = altq_disable(rqp->rq_ifq);
    297 		break;
    298 
    299 	case RED_IF_ATTACH:
    300 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    301 		if (ifp == NULL) {
    302 			error = ENXIO;
    303 			break;
    304 		}
    305 
    306 		/* allocate and initialize red_queue_t */
    307 		MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
    308 		if (rqp == NULL) {
    309 			error = ENOMEM;
    310 			break;
    311 		}
    312 		(void)memset(rqp, 0, sizeof(red_queue_t));
    313 
    314 		MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t),
    315 		       M_DEVBUF, M_WAITOK);
    316 		if (rqp->rq_q == NULL) {
    317 			FREE(rqp, M_DEVBUF);
    318 			error = ENOMEM;
    319 			break;
    320 		}
    321 		(void)memset(rqp->rq_q, 0, sizeof(class_queue_t));
    322 
    323 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    324 		if (rqp->rq_red == NULL) {
    325 			FREE(rqp->rq_q, M_DEVBUF);
    326 			FREE(rqp, M_DEVBUF);
    327 			error = ENOMEM;
    328 			break;
    329 		}
    330 
    331 		rqp->rq_ifq = &ifp->if_snd;
    332 		qtail(rqp->rq_q) = NULL;
    333 		qlen(rqp->rq_q) = 0;
    334 		qlimit(rqp->rq_q) = RED_LIMIT;
    335 		qtype(rqp->rq_q) = Q_RED;
    336 
    337 		/*
    338 		 * set RED to this ifnet structure.
    339 		 */
    340 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    341 				    red_enqueue, red_dequeue, red_request,
    342 				    NULL, NULL);
    343 		if (error) {
    344 			red_destroy(rqp->rq_red);
    345 			FREE(rqp->rq_q, M_DEVBUF);
    346 			FREE(rqp, M_DEVBUF);
    347 			break;
    348 		}
    349 
    350 		/* add this state to the red list */
    351 		rqp->rq_next = red_list;
    352 		red_list = rqp;
    353 		break;
    354 
    355 	case RED_IF_DETACH:
    356 		ifacep = (struct red_interface *)addr;
    357 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    358 			error = EBADF;
    359 			break;
    360 		}
    361 		error = red_detach(rqp);
    362 		break;
    363 
    364 	case RED_GETSTATS:
    365 		do {
    366 			struct red_stats *q_stats;
    367 			red_t *rp;
    368 
    369 			q_stats = (struct red_stats *)addr;
    370 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    371 					     ALTQT_RED)) == NULL) {
    372 				error = EBADF;
    373 				break;
    374 			}
    375 
    376 			q_stats->q_len 	   = qlen(rqp->rq_q);
    377 			q_stats->q_limit   = qlimit(rqp->rq_q);
    378 
    379 			rp = rqp->rq_red;
    380 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    381 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    382 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    383 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    384 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    385 			q_stats->marked_packets = rp->red_stats.marked_packets;
    386 
    387 			q_stats->weight		= rp->red_weight;
    388 			q_stats->inv_pmax	= rp->red_inv_pmax;
    389 			q_stats->th_min		= rp->red_thmin;
    390 			q_stats->th_max		= rp->red_thmax;
    391 
    392 #ifdef ALTQ_FLOWVALVE
    393 			if (rp->red_flowvalve != NULL) {
    394 				struct flowvalve *fv = rp->red_flowvalve;
    395 				q_stats->fv_flows    = fv->fv_flows;
    396 				q_stats->fv_pass     = fv->fv_stats.pass;
    397 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    398 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    399 				q_stats->fv_escape   = fv->fv_stats.escape;
    400 			} else {
    401 #endif /* ALTQ_FLOWVALVE */
    402 				q_stats->fv_flows    = 0;
    403 				q_stats->fv_pass     = 0;
    404 				q_stats->fv_predrop  = 0;
    405 				q_stats->fv_alloc    = 0;
    406 				q_stats->fv_escape   = 0;
    407 #ifdef ALTQ_FLOWVALVE
    408 			}
    409 #endif /* ALTQ_FLOWVALVE */
    410 		} while (0);
    411 		break;
    412 
    413 	case RED_CONFIG:
    414 		do {
    415 			struct red_conf *fc;
    416 			red_t *new;
    417 			int s, limit;
    418 
    419 			fc = (struct red_conf *)addr;
    420 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    421 					       ALTQT_RED)) == NULL) {
    422 				error = EBADF;
    423 				break;
    424 			}
    425 			new = red_alloc(fc->red_weight,
    426 					fc->red_inv_pmax,
    427 					fc->red_thmin,
    428 					fc->red_thmax,
    429 					fc->red_flags,
    430 					fc->red_pkttime);
    431 			if (new == NULL) {
    432 				error = ENOMEM;
    433 				break;
    434 			}
    435 
    436 			s = splnet();
    437 			red_purgeq(rqp);
    438 			limit = fc->red_limit;
    439 			if (limit < fc->red_thmax)
    440 				limit = fc->red_thmax;
    441 			qlimit(rqp->rq_q) = limit;
    442 			fc->red_limit = limit;	/* write back the new value */
    443 
    444 			red_destroy(rqp->rq_red);
    445 			rqp->rq_red = new;
    446 
    447 			splx(s);
    448 
    449 			/* write back new values */
    450 			fc->red_limit = limit;
    451 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    452 			fc->red_thmin = rqp->rq_red->red_thmin;
    453 			fc->red_thmax = rqp->rq_red->red_thmax;
    454 
    455 		} while (0);
    456 		break;
    457 
    458 	case RED_SETDEFAULTS:
    459 		do {
    460 			struct redparams *rp;
    461 
    462 			rp = (struct redparams *)addr;
    463 
    464 			default_th_min = rp->th_min;
    465 			default_th_max = rp->th_max;
    466 			default_inv_pmax = rp->inv_pmax;
    467 		} while (0);
    468 		break;
    469 
    470 	default:
    471 		error = EINVAL;
    472 		break;
    473 	}
    474 	return error;
    475 }
    476 
    477 static int
    478 red_detach(rqp)
    479 	red_queue_t *rqp;
    480 {
    481 	red_queue_t *tmp;
    482 	int error = 0;
    483 
    484 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    485 		altq_disable(rqp->rq_ifq);
    486 
    487 	if ((error = altq_detach(rqp->rq_ifq)))
    488 		return (error);
    489 
    490 	if (red_list == rqp)
    491 		red_list = rqp->rq_next;
    492 	else {
    493 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    494 			if (tmp->rq_next == rqp) {
    495 				tmp->rq_next = rqp->rq_next;
    496 				break;
    497 			}
    498 		if (tmp == NULL)
    499 			printf("red_detach: no state found in red_list!\n");
    500 	}
    501 
    502 	red_destroy(rqp->rq_red);
    503 	FREE(rqp->rq_q, M_DEVBUF);
    504 	FREE(rqp, M_DEVBUF);
    505 	return (error);
    506 }
    507 
    508 /*
    509  * red support routines
    510  */
    511 
    512 red_t *
    513 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    514 	int	weight, inv_pmax, th_min, th_max;
    515 	int	flags, pkttime;
    516 {
    517 	red_t 	*rp;
    518 	int	w, i;
    519 	int	npkts_per_sec;
    520 
    521 	MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
    522 	if (rp == NULL)
    523 		return (NULL);
    524 	(void)memset(rp, 0, sizeof(red_t));
    525 
    526 	rp->red_avg = 0;
    527 	rp->red_idle = 1;
    528 
    529 	if (weight == 0)
    530 		rp->red_weight = W_WEIGHT;
    531 	else
    532 		rp->red_weight = weight;
    533 	if (inv_pmax == 0)
    534 		rp->red_inv_pmax = default_inv_pmax;
    535 	else
    536 		rp->red_inv_pmax = inv_pmax;
    537 	if (th_min == 0)
    538 		rp->red_thmin = default_th_min;
    539 	else
    540 		rp->red_thmin = th_min;
    541 	if (th_max == 0)
    542 		rp->red_thmax = default_th_max;
    543 	else
    544 		rp->red_thmax = th_max;
    545 
    546 	rp->red_flags = flags;
    547 
    548 	if (pkttime == 0)
    549 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    550 		rp->red_pkttime = 800;
    551 	else
    552 		rp->red_pkttime = pkttime;
    553 
    554 	if (weight == 0) {
    555 		/* when the link is very slow, adjust red parameters */
    556 		npkts_per_sec = 1000000 / rp->red_pkttime;
    557 		if (npkts_per_sec < 50) {
    558 			/* up to about 400Kbps */
    559 			rp->red_weight = W_WEIGHT_2;
    560 		} else if (npkts_per_sec < 300) {
    561 			/* up to about 2.4Mbps */
    562 			rp->red_weight = W_WEIGHT_1;
    563 		}
    564 	}
    565 
    566 	/* calculate wshift.  weight must be power of 2 */
    567 	w = rp->red_weight;
    568 	for (i = 0; w > 1; i++)
    569 		w = w >> 1;
    570 	rp->red_wshift = i;
    571 	w = 1 << rp->red_wshift;
    572 	if (w != rp->red_weight) {
    573 		printf("invalid weight value %d for red! use %d\n",
    574 		       rp->red_weight, w);
    575 		rp->red_weight = w;
    576 	}
    577 
    578 	/*
    579 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    580 	 * to be compared with avg.
    581 	 */
    582 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    583 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    584 
    585 	/*
    586 	 * precompute probability denominator
    587 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    588 	 */
    589 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    590 			 * rp->red_inv_pmax) << FP_SHIFT;
    591 
    592 	/* allocate weight table */
    593 	rp->red_wtab = wtab_alloc(rp->red_weight);
    594 
    595 	microtime(&rp->red_last);
    596 #ifdef ALTQ_FLOWVALVE
    597 	if (flags & REDF_FLOWVALVE)
    598 		rp->red_flowvalve = fv_alloc(rp);
    599 	/* if fv_alloc failes, flowvalve is just disabled */
    600 #endif
    601 	return (rp);
    602 }
    603 
    604 void
    605 red_destroy(rp)
    606 	red_t *rp;
    607 {
    608 #ifdef ALTQ_FLOWVALVE
    609 	if (rp->red_flowvalve != NULL)
    610 		fv_destroy(rp->red_flowvalve);
    611 #endif
    612 	wtab_destroy(rp->red_wtab);
    613 	FREE(rp, M_DEVBUF);
    614 }
    615 
    616 void
    617 red_getstats(rp, sp)
    618 	red_t *rp;
    619 	struct redstats *sp;
    620 {
    621 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    622 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    623 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    624 	sp->drop_forced		= rp->red_stats.drop_forced;
    625 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    626 	sp->marked_packets	= rp->red_stats.marked_packets;
    627 }
    628 
    629 /*
    630  * enqueue routine:
    631  *
    632  *	returns: 0 when successfully queued.
    633  *		 ENOBUFS when drop occurs.
    634  */
    635 static int
    636 red_enqueue(ifq, m, pktattr)
    637 	struct ifaltq *ifq;
    638 	struct mbuf *m;
    639 	struct altq_pktattr *pktattr;
    640 {
    641 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    642 
    643 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    644 		return ENOBUFS;
    645 	ifq->ifq_len++;
    646 	return 0;
    647 }
    648 
    649 int
    650 red_addq(rp, q, m, pktattr)
    651 	red_t *rp;
    652 	class_queue_t *q;
    653 	struct mbuf *m;
    654 	struct altq_pktattr *pktattr;
    655 {
    656 	int avg, droptype;
    657 	int n;
    658 #ifdef ALTQ_FLOWVALVE
    659 	struct fve *fve = NULL;
    660 
    661 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    662 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    663 			m_freem(m);
    664 			return (-1);
    665 		}
    666 #endif
    667 
    668 	avg = rp->red_avg;
    669 
    670 	/*
    671 	 * if we were idle, we pretend that n packets arrived during
    672 	 * the idle period.
    673 	 */
    674 	if (rp->red_idle) {
    675 		struct timeval now;
    676 		int t;
    677 
    678 		rp->red_idle = 0;
    679 		microtime(&now);
    680 		t = (now.tv_sec - rp->red_last.tv_sec);
    681 		if (t > 60) {
    682 			/*
    683 			 * being idle for more than 1 minute, set avg to zero.
    684 			 * this prevents t from overflow.
    685 			 */
    686 			avg = 0;
    687 		} else {
    688 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    689 			n = t / rp->red_pkttime - 1;
    690 
    691 			/* the following line does (avg = (1 - Wq)^n * avg) */
    692 			if (n > 0)
    693 				avg = (avg >> FP_SHIFT) *
    694 				    pow_w(rp->red_wtab, n);
    695 		}
    696 	}
    697 
    698 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    699 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    700 	rp->red_avg = avg;		/* save the new value */
    701 
    702 	/*
    703 	 * red_count keeps a tally of arriving traffic that has not
    704 	 * been dropped.
    705 	 */
    706 	rp->red_count++;
    707 
    708 	/* see if we drop early */
    709 	droptype = DTYPE_NODROP;
    710 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    711 		if (avg >= rp->red_thmax_s) {
    712 			/* avg >= th_max: forced drop */
    713 			droptype = DTYPE_FORCED;
    714 		} else if (rp->red_old == 0) {
    715 			/* first exceeds th_min */
    716 			rp->red_count = 1;
    717 			rp->red_old = 1;
    718 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    719 				      rp->red_probd, rp->red_count)) {
    720 			/* mark or drop by red */
    721 			if ((rp->red_flags & REDF_ECN) &&
    722 			    mark_ecn(m, pktattr, rp->red_flags)) {
    723 				/* successfully marked.  do not drop. */
    724 				rp->red_count = 0;
    725 #ifdef RED_STATS
    726 				rp->red_stats.marked_packets++;
    727 #endif
    728 			} else {
    729 				/* unforced drop by red */
    730 				droptype = DTYPE_EARLY;
    731 			}
    732 		}
    733 	} else {
    734 		/* avg < th_min */
    735 		rp->red_old = 0;
    736 	}
    737 
    738 	/*
    739 	 * if the queue length hits the hard limit, it's a forced drop.
    740 	 */
    741 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    742 		droptype = DTYPE_FORCED;
    743 
    744 #ifdef RED_RANDOM_DROP
    745 	/* if successful or forced drop, enqueue this packet. */
    746 	if (droptype != DTYPE_EARLY)
    747 		_addq(q, m);
    748 #else
    749 	/* if successful, enqueue this packet. */
    750 	if (droptype == DTYPE_NODROP)
    751 		_addq(q, m);
    752 #endif
    753 	if (droptype != DTYPE_NODROP) {
    754 		if (droptype == DTYPE_EARLY) {
    755 			/* drop the incoming packet */
    756 #ifdef RED_STATS
    757 			rp->red_stats.drop_unforced++;
    758 #endif
    759 		} else {
    760 			/* forced drop, select a victim packet in the queue. */
    761 #ifdef RED_RANDOM_DROP
    762 			m = _getq_random(q);
    763 #endif
    764 #ifdef RED_STATS
    765 			rp->red_stats.drop_forced++;
    766 #endif
    767 		}
    768 #ifdef RED_STATS
    769 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    770 #endif
    771 		rp->red_count = 0;
    772 #ifdef ALTQ_FLOWVALVE
    773 		if (rp->red_flowvalve != NULL)
    774 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    775 #endif
    776 		m_freem(m);
    777 		return (-1);
    778 	}
    779 	/* successfully queued */
    780 #ifdef RED_STATS
    781 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    782 #endif
    783 	return (0);
    784 }
    785 
    786 /*
    787  * early-drop probability is calculated as follows:
    788  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    789  *   prob_a = prob / (2 - count*prob)
    790  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    791  * here prob_a increases as successive undrop count increases.
    792  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    793  * becomes 1 when (count >= (2 / prob))).
    794  */
    795 int
    796 drop_early(fp_len, fp_probd, count)
    797 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    798 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    799 	int count;	/* how many successive undropped packets */
    800 {
    801 	int d;		/* denominator of drop-probability */
    802 
    803 	d = fp_probd - count * fp_len;
    804 	if (d <= 0)
    805 		/* count exceeds the hard limit: drop or mark */
    806 		return (1);
    807 
    808 	/*
    809 	 * now the range of d is [1..600] in fixed-point. (when
    810 	 * th_max-th_min=10 and p_max=1/30)
    811 	 * drop probability = (avg - TH_MIN) / d
    812 	 */
    813 
    814 	if ((random() % d) < fp_len) {
    815 		/* drop or mark */
    816 		return (1);
    817 	}
    818 	/* no drop/mark */
    819 	return (0);
    820 }
    821 
    822 /*
    823  * try to mark CE bit to the packet.
    824  *    returns 1 if successfully marked, 0 otherwise.
    825  */
    826 int
    827 mark_ecn(m, pktattr, flags)
    828 	struct mbuf *m;
    829 	struct altq_pktattr *pktattr;
    830 	int flags;
    831 {
    832 	struct mbuf *m0;
    833 
    834 	if (pktattr == NULL ||
    835 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    836 		return (0);
    837 
    838 	/* verify that pattr_hdr is within the mbuf data */
    839 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    840 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    841 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    842 			break;
    843 	if (m0 == NULL) {
    844 		/* ick, pattr_hdr is stale */
    845 		pktattr->pattr_af = AF_UNSPEC;
    846 		return (0);
    847 	}
    848 
    849 	switch (pktattr->pattr_af) {
    850 	case AF_INET:
    851 		if (flags & REDF_ECN4) {
    852 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    853 			u_int8_t otos;
    854 			int sum;
    855 
    856 			if (ip->ip_v != 4)
    857 				return (0);	/* version mismatch! */
    858 
    859 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    860 				return (0);	/* not-ECT */
    861 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    862 				return (1);	/* already marked */
    863 
    864 			/*
    865 			 * ecn-capable but not marked,
    866 			 * mark CE and update checksum
    867 			 */
    868 			otos = ip->ip_tos;
    869 			ip->ip_tos |= IPTOS_ECN_CE;
    870 			/*
    871 			 * update checksum (from RFC1624)
    872 			 *	   HC' = ~(~HC + ~m + m')
    873 			 */
    874 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    875 			sum += (~otos & 0xffff) + ip->ip_tos;
    876 			sum = (sum >> 16) + (sum & 0xffff);
    877 			sum += (sum >> 16);  /* add carry */
    878 			ip->ip_sum = htons(~sum & 0xffff);
    879 			return (1);
    880 		}
    881 		break;
    882 #ifdef INET6
    883 	case AF_INET6:
    884 		if (flags & REDF_ECN6) {
    885 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    886 			u_int32_t flowlabel;
    887 
    888 			flowlabel = ntohl(ip6->ip6_flow);
    889 			if ((flowlabel >> 28) != 6)
    890 				return (0);	/* version mismatch! */
    891 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    892 			    (IPTOS_ECN_NOTECT << 20))
    893 				return (0);	/* not-ECT */
    894 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    895 			    (IPTOS_ECN_CE << 20))
    896 				return (1);	/* already marked */
    897 			/*
    898 			 * ecn-capable but not marked,  mark CE
    899 			 */
    900 			flowlabel |= (IPTOS_ECN_CE << 20);
    901 			ip6->ip6_flow = htonl(flowlabel);
    902 			return (1);
    903 		}
    904 		break;
    905 #endif  /* INET6 */
    906 	}
    907 
    908 	/* not marked */
    909 	return (0);
    910 }
    911 
    912 /*
    913  * dequeue routine:
    914  *	must be called in splnet.
    915  *
    916  *	returns: mbuf dequeued.
    917  *		 NULL when no packet is available in the queue.
    918  */
    919 
    920 static struct mbuf *
    921 red_dequeue(ifq, op)
    922 	struct ifaltq *ifq;
    923 	int op;
    924 {
    925 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    926 	struct mbuf *m;
    927 
    928 	if (op == ALTDQ_POLL)
    929 		return qhead(rqp->rq_q);
    930 
    931 	/* op == ALTDQ_REMOVE */
    932 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    933 	if (m != NULL)
    934 		ifq->ifq_len--;
    935 	return (m);
    936 }
    937 
    938 struct mbuf *
    939 red_getq(rp, q)
    940 	red_t *rp;
    941 	class_queue_t *q;
    942 {
    943 	struct mbuf *m;
    944 
    945 	if ((m = _getq(q)) == NULL) {
    946 		if (rp->red_idle == 0) {
    947 			rp->red_idle = 1;
    948 			microtime(&rp->red_last);
    949 		}
    950 		return NULL;
    951 	}
    952 
    953 	rp->red_idle = 0;
    954 	return (m);
    955 }
    956 
    957 static int
    958 red_request(ifq, req, arg)
    959 	struct ifaltq *ifq;
    960 	int req;
    961 	void *arg;
    962 {
    963 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    964 
    965 	switch (req) {
    966 	case ALTRQ_PURGE:
    967 		red_purgeq(rqp);
    968 		break;
    969 	}
    970 	return (0);
    971 }
    972 
    973 static void
    974 red_purgeq(rqp)
    975 	red_queue_t *rqp;
    976 {
    977 	_flushq(rqp->rq_q);
    978 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    979 		rqp->rq_ifq->ifq_len = 0;
    980 }
    981 
    982 
    983 /*
    984  * helper routine to calibrate avg during idle.
    985  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    986  * here Wq = 1/weight and the code assumes Wq is close to zero.
    987  *
    988  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    989  */
    990 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    991 
    992 struct wtab *
    993 wtab_alloc(weight)
    994 	int weight;
    995 {
    996 	struct wtab *w;
    997 	int i;
    998 
    999 	for (w = wtab_list; w != NULL; w = w->w_next)
   1000 		if (w->w_weight == weight) {
   1001 			w->w_refcount++;
   1002 			return (w);
   1003 		}
   1004 
   1005 	MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
   1006 	if (w == NULL)
   1007 		panic("wtab_alloc: malloc failed!");
   1008 	(void)memset(w, 0, sizeof(struct wtab));
   1009 	w->w_weight = weight;
   1010 	w->w_refcount = 1;
   1011 	w->w_next = wtab_list;
   1012 	wtab_list = w;
   1013 
   1014 	/* initialize the weight table */
   1015 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1016 	for (i = 1; i < 32; i++) {
   1017 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1018 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1019 			w->w_param_max = 1 << i;
   1020 	}
   1021 
   1022 	return (w);
   1023 }
   1024 
   1025 int
   1026 wtab_destroy(w)
   1027 	struct wtab *w;
   1028 {
   1029 	struct wtab *prev;
   1030 
   1031 	if (--w->w_refcount > 0)
   1032 		return (0);
   1033 
   1034 	if (wtab_list == w)
   1035 		wtab_list = w->w_next;
   1036 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1037 		if (prev->w_next == w) {
   1038 			prev->w_next = w->w_next;
   1039 			break;
   1040 		}
   1041 
   1042 	FREE(w, M_DEVBUF);
   1043 	return (0);
   1044 }
   1045 
   1046 int32_t
   1047 pow_w(w, n)
   1048 	struct wtab *w;
   1049 	int n;
   1050 {
   1051 	int	i, bit;
   1052 	int32_t	val;
   1053 
   1054 	if (n >= w->w_param_max)
   1055 		return (0);
   1056 
   1057 	val = 1 << FP_SHIFT;
   1058 	if (n <= 0)
   1059 		return (val);
   1060 
   1061 	bit = 1;
   1062 	i = 0;
   1063 	while (n) {
   1064 		if (n & bit) {
   1065 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1066 			n &= ~bit;
   1067 		}
   1068 		i++;
   1069 		bit <<=  1;
   1070 	}
   1071 	return (val);
   1072 }
   1073 
   1074 #ifdef ALTQ_FLOWVALVE
   1075 
   1076 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1077 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1078 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1079 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1080 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1081 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1082 
   1083 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1084 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1085 
   1086 #define	FV_N			10	/* update fve_f every FV_N packets */
   1087 
   1088 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1089 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1090 #define	FV_ALPHA		5  /* extra packet count */
   1091 
   1092 #if (__FreeBSD_version > 300000)
   1093 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1094 #else
   1095 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1096 #endif
   1097 
   1098 /*
   1099  * Brtt table: 127 entry table to convert drop rate (p) to
   1100  * the corresponding bandwidth fraction (f)
   1101  * the following equation is implemented to use scaled values,
   1102  * fve_p and fve_f, in the fixed point format.
   1103  *
   1104  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1105  *   f = Brtt(p) / (max_th + alpha)
   1106  */
   1107 #define	BRTT_SIZE	128
   1108 #define	BRTT_SHIFT	12
   1109 #define	BRTT_MASK	0x0007f000
   1110 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1111 
   1112 const int brtt_tab[BRTT_SIZE] = {
   1113 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1114 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1115 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1116 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1117 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1118 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1119 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1120 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1121 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1122 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1123 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1124 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1125 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1126 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1127 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1128 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1129 };
   1130 
   1131 static __inline struct fve *
   1132 flowlist_lookup(fv, pktattr, now)
   1133 	struct flowvalve *fv;
   1134 	struct altq_pktattr *pktattr;
   1135 	struct timeval *now;
   1136 {
   1137 	struct fve *fve;
   1138 	int flows;
   1139 	struct ip *ip;
   1140 #ifdef INET6
   1141 	struct ip6_hdr *ip6;
   1142 #endif
   1143 	struct timeval tthresh;
   1144 
   1145 	if (pktattr == NULL)
   1146 		return (NULL);
   1147 
   1148 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1149 	flows = 0;
   1150 	/*
   1151 	 * search the flow list
   1152 	 */
   1153 	switch (pktattr->pattr_af) {
   1154 	case AF_INET:
   1155 		ip = (struct ip *)pktattr->pattr_hdr;
   1156 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1157 			if (fve->fve_lastdrop.tv_sec == 0)
   1158 				break;
   1159 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1160 				fve->fve_lastdrop.tv_sec = 0;
   1161 				break;
   1162 			}
   1163 			if (fve->fve_flow.flow_af == AF_INET &&
   1164 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1165 			    ip->ip_src.s_addr &&
   1166 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1167 			    ip->ip_dst.s_addr)
   1168 				return (fve);
   1169 			flows++;
   1170 		}
   1171 		break;
   1172 #ifdef INET6
   1173 	case AF_INET6:
   1174 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1175 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1176 			if (fve->fve_lastdrop.tv_sec == 0)
   1177 				break;
   1178 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1179 				fve->fve_lastdrop.tv_sec = 0;
   1180 				break;
   1181 			}
   1182 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1183 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1184 					       &ip6->ip6_src) &&
   1185 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1186 					       &ip6->ip6_dst))
   1187 				return (fve);
   1188 			flows++;
   1189 		}
   1190 		break;
   1191 #endif /* INET6 */
   1192 
   1193 	default:
   1194 		/* unknown protocol.  no drop. */
   1195 		return (NULL);
   1196 	}
   1197 	fv->fv_flows = flows;	/* save the number of active fve's */
   1198 	return (NULL);
   1199 }
   1200 
   1201 static __inline struct fve *
   1202 flowlist_reclaim(fv, pktattr)
   1203 	struct flowvalve *fv;
   1204 	struct altq_pktattr *pktattr;
   1205 {
   1206 	struct fve *fve;
   1207 	struct ip *ip;
   1208 #ifdef INET6
   1209 	struct ip6_hdr *ip6;
   1210 #endif
   1211 
   1212 	/*
   1213 	 * get an entry from the tail of the LRU list.
   1214 	 */
   1215 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1216 
   1217 	switch (pktattr->pattr_af) {
   1218 	case AF_INET:
   1219 		ip = (struct ip *)pktattr->pattr_hdr;
   1220 		fve->fve_flow.flow_af = AF_INET;
   1221 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1222 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1223 		break;
   1224 #ifdef INET6
   1225 	case AF_INET6:
   1226 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1227 		fve->fve_flow.flow_af = AF_INET6;
   1228 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1229 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1230 		break;
   1231 #endif
   1232 	}
   1233 
   1234 	fve->fve_state = Green;
   1235 	fve->fve_p = 0.0;
   1236 	fve->fve_f = 0.0;
   1237 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1238 	fve->fve_count = 0;
   1239 
   1240 	fv->fv_flows++;
   1241 #ifdef FV_STATS
   1242 	fv->fv_stats.alloc++;
   1243 #endif
   1244 	return (fve);
   1245 }
   1246 
   1247 static __inline void
   1248 flowlist_move_to_head(fv, fve)
   1249 	struct flowvalve *fv;
   1250 	struct fve *fve;
   1251 {
   1252 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1253 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1254 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1255 	}
   1256 }
   1257 
   1258 /*
   1259  * allocate flowvalve structure
   1260  */
   1261 static struct flowvalve *
   1262 fv_alloc(rp)
   1263 	struct red *rp;
   1264 {
   1265 	struct flowvalve *fv;
   1266 	struct fve *fve;
   1267 	int i, num;
   1268 
   1269 	num = FV_FLOWLISTSIZE;
   1270 	MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve),
   1271 	       M_DEVBUF, M_WAITOK);
   1272 	if (fv == NULL)
   1273 		return (NULL);
   1274 	(void)memset(fv, 0, sizeof(struct flowvalve));
   1275 
   1276 	MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num,
   1277 	       M_DEVBUF, M_WAITOK);
   1278 	if (fv->fv_fves == NULL) {
   1279 		FREE(fv, M_DEVBUF);
   1280 		return (NULL);
   1281 	}
   1282 	(void)memset(fv->fv_fves, 0, sizeof(struct fve) * num);
   1283 
   1284 	fv->fv_flows = 0;
   1285 	TAILQ_INIT(&fv->fv_flowlist);
   1286 	for (i = 0; i < num; i++) {
   1287 		fve = &fv->fv_fves[i];
   1288 		fve->fve_lastdrop.tv_sec = 0;
   1289 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1290 	}
   1291 
   1292 	/* initialize drop rate threshold in scaled fixed-point */
   1293 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1294 
   1295 	/* initialize drop rate to fraction table */
   1296 	MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE,
   1297 	       M_DEVBUF, M_WAITOK);
   1298 	if (fv->fv_p2ftab == NULL) {
   1299 		FREE(fv->fv_fves, M_DEVBUF);
   1300 		FREE(fv, M_DEVBUF);
   1301 		return (NULL);
   1302 	}
   1303 	/*
   1304 	 * create the p2f table.
   1305 	 * (shift is used to keep the precision)
   1306 	 */
   1307 	for (i = 1; i < BRTT_SIZE; i++) {
   1308 		int f;
   1309 
   1310 		f = brtt_tab[i] << 8;
   1311 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1312 	}
   1313 
   1314 	return (fv);
   1315 }
   1316 
   1317 static void fv_destroy(fv)
   1318 	struct flowvalve *fv;
   1319 {
   1320 	FREE(fv->fv_p2ftab, M_DEVBUF);
   1321 	FREE(fv->fv_fves, M_DEVBUF);
   1322 	FREE(fv, M_DEVBUF);
   1323 }
   1324 
   1325 static __inline int
   1326 fv_p2f(fv, p)
   1327 	struct flowvalve	*fv;
   1328 	int	p;
   1329 {
   1330 	int val, f;
   1331 
   1332 	if (p >= BRTT_PMAX)
   1333 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1334 	else if ((val = (p & BRTT_MASK)))
   1335 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1336 	else
   1337 		f = fv->fv_p2ftab[1];
   1338 	return (f);
   1339 }
   1340 
   1341 /*
   1342  * check if an arriving packet should be pre-dropped.
   1343  * called from red_addq() when a packet arrives.
   1344  * returns 1 when the packet should be pre-dropped.
   1345  * should be called in splnet.
   1346  */
   1347 static int
   1348 fv_checkflow(fv, pktattr, fcache)
   1349 	struct flowvalve *fv;
   1350 	struct altq_pktattr *pktattr;
   1351 	struct fve **fcache;
   1352 {
   1353 	struct fve *fve;
   1354 	struct timeval now;
   1355 
   1356 	fv->fv_ifseq++;
   1357 	FV_TIMESTAMP(&now);
   1358 
   1359 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1360 		/* no matching entry in the flowlist */
   1361 		return (0);
   1362 
   1363 	*fcache = fve;
   1364 
   1365 	/* update fraction f for every FV_N packets */
   1366 	if (++fve->fve_count == FV_N) {
   1367 		/*
   1368 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1369 		 */
   1370 		fve->fve_f =
   1371 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1372 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1373 		fve->fve_ifseq = fv->fv_ifseq;
   1374 		fve->fve_count = 0;
   1375 	}
   1376 
   1377 	/*
   1378 	 * overpumping test
   1379 	 */
   1380 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1381 		int fthresh;
   1382 
   1383 		/* calculate a threshold */
   1384 		fthresh = fv_p2f(fv, fve->fve_p);
   1385 		if (fve->fve_f > fthresh)
   1386 			fve->fve_state = Red;
   1387 	}
   1388 
   1389 	if (fve->fve_state == Red) {
   1390 		/*
   1391 		 * backoff test
   1392 		 */
   1393 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1394 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1395 			fve->fve_p = 0;
   1396 			fve->fve_state = Green;
   1397 #ifdef FV_STATS
   1398 			fv->fv_stats.escape++;
   1399 #endif
   1400 		} else {
   1401 			/* block this flow */
   1402 			flowlist_move_to_head(fv, fve);
   1403 			fve->fve_lastdrop = now;
   1404 #ifdef FV_STATS
   1405 			fv->fv_stats.predrop++;
   1406 #endif
   1407 			return (1);
   1408 		}
   1409 	}
   1410 
   1411 	/*
   1412 	 * p = (1 - Wp) * p
   1413 	 */
   1414 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1415 	if (fve->fve_p < 0)
   1416 		fve->fve_p = 0;
   1417 #ifdef FV_STATS
   1418 	fv->fv_stats.pass++;
   1419 #endif
   1420 	return (0);
   1421 }
   1422 
   1423 /*
   1424  * called from red_addq when a packet is dropped by red.
   1425  * should be called in splnet.
   1426  */
   1427 static void fv_dropbyred(fv, pktattr, fcache)
   1428 	struct flowvalve *fv;
   1429 	struct altq_pktattr *pktattr;
   1430 	struct fve *fcache;
   1431 {
   1432 	struct fve *fve;
   1433 	struct timeval now;
   1434 
   1435 	if (pktattr == NULL)
   1436 		return;
   1437 	FV_TIMESTAMP(&now);
   1438 
   1439 	if (fcache != NULL)
   1440 		/* the fve of this packet is already cached */
   1441 		fve = fcache;
   1442 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1443 		fve = flowlist_reclaim(fv, pktattr);
   1444 
   1445 	flowlist_move_to_head(fv, fve);
   1446 
   1447 	/*
   1448 	 * update p:  the following line cancels the update
   1449 	 *	      in fv_checkflow() and calculate
   1450 	 *	p = Wp + (1 - Wp) * p
   1451 	 */
   1452 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1453 
   1454 	fve->fve_lastdrop = now;
   1455 }
   1456 
   1457 #endif /* ALTQ_FLOWVALVE */
   1458 
   1459 #ifdef KLD_MODULE
   1460 
   1461 static struct altqsw red_sw =
   1462 	{"red", redopen, redclose, redioctl};
   1463 
   1464 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1465 
   1466 #endif /* KLD_MODULE */
   1467 
   1468 #endif /* ALTQ_RED */
   1469