Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.13.10.1
      1 /*	$NetBSD: altq_red.c,v 1.13.10.1 2006/03/08 00:23:46 elad Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.9 2002/01/07 11:25:40 kjc Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.13.10.1 2006/03/08 00:23:46 elad Exp $");
     65 
     66 #if defined(__FreeBSD__) || defined(__NetBSD__)
     67 #include "opt_altq.h"
     68 #if (__FreeBSD__ != 2)
     69 #include "opt_inet.h"
     70 #ifdef __FreeBSD__
     71 #include "opt_inet6.h"
     72 #endif
     73 #endif
     74 #endif /* __FreeBSD__ || __NetBSD__ */
     75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     76 
     77 #include <sys/param.h>
     78 #include <sys/malloc.h>
     79 #include <sys/mbuf.h>
     80 #include <sys/socket.h>
     81 #include <sys/sockio.h>
     82 #include <sys/systm.h>
     83 #include <sys/proc.h>
     84 #include <sys/errno.h>
     85 #include <sys/kernel.h>
     86 #ifdef ALTQ_FLOWVALVE
     87 #include <sys/queue.h>
     88 #include <sys/time.h>
     89 #endif
     90 
     91 #include <net/if.h>
     92 #include <net/if_types.h>
     93 
     94 #include <netinet/in.h>
     95 #include <netinet/in_systm.h>
     96 #include <netinet/ip.h>
     97 #ifdef INET6
     98 #include <netinet/ip6.h>
     99 #endif
    100 
    101 #include <altq/altq.h>
    102 #include <altq/altq_conf.h>
    103 #include <altq/altq_red.h>
    104 #ifdef ALTQ_FLOWVALVE
    105 #include <altq/altq_flowvalve.h>
    106 #endif
    107 
    108 /*
    109  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    110  * fixed-point calculation.
    111  *
    112  * written by kjc using the ns code as a reference.
    113  * you can learn more about red and ns from Sally's home page at
    114  * http://www-nrg.ee.lbl.gov/floyd/
    115  *
    116  * most of the red parameter values are fixed in this implementation
    117  * to prevent fixed-point overflow/underflow.
    118  * if you change the parameters, watch out for overflow/underflow!
    119  *
    120  * the parameters used are recommended values by Sally.
    121  * the corresponding ns config looks:
    122  *	q_weight=0.00195
    123  *	minthresh=5 maxthresh=15 queue-size=60
    124  *	linterm=30
    125  *	dropmech=drop-tail
    126  *	bytes=false (can't be handled by 32-bit fixed-point)
    127  *	doubleq=false dqthresh=false
    128  *	wait=true
    129  */
    130 /*
    131  * alternative red parameters for a slow link.
    132  *
    133  * assume the queue length becomes from zero to L and keeps L, it takes
    134  * N packets for q_avg to reach 63% of L.
    135  * when q_weight is 0.002, N is about 500 packets.
    136  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    137  * when q_weight is 0.008, N is about 127 packets.
    138  * when q_weight is 0.016, N is about 63 packets.
    139  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    140  * are allowed for 0.016.
    141  * see Sally's paper for more details.
    142  */
    143 /* normal red parameters */
    144 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    145 				/* q_weight = 0.00195 */
    146 
    147 /* red parameters for a slow link */
    148 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    149 				/* q_weight = 0.0078125 */
    150 
    151 /* red parameters for a very slow link (e.g., dialup) */
    152 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    153 				/* q_weight = 0.015625 */
    154 
    155 /* fixed-point uses 12-bit decimal places */
    156 #define	FP_SHIFT	12	/* fixed-point shift */
    157 
    158 /* red parameters for drop probability */
    159 #define	INV_P_MAX	10	/* inverse of max drop probability */
    160 #define	TH_MIN		5	/* min threshold */
    161 #define	TH_MAX		15	/* max threshold */
    162 
    163 #define	RED_LIMIT	60	/* default max queue length */
    164 
    165 /*
    166  * our default policy for forced-drop is drop-tail.
    167  * (in altq-1.1.2 or earlier, the default was random-drop.
    168  * but it makes more sense to punish the cause of the surge.)
    169  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    170  */
    171 
    172 #ifdef ALTQ_FLOWVALVE
    173 /*
    174  * flow-valve is an extension to protect red from unresponsive flows
    175  * and to promote end-to-end congestion control.
    176  * flow-valve observes the average drop rates of the flows that have
    177  * experienced packet drops in the recent past.
    178  * when the average drop rate exceeds the threshold, the flow is
    179  * blocked by the flow-valve.  the trapped flow should back off
    180  * exponentially to escape from the flow-valve.
    181  */
    182 #ifdef RED_RANDOM_DROP
    183 #error "random-drop can't be used with flow-valve!"
    184 #endif
    185 #endif /* ALTQ_FLOWVALVE */
    186 
    187 /* red_list keeps all red_queue_t's allocated. */
    188 static red_queue_t *red_list = NULL;
    189 
    190 /* default red parameter values */
    191 static int default_th_min = TH_MIN;
    192 static int default_th_max = TH_MAX;
    193 static int default_inv_pmax = INV_P_MAX;
    194 
    195 /* internal function prototypes */
    196 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    197 			    struct altq_pktattr *));
    198 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    199 static int red_request __P((struct ifaltq *, int, void *));
    200 static void red_purgeq __P((red_queue_t *));
    201 static int red_detach __P((red_queue_t *));
    202 #ifdef ALTQ_FLOWVALVE
    203 static inline struct fve *flowlist_lookup __P((struct flowvalve *,
    204 			 struct altq_pktattr *, struct timeval *));
    205 static inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    206 						  struct altq_pktattr *));
    207 static inline void flowlist_move_to_head __P((struct flowvalve *,
    208 						struct fve *));
    209 static inline int fv_p2f __P((struct flowvalve *, int));
    210 static struct flowvalve *fv_alloc __P((struct red *));
    211 static void fv_destroy __P((struct flowvalve *));
    212 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    213 			     struct fve **));
    214 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    215 			      struct fve *));
    216 #endif
    217 
    218 /*
    219  * red device interface
    220  */
    221 altqdev_decl(red);
    222 
    223 int
    224 redopen(dev, flag, fmt, l)
    225 	dev_t dev;
    226 	int flag, fmt;
    227 	struct lwp *l;
    228 {
    229 	/* everything will be done when the queueing scheme is attached. */
    230 	return 0;
    231 }
    232 
    233 int
    234 redclose(dev, flag, fmt, l)
    235 	dev_t dev;
    236 	int flag, fmt;
    237 	struct lwp *l;
    238 {
    239 	red_queue_t *rqp;
    240 	int err, error = 0;
    241 
    242 	while ((rqp = red_list) != NULL) {
    243 		/* destroy all */
    244 		err = red_detach(rqp);
    245 		if (err != 0 && error == 0)
    246 			error = err;
    247 	}
    248 
    249 	return error;
    250 }
    251 
    252 int
    253 redioctl(dev, cmd, addr, flag, l)
    254 	dev_t dev;
    255 	ioctlcmd_t cmd;
    256 	caddr_t addr;
    257 	int flag;
    258 	struct lwp *l;
    259 {
    260 	red_queue_t *rqp;
    261 	struct red_interface *ifacep;
    262 	struct ifnet *ifp;
    263 	struct proc *p = l->l_proc;
    264 	int	error = 0;
    265 
    266 	/* check super-user privilege */
    267 	switch (cmd) {
    268 	case RED_GETSTATS:
    269 		break;
    270 	default:
    271 #if (__FreeBSD_version > 400000)
    272 		if ((error = suser(p)) != 0)
    273 #else
    274 		if ((error = generic_authorize(p->p_cred,
    275 					       KAUTH_GENERIC_ISSUSER,
    276 					       &p->p_acflag)) != 0)
    277 #endif
    278 			return (error);
    279 		break;
    280 	}
    281 
    282 	switch (cmd) {
    283 
    284 	case RED_ENABLE:
    285 		ifacep = (struct red_interface *)addr;
    286 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    287 			error = EBADF;
    288 			break;
    289 		}
    290 		error = altq_enable(rqp->rq_ifq);
    291 		break;
    292 
    293 	case RED_DISABLE:
    294 		ifacep = (struct red_interface *)addr;
    295 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    296 			error = EBADF;
    297 			break;
    298 		}
    299 		error = altq_disable(rqp->rq_ifq);
    300 		break;
    301 
    302 	case RED_IF_ATTACH:
    303 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    304 		if (ifp == NULL) {
    305 			error = ENXIO;
    306 			break;
    307 		}
    308 
    309 		/* allocate and initialize red_queue_t */
    310 		MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
    311 		if (rqp == NULL) {
    312 			error = ENOMEM;
    313 			break;
    314 		}
    315 		(void)memset(rqp, 0, sizeof(red_queue_t));
    316 
    317 		MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t),
    318 		       M_DEVBUF, M_WAITOK);
    319 		if (rqp->rq_q == NULL) {
    320 			FREE(rqp, M_DEVBUF);
    321 			error = ENOMEM;
    322 			break;
    323 		}
    324 		(void)memset(rqp->rq_q, 0, sizeof(class_queue_t));
    325 
    326 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    327 		if (rqp->rq_red == NULL) {
    328 			FREE(rqp->rq_q, M_DEVBUF);
    329 			FREE(rqp, M_DEVBUF);
    330 			error = ENOMEM;
    331 			break;
    332 		}
    333 
    334 		rqp->rq_ifq = &ifp->if_snd;
    335 		qtail(rqp->rq_q) = NULL;
    336 		qlen(rqp->rq_q) = 0;
    337 		qlimit(rqp->rq_q) = RED_LIMIT;
    338 		qtype(rqp->rq_q) = Q_RED;
    339 
    340 		/*
    341 		 * set RED to this ifnet structure.
    342 		 */
    343 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    344 				    red_enqueue, red_dequeue, red_request,
    345 				    NULL, NULL);
    346 		if (error) {
    347 			red_destroy(rqp->rq_red);
    348 			FREE(rqp->rq_q, M_DEVBUF);
    349 			FREE(rqp, M_DEVBUF);
    350 			break;
    351 		}
    352 
    353 		/* add this state to the red list */
    354 		rqp->rq_next = red_list;
    355 		red_list = rqp;
    356 		break;
    357 
    358 	case RED_IF_DETACH:
    359 		ifacep = (struct red_interface *)addr;
    360 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    361 			error = EBADF;
    362 			break;
    363 		}
    364 		error = red_detach(rqp);
    365 		break;
    366 
    367 	case RED_GETSTATS:
    368 		do {
    369 			struct red_stats *q_stats;
    370 			red_t *rp;
    371 
    372 			q_stats = (struct red_stats *)addr;
    373 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    374 					     ALTQT_RED)) == NULL) {
    375 				error = EBADF;
    376 				break;
    377 			}
    378 
    379 			q_stats->q_len 	   = qlen(rqp->rq_q);
    380 			q_stats->q_limit   = qlimit(rqp->rq_q);
    381 
    382 			rp = rqp->rq_red;
    383 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    384 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    385 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    386 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    387 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    388 			q_stats->marked_packets = rp->red_stats.marked_packets;
    389 
    390 			q_stats->weight		= rp->red_weight;
    391 			q_stats->inv_pmax	= rp->red_inv_pmax;
    392 			q_stats->th_min		= rp->red_thmin;
    393 			q_stats->th_max		= rp->red_thmax;
    394 
    395 #ifdef ALTQ_FLOWVALVE
    396 			if (rp->red_flowvalve != NULL) {
    397 				struct flowvalve *fv = rp->red_flowvalve;
    398 				q_stats->fv_flows    = fv->fv_flows;
    399 				q_stats->fv_pass     = fv->fv_stats.pass;
    400 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    401 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    402 				q_stats->fv_escape   = fv->fv_stats.escape;
    403 			} else {
    404 #endif /* ALTQ_FLOWVALVE */
    405 				q_stats->fv_flows    = 0;
    406 				q_stats->fv_pass     = 0;
    407 				q_stats->fv_predrop  = 0;
    408 				q_stats->fv_alloc    = 0;
    409 				q_stats->fv_escape   = 0;
    410 #ifdef ALTQ_FLOWVALVE
    411 			}
    412 #endif /* ALTQ_FLOWVALVE */
    413 		} while (0);
    414 		break;
    415 
    416 	case RED_CONFIG:
    417 		do {
    418 			struct red_conf *fc;
    419 			red_t *new;
    420 			int s, limit;
    421 
    422 			fc = (struct red_conf *)addr;
    423 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    424 					       ALTQT_RED)) == NULL) {
    425 				error = EBADF;
    426 				break;
    427 			}
    428 			new = red_alloc(fc->red_weight,
    429 					fc->red_inv_pmax,
    430 					fc->red_thmin,
    431 					fc->red_thmax,
    432 					fc->red_flags,
    433 					fc->red_pkttime);
    434 			if (new == NULL) {
    435 				error = ENOMEM;
    436 				break;
    437 			}
    438 
    439 			s = splnet();
    440 			red_purgeq(rqp);
    441 			limit = fc->red_limit;
    442 			if (limit < fc->red_thmax)
    443 				limit = fc->red_thmax;
    444 			qlimit(rqp->rq_q) = limit;
    445 			fc->red_limit = limit;	/* write back the new value */
    446 
    447 			red_destroy(rqp->rq_red);
    448 			rqp->rq_red = new;
    449 
    450 			splx(s);
    451 
    452 			/* write back new values */
    453 			fc->red_limit = limit;
    454 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    455 			fc->red_thmin = rqp->rq_red->red_thmin;
    456 			fc->red_thmax = rqp->rq_red->red_thmax;
    457 
    458 		} while (0);
    459 		break;
    460 
    461 	case RED_SETDEFAULTS:
    462 		do {
    463 			struct redparams *rp;
    464 
    465 			rp = (struct redparams *)addr;
    466 
    467 			default_th_min = rp->th_min;
    468 			default_th_max = rp->th_max;
    469 			default_inv_pmax = rp->inv_pmax;
    470 		} while (0);
    471 		break;
    472 
    473 	default:
    474 		error = EINVAL;
    475 		break;
    476 	}
    477 	return error;
    478 }
    479 
    480 static int
    481 red_detach(rqp)
    482 	red_queue_t *rqp;
    483 {
    484 	red_queue_t *tmp;
    485 	int error = 0;
    486 
    487 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    488 		altq_disable(rqp->rq_ifq);
    489 
    490 	if ((error = altq_detach(rqp->rq_ifq)))
    491 		return (error);
    492 
    493 	if (red_list == rqp)
    494 		red_list = rqp->rq_next;
    495 	else {
    496 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    497 			if (tmp->rq_next == rqp) {
    498 				tmp->rq_next = rqp->rq_next;
    499 				break;
    500 			}
    501 		if (tmp == NULL)
    502 			printf("red_detach: no state found in red_list!\n");
    503 	}
    504 
    505 	red_destroy(rqp->rq_red);
    506 	FREE(rqp->rq_q, M_DEVBUF);
    507 	FREE(rqp, M_DEVBUF);
    508 	return (error);
    509 }
    510 
    511 /*
    512  * red support routines
    513  */
    514 
    515 red_t *
    516 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    517 	int	weight, inv_pmax, th_min, th_max;
    518 	int	flags, pkttime;
    519 {
    520 	red_t 	*rp;
    521 	int	w, i;
    522 	int	npkts_per_sec;
    523 
    524 	MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
    525 	if (rp == NULL)
    526 		return (NULL);
    527 	(void)memset(rp, 0, sizeof(red_t));
    528 
    529 	rp->red_avg = 0;
    530 	rp->red_idle = 1;
    531 
    532 	if (weight == 0)
    533 		rp->red_weight = W_WEIGHT;
    534 	else
    535 		rp->red_weight = weight;
    536 	if (inv_pmax == 0)
    537 		rp->red_inv_pmax = default_inv_pmax;
    538 	else
    539 		rp->red_inv_pmax = inv_pmax;
    540 	if (th_min == 0)
    541 		rp->red_thmin = default_th_min;
    542 	else
    543 		rp->red_thmin = th_min;
    544 	if (th_max == 0)
    545 		rp->red_thmax = default_th_max;
    546 	else
    547 		rp->red_thmax = th_max;
    548 
    549 	rp->red_flags = flags;
    550 
    551 	if (pkttime == 0)
    552 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    553 		rp->red_pkttime = 800;
    554 	else
    555 		rp->red_pkttime = pkttime;
    556 
    557 	if (weight == 0) {
    558 		/* when the link is very slow, adjust red parameters */
    559 		npkts_per_sec = 1000000 / rp->red_pkttime;
    560 		if (npkts_per_sec < 50) {
    561 			/* up to about 400Kbps */
    562 			rp->red_weight = W_WEIGHT_2;
    563 		} else if (npkts_per_sec < 300) {
    564 			/* up to about 2.4Mbps */
    565 			rp->red_weight = W_WEIGHT_1;
    566 		}
    567 	}
    568 
    569 	/* calculate wshift.  weight must be power of 2 */
    570 	w = rp->red_weight;
    571 	for (i = 0; w > 1; i++)
    572 		w = w >> 1;
    573 	rp->red_wshift = i;
    574 	w = 1 << rp->red_wshift;
    575 	if (w != rp->red_weight) {
    576 		printf("invalid weight value %d for red! use %d\n",
    577 		       rp->red_weight, w);
    578 		rp->red_weight = w;
    579 	}
    580 
    581 	/*
    582 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    583 	 * to be compared with avg.
    584 	 */
    585 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    586 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    587 
    588 	/*
    589 	 * precompute probability denominator
    590 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    591 	 */
    592 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    593 			 * rp->red_inv_pmax) << FP_SHIFT;
    594 
    595 	/* allocate weight table */
    596 	rp->red_wtab = wtab_alloc(rp->red_weight);
    597 
    598 	microtime(&rp->red_last);
    599 #ifdef ALTQ_FLOWVALVE
    600 	if (flags & REDF_FLOWVALVE)
    601 		rp->red_flowvalve = fv_alloc(rp);
    602 	/* if fv_alloc failes, flowvalve is just disabled */
    603 #endif
    604 	return (rp);
    605 }
    606 
    607 void
    608 red_destroy(rp)
    609 	red_t *rp;
    610 {
    611 #ifdef ALTQ_FLOWVALVE
    612 	if (rp->red_flowvalve != NULL)
    613 		fv_destroy(rp->red_flowvalve);
    614 #endif
    615 	wtab_destroy(rp->red_wtab);
    616 	FREE(rp, M_DEVBUF);
    617 }
    618 
    619 void
    620 red_getstats(rp, sp)
    621 	red_t *rp;
    622 	struct redstats *sp;
    623 {
    624 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    625 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    626 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    627 	sp->drop_forced		= rp->red_stats.drop_forced;
    628 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    629 	sp->marked_packets	= rp->red_stats.marked_packets;
    630 }
    631 
    632 /*
    633  * enqueue routine:
    634  *
    635  *	returns: 0 when successfully queued.
    636  *		 ENOBUFS when drop occurs.
    637  */
    638 static int
    639 red_enqueue(ifq, m, pktattr)
    640 	struct ifaltq *ifq;
    641 	struct mbuf *m;
    642 	struct altq_pktattr *pktattr;
    643 {
    644 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    645 
    646 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    647 		return ENOBUFS;
    648 	ifq->ifq_len++;
    649 	return 0;
    650 }
    651 
    652 int
    653 red_addq(rp, q, m, pktattr)
    654 	red_t *rp;
    655 	class_queue_t *q;
    656 	struct mbuf *m;
    657 	struct altq_pktattr *pktattr;
    658 {
    659 	int avg, droptype;
    660 	int n;
    661 #ifdef ALTQ_FLOWVALVE
    662 	struct fve *fve = NULL;
    663 
    664 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    665 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    666 			m_freem(m);
    667 			return (-1);
    668 		}
    669 #endif
    670 
    671 	avg = rp->red_avg;
    672 
    673 	/*
    674 	 * if we were idle, we pretend that n packets arrived during
    675 	 * the idle period.
    676 	 */
    677 	if (rp->red_idle) {
    678 		struct timeval now;
    679 		int t;
    680 
    681 		rp->red_idle = 0;
    682 		microtime(&now);
    683 		t = (now.tv_sec - rp->red_last.tv_sec);
    684 		if (t > 60) {
    685 			/*
    686 			 * being idle for more than 1 minute, set avg to zero.
    687 			 * this prevents t from overflow.
    688 			 */
    689 			avg = 0;
    690 		} else {
    691 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    692 			n = t / rp->red_pkttime - 1;
    693 
    694 			/* the following line does (avg = (1 - Wq)^n * avg) */
    695 			if (n > 0)
    696 				avg = (avg >> FP_SHIFT) *
    697 				    pow_w(rp->red_wtab, n);
    698 		}
    699 	}
    700 
    701 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    702 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    703 	rp->red_avg = avg;		/* save the new value */
    704 
    705 	/*
    706 	 * red_count keeps a tally of arriving traffic that has not
    707 	 * been dropped.
    708 	 */
    709 	rp->red_count++;
    710 
    711 	/* see if we drop early */
    712 	droptype = DTYPE_NODROP;
    713 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    714 		if (avg >= rp->red_thmax_s) {
    715 			/* avg >= th_max: forced drop */
    716 			droptype = DTYPE_FORCED;
    717 		} else if (rp->red_old == 0) {
    718 			/* first exceeds th_min */
    719 			rp->red_count = 1;
    720 			rp->red_old = 1;
    721 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    722 				      rp->red_probd, rp->red_count)) {
    723 			/* mark or drop by red */
    724 			if ((rp->red_flags & REDF_ECN) &&
    725 			    mark_ecn(m, pktattr, rp->red_flags)) {
    726 				/* successfully marked.  do not drop. */
    727 				rp->red_count = 0;
    728 #ifdef RED_STATS
    729 				rp->red_stats.marked_packets++;
    730 #endif
    731 			} else {
    732 				/* unforced drop by red */
    733 				droptype = DTYPE_EARLY;
    734 			}
    735 		}
    736 	} else {
    737 		/* avg < th_min */
    738 		rp->red_old = 0;
    739 	}
    740 
    741 	/*
    742 	 * if the queue length hits the hard limit, it's a forced drop.
    743 	 */
    744 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    745 		droptype = DTYPE_FORCED;
    746 
    747 #ifdef RED_RANDOM_DROP
    748 	/* if successful or forced drop, enqueue this packet. */
    749 	if (droptype != DTYPE_EARLY)
    750 		_addq(q, m);
    751 #else
    752 	/* if successful, enqueue this packet. */
    753 	if (droptype == DTYPE_NODROP)
    754 		_addq(q, m);
    755 #endif
    756 	if (droptype != DTYPE_NODROP) {
    757 		if (droptype == DTYPE_EARLY) {
    758 			/* drop the incoming packet */
    759 #ifdef RED_STATS
    760 			rp->red_stats.drop_unforced++;
    761 #endif
    762 		} else {
    763 			/* forced drop, select a victim packet in the queue. */
    764 #ifdef RED_RANDOM_DROP
    765 			m = _getq_random(q);
    766 #endif
    767 #ifdef RED_STATS
    768 			rp->red_stats.drop_forced++;
    769 #endif
    770 		}
    771 #ifdef RED_STATS
    772 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    773 #endif
    774 		rp->red_count = 0;
    775 #ifdef ALTQ_FLOWVALVE
    776 		if (rp->red_flowvalve != NULL)
    777 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    778 #endif
    779 		m_freem(m);
    780 		return (-1);
    781 	}
    782 	/* successfully queued */
    783 #ifdef RED_STATS
    784 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    785 #endif
    786 	return (0);
    787 }
    788 
    789 /*
    790  * early-drop probability is calculated as follows:
    791  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    792  *   prob_a = prob / (2 - count*prob)
    793  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    794  * here prob_a increases as successive undrop count increases.
    795  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    796  * becomes 1 when (count >= (2 / prob))).
    797  */
    798 int
    799 drop_early(fp_len, fp_probd, count)
    800 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    801 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    802 	int count;	/* how many successive undropped packets */
    803 {
    804 	int d;		/* denominator of drop-probability */
    805 
    806 	d = fp_probd - count * fp_len;
    807 	if (d <= 0)
    808 		/* count exceeds the hard limit: drop or mark */
    809 		return (1);
    810 
    811 	/*
    812 	 * now the range of d is [1..600] in fixed-point. (when
    813 	 * th_max-th_min=10 and p_max=1/30)
    814 	 * drop probability = (avg - TH_MIN) / d
    815 	 */
    816 
    817 	if ((random() % d) < fp_len) {
    818 		/* drop or mark */
    819 		return (1);
    820 	}
    821 	/* no drop/mark */
    822 	return (0);
    823 }
    824 
    825 /*
    826  * try to mark CE bit to the packet.
    827  *    returns 1 if successfully marked, 0 otherwise.
    828  */
    829 int
    830 mark_ecn(m, pktattr, flags)
    831 	struct mbuf *m;
    832 	struct altq_pktattr *pktattr;
    833 	int flags;
    834 {
    835 	struct mbuf *m0;
    836 
    837 	if (pktattr == NULL ||
    838 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    839 		return (0);
    840 
    841 	/* verify that pattr_hdr is within the mbuf data */
    842 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    843 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    844 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    845 			break;
    846 	if (m0 == NULL) {
    847 		/* ick, pattr_hdr is stale */
    848 		pktattr->pattr_af = AF_UNSPEC;
    849 		return (0);
    850 	}
    851 
    852 	switch (pktattr->pattr_af) {
    853 	case AF_INET:
    854 		if (flags & REDF_ECN4) {
    855 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    856 			u_int8_t otos;
    857 			int sum;
    858 
    859 			if (ip->ip_v != 4)
    860 				return (0);	/* version mismatch! */
    861 
    862 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    863 				return (0);	/* not-ECT */
    864 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    865 				return (1);	/* already marked */
    866 
    867 			/*
    868 			 * ecn-capable but not marked,
    869 			 * mark CE and update checksum
    870 			 */
    871 			otos = ip->ip_tos;
    872 			ip->ip_tos |= IPTOS_ECN_CE;
    873 			/*
    874 			 * update checksum (from RFC1624)
    875 			 *	   HC' = ~(~HC + ~m + m')
    876 			 */
    877 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    878 			sum += (~otos & 0xffff) + ip->ip_tos;
    879 			sum = (sum >> 16) + (sum & 0xffff);
    880 			sum += (sum >> 16);  /* add carry */
    881 			ip->ip_sum = htons(~sum & 0xffff);
    882 			return (1);
    883 		}
    884 		break;
    885 #ifdef INET6
    886 	case AF_INET6:
    887 		if (flags & REDF_ECN6) {
    888 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    889 			u_int32_t flowlabel;
    890 
    891 			flowlabel = ntohl(ip6->ip6_flow);
    892 			if ((flowlabel >> 28) != 6)
    893 				return (0);	/* version mismatch! */
    894 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    895 			    (IPTOS_ECN_NOTECT << 20))
    896 				return (0);	/* not-ECT */
    897 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    898 			    (IPTOS_ECN_CE << 20))
    899 				return (1);	/* already marked */
    900 			/*
    901 			 * ecn-capable but not marked,  mark CE
    902 			 */
    903 			flowlabel |= (IPTOS_ECN_CE << 20);
    904 			ip6->ip6_flow = htonl(flowlabel);
    905 			return (1);
    906 		}
    907 		break;
    908 #endif  /* INET6 */
    909 	}
    910 
    911 	/* not marked */
    912 	return (0);
    913 }
    914 
    915 /*
    916  * dequeue routine:
    917  *	must be called in splnet.
    918  *
    919  *	returns: mbuf dequeued.
    920  *		 NULL when no packet is available in the queue.
    921  */
    922 
    923 static struct mbuf *
    924 red_dequeue(ifq, op)
    925 	struct ifaltq *ifq;
    926 	int op;
    927 {
    928 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    929 	struct mbuf *m;
    930 
    931 	if (op == ALTDQ_POLL)
    932 		return qhead(rqp->rq_q);
    933 
    934 	/* op == ALTDQ_REMOVE */
    935 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    936 	if (m != NULL)
    937 		ifq->ifq_len--;
    938 	return (m);
    939 }
    940 
    941 struct mbuf *
    942 red_getq(rp, q)
    943 	red_t *rp;
    944 	class_queue_t *q;
    945 {
    946 	struct mbuf *m;
    947 
    948 	if ((m = _getq(q)) == NULL) {
    949 		if (rp->red_idle == 0) {
    950 			rp->red_idle = 1;
    951 			microtime(&rp->red_last);
    952 		}
    953 		return NULL;
    954 	}
    955 
    956 	rp->red_idle = 0;
    957 	return (m);
    958 }
    959 
    960 static int
    961 red_request(ifq, req, arg)
    962 	struct ifaltq *ifq;
    963 	int req;
    964 	void *arg;
    965 {
    966 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    967 
    968 	switch (req) {
    969 	case ALTRQ_PURGE:
    970 		red_purgeq(rqp);
    971 		break;
    972 	}
    973 	return (0);
    974 }
    975 
    976 static void
    977 red_purgeq(rqp)
    978 	red_queue_t *rqp;
    979 {
    980 	_flushq(rqp->rq_q);
    981 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    982 		rqp->rq_ifq->ifq_len = 0;
    983 }
    984 
    985 
    986 /*
    987  * helper routine to calibrate avg during idle.
    988  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    989  * here Wq = 1/weight and the code assumes Wq is close to zero.
    990  *
    991  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    992  */
    993 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    994 
    995 struct wtab *
    996 wtab_alloc(weight)
    997 	int weight;
    998 {
    999 	struct wtab *w;
   1000 	int i;
   1001 
   1002 	for (w = wtab_list; w != NULL; w = w->w_next)
   1003 		if (w->w_weight == weight) {
   1004 			w->w_refcount++;
   1005 			return (w);
   1006 		}
   1007 
   1008 	MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
   1009 	if (w == NULL)
   1010 		panic("wtab_alloc: malloc failed!");
   1011 	(void)memset(w, 0, sizeof(struct wtab));
   1012 	w->w_weight = weight;
   1013 	w->w_refcount = 1;
   1014 	w->w_next = wtab_list;
   1015 	wtab_list = w;
   1016 
   1017 	/* initialize the weight table */
   1018 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1019 	for (i = 1; i < 32; i++) {
   1020 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1021 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1022 			w->w_param_max = 1 << i;
   1023 	}
   1024 
   1025 	return (w);
   1026 }
   1027 
   1028 int
   1029 wtab_destroy(w)
   1030 	struct wtab *w;
   1031 {
   1032 	struct wtab *prev;
   1033 
   1034 	if (--w->w_refcount > 0)
   1035 		return (0);
   1036 
   1037 	if (wtab_list == w)
   1038 		wtab_list = w->w_next;
   1039 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1040 		if (prev->w_next == w) {
   1041 			prev->w_next = w->w_next;
   1042 			break;
   1043 		}
   1044 
   1045 	FREE(w, M_DEVBUF);
   1046 	return (0);
   1047 }
   1048 
   1049 int32_t
   1050 pow_w(w, n)
   1051 	struct wtab *w;
   1052 	int n;
   1053 {
   1054 	int	i, bit;
   1055 	int32_t	val;
   1056 
   1057 	if (n >= w->w_param_max)
   1058 		return (0);
   1059 
   1060 	val = 1 << FP_SHIFT;
   1061 	if (n <= 0)
   1062 		return (val);
   1063 
   1064 	bit = 1;
   1065 	i = 0;
   1066 	while (n) {
   1067 		if (n & bit) {
   1068 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1069 			n &= ~bit;
   1070 		}
   1071 		i++;
   1072 		bit <<=  1;
   1073 	}
   1074 	return (val);
   1075 }
   1076 
   1077 #ifdef ALTQ_FLOWVALVE
   1078 
   1079 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1080 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1081 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1082 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1083 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1084 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1085 
   1086 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1087 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1088 
   1089 #define	FV_N			10	/* update fve_f every FV_N packets */
   1090 
   1091 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1092 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1093 #define	FV_ALPHA		5  /* extra packet count */
   1094 
   1095 #if (__FreeBSD_version > 300000)
   1096 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1097 #else
   1098 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1099 #endif
   1100 
   1101 /*
   1102  * Brtt table: 127 entry table to convert drop rate (p) to
   1103  * the corresponding bandwidth fraction (f)
   1104  * the following equation is implemented to use scaled values,
   1105  * fve_p and fve_f, in the fixed point format.
   1106  *
   1107  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1108  *   f = Brtt(p) / (max_th + alpha)
   1109  */
   1110 #define	BRTT_SIZE	128
   1111 #define	BRTT_SHIFT	12
   1112 #define	BRTT_MASK	0x0007f000
   1113 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1114 
   1115 const int brtt_tab[BRTT_SIZE] = {
   1116 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1117 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1118 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1119 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1120 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1121 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1122 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1123 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1124 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1125 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1126 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1127 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1128 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1129 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1130 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1131 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1132 };
   1133 
   1134 static inline struct fve *
   1135 flowlist_lookup(fv, pktattr, now)
   1136 	struct flowvalve *fv;
   1137 	struct altq_pktattr *pktattr;
   1138 	struct timeval *now;
   1139 {
   1140 	struct fve *fve;
   1141 	int flows;
   1142 	struct ip *ip;
   1143 #ifdef INET6
   1144 	struct ip6_hdr *ip6;
   1145 #endif
   1146 	struct timeval tthresh;
   1147 
   1148 	if (pktattr == NULL)
   1149 		return (NULL);
   1150 
   1151 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1152 	flows = 0;
   1153 	/*
   1154 	 * search the flow list
   1155 	 */
   1156 	switch (pktattr->pattr_af) {
   1157 	case AF_INET:
   1158 		ip = (struct ip *)pktattr->pattr_hdr;
   1159 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1160 			if (fve->fve_lastdrop.tv_sec == 0)
   1161 				break;
   1162 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1163 				fve->fve_lastdrop.tv_sec = 0;
   1164 				break;
   1165 			}
   1166 			if (fve->fve_flow.flow_af == AF_INET &&
   1167 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1168 			    ip->ip_src.s_addr &&
   1169 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1170 			    ip->ip_dst.s_addr)
   1171 				return (fve);
   1172 			flows++;
   1173 		}
   1174 		break;
   1175 #ifdef INET6
   1176 	case AF_INET6:
   1177 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1178 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1179 			if (fve->fve_lastdrop.tv_sec == 0)
   1180 				break;
   1181 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1182 				fve->fve_lastdrop.tv_sec = 0;
   1183 				break;
   1184 			}
   1185 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1186 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1187 					       &ip6->ip6_src) &&
   1188 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1189 					       &ip6->ip6_dst))
   1190 				return (fve);
   1191 			flows++;
   1192 		}
   1193 		break;
   1194 #endif /* INET6 */
   1195 
   1196 	default:
   1197 		/* unknown protocol.  no drop. */
   1198 		return (NULL);
   1199 	}
   1200 	fv->fv_flows = flows;	/* save the number of active fve's */
   1201 	return (NULL);
   1202 }
   1203 
   1204 static inline struct fve *
   1205 flowlist_reclaim(fv, pktattr)
   1206 	struct flowvalve *fv;
   1207 	struct altq_pktattr *pktattr;
   1208 {
   1209 	struct fve *fve;
   1210 	struct ip *ip;
   1211 #ifdef INET6
   1212 	struct ip6_hdr *ip6;
   1213 #endif
   1214 
   1215 	/*
   1216 	 * get an entry from the tail of the LRU list.
   1217 	 */
   1218 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1219 
   1220 	switch (pktattr->pattr_af) {
   1221 	case AF_INET:
   1222 		ip = (struct ip *)pktattr->pattr_hdr;
   1223 		fve->fve_flow.flow_af = AF_INET;
   1224 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1225 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1226 		break;
   1227 #ifdef INET6
   1228 	case AF_INET6:
   1229 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1230 		fve->fve_flow.flow_af = AF_INET6;
   1231 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1232 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1233 		break;
   1234 #endif
   1235 	}
   1236 
   1237 	fve->fve_state = Green;
   1238 	fve->fve_p = 0.0;
   1239 	fve->fve_f = 0.0;
   1240 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1241 	fve->fve_count = 0;
   1242 
   1243 	fv->fv_flows++;
   1244 #ifdef FV_STATS
   1245 	fv->fv_stats.alloc++;
   1246 #endif
   1247 	return (fve);
   1248 }
   1249 
   1250 static inline void
   1251 flowlist_move_to_head(fv, fve)
   1252 	struct flowvalve *fv;
   1253 	struct fve *fve;
   1254 {
   1255 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1256 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1257 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1258 	}
   1259 }
   1260 
   1261 /*
   1262  * allocate flowvalve structure
   1263  */
   1264 static struct flowvalve *
   1265 fv_alloc(rp)
   1266 	struct red *rp;
   1267 {
   1268 	struct flowvalve *fv;
   1269 	struct fve *fve;
   1270 	int i, num;
   1271 
   1272 	num = FV_FLOWLISTSIZE;
   1273 	MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve),
   1274 	       M_DEVBUF, M_WAITOK);
   1275 	if (fv == NULL)
   1276 		return (NULL);
   1277 	(void)memset(fv, 0, sizeof(struct flowvalve));
   1278 
   1279 	MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num,
   1280 	       M_DEVBUF, M_WAITOK);
   1281 	if (fv->fv_fves == NULL) {
   1282 		FREE(fv, M_DEVBUF);
   1283 		return (NULL);
   1284 	}
   1285 	(void)memset(fv->fv_fves, 0, sizeof(struct fve) * num);
   1286 
   1287 	fv->fv_flows = 0;
   1288 	TAILQ_INIT(&fv->fv_flowlist);
   1289 	for (i = 0; i < num; i++) {
   1290 		fve = &fv->fv_fves[i];
   1291 		fve->fve_lastdrop.tv_sec = 0;
   1292 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1293 	}
   1294 
   1295 	/* initialize drop rate threshold in scaled fixed-point */
   1296 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1297 
   1298 	/* initialize drop rate to fraction table */
   1299 	MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE,
   1300 	       M_DEVBUF, M_WAITOK);
   1301 	if (fv->fv_p2ftab == NULL) {
   1302 		FREE(fv->fv_fves, M_DEVBUF);
   1303 		FREE(fv, M_DEVBUF);
   1304 		return (NULL);
   1305 	}
   1306 	/*
   1307 	 * create the p2f table.
   1308 	 * (shift is used to keep the precision)
   1309 	 */
   1310 	for (i = 1; i < BRTT_SIZE; i++) {
   1311 		int f;
   1312 
   1313 		f = brtt_tab[i] << 8;
   1314 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1315 	}
   1316 
   1317 	return (fv);
   1318 }
   1319 
   1320 static void fv_destroy(fv)
   1321 	struct flowvalve *fv;
   1322 {
   1323 	FREE(fv->fv_p2ftab, M_DEVBUF);
   1324 	FREE(fv->fv_fves, M_DEVBUF);
   1325 	FREE(fv, M_DEVBUF);
   1326 }
   1327 
   1328 static inline int
   1329 fv_p2f(fv, p)
   1330 	struct flowvalve	*fv;
   1331 	int	p;
   1332 {
   1333 	int val, f;
   1334 
   1335 	if (p >= BRTT_PMAX)
   1336 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1337 	else if ((val = (p & BRTT_MASK)))
   1338 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1339 	else
   1340 		f = fv->fv_p2ftab[1];
   1341 	return (f);
   1342 }
   1343 
   1344 /*
   1345  * check if an arriving packet should be pre-dropped.
   1346  * called from red_addq() when a packet arrives.
   1347  * returns 1 when the packet should be pre-dropped.
   1348  * should be called in splnet.
   1349  */
   1350 static int
   1351 fv_checkflow(fv, pktattr, fcache)
   1352 	struct flowvalve *fv;
   1353 	struct altq_pktattr *pktattr;
   1354 	struct fve **fcache;
   1355 {
   1356 	struct fve *fve;
   1357 	struct timeval now;
   1358 
   1359 	fv->fv_ifseq++;
   1360 	FV_TIMESTAMP(&now);
   1361 
   1362 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1363 		/* no matching entry in the flowlist */
   1364 		return (0);
   1365 
   1366 	*fcache = fve;
   1367 
   1368 	/* update fraction f for every FV_N packets */
   1369 	if (++fve->fve_count == FV_N) {
   1370 		/*
   1371 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1372 		 */
   1373 		fve->fve_f =
   1374 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1375 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1376 		fve->fve_ifseq = fv->fv_ifseq;
   1377 		fve->fve_count = 0;
   1378 	}
   1379 
   1380 	/*
   1381 	 * overpumping test
   1382 	 */
   1383 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1384 		int fthresh;
   1385 
   1386 		/* calculate a threshold */
   1387 		fthresh = fv_p2f(fv, fve->fve_p);
   1388 		if (fve->fve_f > fthresh)
   1389 			fve->fve_state = Red;
   1390 	}
   1391 
   1392 	if (fve->fve_state == Red) {
   1393 		/*
   1394 		 * backoff test
   1395 		 */
   1396 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1397 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1398 			fve->fve_p = 0;
   1399 			fve->fve_state = Green;
   1400 #ifdef FV_STATS
   1401 			fv->fv_stats.escape++;
   1402 #endif
   1403 		} else {
   1404 			/* block this flow */
   1405 			flowlist_move_to_head(fv, fve);
   1406 			fve->fve_lastdrop = now;
   1407 #ifdef FV_STATS
   1408 			fv->fv_stats.predrop++;
   1409 #endif
   1410 			return (1);
   1411 		}
   1412 	}
   1413 
   1414 	/*
   1415 	 * p = (1 - Wp) * p
   1416 	 */
   1417 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1418 	if (fve->fve_p < 0)
   1419 		fve->fve_p = 0;
   1420 #ifdef FV_STATS
   1421 	fv->fv_stats.pass++;
   1422 #endif
   1423 	return (0);
   1424 }
   1425 
   1426 /*
   1427  * called from red_addq when a packet is dropped by red.
   1428  * should be called in splnet.
   1429  */
   1430 static void fv_dropbyred(fv, pktattr, fcache)
   1431 	struct flowvalve *fv;
   1432 	struct altq_pktattr *pktattr;
   1433 	struct fve *fcache;
   1434 {
   1435 	struct fve *fve;
   1436 	struct timeval now;
   1437 
   1438 	if (pktattr == NULL)
   1439 		return;
   1440 	FV_TIMESTAMP(&now);
   1441 
   1442 	if (fcache != NULL)
   1443 		/* the fve of this packet is already cached */
   1444 		fve = fcache;
   1445 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1446 		fve = flowlist_reclaim(fv, pktattr);
   1447 
   1448 	flowlist_move_to_head(fv, fve);
   1449 
   1450 	/*
   1451 	 * update p:  the following line cancels the update
   1452 	 *	      in fv_checkflow() and calculate
   1453 	 *	p = Wp + (1 - Wp) * p
   1454 	 */
   1455 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1456 
   1457 	fve->fve_lastdrop = now;
   1458 }
   1459 
   1460 #endif /* ALTQ_FLOWVALVE */
   1461 
   1462 #ifdef KLD_MODULE
   1463 
   1464 static struct altqsw red_sw =
   1465 	{"red", redopen, redclose, redioctl};
   1466 
   1467 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1468 
   1469 #endif /* KLD_MODULE */
   1470 
   1471 #endif /* ALTQ_RED */
   1472