Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.2.2.3
      1 /*	$NetBSD: altq_red.c,v 1.2.2.3 2001/04/21 17:46:12 bouyer Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #if defined(__FreeBSD__) || defined(__NetBSD__)
     64 #include "opt_altq.h"
     65 #if (__FreeBSD__ != 2)
     66 #include "opt_inet.h"
     67 #ifdef __FreeBSD__
     68 #include "opt_inet6.h"
     69 #endif
     70 #endif
     71 #endif /* __FreeBSD__ || __NetBSD__ */
     72 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     73 
     74 #include <sys/param.h>
     75 #include <sys/malloc.h>
     76 #include <sys/mbuf.h>
     77 #include <sys/socket.h>
     78 #include <sys/sockio.h>
     79 #include <sys/systm.h>
     80 #include <sys/proc.h>
     81 #include <sys/errno.h>
     82 #include <sys/kernel.h>
     83 #ifdef ALTQ_FLOWVALVE
     84 #include <sys/queue.h>
     85 #include <sys/time.h>
     86 #endif
     87 
     88 #include <net/if.h>
     89 #include <net/if_types.h>
     90 
     91 #include <netinet/in.h>
     92 #include <netinet/in_systm.h>
     93 #include <netinet/ip.h>
     94 #ifdef INET6
     95 #include <netinet/ip6.h>
     96 #endif
     97 
     98 #include <altq/altq.h>
     99 #include <altq/altq_conf.h>
    100 #include <altq/altq_red.h>
    101 #ifdef ALTQ_FLOWVALVE
    102 #include <altq/altq_flowvalve.h>
    103 #endif
    104 
    105 /*
    106  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    107  * fixed-point calculation.
    108  *
    109  * written by kjc using the ns code as a reference.
    110  * you can learn more about red and ns from Sally's home page at
    111  * http://www-nrg.ee.lbl.gov/floyd/
    112  *
    113  * most of the red parameter values are fixed in this implementation
    114  * to prevent fixed-point overflow/underflow.
    115  * if you change the parameters, watch out for overflow/underflow!
    116  *
    117  * the parameters used are recommended values by Sally.
    118  * the corresponding ns config looks:
    119  *	q_weight=0.00195
    120  *	minthresh=5 maxthresh=15 queue-size=60
    121  *	linterm=30
    122  *	dropmech=drop-tail
    123  *	bytes=false (can't be handled by 32-bit fixed-point)
    124  *	doubleq=false dqthresh=false
    125  *	wait=true
    126  */
    127 /*
    128  * alternative red parameters for a slow link.
    129  *
    130  * assume the queue length becomes from zero to L and keeps L, it takes
    131  * N packets for q_avg to reach 63% of L.
    132  * when q_weight is 0.002, N is about 500 packets.
    133  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    134  * when q_weight is 0.008, N is about 127 packets.
    135  * when q_weight is 0.016, N is about 63 packets.
    136  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    137  * are allowed for 0.016.
    138  * see Sally's paper for more details.
    139  */
    140 /* normal red parameters */
    141 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    142 				/* q_weight = 0.00195 */
    143 
    144 /* red parameters for a slow link */
    145 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    146 				/* q_weight = 0.0078125 */
    147 
    148 /* red parameters for a very slow link (e.g., dialup) */
    149 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    150 				/* q_weight = 0.015625 */
    151 
    152 /* fixed-point uses 12-bit decimal places */
    153 #define	FP_SHIFT	12	/* fixed-point shift */
    154 
    155 /* red parameters for drop probability */
    156 #define	INV_P_MAX	10	/* inverse of max drop probability */
    157 #define	TH_MIN		5	/* min threshold */
    158 #define	TH_MAX		15	/* max threshold */
    159 
    160 #define	RED_LIMIT	60	/* default max queue lenght */
    161 #define	RED_STATS		/* collect statistics */
    162 
    163 /*
    164  * our default policy for forced-drop is drop-tail.
    165  * (in altq-1.1.2 or earlier, the default was random-drop.
    166  * but it makes more sense to punish the cause of the surge.)
    167  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    168  */
    169 
    170 #ifdef ALTQ_FLOWVALVE
    171 /*
    172  * flow-valve is an extention to protect red from unresponsive flows
    173  * and to promote end-to-end congestion control.
    174  * flow-valve observes the average drop rates of the flows that have
    175  * experienced packet drops in the recent past.
    176  * when the average drop rate exceeds the threshold, the flow is
    177  * blocked by the flow-valve.  the trapped flow should back off
    178  * exponentially to escape from the flow-valve.
    179  */
    180 #ifdef RED_RANDOM_DROP
    181 #error "random-drop can't be used with flow-valve!"
    182 #endif
    183 #endif /* ALTQ_FLOWVALVE */
    184 
    185 /* red_list keeps all red_queue_t's allocated. */
    186 static red_queue_t *red_list = NULL;
    187 
    188 /* default red parameter values */
    189 static int default_th_min = TH_MIN;
    190 static int default_th_max = TH_MAX;
    191 static int default_inv_pmax = INV_P_MAX;
    192 
    193 /* internal function prototypes */
    194 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    195 			    struct altq_pktattr *));
    196 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    197 static int red_request __P((struct ifaltq *, int, void *));
    198 static void red_purgeq __P((red_queue_t *));
    199 static int red_detach __P((red_queue_t *));
    200 #ifdef ALTQ_FLOWVALVE
    201 static __inline struct fve *flowlist_lookup __P((struct flowvalve *,
    202 			 struct altq_pktattr *, struct timeval *));
    203 static __inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    204 						  struct altq_pktattr *));
    205 static __inline void flowlist_move_to_head __P((struct flowvalve *,
    206 						struct fve *));
    207 static __inline int fv_p2f __P((struct flowvalve *, int));
    208 static struct flowvalve *fv_alloc __P((struct red *));
    209 static void fv_destroy __P((struct flowvalve *));
    210 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    211 			     struct fve **));
    212 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    213 			      struct fve *));
    214 #endif
    215 
    216 /*
    217  * red device interface
    218  */
    219 altqdev_decl(red);
    220 
    221 int
    222 redopen(dev, flag, fmt, p)
    223 	dev_t dev;
    224 	int flag, fmt;
    225 	struct proc *p;
    226 {
    227 	/* everything will be done when the queueing scheme is attached. */
    228 	return 0;
    229 }
    230 
    231 int
    232 redclose(dev, flag, fmt, p)
    233 	dev_t dev;
    234 	int flag, fmt;
    235 	struct proc *p;
    236 {
    237 	red_queue_t *rqp;
    238 	int err, error = 0;
    239 
    240 	while ((rqp = red_list) != NULL) {
    241 		/* destroy all */
    242 		err = red_detach(rqp);
    243 		if (err != 0 && error == 0)
    244 			error = err;
    245 	}
    246 
    247 	return error;
    248 }
    249 
    250 int
    251 redioctl(dev, cmd, addr, flag, p)
    252 	dev_t dev;
    253 	ioctlcmd_t cmd;
    254 	caddr_t addr;
    255 	int flag;
    256 	struct proc *p;
    257 {
    258 	red_queue_t *rqp;
    259 	struct red_interface *ifacep;
    260 	struct ifnet *ifp;
    261 	int	error = 0;
    262 
    263 	/* check super-user privilege */
    264 	switch (cmd) {
    265 	case RED_GETSTATS:
    266 		break;
    267 	default:
    268 #if (__FreeBSD_version > 400000)
    269 		if ((error = suser(p)) != 0)
    270 #else
    271 		if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
    272 #endif
    273 			return (error);
    274 		break;
    275 	}
    276 
    277 	switch (cmd) {
    278 
    279 	case RED_ENABLE:
    280 		ifacep = (struct red_interface *)addr;
    281 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    282 			error = EBADF;
    283 			break;
    284 		}
    285 		error = altq_enable(rqp->rq_ifq);
    286 		break;
    287 
    288 	case RED_DISABLE:
    289 		ifacep = (struct red_interface *)addr;
    290 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    291 			error = EBADF;
    292 			break;
    293 		}
    294 		error = altq_disable(rqp->rq_ifq);
    295 		break;
    296 
    297 	case RED_IF_ATTACH:
    298 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    299 		if (ifp == NULL) {
    300 			error = ENXIO;
    301 			break;
    302 		}
    303 
    304 		/* allocate and initialize red_queue_t */
    305 		MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
    306 		if (rqp == NULL) {
    307 			error = ENOMEM;
    308 			break;
    309 		}
    310 		bzero(rqp, sizeof(red_queue_t));
    311 
    312 		MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t),
    313 		       M_DEVBUF, M_WAITOK);
    314 		if (rqp->rq_q == NULL) {
    315 			FREE(rqp, M_DEVBUF);
    316 			error = ENOMEM;
    317 			break;
    318 		}
    319 		bzero(rqp->rq_q, sizeof(class_queue_t));
    320 
    321 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    322 		if (rqp->rq_red == NULL) {
    323 			FREE(rqp->rq_q, M_DEVBUF);
    324 			FREE(rqp, M_DEVBUF);
    325 			error = ENOMEM;
    326 			break;
    327 		}
    328 
    329 		rqp->rq_ifq = &ifp->if_snd;
    330 		qtail(rqp->rq_q) = NULL;
    331 		qlen(rqp->rq_q) = 0;
    332 		qlimit(rqp->rq_q) = RED_LIMIT;
    333 		qtype(rqp->rq_q) = Q_RED;
    334 
    335 		/*
    336 		 * set RED to this ifnet structure.
    337 		 */
    338 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    339 				    red_enqueue, red_dequeue, red_request,
    340 				    NULL, NULL);
    341 		if (error) {
    342 			red_destroy(rqp->rq_red);
    343 			FREE(rqp->rq_q, M_DEVBUF);
    344 			FREE(rqp, M_DEVBUF);
    345 			break;
    346 		}
    347 
    348 		/* add this state to the red list */
    349 		rqp->rq_next = red_list;
    350 		red_list = rqp;
    351 		break;
    352 
    353 	case RED_IF_DETACH:
    354 		ifacep = (struct red_interface *)addr;
    355 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    356 			error = EBADF;
    357 			break;
    358 		}
    359 		error = red_detach(rqp);
    360 		break;
    361 
    362 	case RED_GETSTATS:
    363 		do {
    364 			struct red_stats *q_stats;
    365 			red_t *rp;
    366 
    367 			q_stats = (struct red_stats *)addr;
    368 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    369 					     ALTQT_RED)) == NULL) {
    370 				error = EBADF;
    371 				break;
    372 			}
    373 
    374 			q_stats->q_len 	   = qlen(rqp->rq_q);
    375 			q_stats->q_limit   = qlimit(rqp->rq_q);
    376 
    377 			rp = rqp->rq_red;
    378 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    379 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    380 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    381 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    382 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    383 			q_stats->marked_packets = rp->red_stats.marked_packets;
    384 
    385 			q_stats->weight		= rp->red_weight;
    386 			q_stats->inv_pmax	= rp->red_inv_pmax;
    387 			q_stats->th_min		= rp->red_thmin;
    388 			q_stats->th_max		= rp->red_thmax;
    389 
    390 #ifdef ALTQ_FLOWVALVE
    391 			if (rp->red_flowvalve != NULL) {
    392 				struct flowvalve *fv = rp->red_flowvalve;
    393 				q_stats->fv_flows    = fv->fv_flows;
    394 				q_stats->fv_pass     = fv->fv_stats.pass;
    395 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    396 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    397 				q_stats->fv_escape   = fv->fv_stats.escape;
    398 			} else {
    399 #endif /* ALTQ_FLOWVALVE */
    400 				q_stats->fv_flows    = 0;
    401 				q_stats->fv_pass     = 0;
    402 				q_stats->fv_predrop  = 0;
    403 				q_stats->fv_alloc    = 0;
    404 				q_stats->fv_escape   = 0;
    405 #ifdef ALTQ_FLOWVALVE
    406 			}
    407 #endif /* ALTQ_FLOWVALVE */
    408 		} while (0);
    409 		break;
    410 
    411 	case RED_CONFIG:
    412 		do {
    413 			struct red_conf *fc;
    414 			red_t *new;
    415 			int s, limit;
    416 
    417 			fc = (struct red_conf *)addr;
    418 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    419 					       ALTQT_RED)) == NULL) {
    420 				error = EBADF;
    421 				break;
    422 			}
    423 			new = red_alloc(fc->red_weight,
    424 					fc->red_inv_pmax,
    425 					fc->red_thmin,
    426 					fc->red_thmax,
    427 					fc->red_flags,
    428 					fc->red_pkttime);
    429 			if (new == NULL) {
    430 				error = ENOMEM;
    431 				break;
    432 			}
    433 
    434 			s = splnet();
    435 			red_purgeq(rqp);
    436 			limit = fc->red_limit;
    437 			if (limit < fc->red_thmax)
    438 				limit = fc->red_thmax;
    439 			qlimit(rqp->rq_q) = limit;
    440 			fc->red_limit = limit;	/* write back the new value */
    441 
    442 			red_destroy(rqp->rq_red);
    443 			rqp->rq_red = new;
    444 
    445 			splx(s);
    446 
    447 			/* write back new values */
    448 			fc->red_limit = limit;
    449 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    450 			fc->red_thmin = rqp->rq_red->red_thmin;
    451 			fc->red_thmax = rqp->rq_red->red_thmax;
    452 
    453 		} while (0);
    454 		break;
    455 
    456 	case RED_SETDEFAULTS:
    457 		do {
    458 			struct redparams *rp;
    459 
    460 			rp = (struct redparams *)addr;
    461 
    462 			default_th_min = rp->th_min;
    463 			default_th_max = rp->th_max;
    464 			default_inv_pmax = rp->inv_pmax;
    465 		} while (0);
    466 		break;
    467 
    468 	default:
    469 		error = EINVAL;
    470 		break;
    471 	}
    472 	return error;
    473 }
    474 
    475 static int
    476 red_detach(rqp)
    477 	red_queue_t *rqp;
    478 {
    479 	red_queue_t *tmp;
    480 	int error = 0;
    481 
    482 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    483 		altq_disable(rqp->rq_ifq);
    484 
    485 	if ((error = altq_detach(rqp->rq_ifq)))
    486 		return (error);
    487 
    488 	if (red_list == rqp)
    489 		red_list = rqp->rq_next;
    490 	else {
    491 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    492 			if (tmp->rq_next == rqp) {
    493 				tmp->rq_next = rqp->rq_next;
    494 				break;
    495 			}
    496 		if (tmp == NULL)
    497 			printf("red_detach: no state found in red_list!\n");
    498 	}
    499 
    500 	red_destroy(rqp->rq_red);
    501 	FREE(rqp->rq_q, M_DEVBUF);
    502 	FREE(rqp, M_DEVBUF);
    503 	return (error);
    504 }
    505 
    506 /*
    507  * red support routines
    508  */
    509 
    510 red_t *
    511 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    512 	int	weight, inv_pmax, th_min, th_max;
    513 	int	flags, pkttime;
    514 {
    515 	red_t 	*rp;
    516 	int	w, i;
    517 	int	npkts_per_sec;
    518 
    519 	MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
    520 	if (rp == NULL)
    521 		return (NULL);
    522 	bzero(rp, sizeof(red_t));
    523 
    524 	rp->red_avg = 0;
    525 	rp->red_idle = 1;
    526 
    527 	if (weight == 0)
    528 		rp->red_weight = W_WEIGHT;
    529 	else
    530 		rp->red_weight = weight;
    531 	if (inv_pmax == 0)
    532 		rp->red_inv_pmax = default_inv_pmax;
    533 	else
    534 		rp->red_inv_pmax = inv_pmax;
    535 	if (th_min == 0)
    536 		rp->red_thmin = default_th_min;
    537 	else
    538 		rp->red_thmin = th_min;
    539 	if (th_max == 0)
    540 		rp->red_thmax = default_th_max;
    541 	else
    542 		rp->red_thmax = th_max;
    543 
    544 	rp->red_flags = flags;
    545 
    546 	if (pkttime == 0)
    547 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    548 		rp->red_pkttime = 800;
    549 	else
    550 		rp->red_pkttime = pkttime;
    551 
    552 	if (weight == 0) {
    553 		/* when the link is very slow, adjust red parameters */
    554 		npkts_per_sec = 1000000 / rp->red_pkttime;
    555 		if (npkts_per_sec < 50) {
    556 			/* up to about 400Kbps */
    557 			rp->red_weight = W_WEIGHT_2;
    558 		} else if (npkts_per_sec < 300) {
    559 			/* up to about 2.4Mbps */
    560 			rp->red_weight = W_WEIGHT_1;
    561 		}
    562 	}
    563 
    564 	/* calculate wshift.  weight must be power of 2 */
    565 	w = rp->red_weight;
    566 	for (i = 0; w > 1; i++)
    567 		w = w >> 1;
    568 	rp->red_wshift = i;
    569 	w = 1 << rp->red_wshift;
    570 	if (w != rp->red_weight) {
    571 		printf("invalid weight value %d for red! use %d\n",
    572 		       rp->red_weight, w);
    573 		rp->red_weight = w;
    574 	}
    575 
    576 	/*
    577 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    578 	 * to be compared with avg.
    579 	 */
    580 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    581 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    582 
    583 	/*
    584 	 * precompute probability denominator
    585 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    586 	 */
    587 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    588 			 * rp->red_inv_pmax) << FP_SHIFT;
    589 
    590 	/* allocate weight table */
    591 	rp->red_wtab = wtab_alloc(rp->red_weight);
    592 
    593 	microtime(&rp->red_last);
    594 #ifdef ALTQ_FLOWVALVE
    595 	if (flags & REDF_FLOWVALVE)
    596 		rp->red_flowvalve = fv_alloc(rp);
    597 	/* if fv_alloc failes, flowvalve is just disabled */
    598 #endif
    599 	return (rp);
    600 }
    601 
    602 void
    603 red_destroy(rp)
    604 	red_t *rp;
    605 {
    606 #ifdef ALTQ_FLOWVALVE
    607 	if (rp->red_flowvalve != NULL)
    608 		fv_destroy(rp->red_flowvalve);
    609 #endif
    610 	wtab_destroy(rp->red_wtab);
    611 	FREE(rp, M_DEVBUF);
    612 }
    613 
    614 void
    615 red_getstats(rp, sp)
    616 	red_t *rp;
    617 	struct redstats *sp;
    618 {
    619 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    620 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    621 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    622 	sp->drop_forced		= rp->red_stats.drop_forced;
    623 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    624 	sp->marked_packets	= rp->red_stats.marked_packets;
    625 }
    626 
    627 /*
    628  * enqueue routine:
    629  *
    630  *	returns: 0 when successfully queued.
    631  *		 ENOBUFS when drop occurs.
    632  */
    633 static int
    634 red_enqueue(ifq, m, pktattr)
    635 	struct ifaltq *ifq;
    636 	struct mbuf *m;
    637 	struct altq_pktattr *pktattr;
    638 {
    639 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    640 
    641 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    642 		return ENOBUFS;
    643 	ifq->ifq_len++;
    644 	return 0;
    645 }
    646 
    647 int
    648 red_addq(rp, q, m, pktattr)
    649 	red_t *rp;
    650 	class_queue_t *q;
    651 	struct mbuf *m;
    652 	struct altq_pktattr *pktattr;
    653 {
    654 	int avg, droptype;
    655 	int n;
    656 #ifdef ALTQ_FLOWVALVE
    657 	struct fve *fve = NULL;
    658 
    659 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    660 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    661 			m_freem(m);
    662 			return (-1);
    663 		}
    664 #endif
    665 
    666 	avg = rp->red_avg;
    667 
    668 	/*
    669 	 * if we were idle, we pretend that n packets arrived during
    670 	 * the idle period.
    671 	 */
    672 	if (rp->red_idle) {
    673 		struct timeval now;
    674 		int t;
    675 
    676 		rp->red_idle = 0;
    677 		microtime(&now);
    678 		t = (now.tv_sec - rp->red_last.tv_sec);
    679 		if (t > 60) {
    680 			/*
    681 			 * being idle for more than 1 minute, set avg to zero.
    682 			 * this prevents t from overflow.
    683 			 */
    684 			avg = 0;
    685 		} else {
    686 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    687 			n = t / rp->red_pkttime - 1;
    688 
    689 			/* the following line does (avg = (1 - Wq)^n * avg) */
    690 			if (n > 0)
    691 				avg = (avg >> FP_SHIFT) *
    692 				    pow_w(rp->red_wtab, n);
    693 		}
    694 	}
    695 
    696 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    697 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    698 	rp->red_avg = avg;		/* save the new value */
    699 
    700 	/*
    701 	 * red_count keeps a tally of arriving traffic that has not
    702 	 * been dropped.
    703 	 */
    704 	rp->red_count++;
    705 
    706 	/* see if we drop early */
    707 	droptype = DTYPE_NODROP;
    708 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    709 		if (avg >= rp->red_thmax_s) {
    710 			/* avg >= th_max: forced drop */
    711 			droptype = DTYPE_FORCED;
    712 		} else if (rp->red_old == 0) {
    713 			/* first exceeds th_min */
    714 			rp->red_count = 1;
    715 			rp->red_old = 1;
    716 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    717 				      rp->red_probd, rp->red_count)) {
    718 			/* mark or drop by red */
    719 			if ((rp->red_flags & REDF_ECN) &&
    720 			    mark_ecn(m, pktattr, rp->red_flags)) {
    721 				/* successfully marked.  do not drop. */
    722 				rp->red_count = 0;
    723 #ifdef RED_STATS
    724 				rp->red_stats.marked_packets++;
    725 #endif
    726 			} else {
    727 				/* unforced drop by red */
    728 				droptype = DTYPE_EARLY;
    729 			}
    730 		}
    731 	} else {
    732 		/* avg < th_min */
    733 		rp->red_old = 0;
    734 	}
    735 
    736 	/*
    737 	 * if the queue length hits the hard limit, it's a forced drop.
    738 	 */
    739 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    740 		droptype = DTYPE_FORCED;
    741 
    742 #ifdef RED_RANDOM_DROP
    743 	/* if successful or forced drop, enqueue this packet. */
    744 	if (droptype != DTYPE_EARLY)
    745 		_addq(q, m);
    746 #else
    747 	/* if successful, enqueue this packet. */
    748 	if (droptype == DTYPE_NODROP)
    749 		_addq(q, m);
    750 #endif
    751 	if (droptype != DTYPE_NODROP) {
    752 		if (droptype == DTYPE_EARLY) {
    753 			/* drop the incoming packet */
    754 #ifdef RED_STATS
    755 			rp->red_stats.drop_unforced++;
    756 #endif
    757 		} else {
    758 			/* forced drop, select a victim packet in the queue. */
    759 #ifdef RED_RANDOM_DROP
    760 			m = _getq_random(q);
    761 #endif
    762 #ifdef RED_STATS
    763 			rp->red_stats.drop_forced++;
    764 #endif
    765 		}
    766 #ifdef RED_STATS
    767 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    768 #endif
    769 		rp->red_count = 0;
    770 #ifdef ALTQ_FLOWVALVE
    771 		if (rp->red_flowvalve != NULL)
    772 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    773 #endif
    774 		m_freem(m);
    775 		return (-1);
    776 	}
    777 	/* successfully queued */
    778 #ifdef RED_STATS
    779 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    780 #endif
    781 	return (0);
    782 }
    783 
    784 /*
    785  * early-drop probability is calculated as follows:
    786  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    787  *   prob_a = prob / (2 - count*prob)
    788  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    789  * here prob_a increases as successive undrop count increases.
    790  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    791  * becomes 1 when (count >= (2 / prob))).
    792  */
    793 int
    794 drop_early(fp_len, fp_probd, count)
    795 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    796 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    797 	int count;	/* how many successive undropped packets */
    798 {
    799 	int d;		/* denominator of drop-probability */
    800 
    801 	d = fp_probd - count * fp_len;
    802 	if (d <= 0)
    803 		/* count exceeds the hard limit: drop or mark */
    804 		return (1);
    805 
    806 	/*
    807 	 * now the range of d is [1..600] in fixed-point. (when
    808 	 * th_max-th_min=10 and p_max=1/30)
    809 	 * drop probability = (avg - TH_MIN) / d
    810 	 */
    811 
    812 	if ((random() % d) < fp_len) {
    813 		/* drop or mark */
    814 		return (1);
    815 	}
    816 	/* no drop/mark */
    817 	return (0);
    818 }
    819 
    820 /*
    821  * try to mark CE bit to the packet.
    822  *    returns 1 if successfully marked, 0 otherwise.
    823  */
    824 int
    825 mark_ecn(m, pktattr, flags)
    826 	struct mbuf *m;
    827 	struct altq_pktattr *pktattr;
    828 	int flags;
    829 {
    830 	struct mbuf *m0;
    831 
    832 	if (pktattr == NULL ||
    833 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    834 		return (0);
    835 
    836 	/* verify that pattr_hdr is within the mbuf data */
    837 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    838 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    839 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    840 			break;
    841 	if (m0 == NULL) {
    842 		/* ick, pattr_hdr is stale */
    843 		pktattr->pattr_af = AF_UNSPEC;
    844 		return (0);
    845 	}
    846 
    847 	switch (pktattr->pattr_af) {
    848 	case AF_INET:
    849 		if (flags & REDF_ECN4) {
    850 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    851 
    852 			if (ip->ip_v != 4)
    853 				return (0);	/* version mismatch! */
    854 			if (ip->ip_tos & IPTOS_ECT) {
    855 				/* ECN-capable, mark ECN bit. */
    856 				if ((ip->ip_tos & IPTOS_CE) == 0) {
    857 #if (IPTOS_CE == 0x01)
    858 					u_short sum;
    859 
    860 					ip->ip_tos |= IPTOS_CE;
    861 					/*
    862 					 * optimized version when IPTOS_CE
    863 					 * is 0x01.
    864 					 *   HC' = HC -1   when HC > 0
    865 					 *       = 0xfffe  when HC = 0
    866 					 */
    867 					sum = ntohs(ip->ip_sum);
    868 					if (sum == 0)
    869 						sum = 0xfffe;
    870 					else
    871 						sum -= 1;
    872 					ip->ip_sum = htons(sum);
    873 #else /* IPTOS_CE != 0x01 */
    874 					long sum;
    875 
    876 					ip->ip_tos |= IPTOS_CE;
    877 					/*
    878 					 * update checksum (from RFC1624)
    879 					 *	   HC' = ~(~HC + ~m + m')
    880 					 */
    881 					sum = ~ntohs(ip->ip_sum) & 0xffff;
    882 					sum += 0xffff + IPTOS_CE;
    883 					sum = (sum >> 16) + (sum & 0xffff);
    884 					sum += (sum >> 16);  /* add carry */
    885 
    886 					ip->ip_sum = htons(~sum & 0xffff);
    887 #endif /* IPTOS_CE != 0x01 */
    888 				}
    889 				return (1);
    890 			}
    891 		}
    892 		break;
    893 #ifdef INET6
    894 	case AF_INET6:
    895 		if (flags & REDF_ECN6) {
    896 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    897 			u_int32_t flowlabel;
    898 
    899 			flowlabel = ntohl(ip6->ip6_flow);
    900 			if ((flowlabel >> 28) != 6)
    901 				return (0);	/* version mismatch! */
    902 			if (flowlabel & (IPTOS_ECT << 20)) {
    903 				/* ECN-capable, mark ECN bit. */
    904 				flowlabel |= (IPTOS_CE << 20);
    905 				ip6->ip6_flow = htonl(flowlabel);
    906 				return (1);
    907 			}
    908 		}
    909 		break;
    910 #endif  /* INET6 */
    911 	}
    912 
    913 	/* not marked */
    914 	return (0);
    915 }
    916 
    917 /*
    918  * dequeue routine:
    919  *	must be called in splnet.
    920  *
    921  *	returns: mbuf dequeued.
    922  *		 NULL when no packet is available in the queue.
    923  */
    924 
    925 static struct mbuf *
    926 red_dequeue(ifq, op)
    927 	struct ifaltq *ifq;
    928 	int op;
    929 {
    930 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    931 	struct mbuf *m;
    932 
    933 	if (op == ALTDQ_POLL)
    934 		return qhead(rqp->rq_q);
    935 
    936 	/* op == ALTDQ_REMOVE */
    937 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    938 	if (m != NULL)
    939 		ifq->ifq_len--;
    940 	return (m);
    941 }
    942 
    943 struct mbuf *
    944 red_getq(rp, q)
    945 	red_t *rp;
    946 	class_queue_t *q;
    947 {
    948 	struct mbuf *m;
    949 
    950 	if ((m = _getq(q)) == NULL) {
    951 		if (rp->red_idle == 0) {
    952 			rp->red_idle = 1;
    953 			microtime(&rp->red_last);
    954 		}
    955 		return NULL;
    956 	}
    957 
    958 	rp->red_idle = 0;
    959 	return (m);
    960 }
    961 
    962 static int
    963 red_request(ifq, req, arg)
    964 	struct ifaltq *ifq;
    965 	int req;
    966 	void *arg;
    967 {
    968 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    969 
    970 	switch (req) {
    971 	case ALTRQ_PURGE:
    972 		red_purgeq(rqp);
    973 		break;
    974 	}
    975 	return (0);
    976 }
    977 
    978 static void
    979 red_purgeq(rqp)
    980 	red_queue_t *rqp;
    981 {
    982 	_flushq(rqp->rq_q);
    983 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    984 		rqp->rq_ifq->ifq_len = 0;
    985 }
    986 
    987 
    988 /*
    989  * helper routine to calibrate avg during idle.
    990  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    991  * here Wq = 1/weight and the code assumes Wq is close to zero.
    992  *
    993  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    994  */
    995 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    996 
    997 struct wtab *
    998 wtab_alloc(weight)
    999 	int weight;
   1000 {
   1001 	struct wtab *w;
   1002 	int i;
   1003 
   1004 	for (w = wtab_list; w != NULL; w = w->w_next)
   1005 		if (w->w_weight == weight) {
   1006 			w->w_refcount++;
   1007 			return (w);
   1008 		}
   1009 
   1010 	MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
   1011 	if (w == NULL)
   1012 		panic("wtab_alloc: malloc failed!");
   1013 	bzero(w, sizeof(struct wtab));
   1014 	w->w_weight = weight;
   1015 	w->w_refcount = 1;
   1016 	w->w_next = wtab_list;
   1017 	wtab_list = w;
   1018 
   1019 	/* initialize the weight table */
   1020 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1021 	for (i = 1; i < 32; i++) {
   1022 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1023 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1024 			w->w_param_max = 1 << i;
   1025 	}
   1026 
   1027 	return (w);
   1028 }
   1029 
   1030 int
   1031 wtab_destroy(w)
   1032 	struct wtab *w;
   1033 {
   1034 	struct wtab *prev;
   1035 
   1036 	if (--w->w_refcount > 0)
   1037 		return (0);
   1038 
   1039 	if (wtab_list == w)
   1040 		wtab_list = w->w_next;
   1041 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1042 		if (prev->w_next == w) {
   1043 			prev->w_next = w->w_next;
   1044 			break;
   1045 		}
   1046 
   1047 	FREE(w, M_DEVBUF);
   1048 	return (0);
   1049 }
   1050 
   1051 int32_t
   1052 pow_w(w, n)
   1053 	struct wtab *w;
   1054 	int n;
   1055 {
   1056 	int	i, bit;
   1057 	int32_t	val;
   1058 
   1059 	if (n >= w->w_param_max)
   1060 		return (0);
   1061 
   1062 	val = 1 << FP_SHIFT;
   1063 	if (n <= 0)
   1064 		return (val);
   1065 
   1066 	bit = 1;
   1067 	i = 0;
   1068 	while (n) {
   1069 		if (n & bit) {
   1070 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1071 			n &= ~bit;
   1072 		}
   1073 		i++;
   1074 		bit <<=  1;
   1075 	}
   1076 	return (val);
   1077 }
   1078 
   1079 #ifdef ALTQ_FLOWVALVE
   1080 
   1081 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1082 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1083 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1084 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1085 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1086 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1087 
   1088 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1089 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1090 
   1091 #define	FV_N			10	/* update fve_f every FV_N packets */
   1092 
   1093 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1094 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1095 #define	FV_ALPHA		5  /* extra packet count */
   1096 
   1097 #define	FV_STATS
   1098 
   1099 #if (__FreeBSD_version > 300000)
   1100 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1101 #else
   1102 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1103 #endif
   1104 
   1105 /*
   1106  * Brtt table: 127 entry table to convert drop rate (p) to
   1107  * the corresponding bandwidth fraction (f)
   1108  * the following equation is implemented to use scaled values,
   1109  * fve_p and fve_f, in the fixed point format.
   1110  *
   1111  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1112  *   f = Brtt(p) / (max_th + alpha)
   1113  */
   1114 #define	BRTT_SIZE	128
   1115 #define	BRTT_SHIFT	12
   1116 #define	BRTT_MASK	0x0007f000
   1117 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1118 
   1119 const int brtt_tab[BRTT_SIZE] = {
   1120 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1121 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1122 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1123 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1124 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1125 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1126 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1127 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1128 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1129 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1130 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1131 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1132 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1133 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1134 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1135 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1136 };
   1137 
   1138 static __inline struct fve *
   1139 flowlist_lookup(fv, pktattr, now)
   1140 	struct flowvalve *fv;
   1141 	struct altq_pktattr *pktattr;
   1142 	struct timeval *now;
   1143 {
   1144 	struct fve *fve;
   1145 	int flows;
   1146 	struct ip *ip;
   1147 #ifdef INET6
   1148 	struct ip6_hdr *ip6;
   1149 #endif
   1150 	struct timeval tthresh;
   1151 
   1152 	if (pktattr == NULL)
   1153 		return (NULL);
   1154 
   1155 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1156 	flows = 0;
   1157 	/*
   1158 	 * search the flow list
   1159 	 */
   1160 	switch (pktattr->pattr_af) {
   1161 	case AF_INET:
   1162 		ip = (struct ip *)pktattr->pattr_hdr;
   1163 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1164 			if (fve->fve_lastdrop.tv_sec == 0)
   1165 				break;
   1166 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1167 				fve->fve_lastdrop.tv_sec = 0;
   1168 				break;
   1169 			}
   1170 			if (fve->fve_flow.flow_af == AF_INET &&
   1171 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1172 			    ip->ip_src.s_addr &&
   1173 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1174 			    ip->ip_dst.s_addr)
   1175 				return (fve);
   1176 			flows++;
   1177 		}
   1178 		break;
   1179 #ifdef INET6
   1180 	case AF_INET6:
   1181 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1182 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1183 			if (fve->fve_lastdrop.tv_sec == 0)
   1184 				break;
   1185 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1186 				fve->fve_lastdrop.tv_sec = 0;
   1187 				break;
   1188 			}
   1189 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1190 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1191 					       &ip6->ip6_src) &&
   1192 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1193 					       &ip6->ip6_dst))
   1194 				return (fve);
   1195 			flows++;
   1196 		}
   1197 		break;
   1198 #endif /* INET6 */
   1199 
   1200 	default:
   1201 		/* unknown protocol.  no drop. */
   1202 		return (NULL);
   1203 	}
   1204 	fv->fv_flows = flows;	/* save the number of active fve's */
   1205 	return (NULL);
   1206 }
   1207 
   1208 static __inline struct fve *
   1209 flowlist_reclaim(fv, pktattr)
   1210 	struct flowvalve *fv;
   1211 	struct altq_pktattr *pktattr;
   1212 {
   1213 	struct fve *fve;
   1214 	struct ip *ip;
   1215 #ifdef INET6
   1216 	struct ip6_hdr *ip6;
   1217 #endif
   1218 
   1219 	/*
   1220 	 * get an entry from the tail of the LRU list.
   1221 	 */
   1222 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1223 
   1224 	switch (pktattr->pattr_af) {
   1225 	case AF_INET:
   1226 		ip = (struct ip *)pktattr->pattr_hdr;
   1227 		fve->fve_flow.flow_af = AF_INET;
   1228 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1229 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1230 		break;
   1231 #ifdef INET6
   1232 	case AF_INET6:
   1233 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1234 		fve->fve_flow.flow_af = AF_INET6;
   1235 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1236 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1237 		break;
   1238 #endif
   1239 	}
   1240 
   1241 	fve->fve_state = Green;
   1242 	fve->fve_p = 0.0;
   1243 	fve->fve_f = 0.0;
   1244 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1245 	fve->fve_count = 0;
   1246 
   1247 	fv->fv_flows++;
   1248 #ifdef FV_STATS
   1249 	fv->fv_stats.alloc++;
   1250 #endif
   1251 	return (fve);
   1252 }
   1253 
   1254 static __inline void
   1255 flowlist_move_to_head(fv, fve)
   1256 	struct flowvalve *fv;
   1257 	struct fve *fve;
   1258 {
   1259 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1260 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1261 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1262 	}
   1263 }
   1264 
   1265 /*
   1266  * allocate flowvalve structure
   1267  */
   1268 static struct flowvalve *
   1269 fv_alloc(rp)
   1270 	struct red *rp;
   1271 {
   1272 	struct flowvalve *fv;
   1273 	struct fve *fve;
   1274 	int i, num;
   1275 
   1276 	num = FV_FLOWLISTSIZE;
   1277 	MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve),
   1278 	       M_DEVBUF, M_WAITOK);
   1279 	if (fv == NULL)
   1280 		return (NULL);
   1281 	bzero(fv, sizeof(struct flowvalve));
   1282 
   1283 	MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num,
   1284 	       M_DEVBUF, M_WAITOK);
   1285 	if (fv->fv_fves == NULL) {
   1286 		FREE(fv, M_DEVBUF);
   1287 		return (NULL);
   1288 	}
   1289 	bzero(fv->fv_fves, sizeof(struct fve) * num);
   1290 
   1291 	fv->fv_flows = 0;
   1292 	TAILQ_INIT(&fv->fv_flowlist);
   1293 	for (i = 0; i < num; i++) {
   1294 		fve = &fv->fv_fves[i];
   1295 		fve->fve_lastdrop.tv_sec = 0;
   1296 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1297 	}
   1298 
   1299 	/* initialize drop rate threshold in scaled fixed-point */
   1300 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1301 
   1302 	/* initialize drop rate to fraction table */
   1303 	MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE,
   1304 	       M_DEVBUF, M_WAITOK);
   1305 	if (fv->fv_p2ftab == NULL) {
   1306 		FREE(fv->fv_fves, M_DEVBUF);
   1307 		FREE(fv, M_DEVBUF);
   1308 		return (NULL);
   1309 	}
   1310 	/*
   1311 	 * create the p2f table.
   1312 	 * (shift is used to keep the precision)
   1313 	 */
   1314 	for (i = 1; i < BRTT_SIZE; i++) {
   1315 		int f;
   1316 
   1317 		f = brtt_tab[i] << 8;
   1318 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1319 	}
   1320 
   1321 	return (fv);
   1322 }
   1323 
   1324 static void fv_destroy(fv)
   1325 	struct flowvalve *fv;
   1326 {
   1327 	FREE(fv->fv_p2ftab, M_DEVBUF);
   1328 	FREE(fv->fv_fves, M_DEVBUF);
   1329 	FREE(fv, M_DEVBUF);
   1330 }
   1331 
   1332 static __inline int
   1333 fv_p2f(fv, p)
   1334 	struct flowvalve	*fv;
   1335 	int	p;
   1336 {
   1337 	int val, f;
   1338 
   1339 	if (p >= BRTT_PMAX)
   1340 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1341 	else if ((val = (p & BRTT_MASK)))
   1342 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1343 	else
   1344 		f = fv->fv_p2ftab[1];
   1345 	return (f);
   1346 }
   1347 
   1348 /*
   1349  * check if an arriving packet should be pre-dropped.
   1350  * called from red_addq() when a packet arrives.
   1351  * returns 1 when the packet should be pre-dropped.
   1352  * should be called in splnet.
   1353  */
   1354 static int
   1355 fv_checkflow(fv, pktattr, fcache)
   1356 	struct flowvalve *fv;
   1357 	struct altq_pktattr *pktattr;
   1358 	struct fve **fcache;
   1359 {
   1360 	struct fve *fve;
   1361 	struct timeval now;
   1362 
   1363 	fv->fv_ifseq++;
   1364 	FV_TIMESTAMP(&now);
   1365 
   1366 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1367 		/* no matching entry in the flowlist */
   1368 		return (0);
   1369 
   1370 	*fcache = fve;
   1371 
   1372 	/* update fraction f for every FV_N packets */
   1373 	if (++fve->fve_count == FV_N) {
   1374 		/*
   1375 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1376 		 */
   1377 		fve->fve_f =
   1378 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1379 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1380 		fve->fve_ifseq = fv->fv_ifseq;
   1381 		fve->fve_count = 0;
   1382 	}
   1383 
   1384 	/*
   1385 	 * overpumping test
   1386 	 */
   1387 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1388 		int fthresh;
   1389 
   1390 		/* calculate a threshold */
   1391 		fthresh = fv_p2f(fv, fve->fve_p);
   1392 		if (fve->fve_f > fthresh)
   1393 			fve->fve_state = Red;
   1394 	}
   1395 
   1396 	if (fve->fve_state == Red) {
   1397 		/*
   1398 		 * backoff test
   1399 		 */
   1400 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1401 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1402 			fve->fve_p = 0;
   1403 			fve->fve_state = Green;
   1404 #ifdef FV_STATS
   1405 			fv->fv_stats.escape++;
   1406 #endif
   1407 		} else {
   1408 			/* block this flow */
   1409 			flowlist_move_to_head(fv, fve);
   1410 			fve->fve_lastdrop = now;
   1411 #ifdef FV_STATS
   1412 			fv->fv_stats.predrop++;
   1413 #endif
   1414 			return (1);
   1415 		}
   1416 	}
   1417 
   1418 	/*
   1419 	 * p = (1 - Wp) * p
   1420 	 */
   1421 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1422 	if (fve->fve_p < 0)
   1423 		fve->fve_p = 0;
   1424 #ifdef FV_STATS
   1425 	fv->fv_stats.pass++;
   1426 #endif
   1427 	return (0);
   1428 }
   1429 
   1430 /*
   1431  * called from red_addq when a packet is dropped by red.
   1432  * should be called in splnet.
   1433  */
   1434 static void fv_dropbyred(fv, pktattr, fcache)
   1435 	struct flowvalve *fv;
   1436 	struct altq_pktattr *pktattr;
   1437 	struct fve *fcache;
   1438 {
   1439 	struct fve *fve;
   1440 	struct timeval now;
   1441 
   1442 	if (pktattr == NULL)
   1443 		return;
   1444 	FV_TIMESTAMP(&now);
   1445 
   1446 	if (fcache != NULL)
   1447 		/* the fve of this packet is already cached */
   1448 		fve = fcache;
   1449 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1450 		fve = flowlist_reclaim(fv, pktattr);
   1451 
   1452 	flowlist_move_to_head(fv, fve);
   1453 
   1454 	/*
   1455 	 * update p:  the following line cancels the update
   1456 	 *	      in fv_checkflow() and calculate
   1457 	 *	p = Wp + (1 - Wp) * p
   1458 	 */
   1459 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1460 
   1461 	fve->fve_lastdrop = now;
   1462 }
   1463 
   1464 #endif /* ALTQ_FLOWVALVE */
   1465 
   1466 #ifdef KLD_MODULE
   1467 
   1468 static struct altqsw red_sw =
   1469 	{"red", redopen, redclose, redioctl};
   1470 
   1471 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1472 
   1473 #endif /* KLD_MODULE */
   1474 
   1475 #endif /* ALTQ_RED */
   1476