altq_red.c revision 1.13.6.1       1 /*	$NetBSD: altq_red.c,v 1.13.6.1 2006/06/01 22:34:09 kardel Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.9 2002/01/07 11:25:40 kjc Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.13.6.1 2006/06/01 22:34:09 kardel Exp $");
     65 
     66 #if defined(__FreeBSD__) || defined(__NetBSD__)
     67 #include "opt_altq.h"
     68 #if (__FreeBSD__ != 2)
     69 #include "opt_inet.h"
     70 #ifdef __FreeBSD__
     71 #include "opt_inet6.h"
     72 #endif
     73 #endif
     74 #endif /* __FreeBSD__ || __NetBSD__ */
     75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     76 
     77 #include <sys/param.h>
     78 #include <sys/malloc.h>
     79 #include <sys/mbuf.h>
     80 #include <sys/socket.h>
     81 #include <sys/sockio.h>
     82 #include <sys/systm.h>
     83 #include <sys/proc.h>
     84 #include <sys/errno.h>
     85 #include <sys/kernel.h>
     86 #include <sys/kauth.h>
     87 #ifdef ALTQ_FLOWVALVE
     88 #include <sys/queue.h>
     89 #include <sys/time.h>
     90 #endif
     91 
     92 #include <net/if.h>
     93 #include <net/if_types.h>
     94 
     95 #include <netinet/in.h>
     96 #include <netinet/in_systm.h>
     97 #include <netinet/ip.h>
     98 #ifdef INET6
     99 #include <netinet/ip6.h>
    100 #endif
    101 
    102 #include <altq/altq.h>
    103 #include <altq/altq_conf.h>
    104 #include <altq/altq_red.h>
    105 #ifdef ALTQ_FLOWVALVE
    106 #include <altq/altq_flowvalve.h>
    107 #endif
    108 
    109 /*
    110  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    111  * fixed-point calculation.
    112  *
    113  * written by kjc using the ns code as a reference.
    114  * you can learn more about red and ns from Sally's home page at
    115  * http://www-nrg.ee.lbl.gov/floyd/
    116  *
    117  * most of the red parameter values are fixed in this implementation
    118  * to prevent fixed-point overflow/underflow.
    119  * if you change the parameters, watch out for overflow/underflow!
    120  *
    121  * the parameters used are recommended values by Sally.
    122  * the corresponding ns config looks:
    123  *	q_weight=0.00195
    124  *	minthresh=5 maxthresh=15 queue-size=60
    125  *	linterm=30
    126  *	dropmech=drop-tail
    127  *	bytes=false (can't be handled by 32-bit fixed-point)
    128  *	doubleq=false dqthresh=false
    129  *	wait=true
    130  */
    131 /*
    132  * alternative red parameters for a slow link.
    133  *
    134  * assume the queue length becomes from zero to L and keeps L, it takes
    135  * N packets for q_avg to reach 63% of L.
    136  * when q_weight is 0.002, N is about 500 packets.
    137  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    138  * when q_weight is 0.008, N is about 127 packets.
    139  * when q_weight is 0.016, N is about 63 packets.
    140  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    141  * are allowed for 0.016.
    142  * see Sally's paper for more details.
    143  */
    144 /* normal red parameters */
    145 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    146 				/* q_weight = 0.00195 */
    147 
    148 /* red parameters for a slow link */
    149 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    150 				/* q_weight = 0.0078125 */
    151 
    152 /* red parameters for a very slow link (e.g., dialup) */
    153 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    154 				/* q_weight = 0.015625 */
    155 
    156 /* fixed-point uses 12-bit decimal places */
    157 #define	FP_SHIFT	12	/* fixed-point shift */
    158 
    159 /* red parameters for drop probability */
    160 #define	INV_P_MAX	10	/* inverse of max drop probability */
    161 #define	TH_MIN		5	/* min threshold */
    162 #define	TH_MAX		15	/* max threshold */
    163 
    164 #define	RED_LIMIT	60	/* default max queue length */
    165 
    166 /*
    167  * our default policy for forced-drop is drop-tail.
    168  * (in altq-1.1.2 or earlier, the default was random-drop.
    169  * but it makes more sense to punish the cause of the surge.)
    170  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    171  */
    172 
    173 #ifdef ALTQ_FLOWVALVE
    174 /*
    175  * flow-valve is an extension to protect red from unresponsive flows
    176  * and to promote end-to-end congestion control.
    177  * flow-valve observes the average drop rates of the flows that have
    178  * experienced packet drops in the recent past.
    179  * when the average drop rate exceeds the threshold, the flow is
    180  * blocked by the flow-valve.  the trapped flow should back off
    181  * exponentially to escape from the flow-valve.
    182  */
    183 #ifdef RED_RANDOM_DROP
    184 #error "random-drop can't be used with flow-valve!"
    185 #endif
    186 #endif /* ALTQ_FLOWVALVE */
    187 
    188 /* red_list keeps all red_queue_t's allocated. */
    189 static red_queue_t *red_list = NULL;
    190 
    191 /* default red parameter values */
    192 static int default_th_min = TH_MIN;
    193 static int default_th_max = TH_MAX;
    194 static int default_inv_pmax = INV_P_MAX;
    195 
    196 /* internal function prototypes */
    197 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    198 			    struct altq_pktattr *));
    199 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    200 static int red_request __P((struct ifaltq *, int, void *));
    201 static void red_purgeq __P((red_queue_t *));
    202 static int red_detach __P((red_queue_t *));
    203 #ifdef ALTQ_FLOWVALVE
    204 static inline struct fve *flowlist_lookup __P((struct flowvalve *,
    205 			 struct altq_pktattr *, struct timeval *));
    206 static inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    207 						  struct altq_pktattr *));
    208 static inline void flowlist_move_to_head __P((struct flowvalve *,
    209 						struct fve *));
    210 static inline int fv_p2f __P((struct flowvalve *, int));
    211 static struct flowvalve *fv_alloc __P((struct red *));
    212 static void fv_destroy __P((struct flowvalve *));
    213 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    214 			     struct fve **));
    215 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    216 			      struct fve *));
    217 #endif
    218 
    219 /*
    220  * red device interface
    221  */
    222 altqdev_decl(red);
    223 
    224 int
    225 redopen(dev, flag, fmt, l)
    226 	dev_t dev;
    227 	int flag, fmt;
    228 	struct lwp *l;
    229 {
    230 	/* everything will be done when the queueing scheme is attached. */
    231 	return 0;
    232 }
    233 
    234 int
    235 redclose(dev, flag, fmt, l)
    236 	dev_t dev;
    237 	int flag, fmt;
    238 	struct lwp *l;
    239 {
    240 	red_queue_t *rqp;
    241 	int err, error = 0;
    242 
    243 	while ((rqp = red_list) != NULL) {
    244 		/* destroy all */
    245 		err = red_detach(rqp);
    246 		if (err != 0 && error == 0)
    247 			error = err;
    248 	}
    249 
    250 	return error;
    251 }
    252 
    253 int
    254 redioctl(dev, cmd, addr, flag, l)
    255 	dev_t dev;
    256 	ioctlcmd_t cmd;
    257 	caddr_t addr;
    258 	int flag;
    259 	struct lwp *l;
    260 {
    261 	red_queue_t *rqp;
    262 	struct red_interface *ifacep;
    263 	struct ifnet *ifp;
    264 	struct proc *p = l->l_proc;
    265 	int	error = 0;
    266 
    267 	/* check super-user privilege */
    268 	switch (cmd) {
    269 	case RED_GETSTATS:
    270 		break;
    271 	default:
    272 #if (__FreeBSD_version > 400000)
    273 		if ((error = suser(p)) != 0)
    274 #else
    275 		if ((error = kauth_authorize_generic(p->p_cred,
    276 					       KAUTH_GENERIC_ISSUSER,
    277 					       &p->p_acflag)) != 0)
    278 #endif
    279 			return (error);
    280 		break;
    281 	}
    282 
    283 	switch (cmd) {
    284 
    285 	case RED_ENABLE:
    286 		ifacep = (struct red_interface *)addr;
    287 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    288 			error = EBADF;
    289 			break;
    290 		}
    291 		error = altq_enable(rqp->rq_ifq);
    292 		break;
    293 
    294 	case RED_DISABLE:
    295 		ifacep = (struct red_interface *)addr;
    296 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    297 			error = EBADF;
    298 			break;
    299 		}
    300 		error = altq_disable(rqp->rq_ifq);
    301 		break;
    302 
    303 	case RED_IF_ATTACH:
    304 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    305 		if (ifp == NULL) {
    306 			error = ENXIO;
    307 			break;
    308 		}
    309 
    310 		/* allocate and initialize red_queue_t */
    311 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    312 		if (rqp == NULL) {
    313 			error = ENOMEM;
    314 			break;
    315 		}
    316 
    317 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    318 		    M_WAITOK|M_ZERO);
    319 		if (rqp->rq_q == NULL) {
    320 			free(rqp, M_DEVBUF);
    321 			error = ENOMEM;
    322 			break;
    323 		}
    324 
    325 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    326 		if (rqp->rq_red == NULL) {
    327 			free(rqp->rq_q, M_DEVBUF);
    328 			free(rqp, M_DEVBUF);
    329 			error = ENOMEM;
    330 			break;
    331 		}
    332 
    333 		rqp->rq_ifq = &ifp->if_snd;
    334 		qtail(rqp->rq_q) = NULL;
    335 		qlen(rqp->rq_q) = 0;
    336 		qlimit(rqp->rq_q) = RED_LIMIT;
    337 		qtype(rqp->rq_q) = Q_RED;
    338 
    339 		/*
    340 		 * set RED to this ifnet structure.
    341 		 */
    342 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    343 				    red_enqueue, red_dequeue, red_request,
    344 				    NULL, NULL);
    345 		if (error) {
    346 			red_destroy(rqp->rq_red);
    347 			free(rqp->rq_q, M_DEVBUF);
    348 			free(rqp, M_DEVBUF);
    349 			break;
    350 		}
    351 
    352 		/* add this state to the red list */
    353 		rqp->rq_next = red_list;
    354 		red_list = rqp;
    355 		break;
    356 
    357 	case RED_IF_DETACH:
    358 		ifacep = (struct red_interface *)addr;
    359 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    360 			error = EBADF;
    361 			break;
    362 		}
    363 		error = red_detach(rqp);
    364 		break;
    365 
    366 	case RED_GETSTATS:
    367 		do {
    368 			struct red_stats *q_stats;
    369 			red_t *rp;
    370 
    371 			q_stats = (struct red_stats *)addr;
    372 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    373 					     ALTQT_RED)) == NULL) {
    374 				error = EBADF;
    375 				break;
    376 			}
    377 
    378 			q_stats->q_len 	   = qlen(rqp->rq_q);
    379 			q_stats->q_limit   = qlimit(rqp->rq_q);
    380 
    381 			rp = rqp->rq_red;
    382 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    383 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    384 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    385 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    386 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    387 			q_stats->marked_packets = rp->red_stats.marked_packets;
    388 
    389 			q_stats->weight		= rp->red_weight;
    390 			q_stats->inv_pmax	= rp->red_inv_pmax;
    391 			q_stats->th_min		= rp->red_thmin;
    392 			q_stats->th_max		= rp->red_thmax;
    393 
    394 #ifdef ALTQ_FLOWVALVE
    395 			if (rp->red_flowvalve != NULL) {
    396 				struct flowvalve *fv = rp->red_flowvalve;
    397 				q_stats->fv_flows    = fv->fv_flows;
    398 				q_stats->fv_pass     = fv->fv_stats.pass;
    399 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    400 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    401 				q_stats->fv_escape   = fv->fv_stats.escape;
    402 			} else {
    403 #endif /* ALTQ_FLOWVALVE */
    404 				q_stats->fv_flows    = 0;
    405 				q_stats->fv_pass     = 0;
    406 				q_stats->fv_predrop  = 0;
    407 				q_stats->fv_alloc    = 0;
    408 				q_stats->fv_escape   = 0;
    409 #ifdef ALTQ_FLOWVALVE
    410 			}
    411 #endif /* ALTQ_FLOWVALVE */
    412 		} while (0);
    413 		break;
    414 
    415 	case RED_CONFIG:
    416 		do {
    417 			struct red_conf *fc;
    418 			red_t *new;
    419 			int s, limit;
    420 
    421 			fc = (struct red_conf *)addr;
    422 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    423 					       ALTQT_RED)) == NULL) {
    424 				error = EBADF;
    425 				break;
    426 			}
    427 			new = red_alloc(fc->red_weight,
    428 					fc->red_inv_pmax,
    429 					fc->red_thmin,
    430 					fc->red_thmax,
    431 					fc->red_flags,
    432 					fc->red_pkttime);
    433 			if (new == NULL) {
    434 				error = ENOMEM;
    435 				break;
    436 			}
    437 
    438 			s = splnet();
    439 			red_purgeq(rqp);
    440 			limit = fc->red_limit;
    441 			if (limit < fc->red_thmax)
    442 				limit = fc->red_thmax;
    443 			qlimit(rqp->rq_q) = limit;
    444 			fc->red_limit = limit;	/* write back the new value */
    445 
    446 			red_destroy(rqp->rq_red);
    447 			rqp->rq_red = new;
    448 
    449 			splx(s);
    450 
    451 			/* write back new values */
    452 			fc->red_limit = limit;
    453 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    454 			fc->red_thmin = rqp->rq_red->red_thmin;
    455 			fc->red_thmax = rqp->rq_red->red_thmax;
    456 
    457 		} while (0);
    458 		break;
    459 
    460 	case RED_SETDEFAULTS:
    461 		do {
    462 			struct redparams *rp;
    463 
    464 			rp = (struct redparams *)addr;
    465 
    466 			default_th_min = rp->th_min;
    467 			default_th_max = rp->th_max;
    468 			default_inv_pmax = rp->inv_pmax;
    469 		} while (0);
    470 		break;
    471 
    472 	default:
    473 		error = EINVAL;
    474 		break;
    475 	}
    476 	return error;
    477 }
    478 
    479 static int
    480 red_detach(rqp)
    481 	red_queue_t *rqp;
    482 {
    483 	red_queue_t *tmp;
    484 	int error = 0;
    485 
    486 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    487 		altq_disable(rqp->rq_ifq);
    488 
    489 	if ((error = altq_detach(rqp->rq_ifq)))
    490 		return (error);
    491 
    492 	if (red_list == rqp)
    493 		red_list = rqp->rq_next;
    494 	else {
    495 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    496 			if (tmp->rq_next == rqp) {
    497 				tmp->rq_next = rqp->rq_next;
    498 				break;
    499 			}
    500 		if (tmp == NULL)
    501 			printf("red_detach: no state found in red_list!\n");
    502 	}
    503 
    504 	red_destroy(rqp->rq_red);
    505 	free(rqp->rq_q, M_DEVBUF);
    506 	free(rqp, M_DEVBUF);
    507 	return (error);
    508 }
    509 
    510 /*
    511  * red support routines
    512  */
    513 
    514 red_t *
    515 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    516 	int	weight, inv_pmax, th_min, th_max;
    517 	int	flags, pkttime;
    518 {
    519 	red_t 	*rp;
    520 	int	w, i;
    521 	int	npkts_per_sec;
    522 
    523 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    524 	if (rp == NULL)
    525 		return (NULL);
    526 
    527 	rp->red_avg = 0;
    528 	rp->red_idle = 1;
    529 
    530 	if (weight == 0)
    531 		rp->red_weight = W_WEIGHT;
    532 	else
    533 		rp->red_weight = weight;
    534 	if (inv_pmax == 0)
    535 		rp->red_inv_pmax = default_inv_pmax;
    536 	else
    537 		rp->red_inv_pmax = inv_pmax;
    538 	if (th_min == 0)
    539 		rp->red_thmin = default_th_min;
    540 	else
    541 		rp->red_thmin = th_min;
    542 	if (th_max == 0)
    543 		rp->red_thmax = default_th_max;
    544 	else
    545 		rp->red_thmax = th_max;
    546 
    547 	rp->red_flags = flags;
    548 
    549 	if (pkttime == 0)
    550 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    551 		rp->red_pkttime = 800;
    552 	else
    553 		rp->red_pkttime = pkttime;
    554 
    555 	if (weight == 0) {
    556 		/* when the link is very slow, adjust red parameters */
    557 		npkts_per_sec = 1000000 / rp->red_pkttime;
    558 		if (npkts_per_sec < 50) {
    559 			/* up to about 400Kbps */
    560 			rp->red_weight = W_WEIGHT_2;
    561 		} else if (npkts_per_sec < 300) {
    562 			/* up to about 2.4Mbps */
    563 			rp->red_weight = W_WEIGHT_1;
    564 		}
    565 	}
    566 
    567 	/* calculate wshift.  weight must be power of 2 */
    568 	w = rp->red_weight;
    569 	for (i = 0; w > 1; i++)
    570 		w = w >> 1;
    571 	rp->red_wshift = i;
    572 	w = 1 << rp->red_wshift;
    573 	if (w != rp->red_weight) {
    574 		printf("invalid weight value %d for red! use %d\n",
    575 		       rp->red_weight, w);
    576 		rp->red_weight = w;
    577 	}
    578 
    579 	/*
    580 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    581 	 * to be compared with avg.
    582 	 */
    583 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    584 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    585 
    586 	/*
    587 	 * precompute probability denominator
    588 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    589 	 */
    590 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    591 			 * rp->red_inv_pmax) << FP_SHIFT;
    592 
    593 	/* allocate weight table */
    594 	rp->red_wtab = wtab_alloc(rp->red_weight);
    595 
    596 	microtime(&rp->red_last);
    597 #ifdef ALTQ_FLOWVALVE
    598 	if (flags & REDF_FLOWVALVE)
    599 		rp->red_flowvalve = fv_alloc(rp);
    600 	/* if fv_alloc failes, flowvalve is just disabled */
    601 #endif
    602 	return (rp);
    603 }
    604 
    605 void
    606 red_destroy(rp)
    607 	red_t *rp;
    608 {
    609 #ifdef ALTQ_FLOWVALVE
    610 	if (rp->red_flowvalve != NULL)
    611 		fv_destroy(rp->red_flowvalve);
    612 #endif
    613 	wtab_destroy(rp->red_wtab);
    614 	free(rp, M_DEVBUF);
    615 }
    616 
    617 void
    618 red_getstats(rp, sp)
    619 	red_t *rp;
    620 	struct redstats *sp;
    621 {
    622 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    623 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    624 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    625 	sp->drop_forced		= rp->red_stats.drop_forced;
    626 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    627 	sp->marked_packets	= rp->red_stats.marked_packets;
    628 }
    629 
    630 /*
    631  * enqueue routine:
    632  *
    633  *	returns: 0 when successfully queued.
    634  *		 ENOBUFS when drop occurs.
    635  */
    636 static int
    637 red_enqueue(ifq, m, pktattr)
    638 	struct ifaltq *ifq;
    639 	struct mbuf *m;
    640 	struct altq_pktattr *pktattr;
    641 {
    642 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    643 
    644 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    645 		return ENOBUFS;
    646 	ifq->ifq_len++;
    647 	return 0;
    648 }
    649 
    650 int
    651 red_addq(rp, q, m, pktattr)
    652 	red_t *rp;
    653 	class_queue_t *q;
    654 	struct mbuf *m;
    655 	struct altq_pktattr *pktattr;
    656 {
    657 	int avg, droptype;
    658 	int n;
    659 #ifdef ALTQ_FLOWVALVE
    660 	struct fve *fve = NULL;
    661 
    662 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    663 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    664 			m_freem(m);
    665 			return (-1);
    666 		}
    667 #endif
    668 
    669 	avg = rp->red_avg;
    670 
    671 	/*
    672 	 * if we were idle, we pretend that n packets arrived during
    673 	 * the idle period.
    674 	 */
    675 	if (rp->red_idle) {
    676 		struct timeval now;
    677 		int t;
    678 
    679 		rp->red_idle = 0;
    680 		microtime(&now);
    681 		t = (now.tv_sec - rp->red_last.tv_sec);
    682 		if (t > 60) {
    683 			/*
    684 			 * being idle for more than 1 minute, set avg to zero.
    685 			 * this prevents t from overflow.
    686 			 */
    687 			avg = 0;
    688 		} else {
    689 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    690 			n = t / rp->red_pkttime - 1;
    691 
    692 			/* the following line does (avg = (1 - Wq)^n * avg) */
    693 			if (n > 0)
    694 				avg = (avg >> FP_SHIFT) *
    695 				    pow_w(rp->red_wtab, n);
    696 		}
    697 	}
    698 
    699 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    700 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    701 	rp->red_avg = avg;		/* save the new value */
    702 
    703 	/*
    704 	 * red_count keeps a tally of arriving traffic that has not
    705 	 * been dropped.
    706 	 */
    707 	rp->red_count++;
    708 
    709 	/* see if we drop early */
    710 	droptype = DTYPE_NODROP;
    711 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    712 		if (avg >= rp->red_thmax_s) {
    713 			/* avg >= th_max: forced drop */
    714 			droptype = DTYPE_FORCED;
    715 		} else if (rp->red_old == 0) {
    716 			/* first exceeds th_min */
    717 			rp->red_count = 1;
    718 			rp->red_old = 1;
    719 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    720 				      rp->red_probd, rp->red_count)) {
    721 			/* mark or drop by red */
    722 			if ((rp->red_flags & REDF_ECN) &&
    723 			    mark_ecn(m, pktattr, rp->red_flags)) {
    724 				/* successfully marked.  do not drop. */
    725 				rp->red_count = 0;
    726 #ifdef RED_STATS
    727 				rp->red_stats.marked_packets++;
    728 #endif
    729 			} else {
    730 				/* unforced drop by red */
    731 				droptype = DTYPE_EARLY;
    732 			}
    733 		}
    734 	} else {
    735 		/* avg < th_min */
    736 		rp->red_old = 0;
    737 	}
    738 
    739 	/*
    740 	 * if the queue length hits the hard limit, it's a forced drop.
    741 	 */
    742 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    743 		droptype = DTYPE_FORCED;
    744 
    745 #ifdef RED_RANDOM_DROP
    746 	/* if successful or forced drop, enqueue this packet. */
    747 	if (droptype != DTYPE_EARLY)
    748 		_addq(q, m);
    749 #else
    750 	/* if successful, enqueue this packet. */
    751 	if (droptype == DTYPE_NODROP)
    752 		_addq(q, m);
    753 #endif
    754 	if (droptype != DTYPE_NODROP) {
    755 		if (droptype == DTYPE_EARLY) {
    756 			/* drop the incoming packet */
    757 #ifdef RED_STATS
    758 			rp->red_stats.drop_unforced++;
    759 #endif
    760 		} else {
    761 			/* forced drop, select a victim packet in the queue. */
    762 #ifdef RED_RANDOM_DROP
    763 			m = _getq_random(q);
    764 #endif
    765 #ifdef RED_STATS
    766 			rp->red_stats.drop_forced++;
    767 #endif
    768 		}
    769 #ifdef RED_STATS
    770 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    771 #endif
    772 		rp->red_count = 0;
    773 #ifdef ALTQ_FLOWVALVE
    774 		if (rp->red_flowvalve != NULL)
    775 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    776 #endif
    777 		m_freem(m);
    778 		return (-1);
    779 	}
    780 	/* successfully queued */
    781 #ifdef RED_STATS
    782 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    783 #endif
    784 	return (0);
    785 }
    786 
    787 /*
    788  * early-drop probability is calculated as follows:
    789  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    790  *   prob_a = prob / (2 - count*prob)
    791  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    792  * here prob_a increases as successive undrop count increases.
    793  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    794  * becomes 1 when (count >= (2 / prob))).
    795  */
    796 int
    797 drop_early(fp_len, fp_probd, count)
    798 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    799 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    800 	int count;	/* how many successive undropped packets */
    801 {
    802 	int d;		/* denominator of drop-probability */
    803 
    804 	d = fp_probd - count * fp_len;
    805 	if (d <= 0)
    806 		/* count exceeds the hard limit: drop or mark */
    807 		return (1);
    808 
    809 	/*
    810 	 * now the range of d is [1..600] in fixed-point. (when
    811 	 * th_max-th_min=10 and p_max=1/30)
    812 	 * drop probability = (avg - TH_MIN) / d
    813 	 */
    814 
    815 	if ((random() % d) < fp_len) {
    816 		/* drop or mark */
    817 		return (1);
    818 	}
    819 	/* no drop/mark */
    820 	return (0);
    821 }
    822 
    823 /*
    824  * try to mark CE bit to the packet.
    825  *    returns 1 if successfully marked, 0 otherwise.
    826  */
    827 int
    828 mark_ecn(m, pktattr, flags)
    829 	struct mbuf *m;
    830 	struct altq_pktattr *pktattr;
    831 	int flags;
    832 {
    833 	struct mbuf *m0;
    834 
    835 	if (pktattr == NULL ||
    836 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    837 		return (0);
    838 
    839 	/* verify that pattr_hdr is within the mbuf data */
    840 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    841 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    842 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    843 			break;
    844 	if (m0 == NULL) {
    845 		/* ick, pattr_hdr is stale */
    846 		pktattr->pattr_af = AF_UNSPEC;
    847 		return (0);
    848 	}
    849 
    850 	switch (pktattr->pattr_af) {
    851 	case AF_INET:
    852 		if (flags & REDF_ECN4) {
    853 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    854 			u_int8_t otos;
    855 			int sum;
    856 
    857 			if (ip->ip_v != 4)
    858 				return (0);	/* version mismatch! */
    859 
    860 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    861 				return (0);	/* not-ECT */
    862 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    863 				return (1);	/* already marked */
    864 
    865 			/*
    866 			 * ecn-capable but not marked,
    867 			 * mark CE and update checksum
    868 			 */
    869 			otos = ip->ip_tos;
    870 			ip->ip_tos |= IPTOS_ECN_CE;
    871 			/*
    872 			 * update checksum (from RFC1624)
    873 			 *	   HC' = ~(~HC + ~m + m')
    874 			 */
    875 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    876 			sum += (~otos & 0xffff) + ip->ip_tos;
    877 			sum = (sum >> 16) + (sum & 0xffff);
    878 			sum += (sum >> 16);  /* add carry */
    879 			ip->ip_sum = htons(~sum & 0xffff);
    880 			return (1);
    881 		}
    882 		break;
    883 #ifdef INET6
    884 	case AF_INET6:
    885 		if (flags & REDF_ECN6) {
    886 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    887 			u_int32_t flowlabel;
    888 
    889 			flowlabel = ntohl(ip6->ip6_flow);
    890 			if ((flowlabel >> 28) != 6)
    891 				return (0);	/* version mismatch! */
    892 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    893 			    (IPTOS_ECN_NOTECT << 20))
    894 				return (0);	/* not-ECT */
    895 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    896 			    (IPTOS_ECN_CE << 20))
    897 				return (1);	/* already marked */
    898 			/*
    899 			 * ecn-capable but not marked,  mark CE
    900 			 */
    901 			flowlabel |= (IPTOS_ECN_CE << 20);
    902 			ip6->ip6_flow = htonl(flowlabel);
    903 			return (1);
    904 		}
    905 		break;
    906 #endif  /* INET6 */
    907 	}
    908 
    909 	/* not marked */
    910 	return (0);
    911 }
    912 
    913 /*
    914  * dequeue routine:
    915  *	must be called in splnet.
    916  *
    917  *	returns: mbuf dequeued.
    918  *		 NULL when no packet is available in the queue.
    919  */
    920 
    921 static struct mbuf *
    922 red_dequeue(ifq, op)
    923 	struct ifaltq *ifq;
    924 	int op;
    925 {
    926 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    927 	struct mbuf *m;
    928 
    929 	if (op == ALTDQ_POLL)
    930 		return qhead(rqp->rq_q);
    931 
    932 	/* op == ALTDQ_REMOVE */
    933 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    934 	if (m != NULL)
    935 		ifq->ifq_len--;
    936 	return (m);
    937 }
    938 
    939 struct mbuf *
    940 red_getq(rp, q)
    941 	red_t *rp;
    942 	class_queue_t *q;
    943 {
    944 	struct mbuf *m;
    945 
    946 	if ((m = _getq(q)) == NULL) {
    947 		if (rp->red_idle == 0) {
    948 			rp->red_idle = 1;
    949 			microtime(&rp->red_last);
    950 		}
    951 		return NULL;
    952 	}
    953 
    954 	rp->red_idle = 0;
    955 	return (m);
    956 }
    957 
    958 static int
    959 red_request(ifq, req, arg)
    960 	struct ifaltq *ifq;
    961 	int req;
    962 	void *arg;
    963 {
    964 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    965 
    966 	switch (req) {
    967 	case ALTRQ_PURGE:
    968 		red_purgeq(rqp);
    969 		break;
    970 	}
    971 	return (0);
    972 }
    973 
    974 static void
    975 red_purgeq(rqp)
    976 	red_queue_t *rqp;
    977 {
    978 	_flushq(rqp->rq_q);
    979 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    980 		rqp->rq_ifq->ifq_len = 0;
    981 }
    982 
    983 
    984 /*
    985  * helper routine to calibrate avg during idle.
    986  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    987  * here Wq = 1/weight and the code assumes Wq is close to zero.
    988  *
    989  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    990  */
    991 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    992 
    993 struct wtab *
    994 wtab_alloc(weight)
    995 	int weight;
    996 {
    997 	struct wtab *w;
    998 	int i;
    999 
   1000 	for (w = wtab_list; w != NULL; w = w->w_next)
   1001 		if (w->w_weight == weight) {
   1002 			w->w_refcount++;
   1003 			return (w);
   1004 		}
   1005 
   1006 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
   1007 	if (w == NULL)
   1008 		panic("wtab_alloc: malloc failed!");
   1009 	w->w_weight = weight;
   1010 	w->w_refcount = 1;
   1011 	w->w_next = wtab_list;
   1012 	wtab_list = w;
   1013 
   1014 	/* initialize the weight table */
   1015 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1016 	for (i = 1; i < 32; i++) {
   1017 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1018 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1019 			w->w_param_max = 1 << i;
   1020 	}
   1021 
   1022 	return (w);
   1023 }
   1024 
   1025 int
   1026 wtab_destroy(w)
   1027 	struct wtab *w;
   1028 {
   1029 	struct wtab *prev;
   1030 
   1031 	if (--w->w_refcount > 0)
   1032 		return (0);
   1033 
   1034 	if (wtab_list == w)
   1035 		wtab_list = w->w_next;
   1036 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1037 		if (prev->w_next == w) {
   1038 			prev->w_next = w->w_next;
   1039 			break;
   1040 		}
   1041 
   1042 	free(w, M_DEVBUF);
   1043 	return (0);
   1044 }
   1045 
   1046 int32_t
   1047 pow_w(w, n)
   1048 	struct wtab *w;
   1049 	int n;
   1050 {
   1051 	int	i, bit;
   1052 	int32_t	val;
   1053 
   1054 	if (n >= w->w_param_max)
   1055 		return (0);
   1056 
   1057 	val = 1 << FP_SHIFT;
   1058 	if (n <= 0)
   1059 		return (val);
   1060 
   1061 	bit = 1;
   1062 	i = 0;
   1063 	while (n) {
   1064 		if (n & bit) {
   1065 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1066 			n &= ~bit;
   1067 		}
   1068 		i++;
   1069 		bit <<=  1;
   1070 	}
   1071 	return (val);
   1072 }
   1073 
   1074 #ifdef ALTQ_FLOWVALVE
   1075 
   1076 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1077 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1078 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1079 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1080 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1081 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1082 
   1083 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1084 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1085 
   1086 #define	FV_N			10	/* update fve_f every FV_N packets */
   1087 
   1088 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1089 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1090 #define	FV_ALPHA		5  /* extra packet count */
   1091 
   1092 #if (__FreeBSD_version > 300000)
   1093 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1094 #else
   1095 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1096 #endif
   1097 
   1098 /*
   1099  * Brtt table: 127 entry table to convert drop rate (p) to
   1100  * the corresponding bandwidth fraction (f)
   1101  * the following equation is implemented to use scaled values,
   1102  * fve_p and fve_f, in the fixed point format.
   1103  *
   1104  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1105  *   f = Brtt(p) / (max_th + alpha)
   1106  */
   1107 #define	BRTT_SIZE	128
   1108 #define	BRTT_SHIFT	12
   1109 #define	BRTT_MASK	0x0007f000
   1110 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1111 
   1112 const int brtt_tab[BRTT_SIZE] = {
   1113 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1114 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1115 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1116 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1117 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1118 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1119 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1120 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1121 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1122 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1123 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1124 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1125 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1126 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1127 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1128 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1129 };
   1130 
   1131 static inline struct fve *
   1132 flowlist_lookup(fv, pktattr, now)
   1133 	struct flowvalve *fv;
   1134 	struct altq_pktattr *pktattr;
   1135 	struct timeval *now;
   1136 {
   1137 	struct fve *fve;
   1138 	int flows;
   1139 	struct ip *ip;
   1140 #ifdef INET6
   1141 	struct ip6_hdr *ip6;
   1142 #endif
   1143 	struct timeval tthresh;
   1144 
   1145 	if (pktattr == NULL)
   1146 		return (NULL);
   1147 
   1148 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1149 	flows = 0;
   1150 	/*
   1151 	 * search the flow list
   1152 	 */
   1153 	switch (pktattr->pattr_af) {
   1154 	case AF_INET:
   1155 		ip = (struct ip *)pktattr->pattr_hdr;
   1156 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1157 			if (fve->fve_lastdrop.tv_sec == 0)
   1158 				break;
   1159 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1160 				fve->fve_lastdrop.tv_sec = 0;
   1161 				break;
   1162 			}
   1163 			if (fve->fve_flow.flow_af == AF_INET &&
   1164 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1165 			    ip->ip_src.s_addr &&
   1166 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1167 			    ip->ip_dst.s_addr)
   1168 				return (fve);
   1169 			flows++;
   1170 		}
   1171 		break;
   1172 #ifdef INET6
   1173 	case AF_INET6:
   1174 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1175 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1176 			if (fve->fve_lastdrop.tv_sec == 0)
   1177 				break;
   1178 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1179 				fve->fve_lastdrop.tv_sec = 0;
   1180 				break;
   1181 			}
   1182 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1183 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1184 					       &ip6->ip6_src) &&
   1185 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1186 					       &ip6->ip6_dst))
   1187 				return (fve);
   1188 			flows++;
   1189 		}
   1190 		break;
   1191 #endif /* INET6 */
   1192 
   1193 	default:
   1194 		/* unknown protocol.  no drop. */
   1195 		return (NULL);
   1196 	}
   1197 	fv->fv_flows = flows;	/* save the number of active fve's */
   1198 	return (NULL);
   1199 }
   1200 
   1201 static inline struct fve *
   1202 flowlist_reclaim(fv, pktattr)
   1203 	struct flowvalve *fv;
   1204 	struct altq_pktattr *pktattr;
   1205 {
   1206 	struct fve *fve;
   1207 	struct ip *ip;
   1208 #ifdef INET6
   1209 	struct ip6_hdr *ip6;
   1210 #endif
   1211 
   1212 	/*
   1213 	 * get an entry from the tail of the LRU list.
   1214 	 */
   1215 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1216 
   1217 	switch (pktattr->pattr_af) {
   1218 	case AF_INET:
   1219 		ip = (struct ip *)pktattr->pattr_hdr;
   1220 		fve->fve_flow.flow_af = AF_INET;
   1221 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1222 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1223 		break;
   1224 #ifdef INET6
   1225 	case AF_INET6:
   1226 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1227 		fve->fve_flow.flow_af = AF_INET6;
   1228 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1229 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1230 		break;
   1231 #endif
   1232 	}
   1233 
   1234 	fve->fve_state = Green;
   1235 	fve->fve_p = 0.0;
   1236 	fve->fve_f = 0.0;
   1237 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1238 	fve->fve_count = 0;
   1239 
   1240 	fv->fv_flows++;
   1241 #ifdef FV_STATS
   1242 	fv->fv_stats.alloc++;
   1243 #endif
   1244 	return (fve);
   1245 }
   1246 
   1247 static inline void
   1248 flowlist_move_to_head(fv, fve)
   1249 	struct flowvalve *fv;
   1250 	struct fve *fve;
   1251 {
   1252 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1253 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1254 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1255 	}
   1256 }
   1257 
   1258 /*
   1259  * allocate flowvalve structure
   1260  */
   1261 static struct flowvalve *
   1262 fv_alloc(rp)
   1263 	struct red *rp;
   1264 {
   1265 	struct flowvalve *fv;
   1266 	struct fve *fve;
   1267 	int i, num;
   1268 
   1269 	num = FV_FLOWLISTSIZE;
   1270 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1271 	if (fv == NULL)
   1272 		return (NULL);
   1273 
   1274 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1275 	    M_WAITOK|M_ZERO);
   1276 	if (fv->fv_fves == NULL) {
   1277 		free(fv, M_DEVBUF);
   1278 		return (NULL);
   1279 	}
   1280 
   1281 	fv->fv_flows = 0;
   1282 	TAILQ_INIT(&fv->fv_flowlist);
   1283 	for (i = 0; i < num; i++) {
   1284 		fve = &fv->fv_fves[i];
   1285 		fve->fve_lastdrop.tv_sec = 0;
   1286 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1287 	}
   1288 
   1289 	/* initialize drop rate threshold in scaled fixed-point */
   1290 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1291 
   1292 	/* initialize drop rate to fraction table */
   1293 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1294 	if (fv->fv_p2ftab == NULL) {
   1295 		free(fv->fv_fves, M_DEVBUF);
   1296 		free(fv, M_DEVBUF);
   1297 		return (NULL);
   1298 	}
   1299 	/*
   1300 	 * create the p2f table.
   1301 	 * (shift is used to keep the precision)
   1302 	 */
   1303 	for (i = 1; i < BRTT_SIZE; i++) {
   1304 		int f;
   1305 
   1306 		f = brtt_tab[i] << 8;
   1307 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1308 	}
   1309 
   1310 	return (fv);
   1311 }
   1312 
   1313 static void fv_destroy(fv)
   1314 	struct flowvalve *fv;
   1315 {
   1316 	free(fv->fv_p2ftab, M_DEVBUF);
   1317 	free(fv->fv_fves, M_DEVBUF);
   1318 	free(fv, M_DEVBUF);
   1319 }
   1320 
   1321 static inline int
   1322 fv_p2f(fv, p)
   1323 	struct flowvalve	*fv;
   1324 	int	p;
   1325 {
   1326 	int val, f;
   1327 
   1328 	if (p >= BRTT_PMAX)
   1329 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1330 	else if ((val = (p & BRTT_MASK)))
   1331 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1332 	else
   1333 		f = fv->fv_p2ftab[1];
   1334 	return (f);
   1335 }
   1336 
   1337 /*
   1338  * check if an arriving packet should be pre-dropped.
   1339  * called from red_addq() when a packet arrives.
   1340  * returns 1 when the packet should be pre-dropped.
   1341  * should be called in splnet.
   1342  */
   1343 static int
   1344 fv_checkflow(fv, pktattr, fcache)
   1345 	struct flowvalve *fv;
   1346 	struct altq_pktattr *pktattr;
   1347 	struct fve **fcache;
   1348 {
   1349 	struct fve *fve;
   1350 	struct timeval now;
   1351 
   1352 	fv->fv_ifseq++;
   1353 	FV_TIMESTAMP(&now);
   1354 
   1355 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1356 		/* no matching entry in the flowlist */
   1357 		return (0);
   1358 
   1359 	*fcache = fve;
   1360 
   1361 	/* update fraction f for every FV_N packets */
   1362 	if (++fve->fve_count == FV_N) {
   1363 		/*
   1364 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1365 		 */
   1366 		fve->fve_f =
   1367 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1368 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1369 		fve->fve_ifseq = fv->fv_ifseq;
   1370 		fve->fve_count = 0;
   1371 	}
   1372 
   1373 	/*
   1374 	 * overpumping test
   1375 	 */
   1376 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1377 		int fthresh;
   1378 
   1379 		/* calculate a threshold */
   1380 		fthresh = fv_p2f(fv, fve->fve_p);
   1381 		if (fve->fve_f > fthresh)
   1382 			fve->fve_state = Red;
   1383 	}
   1384 
   1385 	if (fve->fve_state == Red) {
   1386 		/*
   1387 		 * backoff test
   1388 		 */
   1389 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1390 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1391 			fve->fve_p = 0;
   1392 			fve->fve_state = Green;
   1393 #ifdef FV_STATS
   1394 			fv->fv_stats.escape++;
   1395 #endif
   1396 		} else {
   1397 			/* block this flow */
   1398 			flowlist_move_to_head(fv, fve);
   1399 			fve->fve_lastdrop = now;
   1400 #ifdef FV_STATS
   1401 			fv->fv_stats.predrop++;
   1402 #endif
   1403 			return (1);
   1404 		}
   1405 	}
   1406 
   1407 	/*
   1408 	 * p = (1 - Wp) * p
   1409 	 */
   1410 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1411 	if (fve->fve_p < 0)
   1412 		fve->fve_p = 0;
   1413 #ifdef FV_STATS
   1414 	fv->fv_stats.pass++;
   1415 #endif
   1416 	return (0);
   1417 }
   1418 
   1419 /*
   1420  * called from red_addq when a packet is dropped by red.
   1421  * should be called in splnet.
   1422  */
   1423 static void fv_dropbyred(fv, pktattr, fcache)
   1424 	struct flowvalve *fv;
   1425 	struct altq_pktattr *pktattr;
   1426 	struct fve *fcache;
   1427 {
   1428 	struct fve *fve;
   1429 	struct timeval now;
   1430 
   1431 	if (pktattr == NULL)
   1432 		return;
   1433 	FV_TIMESTAMP(&now);
   1434 
   1435 	if (fcache != NULL)
   1436 		/* the fve of this packet is already cached */
   1437 		fve = fcache;
   1438 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1439 		fve = flowlist_reclaim(fv, pktattr);
   1440 
   1441 	flowlist_move_to_head(fv, fve);
   1442 
   1443 	/*
   1444 	 * update p:  the following line cancels the update
   1445 	 *	      in fv_checkflow() and calculate
   1446 	 *	p = Wp + (1 - Wp) * p
   1447 	 */
   1448 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1449 
   1450 	fve->fve_lastdrop = now;
   1451 }
   1452 
   1453 #endif /* ALTQ_FLOWVALVE */
   1454 
   1455 #ifdef KLD_MODULE
   1456 
   1457 static struct altqsw red_sw =
   1458 	{"red", redopen, redclose, redioctl};
   1459 
   1460 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1461 
   1462 #endif /* KLD_MODULE */
   1463 
   1464 #endif /* ALTQ_RED */
   1465