Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.13
      1 /*	$NetBSD: altq_red.c,v 1.13 2005/12/24 20:10:16 perry Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.9 2002/01/07 11:25:40 kjc Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.13 2005/12/24 20:10:16 perry Exp $");
     65 
     66 #if defined(__FreeBSD__) || defined(__NetBSD__)
     67 #include "opt_altq.h"
     68 #if (__FreeBSD__ != 2)
     69 #include "opt_inet.h"
     70 #ifdef __FreeBSD__
     71 #include "opt_inet6.h"
     72 #endif
     73 #endif
     74 #endif /* __FreeBSD__ || __NetBSD__ */
     75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     76 
     77 #include <sys/param.h>
     78 #include <sys/malloc.h>
     79 #include <sys/mbuf.h>
     80 #include <sys/socket.h>
     81 #include <sys/sockio.h>
     82 #include <sys/systm.h>
     83 #include <sys/proc.h>
     84 #include <sys/errno.h>
     85 #include <sys/kernel.h>
     86 #ifdef ALTQ_FLOWVALVE
     87 #include <sys/queue.h>
     88 #include <sys/time.h>
     89 #endif
     90 
     91 #include <net/if.h>
     92 #include <net/if_types.h>
     93 
     94 #include <netinet/in.h>
     95 #include <netinet/in_systm.h>
     96 #include <netinet/ip.h>
     97 #ifdef INET6
     98 #include <netinet/ip6.h>
     99 #endif
    100 
    101 #include <altq/altq.h>
    102 #include <altq/altq_conf.h>
    103 #include <altq/altq_red.h>
    104 #ifdef ALTQ_FLOWVALVE
    105 #include <altq/altq_flowvalve.h>
    106 #endif
    107 
    108 /*
    109  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    110  * fixed-point calculation.
    111  *
    112  * written by kjc using the ns code as a reference.
    113  * you can learn more about red and ns from Sally's home page at
    114  * http://www-nrg.ee.lbl.gov/floyd/
    115  *
    116  * most of the red parameter values are fixed in this implementation
    117  * to prevent fixed-point overflow/underflow.
    118  * if you change the parameters, watch out for overflow/underflow!
    119  *
    120  * the parameters used are recommended values by Sally.
    121  * the corresponding ns config looks:
    122  *	q_weight=0.00195
    123  *	minthresh=5 maxthresh=15 queue-size=60
    124  *	linterm=30
    125  *	dropmech=drop-tail
    126  *	bytes=false (can't be handled by 32-bit fixed-point)
    127  *	doubleq=false dqthresh=false
    128  *	wait=true
    129  */
    130 /*
    131  * alternative red parameters for a slow link.
    132  *
    133  * assume the queue length becomes from zero to L and keeps L, it takes
    134  * N packets for q_avg to reach 63% of L.
    135  * when q_weight is 0.002, N is about 500 packets.
    136  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    137  * when q_weight is 0.008, N is about 127 packets.
    138  * when q_weight is 0.016, N is about 63 packets.
    139  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    140  * are allowed for 0.016.
    141  * see Sally's paper for more details.
    142  */
    143 /* normal red parameters */
    144 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    145 				/* q_weight = 0.00195 */
    146 
    147 /* red parameters for a slow link */
    148 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    149 				/* q_weight = 0.0078125 */
    150 
    151 /* red parameters for a very slow link (e.g., dialup) */
    152 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    153 				/* q_weight = 0.015625 */
    154 
    155 /* fixed-point uses 12-bit decimal places */
    156 #define	FP_SHIFT	12	/* fixed-point shift */
    157 
    158 /* red parameters for drop probability */
    159 #define	INV_P_MAX	10	/* inverse of max drop probability */
    160 #define	TH_MIN		5	/* min threshold */
    161 #define	TH_MAX		15	/* max threshold */
    162 
    163 #define	RED_LIMIT	60	/* default max queue length */
    164 
    165 /*
    166  * our default policy for forced-drop is drop-tail.
    167  * (in altq-1.1.2 or earlier, the default was random-drop.
    168  * but it makes more sense to punish the cause of the surge.)
    169  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    170  */
    171 
    172 #ifdef ALTQ_FLOWVALVE
    173 /*
    174  * flow-valve is an extension to protect red from unresponsive flows
    175  * and to promote end-to-end congestion control.
    176  * flow-valve observes the average drop rates of the flows that have
    177  * experienced packet drops in the recent past.
    178  * when the average drop rate exceeds the threshold, the flow is
    179  * blocked by the flow-valve.  the trapped flow should back off
    180  * exponentially to escape from the flow-valve.
    181  */
    182 #ifdef RED_RANDOM_DROP
    183 #error "random-drop can't be used with flow-valve!"
    184 #endif
    185 #endif /* ALTQ_FLOWVALVE */
    186 
    187 /* red_list keeps all red_queue_t's allocated. */
    188 static red_queue_t *red_list = NULL;
    189 
    190 /* default red parameter values */
    191 static int default_th_min = TH_MIN;
    192 static int default_th_max = TH_MAX;
    193 static int default_inv_pmax = INV_P_MAX;
    194 
    195 /* internal function prototypes */
    196 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    197 			    struct altq_pktattr *));
    198 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    199 static int red_request __P((struct ifaltq *, int, void *));
    200 static void red_purgeq __P((red_queue_t *));
    201 static int red_detach __P((red_queue_t *));
    202 #ifdef ALTQ_FLOWVALVE
    203 static inline struct fve *flowlist_lookup __P((struct flowvalve *,
    204 			 struct altq_pktattr *, struct timeval *));
    205 static inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    206 						  struct altq_pktattr *));
    207 static inline void flowlist_move_to_head __P((struct flowvalve *,
    208 						struct fve *));
    209 static inline int fv_p2f __P((struct flowvalve *, int));
    210 static struct flowvalve *fv_alloc __P((struct red *));
    211 static void fv_destroy __P((struct flowvalve *));
    212 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    213 			     struct fve **));
    214 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    215 			      struct fve *));
    216 #endif
    217 
    218 /*
    219  * red device interface
    220  */
    221 altqdev_decl(red);
    222 
    223 int
    224 redopen(dev, flag, fmt, l)
    225 	dev_t dev;
    226 	int flag, fmt;
    227 	struct lwp *l;
    228 {
    229 	/* everything will be done when the queueing scheme is attached. */
    230 	return 0;
    231 }
    232 
    233 int
    234 redclose(dev, flag, fmt, l)
    235 	dev_t dev;
    236 	int flag, fmt;
    237 	struct lwp *l;
    238 {
    239 	red_queue_t *rqp;
    240 	int err, error = 0;
    241 
    242 	while ((rqp = red_list) != NULL) {
    243 		/* destroy all */
    244 		err = red_detach(rqp);
    245 		if (err != 0 && error == 0)
    246 			error = err;
    247 	}
    248 
    249 	return error;
    250 }
    251 
    252 int
    253 redioctl(dev, cmd, addr, flag, l)
    254 	dev_t dev;
    255 	ioctlcmd_t cmd;
    256 	caddr_t addr;
    257 	int flag;
    258 	struct lwp *l;
    259 {
    260 	red_queue_t *rqp;
    261 	struct red_interface *ifacep;
    262 	struct ifnet *ifp;
    263 	struct proc *p = l->l_proc;
    264 	int	error = 0;
    265 
    266 	/* check super-user privilege */
    267 	switch (cmd) {
    268 	case RED_GETSTATS:
    269 		break;
    270 	default:
    271 #if (__FreeBSD_version > 400000)
    272 		if ((error = suser(p)) != 0)
    273 #else
    274 		if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
    275 #endif
    276 			return (error);
    277 		break;
    278 	}
    279 
    280 	switch (cmd) {
    281 
    282 	case RED_ENABLE:
    283 		ifacep = (struct red_interface *)addr;
    284 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    285 			error = EBADF;
    286 			break;
    287 		}
    288 		error = altq_enable(rqp->rq_ifq);
    289 		break;
    290 
    291 	case RED_DISABLE:
    292 		ifacep = (struct red_interface *)addr;
    293 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    294 			error = EBADF;
    295 			break;
    296 		}
    297 		error = altq_disable(rqp->rq_ifq);
    298 		break;
    299 
    300 	case RED_IF_ATTACH:
    301 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    302 		if (ifp == NULL) {
    303 			error = ENXIO;
    304 			break;
    305 		}
    306 
    307 		/* allocate and initialize red_queue_t */
    308 		MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
    309 		if (rqp == NULL) {
    310 			error = ENOMEM;
    311 			break;
    312 		}
    313 		(void)memset(rqp, 0, sizeof(red_queue_t));
    314 
    315 		MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t),
    316 		       M_DEVBUF, M_WAITOK);
    317 		if (rqp->rq_q == NULL) {
    318 			FREE(rqp, M_DEVBUF);
    319 			error = ENOMEM;
    320 			break;
    321 		}
    322 		(void)memset(rqp->rq_q, 0, sizeof(class_queue_t));
    323 
    324 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    325 		if (rqp->rq_red == NULL) {
    326 			FREE(rqp->rq_q, M_DEVBUF);
    327 			FREE(rqp, M_DEVBUF);
    328 			error = ENOMEM;
    329 			break;
    330 		}
    331 
    332 		rqp->rq_ifq = &ifp->if_snd;
    333 		qtail(rqp->rq_q) = NULL;
    334 		qlen(rqp->rq_q) = 0;
    335 		qlimit(rqp->rq_q) = RED_LIMIT;
    336 		qtype(rqp->rq_q) = Q_RED;
    337 
    338 		/*
    339 		 * set RED to this ifnet structure.
    340 		 */
    341 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    342 				    red_enqueue, red_dequeue, red_request,
    343 				    NULL, NULL);
    344 		if (error) {
    345 			red_destroy(rqp->rq_red);
    346 			FREE(rqp->rq_q, M_DEVBUF);
    347 			FREE(rqp, M_DEVBUF);
    348 			break;
    349 		}
    350 
    351 		/* add this state to the red list */
    352 		rqp->rq_next = red_list;
    353 		red_list = rqp;
    354 		break;
    355 
    356 	case RED_IF_DETACH:
    357 		ifacep = (struct red_interface *)addr;
    358 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    359 			error = EBADF;
    360 			break;
    361 		}
    362 		error = red_detach(rqp);
    363 		break;
    364 
    365 	case RED_GETSTATS:
    366 		do {
    367 			struct red_stats *q_stats;
    368 			red_t *rp;
    369 
    370 			q_stats = (struct red_stats *)addr;
    371 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    372 					     ALTQT_RED)) == NULL) {
    373 				error = EBADF;
    374 				break;
    375 			}
    376 
    377 			q_stats->q_len 	   = qlen(rqp->rq_q);
    378 			q_stats->q_limit   = qlimit(rqp->rq_q);
    379 
    380 			rp = rqp->rq_red;
    381 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    382 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    383 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    384 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    385 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    386 			q_stats->marked_packets = rp->red_stats.marked_packets;
    387 
    388 			q_stats->weight		= rp->red_weight;
    389 			q_stats->inv_pmax	= rp->red_inv_pmax;
    390 			q_stats->th_min		= rp->red_thmin;
    391 			q_stats->th_max		= rp->red_thmax;
    392 
    393 #ifdef ALTQ_FLOWVALVE
    394 			if (rp->red_flowvalve != NULL) {
    395 				struct flowvalve *fv = rp->red_flowvalve;
    396 				q_stats->fv_flows    = fv->fv_flows;
    397 				q_stats->fv_pass     = fv->fv_stats.pass;
    398 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    399 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    400 				q_stats->fv_escape   = fv->fv_stats.escape;
    401 			} else {
    402 #endif /* ALTQ_FLOWVALVE */
    403 				q_stats->fv_flows    = 0;
    404 				q_stats->fv_pass     = 0;
    405 				q_stats->fv_predrop  = 0;
    406 				q_stats->fv_alloc    = 0;
    407 				q_stats->fv_escape   = 0;
    408 #ifdef ALTQ_FLOWVALVE
    409 			}
    410 #endif /* ALTQ_FLOWVALVE */
    411 		} while (0);
    412 		break;
    413 
    414 	case RED_CONFIG:
    415 		do {
    416 			struct red_conf *fc;
    417 			red_t *new;
    418 			int s, limit;
    419 
    420 			fc = (struct red_conf *)addr;
    421 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    422 					       ALTQT_RED)) == NULL) {
    423 				error = EBADF;
    424 				break;
    425 			}
    426 			new = red_alloc(fc->red_weight,
    427 					fc->red_inv_pmax,
    428 					fc->red_thmin,
    429 					fc->red_thmax,
    430 					fc->red_flags,
    431 					fc->red_pkttime);
    432 			if (new == NULL) {
    433 				error = ENOMEM;
    434 				break;
    435 			}
    436 
    437 			s = splnet();
    438 			red_purgeq(rqp);
    439 			limit = fc->red_limit;
    440 			if (limit < fc->red_thmax)
    441 				limit = fc->red_thmax;
    442 			qlimit(rqp->rq_q) = limit;
    443 			fc->red_limit = limit;	/* write back the new value */
    444 
    445 			red_destroy(rqp->rq_red);
    446 			rqp->rq_red = new;
    447 
    448 			splx(s);
    449 
    450 			/* write back new values */
    451 			fc->red_limit = limit;
    452 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    453 			fc->red_thmin = rqp->rq_red->red_thmin;
    454 			fc->red_thmax = rqp->rq_red->red_thmax;
    455 
    456 		} while (0);
    457 		break;
    458 
    459 	case RED_SETDEFAULTS:
    460 		do {
    461 			struct redparams *rp;
    462 
    463 			rp = (struct redparams *)addr;
    464 
    465 			default_th_min = rp->th_min;
    466 			default_th_max = rp->th_max;
    467 			default_inv_pmax = rp->inv_pmax;
    468 		} while (0);
    469 		break;
    470 
    471 	default:
    472 		error = EINVAL;
    473 		break;
    474 	}
    475 	return error;
    476 }
    477 
    478 static int
    479 red_detach(rqp)
    480 	red_queue_t *rqp;
    481 {
    482 	red_queue_t *tmp;
    483 	int error = 0;
    484 
    485 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    486 		altq_disable(rqp->rq_ifq);
    487 
    488 	if ((error = altq_detach(rqp->rq_ifq)))
    489 		return (error);
    490 
    491 	if (red_list == rqp)
    492 		red_list = rqp->rq_next;
    493 	else {
    494 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    495 			if (tmp->rq_next == rqp) {
    496 				tmp->rq_next = rqp->rq_next;
    497 				break;
    498 			}
    499 		if (tmp == NULL)
    500 			printf("red_detach: no state found in red_list!\n");
    501 	}
    502 
    503 	red_destroy(rqp->rq_red);
    504 	FREE(rqp->rq_q, M_DEVBUF);
    505 	FREE(rqp, M_DEVBUF);
    506 	return (error);
    507 }
    508 
    509 /*
    510  * red support routines
    511  */
    512 
    513 red_t *
    514 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    515 	int	weight, inv_pmax, th_min, th_max;
    516 	int	flags, pkttime;
    517 {
    518 	red_t 	*rp;
    519 	int	w, i;
    520 	int	npkts_per_sec;
    521 
    522 	MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
    523 	if (rp == NULL)
    524 		return (NULL);
    525 	(void)memset(rp, 0, sizeof(red_t));
    526 
    527 	rp->red_avg = 0;
    528 	rp->red_idle = 1;
    529 
    530 	if (weight == 0)
    531 		rp->red_weight = W_WEIGHT;
    532 	else
    533 		rp->red_weight = weight;
    534 	if (inv_pmax == 0)
    535 		rp->red_inv_pmax = default_inv_pmax;
    536 	else
    537 		rp->red_inv_pmax = inv_pmax;
    538 	if (th_min == 0)
    539 		rp->red_thmin = default_th_min;
    540 	else
    541 		rp->red_thmin = th_min;
    542 	if (th_max == 0)
    543 		rp->red_thmax = default_th_max;
    544 	else
    545 		rp->red_thmax = th_max;
    546 
    547 	rp->red_flags = flags;
    548 
    549 	if (pkttime == 0)
    550 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    551 		rp->red_pkttime = 800;
    552 	else
    553 		rp->red_pkttime = pkttime;
    554 
    555 	if (weight == 0) {
    556 		/* when the link is very slow, adjust red parameters */
    557 		npkts_per_sec = 1000000 / rp->red_pkttime;
    558 		if (npkts_per_sec < 50) {
    559 			/* up to about 400Kbps */
    560 			rp->red_weight = W_WEIGHT_2;
    561 		} else if (npkts_per_sec < 300) {
    562 			/* up to about 2.4Mbps */
    563 			rp->red_weight = W_WEIGHT_1;
    564 		}
    565 	}
    566 
    567 	/* calculate wshift.  weight must be power of 2 */
    568 	w = rp->red_weight;
    569 	for (i = 0; w > 1; i++)
    570 		w = w >> 1;
    571 	rp->red_wshift = i;
    572 	w = 1 << rp->red_wshift;
    573 	if (w != rp->red_weight) {
    574 		printf("invalid weight value %d for red! use %d\n",
    575 		       rp->red_weight, w);
    576 		rp->red_weight = w;
    577 	}
    578 
    579 	/*
    580 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    581 	 * to be compared with avg.
    582 	 */
    583 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    584 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    585 
    586 	/*
    587 	 * precompute probability denominator
    588 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    589 	 */
    590 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    591 			 * rp->red_inv_pmax) << FP_SHIFT;
    592 
    593 	/* allocate weight table */
    594 	rp->red_wtab = wtab_alloc(rp->red_weight);
    595 
    596 	microtime(&rp->red_last);
    597 #ifdef ALTQ_FLOWVALVE
    598 	if (flags & REDF_FLOWVALVE)
    599 		rp->red_flowvalve = fv_alloc(rp);
    600 	/* if fv_alloc failes, flowvalve is just disabled */
    601 #endif
    602 	return (rp);
    603 }
    604 
    605 void
    606 red_destroy(rp)
    607 	red_t *rp;
    608 {
    609 #ifdef ALTQ_FLOWVALVE
    610 	if (rp->red_flowvalve != NULL)
    611 		fv_destroy(rp->red_flowvalve);
    612 #endif
    613 	wtab_destroy(rp->red_wtab);
    614 	FREE(rp, M_DEVBUF);
    615 }
    616 
    617 void
    618 red_getstats(rp, sp)
    619 	red_t *rp;
    620 	struct redstats *sp;
    621 {
    622 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    623 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    624 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    625 	sp->drop_forced		= rp->red_stats.drop_forced;
    626 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    627 	sp->marked_packets	= rp->red_stats.marked_packets;
    628 }
    629 
    630 /*
    631  * enqueue routine:
    632  *
    633  *	returns: 0 when successfully queued.
    634  *		 ENOBUFS when drop occurs.
    635  */
    636 static int
    637 red_enqueue(ifq, m, pktattr)
    638 	struct ifaltq *ifq;
    639 	struct mbuf *m;
    640 	struct altq_pktattr *pktattr;
    641 {
    642 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    643 
    644 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    645 		return ENOBUFS;
    646 	ifq->ifq_len++;
    647 	return 0;
    648 }
    649 
    650 int
    651 red_addq(rp, q, m, pktattr)
    652 	red_t *rp;
    653 	class_queue_t *q;
    654 	struct mbuf *m;
    655 	struct altq_pktattr *pktattr;
    656 {
    657 	int avg, droptype;
    658 	int n;
    659 #ifdef ALTQ_FLOWVALVE
    660 	struct fve *fve = NULL;
    661 
    662 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    663 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    664 			m_freem(m);
    665 			return (-1);
    666 		}
    667 #endif
    668 
    669 	avg = rp->red_avg;
    670 
    671 	/*
    672 	 * if we were idle, we pretend that n packets arrived during
    673 	 * the idle period.
    674 	 */
    675 	if (rp->red_idle) {
    676 		struct timeval now;
    677 		int t;
    678 
    679 		rp->red_idle = 0;
    680 		microtime(&now);
    681 		t = (now.tv_sec - rp->red_last.tv_sec);
    682 		if (t > 60) {
    683 			/*
    684 			 * being idle for more than 1 minute, set avg to zero.
    685 			 * this prevents t from overflow.
    686 			 */
    687 			avg = 0;
    688 		} else {
    689 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    690 			n = t / rp->red_pkttime - 1;
    691 
    692 			/* the following line does (avg = (1 - Wq)^n * avg) */
    693 			if (n > 0)
    694 				avg = (avg >> FP_SHIFT) *
    695 				    pow_w(rp->red_wtab, n);
    696 		}
    697 	}
    698 
    699 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    700 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    701 	rp->red_avg = avg;		/* save the new value */
    702 
    703 	/*
    704 	 * red_count keeps a tally of arriving traffic that has not
    705 	 * been dropped.
    706 	 */
    707 	rp->red_count++;
    708 
    709 	/* see if we drop early */
    710 	droptype = DTYPE_NODROP;
    711 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    712 		if (avg >= rp->red_thmax_s) {
    713 			/* avg >= th_max: forced drop */
    714 			droptype = DTYPE_FORCED;
    715 		} else if (rp->red_old == 0) {
    716 			/* first exceeds th_min */
    717 			rp->red_count = 1;
    718 			rp->red_old = 1;
    719 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    720 				      rp->red_probd, rp->red_count)) {
    721 			/* mark or drop by red */
    722 			if ((rp->red_flags & REDF_ECN) &&
    723 			    mark_ecn(m, pktattr, rp->red_flags)) {
    724 				/* successfully marked.  do not drop. */
    725 				rp->red_count = 0;
    726 #ifdef RED_STATS
    727 				rp->red_stats.marked_packets++;
    728 #endif
    729 			} else {
    730 				/* unforced drop by red */
    731 				droptype = DTYPE_EARLY;
    732 			}
    733 		}
    734 	} else {
    735 		/* avg < th_min */
    736 		rp->red_old = 0;
    737 	}
    738 
    739 	/*
    740 	 * if the queue length hits the hard limit, it's a forced drop.
    741 	 */
    742 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    743 		droptype = DTYPE_FORCED;
    744 
    745 #ifdef RED_RANDOM_DROP
    746 	/* if successful or forced drop, enqueue this packet. */
    747 	if (droptype != DTYPE_EARLY)
    748 		_addq(q, m);
    749 #else
    750 	/* if successful, enqueue this packet. */
    751 	if (droptype == DTYPE_NODROP)
    752 		_addq(q, m);
    753 #endif
    754 	if (droptype != DTYPE_NODROP) {
    755 		if (droptype == DTYPE_EARLY) {
    756 			/* drop the incoming packet */
    757 #ifdef RED_STATS
    758 			rp->red_stats.drop_unforced++;
    759 #endif
    760 		} else {
    761 			/* forced drop, select a victim packet in the queue. */
    762 #ifdef RED_RANDOM_DROP
    763 			m = _getq_random(q);
    764 #endif
    765 #ifdef RED_STATS
    766 			rp->red_stats.drop_forced++;
    767 #endif
    768 		}
    769 #ifdef RED_STATS
    770 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    771 #endif
    772 		rp->red_count = 0;
    773 #ifdef ALTQ_FLOWVALVE
    774 		if (rp->red_flowvalve != NULL)
    775 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    776 #endif
    777 		m_freem(m);
    778 		return (-1);
    779 	}
    780 	/* successfully queued */
    781 #ifdef RED_STATS
    782 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    783 #endif
    784 	return (0);
    785 }
    786 
    787 /*
    788  * early-drop probability is calculated as follows:
    789  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    790  *   prob_a = prob / (2 - count*prob)
    791  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    792  * here prob_a increases as successive undrop count increases.
    793  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    794  * becomes 1 when (count >= (2 / prob))).
    795  */
    796 int
    797 drop_early(fp_len, fp_probd, count)
    798 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    799 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    800 	int count;	/* how many successive undropped packets */
    801 {
    802 	int d;		/* denominator of drop-probability */
    803 
    804 	d = fp_probd - count * fp_len;
    805 	if (d <= 0)
    806 		/* count exceeds the hard limit: drop or mark */
    807 		return (1);
    808 
    809 	/*
    810 	 * now the range of d is [1..600] in fixed-point. (when
    811 	 * th_max-th_min=10 and p_max=1/30)
    812 	 * drop probability = (avg - TH_MIN) / d
    813 	 */
    814 
    815 	if ((random() % d) < fp_len) {
    816 		/* drop or mark */
    817 		return (1);
    818 	}
    819 	/* no drop/mark */
    820 	return (0);
    821 }
    822 
    823 /*
    824  * try to mark CE bit to the packet.
    825  *    returns 1 if successfully marked, 0 otherwise.
    826  */
    827 int
    828 mark_ecn(m, pktattr, flags)
    829 	struct mbuf *m;
    830 	struct altq_pktattr *pktattr;
    831 	int flags;
    832 {
    833 	struct mbuf *m0;
    834 
    835 	if (pktattr == NULL ||
    836 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    837 		return (0);
    838 
    839 	/* verify that pattr_hdr is within the mbuf data */
    840 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    841 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    842 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    843 			break;
    844 	if (m0 == NULL) {
    845 		/* ick, pattr_hdr is stale */
    846 		pktattr->pattr_af = AF_UNSPEC;
    847 		return (0);
    848 	}
    849 
    850 	switch (pktattr->pattr_af) {
    851 	case AF_INET:
    852 		if (flags & REDF_ECN4) {
    853 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    854 			u_int8_t otos;
    855 			int sum;
    856 
    857 			if (ip->ip_v != 4)
    858 				return (0);	/* version mismatch! */
    859 
    860 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    861 				return (0);	/* not-ECT */
    862 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    863 				return (1);	/* already marked */
    864 
    865 			/*
    866 			 * ecn-capable but not marked,
    867 			 * mark CE and update checksum
    868 			 */
    869 			otos = ip->ip_tos;
    870 			ip->ip_tos |= IPTOS_ECN_CE;
    871 			/*
    872 			 * update checksum (from RFC1624)
    873 			 *	   HC' = ~(~HC + ~m + m')
    874 			 */
    875 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    876 			sum += (~otos & 0xffff) + ip->ip_tos;
    877 			sum = (sum >> 16) + (sum & 0xffff);
    878 			sum += (sum >> 16);  /* add carry */
    879 			ip->ip_sum = htons(~sum & 0xffff);
    880 			return (1);
    881 		}
    882 		break;
    883 #ifdef INET6
    884 	case AF_INET6:
    885 		if (flags & REDF_ECN6) {
    886 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    887 			u_int32_t flowlabel;
    888 
    889 			flowlabel = ntohl(ip6->ip6_flow);
    890 			if ((flowlabel >> 28) != 6)
    891 				return (0);	/* version mismatch! */
    892 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    893 			    (IPTOS_ECN_NOTECT << 20))
    894 				return (0);	/* not-ECT */
    895 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    896 			    (IPTOS_ECN_CE << 20))
    897 				return (1);	/* already marked */
    898 			/*
    899 			 * ecn-capable but not marked,  mark CE
    900 			 */
    901 			flowlabel |= (IPTOS_ECN_CE << 20);
    902 			ip6->ip6_flow = htonl(flowlabel);
    903 			return (1);
    904 		}
    905 		break;
    906 #endif  /* INET6 */
    907 	}
    908 
    909 	/* not marked */
    910 	return (0);
    911 }
    912 
    913 /*
    914  * dequeue routine:
    915  *	must be called in splnet.
    916  *
    917  *	returns: mbuf dequeued.
    918  *		 NULL when no packet is available in the queue.
    919  */
    920 
    921 static struct mbuf *
    922 red_dequeue(ifq, op)
    923 	struct ifaltq *ifq;
    924 	int op;
    925 {
    926 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    927 	struct mbuf *m;
    928 
    929 	if (op == ALTDQ_POLL)
    930 		return qhead(rqp->rq_q);
    931 
    932 	/* op == ALTDQ_REMOVE */
    933 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    934 	if (m != NULL)
    935 		ifq->ifq_len--;
    936 	return (m);
    937 }
    938 
    939 struct mbuf *
    940 red_getq(rp, q)
    941 	red_t *rp;
    942 	class_queue_t *q;
    943 {
    944 	struct mbuf *m;
    945 
    946 	if ((m = _getq(q)) == NULL) {
    947 		if (rp->red_idle == 0) {
    948 			rp->red_idle = 1;
    949 			microtime(&rp->red_last);
    950 		}
    951 		return NULL;
    952 	}
    953 
    954 	rp->red_idle = 0;
    955 	return (m);
    956 }
    957 
    958 static int
    959 red_request(ifq, req, arg)
    960 	struct ifaltq *ifq;
    961 	int req;
    962 	void *arg;
    963 {
    964 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    965 
    966 	switch (req) {
    967 	case ALTRQ_PURGE:
    968 		red_purgeq(rqp);
    969 		break;
    970 	}
    971 	return (0);
    972 }
    973 
    974 static void
    975 red_purgeq(rqp)
    976 	red_queue_t *rqp;
    977 {
    978 	_flushq(rqp->rq_q);
    979 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    980 		rqp->rq_ifq->ifq_len = 0;
    981 }
    982 
    983 
    984 /*
    985  * helper routine to calibrate avg during idle.
    986  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    987  * here Wq = 1/weight and the code assumes Wq is close to zero.
    988  *
    989  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    990  */
    991 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    992 
    993 struct wtab *
    994 wtab_alloc(weight)
    995 	int weight;
    996 {
    997 	struct wtab *w;
    998 	int i;
    999 
   1000 	for (w = wtab_list; w != NULL; w = w->w_next)
   1001 		if (w->w_weight == weight) {
   1002 			w->w_refcount++;
   1003 			return (w);
   1004 		}
   1005 
   1006 	MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
   1007 	if (w == NULL)
   1008 		panic("wtab_alloc: malloc failed!");
   1009 	(void)memset(w, 0, sizeof(struct wtab));
   1010 	w->w_weight = weight;
   1011 	w->w_refcount = 1;
   1012 	w->w_next = wtab_list;
   1013 	wtab_list = w;
   1014 
   1015 	/* initialize the weight table */
   1016 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1017 	for (i = 1; i < 32; i++) {
   1018 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1019 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1020 			w->w_param_max = 1 << i;
   1021 	}
   1022 
   1023 	return (w);
   1024 }
   1025 
   1026 int
   1027 wtab_destroy(w)
   1028 	struct wtab *w;
   1029 {
   1030 	struct wtab *prev;
   1031 
   1032 	if (--w->w_refcount > 0)
   1033 		return (0);
   1034 
   1035 	if (wtab_list == w)
   1036 		wtab_list = w->w_next;
   1037 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1038 		if (prev->w_next == w) {
   1039 			prev->w_next = w->w_next;
   1040 			break;
   1041 		}
   1042 
   1043 	FREE(w, M_DEVBUF);
   1044 	return (0);
   1045 }
   1046 
   1047 int32_t
   1048 pow_w(w, n)
   1049 	struct wtab *w;
   1050 	int n;
   1051 {
   1052 	int	i, bit;
   1053 	int32_t	val;
   1054 
   1055 	if (n >= w->w_param_max)
   1056 		return (0);
   1057 
   1058 	val = 1 << FP_SHIFT;
   1059 	if (n <= 0)
   1060 		return (val);
   1061 
   1062 	bit = 1;
   1063 	i = 0;
   1064 	while (n) {
   1065 		if (n & bit) {
   1066 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1067 			n &= ~bit;
   1068 		}
   1069 		i++;
   1070 		bit <<=  1;
   1071 	}
   1072 	return (val);
   1073 }
   1074 
   1075 #ifdef ALTQ_FLOWVALVE
   1076 
   1077 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1078 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1079 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1080 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1081 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1082 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1083 
   1084 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1085 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1086 
   1087 #define	FV_N			10	/* update fve_f every FV_N packets */
   1088 
   1089 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1090 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1091 #define	FV_ALPHA		5  /* extra packet count */
   1092 
   1093 #if (__FreeBSD_version > 300000)
   1094 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1095 #else
   1096 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1097 #endif
   1098 
   1099 /*
   1100  * Brtt table: 127 entry table to convert drop rate (p) to
   1101  * the corresponding bandwidth fraction (f)
   1102  * the following equation is implemented to use scaled values,
   1103  * fve_p and fve_f, in the fixed point format.
   1104  *
   1105  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1106  *   f = Brtt(p) / (max_th + alpha)
   1107  */
   1108 #define	BRTT_SIZE	128
   1109 #define	BRTT_SHIFT	12
   1110 #define	BRTT_MASK	0x0007f000
   1111 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1112 
   1113 const int brtt_tab[BRTT_SIZE] = {
   1114 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1115 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1116 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1117 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1118 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1119 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1120 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1121 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1122 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1123 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1124 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1125 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1126 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1127 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1128 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1129 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1130 };
   1131 
   1132 static inline struct fve *
   1133 flowlist_lookup(fv, pktattr, now)
   1134 	struct flowvalve *fv;
   1135 	struct altq_pktattr *pktattr;
   1136 	struct timeval *now;
   1137 {
   1138 	struct fve *fve;
   1139 	int flows;
   1140 	struct ip *ip;
   1141 #ifdef INET6
   1142 	struct ip6_hdr *ip6;
   1143 #endif
   1144 	struct timeval tthresh;
   1145 
   1146 	if (pktattr == NULL)
   1147 		return (NULL);
   1148 
   1149 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1150 	flows = 0;
   1151 	/*
   1152 	 * search the flow list
   1153 	 */
   1154 	switch (pktattr->pattr_af) {
   1155 	case AF_INET:
   1156 		ip = (struct ip *)pktattr->pattr_hdr;
   1157 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1158 			if (fve->fve_lastdrop.tv_sec == 0)
   1159 				break;
   1160 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1161 				fve->fve_lastdrop.tv_sec = 0;
   1162 				break;
   1163 			}
   1164 			if (fve->fve_flow.flow_af == AF_INET &&
   1165 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1166 			    ip->ip_src.s_addr &&
   1167 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1168 			    ip->ip_dst.s_addr)
   1169 				return (fve);
   1170 			flows++;
   1171 		}
   1172 		break;
   1173 #ifdef INET6
   1174 	case AF_INET6:
   1175 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1176 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1177 			if (fve->fve_lastdrop.tv_sec == 0)
   1178 				break;
   1179 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1180 				fve->fve_lastdrop.tv_sec = 0;
   1181 				break;
   1182 			}
   1183 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1184 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1185 					       &ip6->ip6_src) &&
   1186 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1187 					       &ip6->ip6_dst))
   1188 				return (fve);
   1189 			flows++;
   1190 		}
   1191 		break;
   1192 #endif /* INET6 */
   1193 
   1194 	default:
   1195 		/* unknown protocol.  no drop. */
   1196 		return (NULL);
   1197 	}
   1198 	fv->fv_flows = flows;	/* save the number of active fve's */
   1199 	return (NULL);
   1200 }
   1201 
   1202 static inline struct fve *
   1203 flowlist_reclaim(fv, pktattr)
   1204 	struct flowvalve *fv;
   1205 	struct altq_pktattr *pktattr;
   1206 {
   1207 	struct fve *fve;
   1208 	struct ip *ip;
   1209 #ifdef INET6
   1210 	struct ip6_hdr *ip6;
   1211 #endif
   1212 
   1213 	/*
   1214 	 * get an entry from the tail of the LRU list.
   1215 	 */
   1216 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1217 
   1218 	switch (pktattr->pattr_af) {
   1219 	case AF_INET:
   1220 		ip = (struct ip *)pktattr->pattr_hdr;
   1221 		fve->fve_flow.flow_af = AF_INET;
   1222 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1223 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1224 		break;
   1225 #ifdef INET6
   1226 	case AF_INET6:
   1227 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1228 		fve->fve_flow.flow_af = AF_INET6;
   1229 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1230 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1231 		break;
   1232 #endif
   1233 	}
   1234 
   1235 	fve->fve_state = Green;
   1236 	fve->fve_p = 0.0;
   1237 	fve->fve_f = 0.0;
   1238 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1239 	fve->fve_count = 0;
   1240 
   1241 	fv->fv_flows++;
   1242 #ifdef FV_STATS
   1243 	fv->fv_stats.alloc++;
   1244 #endif
   1245 	return (fve);
   1246 }
   1247 
   1248 static inline void
   1249 flowlist_move_to_head(fv, fve)
   1250 	struct flowvalve *fv;
   1251 	struct fve *fve;
   1252 {
   1253 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1254 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1255 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1256 	}
   1257 }
   1258 
   1259 /*
   1260  * allocate flowvalve structure
   1261  */
   1262 static struct flowvalve *
   1263 fv_alloc(rp)
   1264 	struct red *rp;
   1265 {
   1266 	struct flowvalve *fv;
   1267 	struct fve *fve;
   1268 	int i, num;
   1269 
   1270 	num = FV_FLOWLISTSIZE;
   1271 	MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve),
   1272 	       M_DEVBUF, M_WAITOK);
   1273 	if (fv == NULL)
   1274 		return (NULL);
   1275 	(void)memset(fv, 0, sizeof(struct flowvalve));
   1276 
   1277 	MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num,
   1278 	       M_DEVBUF, M_WAITOK);
   1279 	if (fv->fv_fves == NULL) {
   1280 		FREE(fv, M_DEVBUF);
   1281 		return (NULL);
   1282 	}
   1283 	(void)memset(fv->fv_fves, 0, sizeof(struct fve) * num);
   1284 
   1285 	fv->fv_flows = 0;
   1286 	TAILQ_INIT(&fv->fv_flowlist);
   1287 	for (i = 0; i < num; i++) {
   1288 		fve = &fv->fv_fves[i];
   1289 		fve->fve_lastdrop.tv_sec = 0;
   1290 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1291 	}
   1292 
   1293 	/* initialize drop rate threshold in scaled fixed-point */
   1294 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1295 
   1296 	/* initialize drop rate to fraction table */
   1297 	MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE,
   1298 	       M_DEVBUF, M_WAITOK);
   1299 	if (fv->fv_p2ftab == NULL) {
   1300 		FREE(fv->fv_fves, M_DEVBUF);
   1301 		FREE(fv, M_DEVBUF);
   1302 		return (NULL);
   1303 	}
   1304 	/*
   1305 	 * create the p2f table.
   1306 	 * (shift is used to keep the precision)
   1307 	 */
   1308 	for (i = 1; i < BRTT_SIZE; i++) {
   1309 		int f;
   1310 
   1311 		f = brtt_tab[i] << 8;
   1312 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1313 	}
   1314 
   1315 	return (fv);
   1316 }
   1317 
   1318 static void fv_destroy(fv)
   1319 	struct flowvalve *fv;
   1320 {
   1321 	FREE(fv->fv_p2ftab, M_DEVBUF);
   1322 	FREE(fv->fv_fves, M_DEVBUF);
   1323 	FREE(fv, M_DEVBUF);
   1324 }
   1325 
   1326 static inline int
   1327 fv_p2f(fv, p)
   1328 	struct flowvalve	*fv;
   1329 	int	p;
   1330 {
   1331 	int val, f;
   1332 
   1333 	if (p >= BRTT_PMAX)
   1334 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1335 	else if ((val = (p & BRTT_MASK)))
   1336 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1337 	else
   1338 		f = fv->fv_p2ftab[1];
   1339 	return (f);
   1340 }
   1341 
   1342 /*
   1343  * check if an arriving packet should be pre-dropped.
   1344  * called from red_addq() when a packet arrives.
   1345  * returns 1 when the packet should be pre-dropped.
   1346  * should be called in splnet.
   1347  */
   1348 static int
   1349 fv_checkflow(fv, pktattr, fcache)
   1350 	struct flowvalve *fv;
   1351 	struct altq_pktattr *pktattr;
   1352 	struct fve **fcache;
   1353 {
   1354 	struct fve *fve;
   1355 	struct timeval now;
   1356 
   1357 	fv->fv_ifseq++;
   1358 	FV_TIMESTAMP(&now);
   1359 
   1360 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1361 		/* no matching entry in the flowlist */
   1362 		return (0);
   1363 
   1364 	*fcache = fve;
   1365 
   1366 	/* update fraction f for every FV_N packets */
   1367 	if (++fve->fve_count == FV_N) {
   1368 		/*
   1369 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1370 		 */
   1371 		fve->fve_f =
   1372 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1373 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1374 		fve->fve_ifseq = fv->fv_ifseq;
   1375 		fve->fve_count = 0;
   1376 	}
   1377 
   1378 	/*
   1379 	 * overpumping test
   1380 	 */
   1381 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1382 		int fthresh;
   1383 
   1384 		/* calculate a threshold */
   1385 		fthresh = fv_p2f(fv, fve->fve_p);
   1386 		if (fve->fve_f > fthresh)
   1387 			fve->fve_state = Red;
   1388 	}
   1389 
   1390 	if (fve->fve_state == Red) {
   1391 		/*
   1392 		 * backoff test
   1393 		 */
   1394 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1395 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1396 			fve->fve_p = 0;
   1397 			fve->fve_state = Green;
   1398 #ifdef FV_STATS
   1399 			fv->fv_stats.escape++;
   1400 #endif
   1401 		} else {
   1402 			/* block this flow */
   1403 			flowlist_move_to_head(fv, fve);
   1404 			fve->fve_lastdrop = now;
   1405 #ifdef FV_STATS
   1406 			fv->fv_stats.predrop++;
   1407 #endif
   1408 			return (1);
   1409 		}
   1410 	}
   1411 
   1412 	/*
   1413 	 * p = (1 - Wp) * p
   1414 	 */
   1415 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1416 	if (fve->fve_p < 0)
   1417 		fve->fve_p = 0;
   1418 #ifdef FV_STATS
   1419 	fv->fv_stats.pass++;
   1420 #endif
   1421 	return (0);
   1422 }
   1423 
   1424 /*
   1425  * called from red_addq when a packet is dropped by red.
   1426  * should be called in splnet.
   1427  */
   1428 static void fv_dropbyred(fv, pktattr, fcache)
   1429 	struct flowvalve *fv;
   1430 	struct altq_pktattr *pktattr;
   1431 	struct fve *fcache;
   1432 {
   1433 	struct fve *fve;
   1434 	struct timeval now;
   1435 
   1436 	if (pktattr == NULL)
   1437 		return;
   1438 	FV_TIMESTAMP(&now);
   1439 
   1440 	if (fcache != NULL)
   1441 		/* the fve of this packet is already cached */
   1442 		fve = fcache;
   1443 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1444 		fve = flowlist_reclaim(fv, pktattr);
   1445 
   1446 	flowlist_move_to_head(fv, fve);
   1447 
   1448 	/*
   1449 	 * update p:  the following line cancels the update
   1450 	 *	      in fv_checkflow() and calculate
   1451 	 *	p = Wp + (1 - Wp) * p
   1452 	 */
   1453 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1454 
   1455 	fve->fve_lastdrop = now;
   1456 }
   1457 
   1458 #endif /* ALTQ_FLOWVALVE */
   1459 
   1460 #ifdef KLD_MODULE
   1461 
   1462 static struct altqsw red_sw =
   1463 	{"red", redopen, redclose, redioctl};
   1464 
   1465 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1466 
   1467 #endif /* KLD_MODULE */
   1468 
   1469 #endif /* ALTQ_RED */
   1470