Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.14
      1 /*	$NetBSD: altq_red.c,v 1.14 2006/04/23 06:46:40 christos Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.9 2002/01/07 11:25:40 kjc Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.14 2006/04/23 06:46:40 christos Exp $");
     65 
     66 #if defined(__FreeBSD__) || defined(__NetBSD__)
     67 #include "opt_altq.h"
     68 #if (__FreeBSD__ != 2)
     69 #include "opt_inet.h"
     70 #ifdef __FreeBSD__
     71 #include "opt_inet6.h"
     72 #endif
     73 #endif
     74 #endif /* __FreeBSD__ || __NetBSD__ */
     75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     76 
     77 #include <sys/param.h>
     78 #include <sys/malloc.h>
     79 #include <sys/mbuf.h>
     80 #include <sys/socket.h>
     81 #include <sys/sockio.h>
     82 #include <sys/systm.h>
     83 #include <sys/proc.h>
     84 #include <sys/errno.h>
     85 #include <sys/kernel.h>
     86 #ifdef ALTQ_FLOWVALVE
     87 #include <sys/queue.h>
     88 #include <sys/time.h>
     89 #endif
     90 
     91 #include <net/if.h>
     92 #include <net/if_types.h>
     93 
     94 #include <netinet/in.h>
     95 #include <netinet/in_systm.h>
     96 #include <netinet/ip.h>
     97 #ifdef INET6
     98 #include <netinet/ip6.h>
     99 #endif
    100 
    101 #include <altq/altq.h>
    102 #include <altq/altq_conf.h>
    103 #include <altq/altq_red.h>
    104 #ifdef ALTQ_FLOWVALVE
    105 #include <altq/altq_flowvalve.h>
    106 #endif
    107 
    108 /*
    109  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    110  * fixed-point calculation.
    111  *
    112  * written by kjc using the ns code as a reference.
    113  * you can learn more about red and ns from Sally's home page at
    114  * http://www-nrg.ee.lbl.gov/floyd/
    115  *
    116  * most of the red parameter values are fixed in this implementation
    117  * to prevent fixed-point overflow/underflow.
    118  * if you change the parameters, watch out for overflow/underflow!
    119  *
    120  * the parameters used are recommended values by Sally.
    121  * the corresponding ns config looks:
    122  *	q_weight=0.00195
    123  *	minthresh=5 maxthresh=15 queue-size=60
    124  *	linterm=30
    125  *	dropmech=drop-tail
    126  *	bytes=false (can't be handled by 32-bit fixed-point)
    127  *	doubleq=false dqthresh=false
    128  *	wait=true
    129  */
    130 /*
    131  * alternative red parameters for a slow link.
    132  *
    133  * assume the queue length becomes from zero to L and keeps L, it takes
    134  * N packets for q_avg to reach 63% of L.
    135  * when q_weight is 0.002, N is about 500 packets.
    136  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    137  * when q_weight is 0.008, N is about 127 packets.
    138  * when q_weight is 0.016, N is about 63 packets.
    139  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    140  * are allowed for 0.016.
    141  * see Sally's paper for more details.
    142  */
    143 /* normal red parameters */
    144 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    145 				/* q_weight = 0.00195 */
    146 
    147 /* red parameters for a slow link */
    148 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    149 				/* q_weight = 0.0078125 */
    150 
    151 /* red parameters for a very slow link (e.g., dialup) */
    152 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    153 				/* q_weight = 0.015625 */
    154 
    155 /* fixed-point uses 12-bit decimal places */
    156 #define	FP_SHIFT	12	/* fixed-point shift */
    157 
    158 /* red parameters for drop probability */
    159 #define	INV_P_MAX	10	/* inverse of max drop probability */
    160 #define	TH_MIN		5	/* min threshold */
    161 #define	TH_MAX		15	/* max threshold */
    162 
    163 #define	RED_LIMIT	60	/* default max queue length */
    164 
    165 /*
    166  * our default policy for forced-drop is drop-tail.
    167  * (in altq-1.1.2 or earlier, the default was random-drop.
    168  * but it makes more sense to punish the cause of the surge.)
    169  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    170  */
    171 
    172 #ifdef ALTQ_FLOWVALVE
    173 /*
    174  * flow-valve is an extension to protect red from unresponsive flows
    175  * and to promote end-to-end congestion control.
    176  * flow-valve observes the average drop rates of the flows that have
    177  * experienced packet drops in the recent past.
    178  * when the average drop rate exceeds the threshold, the flow is
    179  * blocked by the flow-valve.  the trapped flow should back off
    180  * exponentially to escape from the flow-valve.
    181  */
    182 #ifdef RED_RANDOM_DROP
    183 #error "random-drop can't be used with flow-valve!"
    184 #endif
    185 #endif /* ALTQ_FLOWVALVE */
    186 
    187 /* red_list keeps all red_queue_t's allocated. */
    188 static red_queue_t *red_list = NULL;
    189 
    190 /* default red parameter values */
    191 static int default_th_min = TH_MIN;
    192 static int default_th_max = TH_MAX;
    193 static int default_inv_pmax = INV_P_MAX;
    194 
    195 /* internal function prototypes */
    196 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    197 			    struct altq_pktattr *));
    198 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    199 static int red_request __P((struct ifaltq *, int, void *));
    200 static void red_purgeq __P((red_queue_t *));
    201 static int red_detach __P((red_queue_t *));
    202 #ifdef ALTQ_FLOWVALVE
    203 static inline struct fve *flowlist_lookup __P((struct flowvalve *,
    204 			 struct altq_pktattr *, struct timeval *));
    205 static inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    206 						  struct altq_pktattr *));
    207 static inline void flowlist_move_to_head __P((struct flowvalve *,
    208 						struct fve *));
    209 static inline int fv_p2f __P((struct flowvalve *, int));
    210 static struct flowvalve *fv_alloc __P((struct red *));
    211 static void fv_destroy __P((struct flowvalve *));
    212 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    213 			     struct fve **));
    214 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    215 			      struct fve *));
    216 #endif
    217 
    218 /*
    219  * red device interface
    220  */
    221 altqdev_decl(red);
    222 
    223 int
    224 redopen(dev, flag, fmt, l)
    225 	dev_t dev;
    226 	int flag, fmt;
    227 	struct lwp *l;
    228 {
    229 	/* everything will be done when the queueing scheme is attached. */
    230 	return 0;
    231 }
    232 
    233 int
    234 redclose(dev, flag, fmt, l)
    235 	dev_t dev;
    236 	int flag, fmt;
    237 	struct lwp *l;
    238 {
    239 	red_queue_t *rqp;
    240 	int err, error = 0;
    241 
    242 	while ((rqp = red_list) != NULL) {
    243 		/* destroy all */
    244 		err = red_detach(rqp);
    245 		if (err != 0 && error == 0)
    246 			error = err;
    247 	}
    248 
    249 	return error;
    250 }
    251 
    252 int
    253 redioctl(dev, cmd, addr, flag, l)
    254 	dev_t dev;
    255 	ioctlcmd_t cmd;
    256 	caddr_t addr;
    257 	int flag;
    258 	struct lwp *l;
    259 {
    260 	red_queue_t *rqp;
    261 	struct red_interface *ifacep;
    262 	struct ifnet *ifp;
    263 	struct proc *p = l->l_proc;
    264 	int	error = 0;
    265 
    266 	/* check super-user privilege */
    267 	switch (cmd) {
    268 	case RED_GETSTATS:
    269 		break;
    270 	default:
    271 #if (__FreeBSD_version > 400000)
    272 		if ((error = suser(p)) != 0)
    273 #else
    274 		if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
    275 #endif
    276 			return (error);
    277 		break;
    278 	}
    279 
    280 	switch (cmd) {
    281 
    282 	case RED_ENABLE:
    283 		ifacep = (struct red_interface *)addr;
    284 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    285 			error = EBADF;
    286 			break;
    287 		}
    288 		error = altq_enable(rqp->rq_ifq);
    289 		break;
    290 
    291 	case RED_DISABLE:
    292 		ifacep = (struct red_interface *)addr;
    293 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    294 			error = EBADF;
    295 			break;
    296 		}
    297 		error = altq_disable(rqp->rq_ifq);
    298 		break;
    299 
    300 	case RED_IF_ATTACH:
    301 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    302 		if (ifp == NULL) {
    303 			error = ENXIO;
    304 			break;
    305 		}
    306 
    307 		/* allocate and initialize red_queue_t */
    308 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    309 		if (rqp == NULL) {
    310 			error = ENOMEM;
    311 			break;
    312 		}
    313 
    314 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    315 		    M_WAITOK|M_ZERO);
    316 		if (rqp->rq_q == NULL) {
    317 			free(rqp, M_DEVBUF);
    318 			error = ENOMEM;
    319 			break;
    320 		}
    321 
    322 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    323 		if (rqp->rq_red == NULL) {
    324 			FREE(rqp->rq_q, M_DEVBUF);
    325 			FREE(rqp, M_DEVBUF);
    326 			error = ENOMEM;
    327 			break;
    328 		}
    329 
    330 		rqp->rq_ifq = &ifp->if_snd;
    331 		qtail(rqp->rq_q) = NULL;
    332 		qlen(rqp->rq_q) = 0;
    333 		qlimit(rqp->rq_q) = RED_LIMIT;
    334 		qtype(rqp->rq_q) = Q_RED;
    335 
    336 		/*
    337 		 * set RED to this ifnet structure.
    338 		 */
    339 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    340 				    red_enqueue, red_dequeue, red_request,
    341 				    NULL, NULL);
    342 		if (error) {
    343 			red_destroy(rqp->rq_red);
    344 			FREE(rqp->rq_q, M_DEVBUF);
    345 			FREE(rqp, M_DEVBUF);
    346 			break;
    347 		}
    348 
    349 		/* add this state to the red list */
    350 		rqp->rq_next = red_list;
    351 		red_list = rqp;
    352 		break;
    353 
    354 	case RED_IF_DETACH:
    355 		ifacep = (struct red_interface *)addr;
    356 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    357 			error = EBADF;
    358 			break;
    359 		}
    360 		error = red_detach(rqp);
    361 		break;
    362 
    363 	case RED_GETSTATS:
    364 		do {
    365 			struct red_stats *q_stats;
    366 			red_t *rp;
    367 
    368 			q_stats = (struct red_stats *)addr;
    369 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    370 					     ALTQT_RED)) == NULL) {
    371 				error = EBADF;
    372 				break;
    373 			}
    374 
    375 			q_stats->q_len 	   = qlen(rqp->rq_q);
    376 			q_stats->q_limit   = qlimit(rqp->rq_q);
    377 
    378 			rp = rqp->rq_red;
    379 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    380 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    381 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    382 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    383 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    384 			q_stats->marked_packets = rp->red_stats.marked_packets;
    385 
    386 			q_stats->weight		= rp->red_weight;
    387 			q_stats->inv_pmax	= rp->red_inv_pmax;
    388 			q_stats->th_min		= rp->red_thmin;
    389 			q_stats->th_max		= rp->red_thmax;
    390 
    391 #ifdef ALTQ_FLOWVALVE
    392 			if (rp->red_flowvalve != NULL) {
    393 				struct flowvalve *fv = rp->red_flowvalve;
    394 				q_stats->fv_flows    = fv->fv_flows;
    395 				q_stats->fv_pass     = fv->fv_stats.pass;
    396 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    397 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    398 				q_stats->fv_escape   = fv->fv_stats.escape;
    399 			} else {
    400 #endif /* ALTQ_FLOWVALVE */
    401 				q_stats->fv_flows    = 0;
    402 				q_stats->fv_pass     = 0;
    403 				q_stats->fv_predrop  = 0;
    404 				q_stats->fv_alloc    = 0;
    405 				q_stats->fv_escape   = 0;
    406 #ifdef ALTQ_FLOWVALVE
    407 			}
    408 #endif /* ALTQ_FLOWVALVE */
    409 		} while (0);
    410 		break;
    411 
    412 	case RED_CONFIG:
    413 		do {
    414 			struct red_conf *fc;
    415 			red_t *new;
    416 			int s, limit;
    417 
    418 			fc = (struct red_conf *)addr;
    419 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    420 					       ALTQT_RED)) == NULL) {
    421 				error = EBADF;
    422 				break;
    423 			}
    424 			new = red_alloc(fc->red_weight,
    425 					fc->red_inv_pmax,
    426 					fc->red_thmin,
    427 					fc->red_thmax,
    428 					fc->red_flags,
    429 					fc->red_pkttime);
    430 			if (new == NULL) {
    431 				error = ENOMEM;
    432 				break;
    433 			}
    434 
    435 			s = splnet();
    436 			red_purgeq(rqp);
    437 			limit = fc->red_limit;
    438 			if (limit < fc->red_thmax)
    439 				limit = fc->red_thmax;
    440 			qlimit(rqp->rq_q) = limit;
    441 			fc->red_limit = limit;	/* write back the new value */
    442 
    443 			red_destroy(rqp->rq_red);
    444 			rqp->rq_red = new;
    445 
    446 			splx(s);
    447 
    448 			/* write back new values */
    449 			fc->red_limit = limit;
    450 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    451 			fc->red_thmin = rqp->rq_red->red_thmin;
    452 			fc->red_thmax = rqp->rq_red->red_thmax;
    453 
    454 		} while (0);
    455 		break;
    456 
    457 	case RED_SETDEFAULTS:
    458 		do {
    459 			struct redparams *rp;
    460 
    461 			rp = (struct redparams *)addr;
    462 
    463 			default_th_min = rp->th_min;
    464 			default_th_max = rp->th_max;
    465 			default_inv_pmax = rp->inv_pmax;
    466 		} while (0);
    467 		break;
    468 
    469 	default:
    470 		error = EINVAL;
    471 		break;
    472 	}
    473 	return error;
    474 }
    475 
    476 static int
    477 red_detach(rqp)
    478 	red_queue_t *rqp;
    479 {
    480 	red_queue_t *tmp;
    481 	int error = 0;
    482 
    483 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    484 		altq_disable(rqp->rq_ifq);
    485 
    486 	if ((error = altq_detach(rqp->rq_ifq)))
    487 		return (error);
    488 
    489 	if (red_list == rqp)
    490 		red_list = rqp->rq_next;
    491 	else {
    492 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    493 			if (tmp->rq_next == rqp) {
    494 				tmp->rq_next = rqp->rq_next;
    495 				break;
    496 			}
    497 		if (tmp == NULL)
    498 			printf("red_detach: no state found in red_list!\n");
    499 	}
    500 
    501 	red_destroy(rqp->rq_red);
    502 	FREE(rqp->rq_q, M_DEVBUF);
    503 	FREE(rqp, M_DEVBUF);
    504 	return (error);
    505 }
    506 
    507 /*
    508  * red support routines
    509  */
    510 
    511 red_t *
    512 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    513 	int	weight, inv_pmax, th_min, th_max;
    514 	int	flags, pkttime;
    515 {
    516 	red_t 	*rp;
    517 	int	w, i;
    518 	int	npkts_per_sec;
    519 
    520 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    521 	if (rp == NULL)
    522 		return (NULL);
    523 
    524 	rp->red_avg = 0;
    525 	rp->red_idle = 1;
    526 
    527 	if (weight == 0)
    528 		rp->red_weight = W_WEIGHT;
    529 	else
    530 		rp->red_weight = weight;
    531 	if (inv_pmax == 0)
    532 		rp->red_inv_pmax = default_inv_pmax;
    533 	else
    534 		rp->red_inv_pmax = inv_pmax;
    535 	if (th_min == 0)
    536 		rp->red_thmin = default_th_min;
    537 	else
    538 		rp->red_thmin = th_min;
    539 	if (th_max == 0)
    540 		rp->red_thmax = default_th_max;
    541 	else
    542 		rp->red_thmax = th_max;
    543 
    544 	rp->red_flags = flags;
    545 
    546 	if (pkttime == 0)
    547 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    548 		rp->red_pkttime = 800;
    549 	else
    550 		rp->red_pkttime = pkttime;
    551 
    552 	if (weight == 0) {
    553 		/* when the link is very slow, adjust red parameters */
    554 		npkts_per_sec = 1000000 / rp->red_pkttime;
    555 		if (npkts_per_sec < 50) {
    556 			/* up to about 400Kbps */
    557 			rp->red_weight = W_WEIGHT_2;
    558 		} else if (npkts_per_sec < 300) {
    559 			/* up to about 2.4Mbps */
    560 			rp->red_weight = W_WEIGHT_1;
    561 		}
    562 	}
    563 
    564 	/* calculate wshift.  weight must be power of 2 */
    565 	w = rp->red_weight;
    566 	for (i = 0; w > 1; i++)
    567 		w = w >> 1;
    568 	rp->red_wshift = i;
    569 	w = 1 << rp->red_wshift;
    570 	if (w != rp->red_weight) {
    571 		printf("invalid weight value %d for red! use %d\n",
    572 		       rp->red_weight, w);
    573 		rp->red_weight = w;
    574 	}
    575 
    576 	/*
    577 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    578 	 * to be compared with avg.
    579 	 */
    580 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    581 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    582 
    583 	/*
    584 	 * precompute probability denominator
    585 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    586 	 */
    587 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    588 			 * rp->red_inv_pmax) << FP_SHIFT;
    589 
    590 	/* allocate weight table */
    591 	rp->red_wtab = wtab_alloc(rp->red_weight);
    592 
    593 	microtime(&rp->red_last);
    594 #ifdef ALTQ_FLOWVALVE
    595 	if (flags & REDF_FLOWVALVE)
    596 		rp->red_flowvalve = fv_alloc(rp);
    597 	/* if fv_alloc failes, flowvalve is just disabled */
    598 #endif
    599 	return (rp);
    600 }
    601 
    602 void
    603 red_destroy(rp)
    604 	red_t *rp;
    605 {
    606 #ifdef ALTQ_FLOWVALVE
    607 	if (rp->red_flowvalve != NULL)
    608 		fv_destroy(rp->red_flowvalve);
    609 #endif
    610 	wtab_destroy(rp->red_wtab);
    611 	FREE(rp, M_DEVBUF);
    612 }
    613 
    614 void
    615 red_getstats(rp, sp)
    616 	red_t *rp;
    617 	struct redstats *sp;
    618 {
    619 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    620 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    621 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    622 	sp->drop_forced		= rp->red_stats.drop_forced;
    623 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    624 	sp->marked_packets	= rp->red_stats.marked_packets;
    625 }
    626 
    627 /*
    628  * enqueue routine:
    629  *
    630  *	returns: 0 when successfully queued.
    631  *		 ENOBUFS when drop occurs.
    632  */
    633 static int
    634 red_enqueue(ifq, m, pktattr)
    635 	struct ifaltq *ifq;
    636 	struct mbuf *m;
    637 	struct altq_pktattr *pktattr;
    638 {
    639 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    640 
    641 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    642 		return ENOBUFS;
    643 	ifq->ifq_len++;
    644 	return 0;
    645 }
    646 
    647 int
    648 red_addq(rp, q, m, pktattr)
    649 	red_t *rp;
    650 	class_queue_t *q;
    651 	struct mbuf *m;
    652 	struct altq_pktattr *pktattr;
    653 {
    654 	int avg, droptype;
    655 	int n;
    656 #ifdef ALTQ_FLOWVALVE
    657 	struct fve *fve = NULL;
    658 
    659 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    660 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    661 			m_freem(m);
    662 			return (-1);
    663 		}
    664 #endif
    665 
    666 	avg = rp->red_avg;
    667 
    668 	/*
    669 	 * if we were idle, we pretend that n packets arrived during
    670 	 * the idle period.
    671 	 */
    672 	if (rp->red_idle) {
    673 		struct timeval now;
    674 		int t;
    675 
    676 		rp->red_idle = 0;
    677 		microtime(&now);
    678 		t = (now.tv_sec - rp->red_last.tv_sec);
    679 		if (t > 60) {
    680 			/*
    681 			 * being idle for more than 1 minute, set avg to zero.
    682 			 * this prevents t from overflow.
    683 			 */
    684 			avg = 0;
    685 		} else {
    686 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    687 			n = t / rp->red_pkttime - 1;
    688 
    689 			/* the following line does (avg = (1 - Wq)^n * avg) */
    690 			if (n > 0)
    691 				avg = (avg >> FP_SHIFT) *
    692 				    pow_w(rp->red_wtab, n);
    693 		}
    694 	}
    695 
    696 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    697 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    698 	rp->red_avg = avg;		/* save the new value */
    699 
    700 	/*
    701 	 * red_count keeps a tally of arriving traffic that has not
    702 	 * been dropped.
    703 	 */
    704 	rp->red_count++;
    705 
    706 	/* see if we drop early */
    707 	droptype = DTYPE_NODROP;
    708 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    709 		if (avg >= rp->red_thmax_s) {
    710 			/* avg >= th_max: forced drop */
    711 			droptype = DTYPE_FORCED;
    712 		} else if (rp->red_old == 0) {
    713 			/* first exceeds th_min */
    714 			rp->red_count = 1;
    715 			rp->red_old = 1;
    716 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    717 				      rp->red_probd, rp->red_count)) {
    718 			/* mark or drop by red */
    719 			if ((rp->red_flags & REDF_ECN) &&
    720 			    mark_ecn(m, pktattr, rp->red_flags)) {
    721 				/* successfully marked.  do not drop. */
    722 				rp->red_count = 0;
    723 #ifdef RED_STATS
    724 				rp->red_stats.marked_packets++;
    725 #endif
    726 			} else {
    727 				/* unforced drop by red */
    728 				droptype = DTYPE_EARLY;
    729 			}
    730 		}
    731 	} else {
    732 		/* avg < th_min */
    733 		rp->red_old = 0;
    734 	}
    735 
    736 	/*
    737 	 * if the queue length hits the hard limit, it's a forced drop.
    738 	 */
    739 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    740 		droptype = DTYPE_FORCED;
    741 
    742 #ifdef RED_RANDOM_DROP
    743 	/* if successful or forced drop, enqueue this packet. */
    744 	if (droptype != DTYPE_EARLY)
    745 		_addq(q, m);
    746 #else
    747 	/* if successful, enqueue this packet. */
    748 	if (droptype == DTYPE_NODROP)
    749 		_addq(q, m);
    750 #endif
    751 	if (droptype != DTYPE_NODROP) {
    752 		if (droptype == DTYPE_EARLY) {
    753 			/* drop the incoming packet */
    754 #ifdef RED_STATS
    755 			rp->red_stats.drop_unforced++;
    756 #endif
    757 		} else {
    758 			/* forced drop, select a victim packet in the queue. */
    759 #ifdef RED_RANDOM_DROP
    760 			m = _getq_random(q);
    761 #endif
    762 #ifdef RED_STATS
    763 			rp->red_stats.drop_forced++;
    764 #endif
    765 		}
    766 #ifdef RED_STATS
    767 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    768 #endif
    769 		rp->red_count = 0;
    770 #ifdef ALTQ_FLOWVALVE
    771 		if (rp->red_flowvalve != NULL)
    772 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    773 #endif
    774 		m_freem(m);
    775 		return (-1);
    776 	}
    777 	/* successfully queued */
    778 #ifdef RED_STATS
    779 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    780 #endif
    781 	return (0);
    782 }
    783 
    784 /*
    785  * early-drop probability is calculated as follows:
    786  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    787  *   prob_a = prob / (2 - count*prob)
    788  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    789  * here prob_a increases as successive undrop count increases.
    790  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    791  * becomes 1 when (count >= (2 / prob))).
    792  */
    793 int
    794 drop_early(fp_len, fp_probd, count)
    795 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    796 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    797 	int count;	/* how many successive undropped packets */
    798 {
    799 	int d;		/* denominator of drop-probability */
    800 
    801 	d = fp_probd - count * fp_len;
    802 	if (d <= 0)
    803 		/* count exceeds the hard limit: drop or mark */
    804 		return (1);
    805 
    806 	/*
    807 	 * now the range of d is [1..600] in fixed-point. (when
    808 	 * th_max-th_min=10 and p_max=1/30)
    809 	 * drop probability = (avg - TH_MIN) / d
    810 	 */
    811 
    812 	if ((random() % d) < fp_len) {
    813 		/* drop or mark */
    814 		return (1);
    815 	}
    816 	/* no drop/mark */
    817 	return (0);
    818 }
    819 
    820 /*
    821  * try to mark CE bit to the packet.
    822  *    returns 1 if successfully marked, 0 otherwise.
    823  */
    824 int
    825 mark_ecn(m, pktattr, flags)
    826 	struct mbuf *m;
    827 	struct altq_pktattr *pktattr;
    828 	int flags;
    829 {
    830 	struct mbuf *m0;
    831 
    832 	if (pktattr == NULL ||
    833 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    834 		return (0);
    835 
    836 	/* verify that pattr_hdr is within the mbuf data */
    837 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    838 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    839 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    840 			break;
    841 	if (m0 == NULL) {
    842 		/* ick, pattr_hdr is stale */
    843 		pktattr->pattr_af = AF_UNSPEC;
    844 		return (0);
    845 	}
    846 
    847 	switch (pktattr->pattr_af) {
    848 	case AF_INET:
    849 		if (flags & REDF_ECN4) {
    850 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    851 			u_int8_t otos;
    852 			int sum;
    853 
    854 			if (ip->ip_v != 4)
    855 				return (0);	/* version mismatch! */
    856 
    857 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    858 				return (0);	/* not-ECT */
    859 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    860 				return (1);	/* already marked */
    861 
    862 			/*
    863 			 * ecn-capable but not marked,
    864 			 * mark CE and update checksum
    865 			 */
    866 			otos = ip->ip_tos;
    867 			ip->ip_tos |= IPTOS_ECN_CE;
    868 			/*
    869 			 * update checksum (from RFC1624)
    870 			 *	   HC' = ~(~HC + ~m + m')
    871 			 */
    872 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    873 			sum += (~otos & 0xffff) + ip->ip_tos;
    874 			sum = (sum >> 16) + (sum & 0xffff);
    875 			sum += (sum >> 16);  /* add carry */
    876 			ip->ip_sum = htons(~sum & 0xffff);
    877 			return (1);
    878 		}
    879 		break;
    880 #ifdef INET6
    881 	case AF_INET6:
    882 		if (flags & REDF_ECN6) {
    883 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    884 			u_int32_t flowlabel;
    885 
    886 			flowlabel = ntohl(ip6->ip6_flow);
    887 			if ((flowlabel >> 28) != 6)
    888 				return (0);	/* version mismatch! */
    889 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    890 			    (IPTOS_ECN_NOTECT << 20))
    891 				return (0);	/* not-ECT */
    892 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    893 			    (IPTOS_ECN_CE << 20))
    894 				return (1);	/* already marked */
    895 			/*
    896 			 * ecn-capable but not marked,  mark CE
    897 			 */
    898 			flowlabel |= (IPTOS_ECN_CE << 20);
    899 			ip6->ip6_flow = htonl(flowlabel);
    900 			return (1);
    901 		}
    902 		break;
    903 #endif  /* INET6 */
    904 	}
    905 
    906 	/* not marked */
    907 	return (0);
    908 }
    909 
    910 /*
    911  * dequeue routine:
    912  *	must be called in splnet.
    913  *
    914  *	returns: mbuf dequeued.
    915  *		 NULL when no packet is available in the queue.
    916  */
    917 
    918 static struct mbuf *
    919 red_dequeue(ifq, op)
    920 	struct ifaltq *ifq;
    921 	int op;
    922 {
    923 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    924 	struct mbuf *m;
    925 
    926 	if (op == ALTDQ_POLL)
    927 		return qhead(rqp->rq_q);
    928 
    929 	/* op == ALTDQ_REMOVE */
    930 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    931 	if (m != NULL)
    932 		ifq->ifq_len--;
    933 	return (m);
    934 }
    935 
    936 struct mbuf *
    937 red_getq(rp, q)
    938 	red_t *rp;
    939 	class_queue_t *q;
    940 {
    941 	struct mbuf *m;
    942 
    943 	if ((m = _getq(q)) == NULL) {
    944 		if (rp->red_idle == 0) {
    945 			rp->red_idle = 1;
    946 			microtime(&rp->red_last);
    947 		}
    948 		return NULL;
    949 	}
    950 
    951 	rp->red_idle = 0;
    952 	return (m);
    953 }
    954 
    955 static int
    956 red_request(ifq, req, arg)
    957 	struct ifaltq *ifq;
    958 	int req;
    959 	void *arg;
    960 {
    961 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    962 
    963 	switch (req) {
    964 	case ALTRQ_PURGE:
    965 		red_purgeq(rqp);
    966 		break;
    967 	}
    968 	return (0);
    969 }
    970 
    971 static void
    972 red_purgeq(rqp)
    973 	red_queue_t *rqp;
    974 {
    975 	_flushq(rqp->rq_q);
    976 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    977 		rqp->rq_ifq->ifq_len = 0;
    978 }
    979 
    980 
    981 /*
    982  * helper routine to calibrate avg during idle.
    983  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    984  * here Wq = 1/weight and the code assumes Wq is close to zero.
    985  *
    986  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    987  */
    988 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    989 
    990 struct wtab *
    991 wtab_alloc(weight)
    992 	int weight;
    993 {
    994 	struct wtab *w;
    995 	int i;
    996 
    997 	for (w = wtab_list; w != NULL; w = w->w_next)
    998 		if (w->w_weight == weight) {
    999 			w->w_refcount++;
   1000 			return (w);
   1001 		}
   1002 
   1003 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
   1004 	if (w == NULL)
   1005 		panic("wtab_alloc: malloc failed!");
   1006 	w->w_weight = weight;
   1007 	w->w_refcount = 1;
   1008 	w->w_next = wtab_list;
   1009 	wtab_list = w;
   1010 
   1011 	/* initialize the weight table */
   1012 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1013 	for (i = 1; i < 32; i++) {
   1014 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1015 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1016 			w->w_param_max = 1 << i;
   1017 	}
   1018 
   1019 	return (w);
   1020 }
   1021 
   1022 int
   1023 wtab_destroy(w)
   1024 	struct wtab *w;
   1025 {
   1026 	struct wtab *prev;
   1027 
   1028 	if (--w->w_refcount > 0)
   1029 		return (0);
   1030 
   1031 	if (wtab_list == w)
   1032 		wtab_list = w->w_next;
   1033 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1034 		if (prev->w_next == w) {
   1035 			prev->w_next = w->w_next;
   1036 			break;
   1037 		}
   1038 
   1039 	FREE(w, M_DEVBUF);
   1040 	return (0);
   1041 }
   1042 
   1043 int32_t
   1044 pow_w(w, n)
   1045 	struct wtab *w;
   1046 	int n;
   1047 {
   1048 	int	i, bit;
   1049 	int32_t	val;
   1050 
   1051 	if (n >= w->w_param_max)
   1052 		return (0);
   1053 
   1054 	val = 1 << FP_SHIFT;
   1055 	if (n <= 0)
   1056 		return (val);
   1057 
   1058 	bit = 1;
   1059 	i = 0;
   1060 	while (n) {
   1061 		if (n & bit) {
   1062 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1063 			n &= ~bit;
   1064 		}
   1065 		i++;
   1066 		bit <<=  1;
   1067 	}
   1068 	return (val);
   1069 }
   1070 
   1071 #ifdef ALTQ_FLOWVALVE
   1072 
   1073 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1074 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1075 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1076 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1077 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1078 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1079 
   1080 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1081 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1082 
   1083 #define	FV_N			10	/* update fve_f every FV_N packets */
   1084 
   1085 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1086 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1087 #define	FV_ALPHA		5  /* extra packet count */
   1088 
   1089 #if (__FreeBSD_version > 300000)
   1090 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1091 #else
   1092 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1093 #endif
   1094 
   1095 /*
   1096  * Brtt table: 127 entry table to convert drop rate (p) to
   1097  * the corresponding bandwidth fraction (f)
   1098  * the following equation is implemented to use scaled values,
   1099  * fve_p and fve_f, in the fixed point format.
   1100  *
   1101  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1102  *   f = Brtt(p) / (max_th + alpha)
   1103  */
   1104 #define	BRTT_SIZE	128
   1105 #define	BRTT_SHIFT	12
   1106 #define	BRTT_MASK	0x0007f000
   1107 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1108 
   1109 const int brtt_tab[BRTT_SIZE] = {
   1110 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1111 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1112 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1113 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1114 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1115 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1116 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1117 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1118 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1119 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1120 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1121 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1122 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1123 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1124 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1125 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1126 };
   1127 
   1128 static inline struct fve *
   1129 flowlist_lookup(fv, pktattr, now)
   1130 	struct flowvalve *fv;
   1131 	struct altq_pktattr *pktattr;
   1132 	struct timeval *now;
   1133 {
   1134 	struct fve *fve;
   1135 	int flows;
   1136 	struct ip *ip;
   1137 #ifdef INET6
   1138 	struct ip6_hdr *ip6;
   1139 #endif
   1140 	struct timeval tthresh;
   1141 
   1142 	if (pktattr == NULL)
   1143 		return (NULL);
   1144 
   1145 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1146 	flows = 0;
   1147 	/*
   1148 	 * search the flow list
   1149 	 */
   1150 	switch (pktattr->pattr_af) {
   1151 	case AF_INET:
   1152 		ip = (struct ip *)pktattr->pattr_hdr;
   1153 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1154 			if (fve->fve_lastdrop.tv_sec == 0)
   1155 				break;
   1156 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1157 				fve->fve_lastdrop.tv_sec = 0;
   1158 				break;
   1159 			}
   1160 			if (fve->fve_flow.flow_af == AF_INET &&
   1161 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1162 			    ip->ip_src.s_addr &&
   1163 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1164 			    ip->ip_dst.s_addr)
   1165 				return (fve);
   1166 			flows++;
   1167 		}
   1168 		break;
   1169 #ifdef INET6
   1170 	case AF_INET6:
   1171 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1172 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1173 			if (fve->fve_lastdrop.tv_sec == 0)
   1174 				break;
   1175 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1176 				fve->fve_lastdrop.tv_sec = 0;
   1177 				break;
   1178 			}
   1179 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1180 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1181 					       &ip6->ip6_src) &&
   1182 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1183 					       &ip6->ip6_dst))
   1184 				return (fve);
   1185 			flows++;
   1186 		}
   1187 		break;
   1188 #endif /* INET6 */
   1189 
   1190 	default:
   1191 		/* unknown protocol.  no drop. */
   1192 		return (NULL);
   1193 	}
   1194 	fv->fv_flows = flows;	/* save the number of active fve's */
   1195 	return (NULL);
   1196 }
   1197 
   1198 static inline struct fve *
   1199 flowlist_reclaim(fv, pktattr)
   1200 	struct flowvalve *fv;
   1201 	struct altq_pktattr *pktattr;
   1202 {
   1203 	struct fve *fve;
   1204 	struct ip *ip;
   1205 #ifdef INET6
   1206 	struct ip6_hdr *ip6;
   1207 #endif
   1208 
   1209 	/*
   1210 	 * get an entry from the tail of the LRU list.
   1211 	 */
   1212 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1213 
   1214 	switch (pktattr->pattr_af) {
   1215 	case AF_INET:
   1216 		ip = (struct ip *)pktattr->pattr_hdr;
   1217 		fve->fve_flow.flow_af = AF_INET;
   1218 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1219 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1220 		break;
   1221 #ifdef INET6
   1222 	case AF_INET6:
   1223 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1224 		fve->fve_flow.flow_af = AF_INET6;
   1225 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1226 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1227 		break;
   1228 #endif
   1229 	}
   1230 
   1231 	fve->fve_state = Green;
   1232 	fve->fve_p = 0.0;
   1233 	fve->fve_f = 0.0;
   1234 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1235 	fve->fve_count = 0;
   1236 
   1237 	fv->fv_flows++;
   1238 #ifdef FV_STATS
   1239 	fv->fv_stats.alloc++;
   1240 #endif
   1241 	return (fve);
   1242 }
   1243 
   1244 static inline void
   1245 flowlist_move_to_head(fv, fve)
   1246 	struct flowvalve *fv;
   1247 	struct fve *fve;
   1248 {
   1249 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1250 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1251 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1252 	}
   1253 }
   1254 
   1255 /*
   1256  * allocate flowvalve structure
   1257  */
   1258 static struct flowvalve *
   1259 fv_alloc(rp)
   1260 	struct red *rp;
   1261 {
   1262 	struct flowvalve *fv;
   1263 	struct fve *fve;
   1264 	int i, num;
   1265 
   1266 	num = FV_FLOWLISTSIZE;
   1267 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1268 	if (fv == NULL)
   1269 		return (NULL);
   1270 
   1271 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1272 	    M_WAITOK|M_ZERO);
   1273 	if (fv->fv_fves == NULL) {
   1274 		free(fv, M_DEVBUF);
   1275 		return (NULL);
   1276 	}
   1277 
   1278 	fv->fv_flows = 0;
   1279 	TAILQ_INIT(&fv->fv_flowlist);
   1280 	for (i = 0; i < num; i++) {
   1281 		fve = &fv->fv_fves[i];
   1282 		fve->fve_lastdrop.tv_sec = 0;
   1283 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1284 	}
   1285 
   1286 	/* initialize drop rate threshold in scaled fixed-point */
   1287 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1288 
   1289 	/* initialize drop rate to fraction table */
   1290 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1291 	if (fv->fv_p2ftab == NULL) {
   1292 		free(fv->fv_fves, M_DEVBUF);
   1293 		free(fv, M_DEVBUF);
   1294 		return (NULL);
   1295 	}
   1296 	/*
   1297 	 * create the p2f table.
   1298 	 * (shift is used to keep the precision)
   1299 	 */
   1300 	for (i = 1; i < BRTT_SIZE; i++) {
   1301 		int f;
   1302 
   1303 		f = brtt_tab[i] << 8;
   1304 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1305 	}
   1306 
   1307 	return (fv);
   1308 }
   1309 
   1310 static void fv_destroy(fv)
   1311 	struct flowvalve *fv;
   1312 {
   1313 	FREE(fv->fv_p2ftab, M_DEVBUF);
   1314 	FREE(fv->fv_fves, M_DEVBUF);
   1315 	FREE(fv, M_DEVBUF);
   1316 }
   1317 
   1318 static inline int
   1319 fv_p2f(fv, p)
   1320 	struct flowvalve	*fv;
   1321 	int	p;
   1322 {
   1323 	int val, f;
   1324 
   1325 	if (p >= BRTT_PMAX)
   1326 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1327 	else if ((val = (p & BRTT_MASK)))
   1328 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1329 	else
   1330 		f = fv->fv_p2ftab[1];
   1331 	return (f);
   1332 }
   1333 
   1334 /*
   1335  * check if an arriving packet should be pre-dropped.
   1336  * called from red_addq() when a packet arrives.
   1337  * returns 1 when the packet should be pre-dropped.
   1338  * should be called in splnet.
   1339  */
   1340 static int
   1341 fv_checkflow(fv, pktattr, fcache)
   1342 	struct flowvalve *fv;
   1343 	struct altq_pktattr *pktattr;
   1344 	struct fve **fcache;
   1345 {
   1346 	struct fve *fve;
   1347 	struct timeval now;
   1348 
   1349 	fv->fv_ifseq++;
   1350 	FV_TIMESTAMP(&now);
   1351 
   1352 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1353 		/* no matching entry in the flowlist */
   1354 		return (0);
   1355 
   1356 	*fcache = fve;
   1357 
   1358 	/* update fraction f for every FV_N packets */
   1359 	if (++fve->fve_count == FV_N) {
   1360 		/*
   1361 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1362 		 */
   1363 		fve->fve_f =
   1364 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1365 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1366 		fve->fve_ifseq = fv->fv_ifseq;
   1367 		fve->fve_count = 0;
   1368 	}
   1369 
   1370 	/*
   1371 	 * overpumping test
   1372 	 */
   1373 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1374 		int fthresh;
   1375 
   1376 		/* calculate a threshold */
   1377 		fthresh = fv_p2f(fv, fve->fve_p);
   1378 		if (fve->fve_f > fthresh)
   1379 			fve->fve_state = Red;
   1380 	}
   1381 
   1382 	if (fve->fve_state == Red) {
   1383 		/*
   1384 		 * backoff test
   1385 		 */
   1386 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1387 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1388 			fve->fve_p = 0;
   1389 			fve->fve_state = Green;
   1390 #ifdef FV_STATS
   1391 			fv->fv_stats.escape++;
   1392 #endif
   1393 		} else {
   1394 			/* block this flow */
   1395 			flowlist_move_to_head(fv, fve);
   1396 			fve->fve_lastdrop = now;
   1397 #ifdef FV_STATS
   1398 			fv->fv_stats.predrop++;
   1399 #endif
   1400 			return (1);
   1401 		}
   1402 	}
   1403 
   1404 	/*
   1405 	 * p = (1 - Wp) * p
   1406 	 */
   1407 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1408 	if (fve->fve_p < 0)
   1409 		fve->fve_p = 0;
   1410 #ifdef FV_STATS
   1411 	fv->fv_stats.pass++;
   1412 #endif
   1413 	return (0);
   1414 }
   1415 
   1416 /*
   1417  * called from red_addq when a packet is dropped by red.
   1418  * should be called in splnet.
   1419  */
   1420 static void fv_dropbyred(fv, pktattr, fcache)
   1421 	struct flowvalve *fv;
   1422 	struct altq_pktattr *pktattr;
   1423 	struct fve *fcache;
   1424 {
   1425 	struct fve *fve;
   1426 	struct timeval now;
   1427 
   1428 	if (pktattr == NULL)
   1429 		return;
   1430 	FV_TIMESTAMP(&now);
   1431 
   1432 	if (fcache != NULL)
   1433 		/* the fve of this packet is already cached */
   1434 		fve = fcache;
   1435 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1436 		fve = flowlist_reclaim(fv, pktattr);
   1437 
   1438 	flowlist_move_to_head(fv, fve);
   1439 
   1440 	/*
   1441 	 * update p:  the following line cancels the update
   1442 	 *	      in fv_checkflow() and calculate
   1443 	 *	p = Wp + (1 - Wp) * p
   1444 	 */
   1445 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1446 
   1447 	fve->fve_lastdrop = now;
   1448 }
   1449 
   1450 #endif /* ALTQ_FLOWVALVE */
   1451 
   1452 #ifdef KLD_MODULE
   1453 
   1454 static struct altqsw red_sw =
   1455 	{"red", redopen, redclose, redioctl};
   1456 
   1457 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1458 
   1459 #endif /* KLD_MODULE */
   1460 
   1461 #endif /* ALTQ_RED */
   1462