Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.19
      1 /*	$NetBSD: altq_red.c,v 1.19 2006/07/21 16:48:45 ad Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.9 2002/01/07 11:25:40 kjc Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.19 2006/07/21 16:48:45 ad Exp $");
     65 
     66 #if defined(__FreeBSD__) || defined(__NetBSD__)
     67 #include "opt_altq.h"
     68 #if (__FreeBSD__ != 2)
     69 #include "opt_inet.h"
     70 #ifdef __FreeBSD__
     71 #include "opt_inet6.h"
     72 #endif
     73 #endif
     74 #endif /* __FreeBSD__ || __NetBSD__ */
     75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     76 
     77 #include <sys/param.h>
     78 #include <sys/malloc.h>
     79 #include <sys/mbuf.h>
     80 #include <sys/socket.h>
     81 #include <sys/sockio.h>
     82 #include <sys/systm.h>
     83 #include <sys/proc.h>
     84 #include <sys/errno.h>
     85 #include <sys/kernel.h>
     86 #include <sys/kauth.h>
     87 #ifdef ALTQ_FLOWVALVE
     88 #include <sys/queue.h>
     89 #include <sys/time.h>
     90 #endif
     91 
     92 #include <net/if.h>
     93 #include <net/if_types.h>
     94 
     95 #include <netinet/in.h>
     96 #include <netinet/in_systm.h>
     97 #include <netinet/ip.h>
     98 #ifdef INET6
     99 #include <netinet/ip6.h>
    100 #endif
    101 
    102 #include <altq/altq.h>
    103 #include <altq/altq_conf.h>
    104 #include <altq/altq_red.h>
    105 #ifdef ALTQ_FLOWVALVE
    106 #include <altq/altq_flowvalve.h>
    107 #endif
    108 
    109 /*
    110  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    111  * fixed-point calculation.
    112  *
    113  * written by kjc using the ns code as a reference.
    114  * you can learn more about red and ns from Sally's home page at
    115  * http://www-nrg.ee.lbl.gov/floyd/
    116  *
    117  * most of the red parameter values are fixed in this implementation
    118  * to prevent fixed-point overflow/underflow.
    119  * if you change the parameters, watch out for overflow/underflow!
    120  *
    121  * the parameters used are recommended values by Sally.
    122  * the corresponding ns config looks:
    123  *	q_weight=0.00195
    124  *	minthresh=5 maxthresh=15 queue-size=60
    125  *	linterm=30
    126  *	dropmech=drop-tail
    127  *	bytes=false (can't be handled by 32-bit fixed-point)
    128  *	doubleq=false dqthresh=false
    129  *	wait=true
    130  */
    131 /*
    132  * alternative red parameters for a slow link.
    133  *
    134  * assume the queue length becomes from zero to L and keeps L, it takes
    135  * N packets for q_avg to reach 63% of L.
    136  * when q_weight is 0.002, N is about 500 packets.
    137  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    138  * when q_weight is 0.008, N is about 127 packets.
    139  * when q_weight is 0.016, N is about 63 packets.
    140  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    141  * are allowed for 0.016.
    142  * see Sally's paper for more details.
    143  */
    144 /* normal red parameters */
    145 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    146 				/* q_weight = 0.00195 */
    147 
    148 /* red parameters for a slow link */
    149 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    150 				/* q_weight = 0.0078125 */
    151 
    152 /* red parameters for a very slow link (e.g., dialup) */
    153 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    154 				/* q_weight = 0.015625 */
    155 
    156 /* fixed-point uses 12-bit decimal places */
    157 #define	FP_SHIFT	12	/* fixed-point shift */
    158 
    159 /* red parameters for drop probability */
    160 #define	INV_P_MAX	10	/* inverse of max drop probability */
    161 #define	TH_MIN		5	/* min threshold */
    162 #define	TH_MAX		15	/* max threshold */
    163 
    164 #define	RED_LIMIT	60	/* default max queue length */
    165 
    166 /*
    167  * our default policy for forced-drop is drop-tail.
    168  * (in altq-1.1.2 or earlier, the default was random-drop.
    169  * but it makes more sense to punish the cause of the surge.)
    170  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    171  */
    172 
    173 #ifdef ALTQ_FLOWVALVE
    174 /*
    175  * flow-valve is an extension to protect red from unresponsive flows
    176  * and to promote end-to-end congestion control.
    177  * flow-valve observes the average drop rates of the flows that have
    178  * experienced packet drops in the recent past.
    179  * when the average drop rate exceeds the threshold, the flow is
    180  * blocked by the flow-valve.  the trapped flow should back off
    181  * exponentially to escape from the flow-valve.
    182  */
    183 #ifdef RED_RANDOM_DROP
    184 #error "random-drop can't be used with flow-valve!"
    185 #endif
    186 #endif /* ALTQ_FLOWVALVE */
    187 
    188 /* red_list keeps all red_queue_t's allocated. */
    189 static red_queue_t *red_list = NULL;
    190 
    191 /* default red parameter values */
    192 static int default_th_min = TH_MIN;
    193 static int default_th_max = TH_MAX;
    194 static int default_inv_pmax = INV_P_MAX;
    195 
    196 /* internal function prototypes */
    197 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    198 			    struct altq_pktattr *));
    199 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    200 static int red_request __P((struct ifaltq *, int, void *));
    201 static void red_purgeq __P((red_queue_t *));
    202 static int red_detach __P((red_queue_t *));
    203 #ifdef ALTQ_FLOWVALVE
    204 static inline struct fve *flowlist_lookup __P((struct flowvalve *,
    205 			 struct altq_pktattr *, struct timeval *));
    206 static inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    207 						  struct altq_pktattr *));
    208 static inline void flowlist_move_to_head __P((struct flowvalve *,
    209 						struct fve *));
    210 static inline int fv_p2f __P((struct flowvalve *, int));
    211 static struct flowvalve *fv_alloc __P((struct red *));
    212 static void fv_destroy __P((struct flowvalve *));
    213 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    214 			     struct fve **));
    215 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    216 			      struct fve *));
    217 #endif
    218 
    219 /*
    220  * red device interface
    221  */
    222 altqdev_decl(red);
    223 
    224 int
    225 redopen(dev, flag, fmt, l)
    226 	dev_t dev;
    227 	int flag, fmt;
    228 	struct lwp *l;
    229 {
    230 	/* everything will be done when the queueing scheme is attached. */
    231 	return 0;
    232 }
    233 
    234 int
    235 redclose(dev, flag, fmt, l)
    236 	dev_t dev;
    237 	int flag, fmt;
    238 	struct lwp *l;
    239 {
    240 	red_queue_t *rqp;
    241 	int err, error = 0;
    242 
    243 	while ((rqp = red_list) != NULL) {
    244 		/* destroy all */
    245 		err = red_detach(rqp);
    246 		if (err != 0 && error == 0)
    247 			error = err;
    248 	}
    249 
    250 	return error;
    251 }
    252 
    253 int
    254 redioctl(dev, cmd, addr, flag, l)
    255 	dev_t dev;
    256 	ioctlcmd_t cmd;
    257 	caddr_t addr;
    258 	int flag;
    259 	struct lwp *l;
    260 {
    261 	red_queue_t *rqp;
    262 	struct red_interface *ifacep;
    263 	struct ifnet *ifp;
    264 	int	error = 0;
    265 
    266 	/* check super-user privilege */
    267 	switch (cmd) {
    268 	case RED_GETSTATS:
    269 		break;
    270 	default:
    271 #if (__FreeBSD_version > 400000)
    272 		if ((error = suser(p)) != 0)
    273 #else
    274 		if ((error = kauth_authorize_generic(l->l_cred,
    275 		    KAUTH_GENERIC_ISSUSER, &l->l_acflag)) != 0)
    276 #endif
    277 			return (error);
    278 		break;
    279 	}
    280 
    281 	switch (cmd) {
    282 
    283 	case RED_ENABLE:
    284 		ifacep = (struct red_interface *)addr;
    285 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    286 			error = EBADF;
    287 			break;
    288 		}
    289 		error = altq_enable(rqp->rq_ifq);
    290 		break;
    291 
    292 	case RED_DISABLE:
    293 		ifacep = (struct red_interface *)addr;
    294 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    295 			error = EBADF;
    296 			break;
    297 		}
    298 		error = altq_disable(rqp->rq_ifq);
    299 		break;
    300 
    301 	case RED_IF_ATTACH:
    302 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    303 		if (ifp == NULL) {
    304 			error = ENXIO;
    305 			break;
    306 		}
    307 
    308 		/* allocate and initialize red_queue_t */
    309 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    310 		if (rqp == NULL) {
    311 			error = ENOMEM;
    312 			break;
    313 		}
    314 
    315 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    316 		    M_WAITOK|M_ZERO);
    317 		if (rqp->rq_q == NULL) {
    318 			free(rqp, M_DEVBUF);
    319 			error = ENOMEM;
    320 			break;
    321 		}
    322 
    323 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    324 		if (rqp->rq_red == NULL) {
    325 			free(rqp->rq_q, M_DEVBUF);
    326 			free(rqp, M_DEVBUF);
    327 			error = ENOMEM;
    328 			break;
    329 		}
    330 
    331 		rqp->rq_ifq = &ifp->if_snd;
    332 		qtail(rqp->rq_q) = NULL;
    333 		qlen(rqp->rq_q) = 0;
    334 		qlimit(rqp->rq_q) = RED_LIMIT;
    335 		qtype(rqp->rq_q) = Q_RED;
    336 
    337 		/*
    338 		 * set RED to this ifnet structure.
    339 		 */
    340 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    341 				    red_enqueue, red_dequeue, red_request,
    342 				    NULL, NULL);
    343 		if (error) {
    344 			red_destroy(rqp->rq_red);
    345 			free(rqp->rq_q, M_DEVBUF);
    346 			free(rqp, M_DEVBUF);
    347 			break;
    348 		}
    349 
    350 		/* add this state to the red list */
    351 		rqp->rq_next = red_list;
    352 		red_list = rqp;
    353 		break;
    354 
    355 	case RED_IF_DETACH:
    356 		ifacep = (struct red_interface *)addr;
    357 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    358 			error = EBADF;
    359 			break;
    360 		}
    361 		error = red_detach(rqp);
    362 		break;
    363 
    364 	case RED_GETSTATS:
    365 		do {
    366 			struct red_stats *q_stats;
    367 			red_t *rp;
    368 
    369 			q_stats = (struct red_stats *)addr;
    370 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    371 					     ALTQT_RED)) == NULL) {
    372 				error = EBADF;
    373 				break;
    374 			}
    375 
    376 			q_stats->q_len 	   = qlen(rqp->rq_q);
    377 			q_stats->q_limit   = qlimit(rqp->rq_q);
    378 
    379 			rp = rqp->rq_red;
    380 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    381 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    382 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    383 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    384 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    385 			q_stats->marked_packets = rp->red_stats.marked_packets;
    386 
    387 			q_stats->weight		= rp->red_weight;
    388 			q_stats->inv_pmax	= rp->red_inv_pmax;
    389 			q_stats->th_min		= rp->red_thmin;
    390 			q_stats->th_max		= rp->red_thmax;
    391 
    392 #ifdef ALTQ_FLOWVALVE
    393 			if (rp->red_flowvalve != NULL) {
    394 				struct flowvalve *fv = rp->red_flowvalve;
    395 				q_stats->fv_flows    = fv->fv_flows;
    396 				q_stats->fv_pass     = fv->fv_stats.pass;
    397 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    398 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    399 				q_stats->fv_escape   = fv->fv_stats.escape;
    400 			} else {
    401 #endif /* ALTQ_FLOWVALVE */
    402 				q_stats->fv_flows    = 0;
    403 				q_stats->fv_pass     = 0;
    404 				q_stats->fv_predrop  = 0;
    405 				q_stats->fv_alloc    = 0;
    406 				q_stats->fv_escape   = 0;
    407 #ifdef ALTQ_FLOWVALVE
    408 			}
    409 #endif /* ALTQ_FLOWVALVE */
    410 		} while (0);
    411 		break;
    412 
    413 	case RED_CONFIG:
    414 		do {
    415 			struct red_conf *fc;
    416 			red_t *new;
    417 			int s, limit;
    418 
    419 			fc = (struct red_conf *)addr;
    420 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    421 					       ALTQT_RED)) == NULL) {
    422 				error = EBADF;
    423 				break;
    424 			}
    425 			new = red_alloc(fc->red_weight,
    426 					fc->red_inv_pmax,
    427 					fc->red_thmin,
    428 					fc->red_thmax,
    429 					fc->red_flags,
    430 					fc->red_pkttime);
    431 			if (new == NULL) {
    432 				error = ENOMEM;
    433 				break;
    434 			}
    435 
    436 			s = splnet();
    437 			red_purgeq(rqp);
    438 			limit = fc->red_limit;
    439 			if (limit < fc->red_thmax)
    440 				limit = fc->red_thmax;
    441 			qlimit(rqp->rq_q) = limit;
    442 			fc->red_limit = limit;	/* write back the new value */
    443 
    444 			red_destroy(rqp->rq_red);
    445 			rqp->rq_red = new;
    446 
    447 			splx(s);
    448 
    449 			/* write back new values */
    450 			fc->red_limit = limit;
    451 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    452 			fc->red_thmin = rqp->rq_red->red_thmin;
    453 			fc->red_thmax = rqp->rq_red->red_thmax;
    454 
    455 		} while (0);
    456 		break;
    457 
    458 	case RED_SETDEFAULTS:
    459 		do {
    460 			struct redparams *rp;
    461 
    462 			rp = (struct redparams *)addr;
    463 
    464 			default_th_min = rp->th_min;
    465 			default_th_max = rp->th_max;
    466 			default_inv_pmax = rp->inv_pmax;
    467 		} while (0);
    468 		break;
    469 
    470 	default:
    471 		error = EINVAL;
    472 		break;
    473 	}
    474 	return error;
    475 }
    476 
    477 static int
    478 red_detach(rqp)
    479 	red_queue_t *rqp;
    480 {
    481 	red_queue_t *tmp;
    482 	int error = 0;
    483 
    484 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    485 		altq_disable(rqp->rq_ifq);
    486 
    487 	if ((error = altq_detach(rqp->rq_ifq)))
    488 		return (error);
    489 
    490 	if (red_list == rqp)
    491 		red_list = rqp->rq_next;
    492 	else {
    493 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    494 			if (tmp->rq_next == rqp) {
    495 				tmp->rq_next = rqp->rq_next;
    496 				break;
    497 			}
    498 		if (tmp == NULL)
    499 			printf("red_detach: no state found in red_list!\n");
    500 	}
    501 
    502 	red_destroy(rqp->rq_red);
    503 	free(rqp->rq_q, M_DEVBUF);
    504 	free(rqp, M_DEVBUF);
    505 	return (error);
    506 }
    507 
    508 /*
    509  * red support routines
    510  */
    511 
    512 red_t *
    513 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    514 	int	weight, inv_pmax, th_min, th_max;
    515 	int	flags, pkttime;
    516 {
    517 	red_t 	*rp;
    518 	int	w, i;
    519 	int	npkts_per_sec;
    520 
    521 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    522 	if (rp == NULL)
    523 		return (NULL);
    524 
    525 	rp->red_avg = 0;
    526 	rp->red_idle = 1;
    527 
    528 	if (weight == 0)
    529 		rp->red_weight = W_WEIGHT;
    530 	else
    531 		rp->red_weight = weight;
    532 	if (inv_pmax == 0)
    533 		rp->red_inv_pmax = default_inv_pmax;
    534 	else
    535 		rp->red_inv_pmax = inv_pmax;
    536 	if (th_min == 0)
    537 		rp->red_thmin = default_th_min;
    538 	else
    539 		rp->red_thmin = th_min;
    540 	if (th_max == 0)
    541 		rp->red_thmax = default_th_max;
    542 	else
    543 		rp->red_thmax = th_max;
    544 
    545 	rp->red_flags = flags;
    546 
    547 	if (pkttime == 0)
    548 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    549 		rp->red_pkttime = 800;
    550 	else
    551 		rp->red_pkttime = pkttime;
    552 
    553 	if (weight == 0) {
    554 		/* when the link is very slow, adjust red parameters */
    555 		npkts_per_sec = 1000000 / rp->red_pkttime;
    556 		if (npkts_per_sec < 50) {
    557 			/* up to about 400Kbps */
    558 			rp->red_weight = W_WEIGHT_2;
    559 		} else if (npkts_per_sec < 300) {
    560 			/* up to about 2.4Mbps */
    561 			rp->red_weight = W_WEIGHT_1;
    562 		}
    563 	}
    564 
    565 	/* calculate wshift.  weight must be power of 2 */
    566 	w = rp->red_weight;
    567 	for (i = 0; w > 1; i++)
    568 		w = w >> 1;
    569 	rp->red_wshift = i;
    570 	w = 1 << rp->red_wshift;
    571 	if (w != rp->red_weight) {
    572 		printf("invalid weight value %d for red! use %d\n",
    573 		       rp->red_weight, w);
    574 		rp->red_weight = w;
    575 	}
    576 
    577 	/*
    578 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    579 	 * to be compared with avg.
    580 	 */
    581 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    582 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    583 
    584 	/*
    585 	 * precompute probability denominator
    586 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    587 	 */
    588 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    589 			 * rp->red_inv_pmax) << FP_SHIFT;
    590 
    591 	/* allocate weight table */
    592 	rp->red_wtab = wtab_alloc(rp->red_weight);
    593 
    594 	microtime(&rp->red_last);
    595 #ifdef ALTQ_FLOWVALVE
    596 	if (flags & REDF_FLOWVALVE)
    597 		rp->red_flowvalve = fv_alloc(rp);
    598 	/* if fv_alloc failes, flowvalve is just disabled */
    599 #endif
    600 	return (rp);
    601 }
    602 
    603 void
    604 red_destroy(rp)
    605 	red_t *rp;
    606 {
    607 #ifdef ALTQ_FLOWVALVE
    608 	if (rp->red_flowvalve != NULL)
    609 		fv_destroy(rp->red_flowvalve);
    610 #endif
    611 	wtab_destroy(rp->red_wtab);
    612 	free(rp, M_DEVBUF);
    613 }
    614 
    615 void
    616 red_getstats(rp, sp)
    617 	red_t *rp;
    618 	struct redstats *sp;
    619 {
    620 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    621 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    622 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    623 	sp->drop_forced		= rp->red_stats.drop_forced;
    624 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    625 	sp->marked_packets	= rp->red_stats.marked_packets;
    626 }
    627 
    628 /*
    629  * enqueue routine:
    630  *
    631  *	returns: 0 when successfully queued.
    632  *		 ENOBUFS when drop occurs.
    633  */
    634 static int
    635 red_enqueue(ifq, m, pktattr)
    636 	struct ifaltq *ifq;
    637 	struct mbuf *m;
    638 	struct altq_pktattr *pktattr;
    639 {
    640 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    641 
    642 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    643 		return ENOBUFS;
    644 	ifq->ifq_len++;
    645 	return 0;
    646 }
    647 
    648 int
    649 red_addq(rp, q, m, pktattr)
    650 	red_t *rp;
    651 	class_queue_t *q;
    652 	struct mbuf *m;
    653 	struct altq_pktattr *pktattr;
    654 {
    655 	int avg, droptype;
    656 	int n;
    657 #ifdef ALTQ_FLOWVALVE
    658 	struct fve *fve = NULL;
    659 
    660 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    661 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    662 			m_freem(m);
    663 			return (-1);
    664 		}
    665 #endif
    666 
    667 	avg = rp->red_avg;
    668 
    669 	/*
    670 	 * if we were idle, we pretend that n packets arrived during
    671 	 * the idle period.
    672 	 */
    673 	if (rp->red_idle) {
    674 		struct timeval now;
    675 		int t;
    676 
    677 		rp->red_idle = 0;
    678 		microtime(&now);
    679 		t = (now.tv_sec - rp->red_last.tv_sec);
    680 		if (t > 60) {
    681 			/*
    682 			 * being idle for more than 1 minute, set avg to zero.
    683 			 * this prevents t from overflow.
    684 			 */
    685 			avg = 0;
    686 		} else {
    687 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    688 			n = t / rp->red_pkttime - 1;
    689 
    690 			/* the following line does (avg = (1 - Wq)^n * avg) */
    691 			if (n > 0)
    692 				avg = (avg >> FP_SHIFT) *
    693 				    pow_w(rp->red_wtab, n);
    694 		}
    695 	}
    696 
    697 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    698 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    699 	rp->red_avg = avg;		/* save the new value */
    700 
    701 	/*
    702 	 * red_count keeps a tally of arriving traffic that has not
    703 	 * been dropped.
    704 	 */
    705 	rp->red_count++;
    706 
    707 	/* see if we drop early */
    708 	droptype = DTYPE_NODROP;
    709 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    710 		if (avg >= rp->red_thmax_s) {
    711 			/* avg >= th_max: forced drop */
    712 			droptype = DTYPE_FORCED;
    713 		} else if (rp->red_old == 0) {
    714 			/* first exceeds th_min */
    715 			rp->red_count = 1;
    716 			rp->red_old = 1;
    717 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    718 				      rp->red_probd, rp->red_count)) {
    719 			/* mark or drop by red */
    720 			if ((rp->red_flags & REDF_ECN) &&
    721 			    mark_ecn(m, pktattr, rp->red_flags)) {
    722 				/* successfully marked.  do not drop. */
    723 				rp->red_count = 0;
    724 #ifdef RED_STATS
    725 				rp->red_stats.marked_packets++;
    726 #endif
    727 			} else {
    728 				/* unforced drop by red */
    729 				droptype = DTYPE_EARLY;
    730 			}
    731 		}
    732 	} else {
    733 		/* avg < th_min */
    734 		rp->red_old = 0;
    735 	}
    736 
    737 	/*
    738 	 * if the queue length hits the hard limit, it's a forced drop.
    739 	 */
    740 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    741 		droptype = DTYPE_FORCED;
    742 
    743 #ifdef RED_RANDOM_DROP
    744 	/* if successful or forced drop, enqueue this packet. */
    745 	if (droptype != DTYPE_EARLY)
    746 		_addq(q, m);
    747 #else
    748 	/* if successful, enqueue this packet. */
    749 	if (droptype == DTYPE_NODROP)
    750 		_addq(q, m);
    751 #endif
    752 	if (droptype != DTYPE_NODROP) {
    753 		if (droptype == DTYPE_EARLY) {
    754 			/* drop the incoming packet */
    755 #ifdef RED_STATS
    756 			rp->red_stats.drop_unforced++;
    757 #endif
    758 		} else {
    759 			/* forced drop, select a victim packet in the queue. */
    760 #ifdef RED_RANDOM_DROP
    761 			m = _getq_random(q);
    762 #endif
    763 #ifdef RED_STATS
    764 			rp->red_stats.drop_forced++;
    765 #endif
    766 		}
    767 #ifdef RED_STATS
    768 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    769 #endif
    770 		rp->red_count = 0;
    771 #ifdef ALTQ_FLOWVALVE
    772 		if (rp->red_flowvalve != NULL)
    773 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    774 #endif
    775 		m_freem(m);
    776 		return (-1);
    777 	}
    778 	/* successfully queued */
    779 #ifdef RED_STATS
    780 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    781 #endif
    782 	return (0);
    783 }
    784 
    785 /*
    786  * early-drop probability is calculated as follows:
    787  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    788  *   prob_a = prob / (2 - count*prob)
    789  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    790  * here prob_a increases as successive undrop count increases.
    791  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    792  * becomes 1 when (count >= (2 / prob))).
    793  */
    794 int
    795 drop_early(fp_len, fp_probd, count)
    796 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    797 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    798 	int count;	/* how many successive undropped packets */
    799 {
    800 	int d;		/* denominator of drop-probability */
    801 
    802 	d = fp_probd - count * fp_len;
    803 	if (d <= 0)
    804 		/* count exceeds the hard limit: drop or mark */
    805 		return (1);
    806 
    807 	/*
    808 	 * now the range of d is [1..600] in fixed-point. (when
    809 	 * th_max-th_min=10 and p_max=1/30)
    810 	 * drop probability = (avg - TH_MIN) / d
    811 	 */
    812 
    813 	if ((random() % d) < fp_len) {
    814 		/* drop or mark */
    815 		return (1);
    816 	}
    817 	/* no drop/mark */
    818 	return (0);
    819 }
    820 
    821 /*
    822  * try to mark CE bit to the packet.
    823  *    returns 1 if successfully marked, 0 otherwise.
    824  */
    825 int
    826 mark_ecn(m, pktattr, flags)
    827 	struct mbuf *m;
    828 	struct altq_pktattr *pktattr;
    829 	int flags;
    830 {
    831 	struct mbuf *m0;
    832 
    833 	if (pktattr == NULL ||
    834 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    835 		return (0);
    836 
    837 	/* verify that pattr_hdr is within the mbuf data */
    838 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    839 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    840 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    841 			break;
    842 	if (m0 == NULL) {
    843 		/* ick, pattr_hdr is stale */
    844 		pktattr->pattr_af = AF_UNSPEC;
    845 		return (0);
    846 	}
    847 
    848 	switch (pktattr->pattr_af) {
    849 	case AF_INET:
    850 		if (flags & REDF_ECN4) {
    851 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    852 			u_int8_t otos;
    853 			int sum;
    854 
    855 			if (ip->ip_v != 4)
    856 				return (0);	/* version mismatch! */
    857 
    858 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    859 				return (0);	/* not-ECT */
    860 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    861 				return (1);	/* already marked */
    862 
    863 			/*
    864 			 * ecn-capable but not marked,
    865 			 * mark CE and update checksum
    866 			 */
    867 			otos = ip->ip_tos;
    868 			ip->ip_tos |= IPTOS_ECN_CE;
    869 			/*
    870 			 * update checksum (from RFC1624)
    871 			 *	   HC' = ~(~HC + ~m + m')
    872 			 */
    873 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    874 			sum += (~otos & 0xffff) + ip->ip_tos;
    875 			sum = (sum >> 16) + (sum & 0xffff);
    876 			sum += (sum >> 16);  /* add carry */
    877 			ip->ip_sum = htons(~sum & 0xffff);
    878 			return (1);
    879 		}
    880 		break;
    881 #ifdef INET6
    882 	case AF_INET6:
    883 		if (flags & REDF_ECN6) {
    884 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    885 			u_int32_t flowlabel;
    886 
    887 			flowlabel = ntohl(ip6->ip6_flow);
    888 			if ((flowlabel >> 28) != 6)
    889 				return (0);	/* version mismatch! */
    890 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    891 			    (IPTOS_ECN_NOTECT << 20))
    892 				return (0);	/* not-ECT */
    893 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    894 			    (IPTOS_ECN_CE << 20))
    895 				return (1);	/* already marked */
    896 			/*
    897 			 * ecn-capable but not marked,  mark CE
    898 			 */
    899 			flowlabel |= (IPTOS_ECN_CE << 20);
    900 			ip6->ip6_flow = htonl(flowlabel);
    901 			return (1);
    902 		}
    903 		break;
    904 #endif  /* INET6 */
    905 	}
    906 
    907 	/* not marked */
    908 	return (0);
    909 }
    910 
    911 /*
    912  * dequeue routine:
    913  *	must be called in splnet.
    914  *
    915  *	returns: mbuf dequeued.
    916  *		 NULL when no packet is available in the queue.
    917  */
    918 
    919 static struct mbuf *
    920 red_dequeue(ifq, op)
    921 	struct ifaltq *ifq;
    922 	int op;
    923 {
    924 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    925 	struct mbuf *m;
    926 
    927 	if (op == ALTDQ_POLL)
    928 		return qhead(rqp->rq_q);
    929 
    930 	/* op == ALTDQ_REMOVE */
    931 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    932 	if (m != NULL)
    933 		ifq->ifq_len--;
    934 	return (m);
    935 }
    936 
    937 struct mbuf *
    938 red_getq(rp, q)
    939 	red_t *rp;
    940 	class_queue_t *q;
    941 {
    942 	struct mbuf *m;
    943 
    944 	if ((m = _getq(q)) == NULL) {
    945 		if (rp->red_idle == 0) {
    946 			rp->red_idle = 1;
    947 			microtime(&rp->red_last);
    948 		}
    949 		return NULL;
    950 	}
    951 
    952 	rp->red_idle = 0;
    953 	return (m);
    954 }
    955 
    956 static int
    957 red_request(ifq, req, arg)
    958 	struct ifaltq *ifq;
    959 	int req;
    960 	void *arg;
    961 {
    962 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    963 
    964 	switch (req) {
    965 	case ALTRQ_PURGE:
    966 		red_purgeq(rqp);
    967 		break;
    968 	}
    969 	return (0);
    970 }
    971 
    972 static void
    973 red_purgeq(rqp)
    974 	red_queue_t *rqp;
    975 {
    976 	_flushq(rqp->rq_q);
    977 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    978 		rqp->rq_ifq->ifq_len = 0;
    979 }
    980 
    981 
    982 /*
    983  * helper routine to calibrate avg during idle.
    984  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    985  * here Wq = 1/weight and the code assumes Wq is close to zero.
    986  *
    987  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    988  */
    989 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    990 
    991 struct wtab *
    992 wtab_alloc(weight)
    993 	int weight;
    994 {
    995 	struct wtab *w;
    996 	int i;
    997 
    998 	for (w = wtab_list; w != NULL; w = w->w_next)
    999 		if (w->w_weight == weight) {
   1000 			w->w_refcount++;
   1001 			return (w);
   1002 		}
   1003 
   1004 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
   1005 	if (w == NULL)
   1006 		panic("wtab_alloc: malloc failed!");
   1007 	w->w_weight = weight;
   1008 	w->w_refcount = 1;
   1009 	w->w_next = wtab_list;
   1010 	wtab_list = w;
   1011 
   1012 	/* initialize the weight table */
   1013 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1014 	for (i = 1; i < 32; i++) {
   1015 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1016 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1017 			w->w_param_max = 1 << i;
   1018 	}
   1019 
   1020 	return (w);
   1021 }
   1022 
   1023 int
   1024 wtab_destroy(w)
   1025 	struct wtab *w;
   1026 {
   1027 	struct wtab *prev;
   1028 
   1029 	if (--w->w_refcount > 0)
   1030 		return (0);
   1031 
   1032 	if (wtab_list == w)
   1033 		wtab_list = w->w_next;
   1034 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1035 		if (prev->w_next == w) {
   1036 			prev->w_next = w->w_next;
   1037 			break;
   1038 		}
   1039 
   1040 	free(w, M_DEVBUF);
   1041 	return (0);
   1042 }
   1043 
   1044 int32_t
   1045 pow_w(w, n)
   1046 	struct wtab *w;
   1047 	int n;
   1048 {
   1049 	int	i, bit;
   1050 	int32_t	val;
   1051 
   1052 	if (n >= w->w_param_max)
   1053 		return (0);
   1054 
   1055 	val = 1 << FP_SHIFT;
   1056 	if (n <= 0)
   1057 		return (val);
   1058 
   1059 	bit = 1;
   1060 	i = 0;
   1061 	while (n) {
   1062 		if (n & bit) {
   1063 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1064 			n &= ~bit;
   1065 		}
   1066 		i++;
   1067 		bit <<=  1;
   1068 	}
   1069 	return (val);
   1070 }
   1071 
   1072 #ifdef ALTQ_FLOWVALVE
   1073 
   1074 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1075 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1076 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1077 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1078 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1079 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1080 
   1081 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1082 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1083 
   1084 #define	FV_N			10	/* update fve_f every FV_N packets */
   1085 
   1086 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1087 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1088 #define	FV_ALPHA		5  /* extra packet count */
   1089 
   1090 #if (__FreeBSD_version > 300000) || defined(__HAVE_TIMECOUNTER)
   1091 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1092 #else
   1093 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1094 #endif
   1095 
   1096 /*
   1097  * Brtt table: 127 entry table to convert drop rate (p) to
   1098  * the corresponding bandwidth fraction (f)
   1099  * the following equation is implemented to use scaled values,
   1100  * fve_p and fve_f, in the fixed point format.
   1101  *
   1102  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1103  *   f = Brtt(p) / (max_th + alpha)
   1104  */
   1105 #define	BRTT_SIZE	128
   1106 #define	BRTT_SHIFT	12
   1107 #define	BRTT_MASK	0x0007f000
   1108 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1109 
   1110 const int brtt_tab[BRTT_SIZE] = {
   1111 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1112 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1113 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1114 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1115 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1116 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1117 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1118 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1119 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1120 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1121 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1122 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1123 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1124 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1125 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1126 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1127 };
   1128 
   1129 static inline struct fve *
   1130 flowlist_lookup(fv, pktattr, now)
   1131 	struct flowvalve *fv;
   1132 	struct altq_pktattr *pktattr;
   1133 	struct timeval *now;
   1134 {
   1135 	struct fve *fve;
   1136 	int flows;
   1137 	struct ip *ip;
   1138 #ifdef INET6
   1139 	struct ip6_hdr *ip6;
   1140 #endif
   1141 	struct timeval tthresh;
   1142 
   1143 	if (pktattr == NULL)
   1144 		return (NULL);
   1145 
   1146 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1147 	flows = 0;
   1148 	/*
   1149 	 * search the flow list
   1150 	 */
   1151 	switch (pktattr->pattr_af) {
   1152 	case AF_INET:
   1153 		ip = (struct ip *)pktattr->pattr_hdr;
   1154 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1155 			if (fve->fve_lastdrop.tv_sec == 0)
   1156 				break;
   1157 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1158 				fve->fve_lastdrop.tv_sec = 0;
   1159 				break;
   1160 			}
   1161 			if (fve->fve_flow.flow_af == AF_INET &&
   1162 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1163 			    ip->ip_src.s_addr &&
   1164 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1165 			    ip->ip_dst.s_addr)
   1166 				return (fve);
   1167 			flows++;
   1168 		}
   1169 		break;
   1170 #ifdef INET6
   1171 	case AF_INET6:
   1172 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1173 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1174 			if (fve->fve_lastdrop.tv_sec == 0)
   1175 				break;
   1176 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1177 				fve->fve_lastdrop.tv_sec = 0;
   1178 				break;
   1179 			}
   1180 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1181 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1182 					       &ip6->ip6_src) &&
   1183 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1184 					       &ip6->ip6_dst))
   1185 				return (fve);
   1186 			flows++;
   1187 		}
   1188 		break;
   1189 #endif /* INET6 */
   1190 
   1191 	default:
   1192 		/* unknown protocol.  no drop. */
   1193 		return (NULL);
   1194 	}
   1195 	fv->fv_flows = flows;	/* save the number of active fve's */
   1196 	return (NULL);
   1197 }
   1198 
   1199 static inline struct fve *
   1200 flowlist_reclaim(fv, pktattr)
   1201 	struct flowvalve *fv;
   1202 	struct altq_pktattr *pktattr;
   1203 {
   1204 	struct fve *fve;
   1205 	struct ip *ip;
   1206 #ifdef INET6
   1207 	struct ip6_hdr *ip6;
   1208 #endif
   1209 
   1210 	/*
   1211 	 * get an entry from the tail of the LRU list.
   1212 	 */
   1213 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1214 
   1215 	switch (pktattr->pattr_af) {
   1216 	case AF_INET:
   1217 		ip = (struct ip *)pktattr->pattr_hdr;
   1218 		fve->fve_flow.flow_af = AF_INET;
   1219 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1220 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1221 		break;
   1222 #ifdef INET6
   1223 	case AF_INET6:
   1224 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1225 		fve->fve_flow.flow_af = AF_INET6;
   1226 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1227 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1228 		break;
   1229 #endif
   1230 	}
   1231 
   1232 	fve->fve_state = Green;
   1233 	fve->fve_p = 0.0;
   1234 	fve->fve_f = 0.0;
   1235 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1236 	fve->fve_count = 0;
   1237 
   1238 	fv->fv_flows++;
   1239 #ifdef FV_STATS
   1240 	fv->fv_stats.alloc++;
   1241 #endif
   1242 	return (fve);
   1243 }
   1244 
   1245 static inline void
   1246 flowlist_move_to_head(fv, fve)
   1247 	struct flowvalve *fv;
   1248 	struct fve *fve;
   1249 {
   1250 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1251 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1252 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1253 	}
   1254 }
   1255 
   1256 /*
   1257  * allocate flowvalve structure
   1258  */
   1259 static struct flowvalve *
   1260 fv_alloc(rp)
   1261 	struct red *rp;
   1262 {
   1263 	struct flowvalve *fv;
   1264 	struct fve *fve;
   1265 	int i, num;
   1266 
   1267 	num = FV_FLOWLISTSIZE;
   1268 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1269 	if (fv == NULL)
   1270 		return (NULL);
   1271 
   1272 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1273 	    M_WAITOK|M_ZERO);
   1274 	if (fv->fv_fves == NULL) {
   1275 		free(fv, M_DEVBUF);
   1276 		return (NULL);
   1277 	}
   1278 
   1279 	fv->fv_flows = 0;
   1280 	TAILQ_INIT(&fv->fv_flowlist);
   1281 	for (i = 0; i < num; i++) {
   1282 		fve = &fv->fv_fves[i];
   1283 		fve->fve_lastdrop.tv_sec = 0;
   1284 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1285 	}
   1286 
   1287 	/* initialize drop rate threshold in scaled fixed-point */
   1288 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1289 
   1290 	/* initialize drop rate to fraction table */
   1291 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1292 	if (fv->fv_p2ftab == NULL) {
   1293 		free(fv->fv_fves, M_DEVBUF);
   1294 		free(fv, M_DEVBUF);
   1295 		return (NULL);
   1296 	}
   1297 	/*
   1298 	 * create the p2f table.
   1299 	 * (shift is used to keep the precision)
   1300 	 */
   1301 	for (i = 1; i < BRTT_SIZE; i++) {
   1302 		int f;
   1303 
   1304 		f = brtt_tab[i] << 8;
   1305 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1306 	}
   1307 
   1308 	return (fv);
   1309 }
   1310 
   1311 static void fv_destroy(fv)
   1312 	struct flowvalve *fv;
   1313 {
   1314 	free(fv->fv_p2ftab, M_DEVBUF);
   1315 	free(fv->fv_fves, M_DEVBUF);
   1316 	free(fv, M_DEVBUF);
   1317 }
   1318 
   1319 static inline int
   1320 fv_p2f(fv, p)
   1321 	struct flowvalve	*fv;
   1322 	int	p;
   1323 {
   1324 	int val, f;
   1325 
   1326 	if (p >= BRTT_PMAX)
   1327 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1328 	else if ((val = (p & BRTT_MASK)))
   1329 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1330 	else
   1331 		f = fv->fv_p2ftab[1];
   1332 	return (f);
   1333 }
   1334 
   1335 /*
   1336  * check if an arriving packet should be pre-dropped.
   1337  * called from red_addq() when a packet arrives.
   1338  * returns 1 when the packet should be pre-dropped.
   1339  * should be called in splnet.
   1340  */
   1341 static int
   1342 fv_checkflow(fv, pktattr, fcache)
   1343 	struct flowvalve *fv;
   1344 	struct altq_pktattr *pktattr;
   1345 	struct fve **fcache;
   1346 {
   1347 	struct fve *fve;
   1348 	struct timeval now;
   1349 
   1350 	fv->fv_ifseq++;
   1351 	FV_TIMESTAMP(&now);
   1352 
   1353 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1354 		/* no matching entry in the flowlist */
   1355 		return (0);
   1356 
   1357 	*fcache = fve;
   1358 
   1359 	/* update fraction f for every FV_N packets */
   1360 	if (++fve->fve_count == FV_N) {
   1361 		/*
   1362 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1363 		 */
   1364 		fve->fve_f =
   1365 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1366 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1367 		fve->fve_ifseq = fv->fv_ifseq;
   1368 		fve->fve_count = 0;
   1369 	}
   1370 
   1371 	/*
   1372 	 * overpumping test
   1373 	 */
   1374 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1375 		int fthresh;
   1376 
   1377 		/* calculate a threshold */
   1378 		fthresh = fv_p2f(fv, fve->fve_p);
   1379 		if (fve->fve_f > fthresh)
   1380 			fve->fve_state = Red;
   1381 	}
   1382 
   1383 	if (fve->fve_state == Red) {
   1384 		/*
   1385 		 * backoff test
   1386 		 */
   1387 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1388 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1389 			fve->fve_p = 0;
   1390 			fve->fve_state = Green;
   1391 #ifdef FV_STATS
   1392 			fv->fv_stats.escape++;
   1393 #endif
   1394 		} else {
   1395 			/* block this flow */
   1396 			flowlist_move_to_head(fv, fve);
   1397 			fve->fve_lastdrop = now;
   1398 #ifdef FV_STATS
   1399 			fv->fv_stats.predrop++;
   1400 #endif
   1401 			return (1);
   1402 		}
   1403 	}
   1404 
   1405 	/*
   1406 	 * p = (1 - Wp) * p
   1407 	 */
   1408 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1409 	if (fve->fve_p < 0)
   1410 		fve->fve_p = 0;
   1411 #ifdef FV_STATS
   1412 	fv->fv_stats.pass++;
   1413 #endif
   1414 	return (0);
   1415 }
   1416 
   1417 /*
   1418  * called from red_addq when a packet is dropped by red.
   1419  * should be called in splnet.
   1420  */
   1421 static void fv_dropbyred(fv, pktattr, fcache)
   1422 	struct flowvalve *fv;
   1423 	struct altq_pktattr *pktattr;
   1424 	struct fve *fcache;
   1425 {
   1426 	struct fve *fve;
   1427 	struct timeval now;
   1428 
   1429 	if (pktattr == NULL)
   1430 		return;
   1431 	FV_TIMESTAMP(&now);
   1432 
   1433 	if (fcache != NULL)
   1434 		/* the fve of this packet is already cached */
   1435 		fve = fcache;
   1436 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1437 		fve = flowlist_reclaim(fv, pktattr);
   1438 
   1439 	flowlist_move_to_head(fv, fve);
   1440 
   1441 	/*
   1442 	 * update p:  the following line cancels the update
   1443 	 *	      in fv_checkflow() and calculate
   1444 	 *	p = Wp + (1 - Wp) * p
   1445 	 */
   1446 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1447 
   1448 	fve->fve_lastdrop = now;
   1449 }
   1450 
   1451 #endif /* ALTQ_FLOWVALVE */
   1452 
   1453 #ifdef KLD_MODULE
   1454 
   1455 static struct altqsw red_sw =
   1456 	{"red", redopen, redclose, redioctl};
   1457 
   1458 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1459 
   1460 #endif /* KLD_MODULE */
   1461 
   1462 #endif /* ALTQ_RED */
   1463