Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.13.10.3
      1 /*	$NetBSD: altq_red.c,v 1.13.10.3 2006/05/11 23:26:17 elad Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.9 2002/01/07 11:25:40 kjc Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2000
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.13.10.3 2006/05/11 23:26:17 elad Exp $");
     65 
     66 #if defined(__FreeBSD__) || defined(__NetBSD__)
     67 #include "opt_altq.h"
     68 #if (__FreeBSD__ != 2)
     69 #include "opt_inet.h"
     70 #ifdef __FreeBSD__
     71 #include "opt_inet6.h"
     72 #endif
     73 #endif
     74 #endif /* __FreeBSD__ || __NetBSD__ */
     75 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     76 
     77 #include <sys/param.h>
     78 #include <sys/malloc.h>
     79 #include <sys/mbuf.h>
     80 #include <sys/socket.h>
     81 #include <sys/sockio.h>
     82 #include <sys/systm.h>
     83 #include <sys/proc.h>
     84 #include <sys/errno.h>
     85 #include <sys/kernel.h>
     86 #ifdef ALTQ_FLOWVALVE
     87 #include <sys/queue.h>
     88 #include <sys/time.h>
     89 #endif
     90 
     91 #include <net/if.h>
     92 #include <net/if_types.h>
     93 
     94 #include <netinet/in.h>
     95 #include <netinet/in_systm.h>
     96 #include <netinet/ip.h>
     97 #ifdef INET6
     98 #include <netinet/ip6.h>
     99 #endif
    100 
    101 #include <altq/altq.h>
    102 #include <altq/altq_conf.h>
    103 #include <altq/altq_red.h>
    104 #ifdef ALTQ_FLOWVALVE
    105 #include <altq/altq_flowvalve.h>
    106 #endif
    107 
    108 /*
    109  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    110  * fixed-point calculation.
    111  *
    112  * written by kjc using the ns code as a reference.
    113  * you can learn more about red and ns from Sally's home page at
    114  * http://www-nrg.ee.lbl.gov/floyd/
    115  *
    116  * most of the red parameter values are fixed in this implementation
    117  * to prevent fixed-point overflow/underflow.
    118  * if you change the parameters, watch out for overflow/underflow!
    119  *
    120  * the parameters used are recommended values by Sally.
    121  * the corresponding ns config looks:
    122  *	q_weight=0.00195
    123  *	minthresh=5 maxthresh=15 queue-size=60
    124  *	linterm=30
    125  *	dropmech=drop-tail
    126  *	bytes=false (can't be handled by 32-bit fixed-point)
    127  *	doubleq=false dqthresh=false
    128  *	wait=true
    129  */
    130 /*
    131  * alternative red parameters for a slow link.
    132  *
    133  * assume the queue length becomes from zero to L and keeps L, it takes
    134  * N packets for q_avg to reach 63% of L.
    135  * when q_weight is 0.002, N is about 500 packets.
    136  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    137  * when q_weight is 0.008, N is about 127 packets.
    138  * when q_weight is 0.016, N is about 63 packets.
    139  * bursts of 50 packets are allowd for 0.002, bursts of 25 packets
    140  * are allowed for 0.016.
    141  * see Sally's paper for more details.
    142  */
    143 /* normal red parameters */
    144 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    145 				/* q_weight = 0.00195 */
    146 
    147 /* red parameters for a slow link */
    148 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    149 				/* q_weight = 0.0078125 */
    150 
    151 /* red parameters for a very slow link (e.g., dialup) */
    152 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    153 				/* q_weight = 0.015625 */
    154 
    155 /* fixed-point uses 12-bit decimal places */
    156 #define	FP_SHIFT	12	/* fixed-point shift */
    157 
    158 /* red parameters for drop probability */
    159 #define	INV_P_MAX	10	/* inverse of max drop probability */
    160 #define	TH_MIN		5	/* min threshold */
    161 #define	TH_MAX		15	/* max threshold */
    162 
    163 #define	RED_LIMIT	60	/* default max queue length */
    164 
    165 /*
    166  * our default policy for forced-drop is drop-tail.
    167  * (in altq-1.1.2 or earlier, the default was random-drop.
    168  * but it makes more sense to punish the cause of the surge.)
    169  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    170  */
    171 
    172 #ifdef ALTQ_FLOWVALVE
    173 /*
    174  * flow-valve is an extension to protect red from unresponsive flows
    175  * and to promote end-to-end congestion control.
    176  * flow-valve observes the average drop rates of the flows that have
    177  * experienced packet drops in the recent past.
    178  * when the average drop rate exceeds the threshold, the flow is
    179  * blocked by the flow-valve.  the trapped flow should back off
    180  * exponentially to escape from the flow-valve.
    181  */
    182 #ifdef RED_RANDOM_DROP
    183 #error "random-drop can't be used with flow-valve!"
    184 #endif
    185 #endif /* ALTQ_FLOWVALVE */
    186 
    187 /* red_list keeps all red_queue_t's allocated. */
    188 static red_queue_t *red_list = NULL;
    189 
    190 /* default red parameter values */
    191 static int default_th_min = TH_MIN;
    192 static int default_th_max = TH_MAX;
    193 static int default_inv_pmax = INV_P_MAX;
    194 
    195 /* internal function prototypes */
    196 static int red_enqueue __P((struct ifaltq *, struct mbuf *,
    197 			    struct altq_pktattr *));
    198 static struct mbuf *red_dequeue __P((struct ifaltq *, int));
    199 static int red_request __P((struct ifaltq *, int, void *));
    200 static void red_purgeq __P((red_queue_t *));
    201 static int red_detach __P((red_queue_t *));
    202 #ifdef ALTQ_FLOWVALVE
    203 static inline struct fve *flowlist_lookup __P((struct flowvalve *,
    204 			 struct altq_pktattr *, struct timeval *));
    205 static inline struct fve *flowlist_reclaim __P((struct flowvalve *,
    206 						  struct altq_pktattr *));
    207 static inline void flowlist_move_to_head __P((struct flowvalve *,
    208 						struct fve *));
    209 static inline int fv_p2f __P((struct flowvalve *, int));
    210 static struct flowvalve *fv_alloc __P((struct red *));
    211 static void fv_destroy __P((struct flowvalve *));
    212 static int fv_checkflow __P((struct flowvalve *, struct altq_pktattr *,
    213 			     struct fve **));
    214 static void fv_dropbyred __P((struct flowvalve *fv, struct altq_pktattr *,
    215 			      struct fve *));
    216 #endif
    217 
    218 /*
    219  * red device interface
    220  */
    221 altqdev_decl(red);
    222 
    223 int
    224 redopen(dev, flag, fmt, l)
    225 	dev_t dev;
    226 	int flag, fmt;
    227 	struct lwp *l;
    228 {
    229 	/* everything will be done when the queueing scheme is attached. */
    230 	return 0;
    231 }
    232 
    233 int
    234 redclose(dev, flag, fmt, l)
    235 	dev_t dev;
    236 	int flag, fmt;
    237 	struct lwp *l;
    238 {
    239 	red_queue_t *rqp;
    240 	int err, error = 0;
    241 
    242 	while ((rqp = red_list) != NULL) {
    243 		/* destroy all */
    244 		err = red_detach(rqp);
    245 		if (err != 0 && error == 0)
    246 			error = err;
    247 	}
    248 
    249 	return error;
    250 }
    251 
    252 int
    253 redioctl(dev, cmd, addr, flag, l)
    254 	dev_t dev;
    255 	ioctlcmd_t cmd;
    256 	caddr_t addr;
    257 	int flag;
    258 	struct lwp *l;
    259 {
    260 	red_queue_t *rqp;
    261 	struct red_interface *ifacep;
    262 	struct ifnet *ifp;
    263 	struct proc *p = l->l_proc;
    264 	int	error = 0;
    265 
    266 	/* check super-user privilege */
    267 	switch (cmd) {
    268 	case RED_GETSTATS:
    269 		break;
    270 	default:
    271 #if (__FreeBSD_version > 400000)
    272 		if ((error = suser(p)) != 0)
    273 #else
    274 		if ((error = kauth_authorize_generic(p->p_cred,
    275 					       KAUTH_GENERIC_ISSUSER,
    276 					       &p->p_acflag)) != 0)
    277 #endif
    278 			return (error);
    279 		break;
    280 	}
    281 
    282 	switch (cmd) {
    283 
    284 	case RED_ENABLE:
    285 		ifacep = (struct red_interface *)addr;
    286 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    287 			error = EBADF;
    288 			break;
    289 		}
    290 		error = altq_enable(rqp->rq_ifq);
    291 		break;
    292 
    293 	case RED_DISABLE:
    294 		ifacep = (struct red_interface *)addr;
    295 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    296 			error = EBADF;
    297 			break;
    298 		}
    299 		error = altq_disable(rqp->rq_ifq);
    300 		break;
    301 
    302 	case RED_IF_ATTACH:
    303 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    304 		if (ifp == NULL) {
    305 			error = ENXIO;
    306 			break;
    307 		}
    308 
    309 		/* allocate and initialize red_queue_t */
    310 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    311 		if (rqp == NULL) {
    312 			error = ENOMEM;
    313 			break;
    314 		}
    315 
    316 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    317 		    M_WAITOK|M_ZERO);
    318 		if (rqp->rq_q == NULL) {
    319 			free(rqp, M_DEVBUF);
    320 			error = ENOMEM;
    321 			break;
    322 		}
    323 
    324 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    325 		if (rqp->rq_red == NULL) {
    326 			free(rqp->rq_q, M_DEVBUF);
    327 			free(rqp, M_DEVBUF);
    328 			error = ENOMEM;
    329 			break;
    330 		}
    331 
    332 		rqp->rq_ifq = &ifp->if_snd;
    333 		qtail(rqp->rq_q) = NULL;
    334 		qlen(rqp->rq_q) = 0;
    335 		qlimit(rqp->rq_q) = RED_LIMIT;
    336 		qtype(rqp->rq_q) = Q_RED;
    337 
    338 		/*
    339 		 * set RED to this ifnet structure.
    340 		 */
    341 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    342 				    red_enqueue, red_dequeue, red_request,
    343 				    NULL, NULL);
    344 		if (error) {
    345 			red_destroy(rqp->rq_red);
    346 			free(rqp->rq_q, M_DEVBUF);
    347 			free(rqp, M_DEVBUF);
    348 			break;
    349 		}
    350 
    351 		/* add this state to the red list */
    352 		rqp->rq_next = red_list;
    353 		red_list = rqp;
    354 		break;
    355 
    356 	case RED_IF_DETACH:
    357 		ifacep = (struct red_interface *)addr;
    358 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    359 			error = EBADF;
    360 			break;
    361 		}
    362 		error = red_detach(rqp);
    363 		break;
    364 
    365 	case RED_GETSTATS:
    366 		do {
    367 			struct red_stats *q_stats;
    368 			red_t *rp;
    369 
    370 			q_stats = (struct red_stats *)addr;
    371 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    372 					     ALTQT_RED)) == NULL) {
    373 				error = EBADF;
    374 				break;
    375 			}
    376 
    377 			q_stats->q_len 	   = qlen(rqp->rq_q);
    378 			q_stats->q_limit   = qlimit(rqp->rq_q);
    379 
    380 			rp = rqp->rq_red;
    381 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    382 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    383 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    384 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    385 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    386 			q_stats->marked_packets = rp->red_stats.marked_packets;
    387 
    388 			q_stats->weight		= rp->red_weight;
    389 			q_stats->inv_pmax	= rp->red_inv_pmax;
    390 			q_stats->th_min		= rp->red_thmin;
    391 			q_stats->th_max		= rp->red_thmax;
    392 
    393 #ifdef ALTQ_FLOWVALVE
    394 			if (rp->red_flowvalve != NULL) {
    395 				struct flowvalve *fv = rp->red_flowvalve;
    396 				q_stats->fv_flows    = fv->fv_flows;
    397 				q_stats->fv_pass     = fv->fv_stats.pass;
    398 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    399 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    400 				q_stats->fv_escape   = fv->fv_stats.escape;
    401 			} else {
    402 #endif /* ALTQ_FLOWVALVE */
    403 				q_stats->fv_flows    = 0;
    404 				q_stats->fv_pass     = 0;
    405 				q_stats->fv_predrop  = 0;
    406 				q_stats->fv_alloc    = 0;
    407 				q_stats->fv_escape   = 0;
    408 #ifdef ALTQ_FLOWVALVE
    409 			}
    410 #endif /* ALTQ_FLOWVALVE */
    411 		} while (0);
    412 		break;
    413 
    414 	case RED_CONFIG:
    415 		do {
    416 			struct red_conf *fc;
    417 			red_t *new;
    418 			int s, limit;
    419 
    420 			fc = (struct red_conf *)addr;
    421 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    422 					       ALTQT_RED)) == NULL) {
    423 				error = EBADF;
    424 				break;
    425 			}
    426 			new = red_alloc(fc->red_weight,
    427 					fc->red_inv_pmax,
    428 					fc->red_thmin,
    429 					fc->red_thmax,
    430 					fc->red_flags,
    431 					fc->red_pkttime);
    432 			if (new == NULL) {
    433 				error = ENOMEM;
    434 				break;
    435 			}
    436 
    437 			s = splnet();
    438 			red_purgeq(rqp);
    439 			limit = fc->red_limit;
    440 			if (limit < fc->red_thmax)
    441 				limit = fc->red_thmax;
    442 			qlimit(rqp->rq_q) = limit;
    443 			fc->red_limit = limit;	/* write back the new value */
    444 
    445 			red_destroy(rqp->rq_red);
    446 			rqp->rq_red = new;
    447 
    448 			splx(s);
    449 
    450 			/* write back new values */
    451 			fc->red_limit = limit;
    452 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    453 			fc->red_thmin = rqp->rq_red->red_thmin;
    454 			fc->red_thmax = rqp->rq_red->red_thmax;
    455 
    456 		} while (0);
    457 		break;
    458 
    459 	case RED_SETDEFAULTS:
    460 		do {
    461 			struct redparams *rp;
    462 
    463 			rp = (struct redparams *)addr;
    464 
    465 			default_th_min = rp->th_min;
    466 			default_th_max = rp->th_max;
    467 			default_inv_pmax = rp->inv_pmax;
    468 		} while (0);
    469 		break;
    470 
    471 	default:
    472 		error = EINVAL;
    473 		break;
    474 	}
    475 	return error;
    476 }
    477 
    478 static int
    479 red_detach(rqp)
    480 	red_queue_t *rqp;
    481 {
    482 	red_queue_t *tmp;
    483 	int error = 0;
    484 
    485 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    486 		altq_disable(rqp->rq_ifq);
    487 
    488 	if ((error = altq_detach(rqp->rq_ifq)))
    489 		return (error);
    490 
    491 	if (red_list == rqp)
    492 		red_list = rqp->rq_next;
    493 	else {
    494 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    495 			if (tmp->rq_next == rqp) {
    496 				tmp->rq_next = rqp->rq_next;
    497 				break;
    498 			}
    499 		if (tmp == NULL)
    500 			printf("red_detach: no state found in red_list!\n");
    501 	}
    502 
    503 	red_destroy(rqp->rq_red);
    504 	free(rqp->rq_q, M_DEVBUF);
    505 	free(rqp, M_DEVBUF);
    506 	return (error);
    507 }
    508 
    509 /*
    510  * red support routines
    511  */
    512 
    513 red_t *
    514 red_alloc(weight, inv_pmax, th_min, th_max, flags, pkttime)
    515 	int	weight, inv_pmax, th_min, th_max;
    516 	int	flags, pkttime;
    517 {
    518 	red_t 	*rp;
    519 	int	w, i;
    520 	int	npkts_per_sec;
    521 
    522 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    523 	if (rp == NULL)
    524 		return (NULL);
    525 
    526 	rp->red_avg = 0;
    527 	rp->red_idle = 1;
    528 
    529 	if (weight == 0)
    530 		rp->red_weight = W_WEIGHT;
    531 	else
    532 		rp->red_weight = weight;
    533 	if (inv_pmax == 0)
    534 		rp->red_inv_pmax = default_inv_pmax;
    535 	else
    536 		rp->red_inv_pmax = inv_pmax;
    537 	if (th_min == 0)
    538 		rp->red_thmin = default_th_min;
    539 	else
    540 		rp->red_thmin = th_min;
    541 	if (th_max == 0)
    542 		rp->red_thmax = default_th_max;
    543 	else
    544 		rp->red_thmax = th_max;
    545 
    546 	rp->red_flags = flags;
    547 
    548 	if (pkttime == 0)
    549 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    550 		rp->red_pkttime = 800;
    551 	else
    552 		rp->red_pkttime = pkttime;
    553 
    554 	if (weight == 0) {
    555 		/* when the link is very slow, adjust red parameters */
    556 		npkts_per_sec = 1000000 / rp->red_pkttime;
    557 		if (npkts_per_sec < 50) {
    558 			/* up to about 400Kbps */
    559 			rp->red_weight = W_WEIGHT_2;
    560 		} else if (npkts_per_sec < 300) {
    561 			/* up to about 2.4Mbps */
    562 			rp->red_weight = W_WEIGHT_1;
    563 		}
    564 	}
    565 
    566 	/* calculate wshift.  weight must be power of 2 */
    567 	w = rp->red_weight;
    568 	for (i = 0; w > 1; i++)
    569 		w = w >> 1;
    570 	rp->red_wshift = i;
    571 	w = 1 << rp->red_wshift;
    572 	if (w != rp->red_weight) {
    573 		printf("invalid weight value %d for red! use %d\n",
    574 		       rp->red_weight, w);
    575 		rp->red_weight = w;
    576 	}
    577 
    578 	/*
    579 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    580 	 * to be compared with avg.
    581 	 */
    582 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    583 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    584 
    585 	/*
    586 	 * precompute probability denominator
    587 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    588 	 */
    589 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    590 			 * rp->red_inv_pmax) << FP_SHIFT;
    591 
    592 	/* allocate weight table */
    593 	rp->red_wtab = wtab_alloc(rp->red_weight);
    594 
    595 	microtime(&rp->red_last);
    596 #ifdef ALTQ_FLOWVALVE
    597 	if (flags & REDF_FLOWVALVE)
    598 		rp->red_flowvalve = fv_alloc(rp);
    599 	/* if fv_alloc failes, flowvalve is just disabled */
    600 #endif
    601 	return (rp);
    602 }
    603 
    604 void
    605 red_destroy(rp)
    606 	red_t *rp;
    607 {
    608 #ifdef ALTQ_FLOWVALVE
    609 	if (rp->red_flowvalve != NULL)
    610 		fv_destroy(rp->red_flowvalve);
    611 #endif
    612 	wtab_destroy(rp->red_wtab);
    613 	free(rp, M_DEVBUF);
    614 }
    615 
    616 void
    617 red_getstats(rp, sp)
    618 	red_t *rp;
    619 	struct redstats *sp;
    620 {
    621 	sp->q_avg 		= rp->red_avg >> rp->red_wshift;
    622 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    623 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    624 	sp->drop_forced		= rp->red_stats.drop_forced;
    625 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    626 	sp->marked_packets	= rp->red_stats.marked_packets;
    627 }
    628 
    629 /*
    630  * enqueue routine:
    631  *
    632  *	returns: 0 when successfully queued.
    633  *		 ENOBUFS when drop occurs.
    634  */
    635 static int
    636 red_enqueue(ifq, m, pktattr)
    637 	struct ifaltq *ifq;
    638 	struct mbuf *m;
    639 	struct altq_pktattr *pktattr;
    640 {
    641 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    642 
    643 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
    644 		return ENOBUFS;
    645 	ifq->ifq_len++;
    646 	return 0;
    647 }
    648 
    649 int
    650 red_addq(rp, q, m, pktattr)
    651 	red_t *rp;
    652 	class_queue_t *q;
    653 	struct mbuf *m;
    654 	struct altq_pktattr *pktattr;
    655 {
    656 	int avg, droptype;
    657 	int n;
    658 #ifdef ALTQ_FLOWVALVE
    659 	struct fve *fve = NULL;
    660 
    661 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    662 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    663 			m_freem(m);
    664 			return (-1);
    665 		}
    666 #endif
    667 
    668 	avg = rp->red_avg;
    669 
    670 	/*
    671 	 * if we were idle, we pretend that n packets arrived during
    672 	 * the idle period.
    673 	 */
    674 	if (rp->red_idle) {
    675 		struct timeval now;
    676 		int t;
    677 
    678 		rp->red_idle = 0;
    679 		microtime(&now);
    680 		t = (now.tv_sec - rp->red_last.tv_sec);
    681 		if (t > 60) {
    682 			/*
    683 			 * being idle for more than 1 minute, set avg to zero.
    684 			 * this prevents t from overflow.
    685 			 */
    686 			avg = 0;
    687 		} else {
    688 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    689 			n = t / rp->red_pkttime - 1;
    690 
    691 			/* the following line does (avg = (1 - Wq)^n * avg) */
    692 			if (n > 0)
    693 				avg = (avg >> FP_SHIFT) *
    694 				    pow_w(rp->red_wtab, n);
    695 		}
    696 	}
    697 
    698 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    699 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    700 	rp->red_avg = avg;		/* save the new value */
    701 
    702 	/*
    703 	 * red_count keeps a tally of arriving traffic that has not
    704 	 * been dropped.
    705 	 */
    706 	rp->red_count++;
    707 
    708 	/* see if we drop early */
    709 	droptype = DTYPE_NODROP;
    710 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    711 		if (avg >= rp->red_thmax_s) {
    712 			/* avg >= th_max: forced drop */
    713 			droptype = DTYPE_FORCED;
    714 		} else if (rp->red_old == 0) {
    715 			/* first exceeds th_min */
    716 			rp->red_count = 1;
    717 			rp->red_old = 1;
    718 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    719 				      rp->red_probd, rp->red_count)) {
    720 			/* mark or drop by red */
    721 			if ((rp->red_flags & REDF_ECN) &&
    722 			    mark_ecn(m, pktattr, rp->red_flags)) {
    723 				/* successfully marked.  do not drop. */
    724 				rp->red_count = 0;
    725 #ifdef RED_STATS
    726 				rp->red_stats.marked_packets++;
    727 #endif
    728 			} else {
    729 				/* unforced drop by red */
    730 				droptype = DTYPE_EARLY;
    731 			}
    732 		}
    733 	} else {
    734 		/* avg < th_min */
    735 		rp->red_old = 0;
    736 	}
    737 
    738 	/*
    739 	 * if the queue length hits the hard limit, it's a forced drop.
    740 	 */
    741 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    742 		droptype = DTYPE_FORCED;
    743 
    744 #ifdef RED_RANDOM_DROP
    745 	/* if successful or forced drop, enqueue this packet. */
    746 	if (droptype != DTYPE_EARLY)
    747 		_addq(q, m);
    748 #else
    749 	/* if successful, enqueue this packet. */
    750 	if (droptype == DTYPE_NODROP)
    751 		_addq(q, m);
    752 #endif
    753 	if (droptype != DTYPE_NODROP) {
    754 		if (droptype == DTYPE_EARLY) {
    755 			/* drop the incoming packet */
    756 #ifdef RED_STATS
    757 			rp->red_stats.drop_unforced++;
    758 #endif
    759 		} else {
    760 			/* forced drop, select a victim packet in the queue. */
    761 #ifdef RED_RANDOM_DROP
    762 			m = _getq_random(q);
    763 #endif
    764 #ifdef RED_STATS
    765 			rp->red_stats.drop_forced++;
    766 #endif
    767 		}
    768 #ifdef RED_STATS
    769 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    770 #endif
    771 		rp->red_count = 0;
    772 #ifdef ALTQ_FLOWVALVE
    773 		if (rp->red_flowvalve != NULL)
    774 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    775 #endif
    776 		m_freem(m);
    777 		return (-1);
    778 	}
    779 	/* successfully queued */
    780 #ifdef RED_STATS
    781 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    782 #endif
    783 	return (0);
    784 }
    785 
    786 /*
    787  * early-drop probability is calculated as follows:
    788  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    789  *   prob_a = prob / (2 - count*prob)
    790  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    791  * here prob_a increases as successive undrop count increases.
    792  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    793  * becomes 1 when (count >= (2 / prob))).
    794  */
    795 int
    796 drop_early(fp_len, fp_probd, count)
    797 	int fp_len;	/* (avg - TH_MIN) in fixed-point */
    798 	int fp_probd;	/* (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */
    799 	int count;	/* how many successive undropped packets */
    800 {
    801 	int d;		/* denominator of drop-probability */
    802 
    803 	d = fp_probd - count * fp_len;
    804 	if (d <= 0)
    805 		/* count exceeds the hard limit: drop or mark */
    806 		return (1);
    807 
    808 	/*
    809 	 * now the range of d is [1..600] in fixed-point. (when
    810 	 * th_max-th_min=10 and p_max=1/30)
    811 	 * drop probability = (avg - TH_MIN) / d
    812 	 */
    813 
    814 	if ((random() % d) < fp_len) {
    815 		/* drop or mark */
    816 		return (1);
    817 	}
    818 	/* no drop/mark */
    819 	return (0);
    820 }
    821 
    822 /*
    823  * try to mark CE bit to the packet.
    824  *    returns 1 if successfully marked, 0 otherwise.
    825  */
    826 int
    827 mark_ecn(m, pktattr, flags)
    828 	struct mbuf *m;
    829 	struct altq_pktattr *pktattr;
    830 	int flags;
    831 {
    832 	struct mbuf *m0;
    833 
    834 	if (pktattr == NULL ||
    835 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
    836 		return (0);
    837 
    838 	/* verify that pattr_hdr is within the mbuf data */
    839 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    840 		if ((pktattr->pattr_hdr >= m0->m_data) &&
    841 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
    842 			break;
    843 	if (m0 == NULL) {
    844 		/* ick, pattr_hdr is stale */
    845 		pktattr->pattr_af = AF_UNSPEC;
    846 		return (0);
    847 	}
    848 
    849 	switch (pktattr->pattr_af) {
    850 	case AF_INET:
    851 		if (flags & REDF_ECN4) {
    852 			struct ip *ip = (struct ip *)pktattr->pattr_hdr;
    853 			u_int8_t otos;
    854 			int sum;
    855 
    856 			if (ip->ip_v != 4)
    857 				return (0);	/* version mismatch! */
    858 
    859 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    860 				return (0);	/* not-ECT */
    861 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    862 				return (1);	/* already marked */
    863 
    864 			/*
    865 			 * ecn-capable but not marked,
    866 			 * mark CE and update checksum
    867 			 */
    868 			otos = ip->ip_tos;
    869 			ip->ip_tos |= IPTOS_ECN_CE;
    870 			/*
    871 			 * update checksum (from RFC1624)
    872 			 *	   HC' = ~(~HC + ~m + m')
    873 			 */
    874 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    875 			sum += (~otos & 0xffff) + ip->ip_tos;
    876 			sum = (sum >> 16) + (sum & 0xffff);
    877 			sum += (sum >> 16);  /* add carry */
    878 			ip->ip_sum = htons(~sum & 0xffff);
    879 			return (1);
    880 		}
    881 		break;
    882 #ifdef INET6
    883 	case AF_INET6:
    884 		if (flags & REDF_ECN6) {
    885 			struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
    886 			u_int32_t flowlabel;
    887 
    888 			flowlabel = ntohl(ip6->ip6_flow);
    889 			if ((flowlabel >> 28) != 6)
    890 				return (0);	/* version mismatch! */
    891 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    892 			    (IPTOS_ECN_NOTECT << 20))
    893 				return (0);	/* not-ECT */
    894 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    895 			    (IPTOS_ECN_CE << 20))
    896 				return (1);	/* already marked */
    897 			/*
    898 			 * ecn-capable but not marked,  mark CE
    899 			 */
    900 			flowlabel |= (IPTOS_ECN_CE << 20);
    901 			ip6->ip6_flow = htonl(flowlabel);
    902 			return (1);
    903 		}
    904 		break;
    905 #endif  /* INET6 */
    906 	}
    907 
    908 	/* not marked */
    909 	return (0);
    910 }
    911 
    912 /*
    913  * dequeue routine:
    914  *	must be called in splnet.
    915  *
    916  *	returns: mbuf dequeued.
    917  *		 NULL when no packet is available in the queue.
    918  */
    919 
    920 static struct mbuf *
    921 red_dequeue(ifq, op)
    922 	struct ifaltq *ifq;
    923 	int op;
    924 {
    925 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    926 	struct mbuf *m;
    927 
    928 	if (op == ALTDQ_POLL)
    929 		return qhead(rqp->rq_q);
    930 
    931 	/* op == ALTDQ_REMOVE */
    932 	m =  red_getq(rqp->rq_red, rqp->rq_q);
    933 	if (m != NULL)
    934 		ifq->ifq_len--;
    935 	return (m);
    936 }
    937 
    938 struct mbuf *
    939 red_getq(rp, q)
    940 	red_t *rp;
    941 	class_queue_t *q;
    942 {
    943 	struct mbuf *m;
    944 
    945 	if ((m = _getq(q)) == NULL) {
    946 		if (rp->red_idle == 0) {
    947 			rp->red_idle = 1;
    948 			microtime(&rp->red_last);
    949 		}
    950 		return NULL;
    951 	}
    952 
    953 	rp->red_idle = 0;
    954 	return (m);
    955 }
    956 
    957 static int
    958 red_request(ifq, req, arg)
    959 	struct ifaltq *ifq;
    960 	int req;
    961 	void *arg;
    962 {
    963 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
    964 
    965 	switch (req) {
    966 	case ALTRQ_PURGE:
    967 		red_purgeq(rqp);
    968 		break;
    969 	}
    970 	return (0);
    971 }
    972 
    973 static void
    974 red_purgeq(rqp)
    975 	red_queue_t *rqp;
    976 {
    977 	_flushq(rqp->rq_q);
    978 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    979 		rqp->rq_ifq->ifq_len = 0;
    980 }
    981 
    982 
    983 /*
    984  * helper routine to calibrate avg during idle.
    985  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    986  * here Wq = 1/weight and the code assumes Wq is close to zero.
    987  *
    988  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    989  */
    990 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    991 
    992 struct wtab *
    993 wtab_alloc(weight)
    994 	int weight;
    995 {
    996 	struct wtab *w;
    997 	int i;
    998 
    999 	for (w = wtab_list; w != NULL; w = w->w_next)
   1000 		if (w->w_weight == weight) {
   1001 			w->w_refcount++;
   1002 			return (w);
   1003 		}
   1004 
   1005 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
   1006 	if (w == NULL)
   1007 		panic("wtab_alloc: malloc failed!");
   1008 	w->w_weight = weight;
   1009 	w->w_refcount = 1;
   1010 	w->w_next = wtab_list;
   1011 	wtab_list = w;
   1012 
   1013 	/* initialize the weight table */
   1014 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
   1015 	for (i = 1; i < 32; i++) {
   1016 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
   1017 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
   1018 			w->w_param_max = 1 << i;
   1019 	}
   1020 
   1021 	return (w);
   1022 }
   1023 
   1024 int
   1025 wtab_destroy(w)
   1026 	struct wtab *w;
   1027 {
   1028 	struct wtab *prev;
   1029 
   1030 	if (--w->w_refcount > 0)
   1031 		return (0);
   1032 
   1033 	if (wtab_list == w)
   1034 		wtab_list = w->w_next;
   1035 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
   1036 		if (prev->w_next == w) {
   1037 			prev->w_next = w->w_next;
   1038 			break;
   1039 		}
   1040 
   1041 	free(w, M_DEVBUF);
   1042 	return (0);
   1043 }
   1044 
   1045 int32_t
   1046 pow_w(w, n)
   1047 	struct wtab *w;
   1048 	int n;
   1049 {
   1050 	int	i, bit;
   1051 	int32_t	val;
   1052 
   1053 	if (n >= w->w_param_max)
   1054 		return (0);
   1055 
   1056 	val = 1 << FP_SHIFT;
   1057 	if (n <= 0)
   1058 		return (val);
   1059 
   1060 	bit = 1;
   1061 	i = 0;
   1062 	while (n) {
   1063 		if (n & bit) {
   1064 			val = (val * w->w_tab[i]) >> FP_SHIFT;
   1065 			n &= ~bit;
   1066 		}
   1067 		i++;
   1068 		bit <<=  1;
   1069 	}
   1070 	return (val);
   1071 }
   1072 
   1073 #ifdef ALTQ_FLOWVALVE
   1074 
   1075 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1076 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1077 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1078 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1079 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1080 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1081 
   1082 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1083 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1084 
   1085 #define	FV_N			10	/* update fve_f every FV_N packets */
   1086 
   1087 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1088 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1089 #define	FV_ALPHA		5  /* extra packet count */
   1090 
   1091 #if (__FreeBSD_version > 300000)
   1092 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1093 #else
   1094 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1095 #endif
   1096 
   1097 /*
   1098  * Brtt table: 127 entry table to convert drop rate (p) to
   1099  * the corresponding bandwidth fraction (f)
   1100  * the following equation is implemented to use scaled values,
   1101  * fve_p and fve_f, in the fixed point format.
   1102  *
   1103  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1104  *   f = Brtt(p) / (max_th + alpha)
   1105  */
   1106 #define	BRTT_SIZE	128
   1107 #define	BRTT_SHIFT	12
   1108 #define	BRTT_MASK	0x0007f000
   1109 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1110 
   1111 const int brtt_tab[BRTT_SIZE] = {
   1112 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1113 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1114 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1115 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1116 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1117 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1118 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1119 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1120 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1121 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1122 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1123 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1124 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1125 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1126 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1127 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1128 };
   1129 
   1130 static inline struct fve *
   1131 flowlist_lookup(fv, pktattr, now)
   1132 	struct flowvalve *fv;
   1133 	struct altq_pktattr *pktattr;
   1134 	struct timeval *now;
   1135 {
   1136 	struct fve *fve;
   1137 	int flows;
   1138 	struct ip *ip;
   1139 #ifdef INET6
   1140 	struct ip6_hdr *ip6;
   1141 #endif
   1142 	struct timeval tthresh;
   1143 
   1144 	if (pktattr == NULL)
   1145 		return (NULL);
   1146 
   1147 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1148 	flows = 0;
   1149 	/*
   1150 	 * search the flow list
   1151 	 */
   1152 	switch (pktattr->pattr_af) {
   1153 	case AF_INET:
   1154 		ip = (struct ip *)pktattr->pattr_hdr;
   1155 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1156 			if (fve->fve_lastdrop.tv_sec == 0)
   1157 				break;
   1158 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1159 				fve->fve_lastdrop.tv_sec = 0;
   1160 				break;
   1161 			}
   1162 			if (fve->fve_flow.flow_af == AF_INET &&
   1163 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1164 			    ip->ip_src.s_addr &&
   1165 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1166 			    ip->ip_dst.s_addr)
   1167 				return (fve);
   1168 			flows++;
   1169 		}
   1170 		break;
   1171 #ifdef INET6
   1172 	case AF_INET6:
   1173 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1174 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1175 			if (fve->fve_lastdrop.tv_sec == 0)
   1176 				break;
   1177 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1178 				fve->fve_lastdrop.tv_sec = 0;
   1179 				break;
   1180 			}
   1181 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1182 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1183 					       &ip6->ip6_src) &&
   1184 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1185 					       &ip6->ip6_dst))
   1186 				return (fve);
   1187 			flows++;
   1188 		}
   1189 		break;
   1190 #endif /* INET6 */
   1191 
   1192 	default:
   1193 		/* unknown protocol.  no drop. */
   1194 		return (NULL);
   1195 	}
   1196 	fv->fv_flows = flows;	/* save the number of active fve's */
   1197 	return (NULL);
   1198 }
   1199 
   1200 static inline struct fve *
   1201 flowlist_reclaim(fv, pktattr)
   1202 	struct flowvalve *fv;
   1203 	struct altq_pktattr *pktattr;
   1204 {
   1205 	struct fve *fve;
   1206 	struct ip *ip;
   1207 #ifdef INET6
   1208 	struct ip6_hdr *ip6;
   1209 #endif
   1210 
   1211 	/*
   1212 	 * get an entry from the tail of the LRU list.
   1213 	 */
   1214 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1215 
   1216 	switch (pktattr->pattr_af) {
   1217 	case AF_INET:
   1218 		ip = (struct ip *)pktattr->pattr_hdr;
   1219 		fve->fve_flow.flow_af = AF_INET;
   1220 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1221 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1222 		break;
   1223 #ifdef INET6
   1224 	case AF_INET6:
   1225 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1226 		fve->fve_flow.flow_af = AF_INET6;
   1227 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1228 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1229 		break;
   1230 #endif
   1231 	}
   1232 
   1233 	fve->fve_state = Green;
   1234 	fve->fve_p = 0.0;
   1235 	fve->fve_f = 0.0;
   1236 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1237 	fve->fve_count = 0;
   1238 
   1239 	fv->fv_flows++;
   1240 #ifdef FV_STATS
   1241 	fv->fv_stats.alloc++;
   1242 #endif
   1243 	return (fve);
   1244 }
   1245 
   1246 static inline void
   1247 flowlist_move_to_head(fv, fve)
   1248 	struct flowvalve *fv;
   1249 	struct fve *fve;
   1250 {
   1251 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1252 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1253 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1254 	}
   1255 }
   1256 
   1257 /*
   1258  * allocate flowvalve structure
   1259  */
   1260 static struct flowvalve *
   1261 fv_alloc(rp)
   1262 	struct red *rp;
   1263 {
   1264 	struct flowvalve *fv;
   1265 	struct fve *fve;
   1266 	int i, num;
   1267 
   1268 	num = FV_FLOWLISTSIZE;
   1269 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1270 	if (fv == NULL)
   1271 		return (NULL);
   1272 
   1273 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1274 	    M_WAITOK|M_ZERO);
   1275 	if (fv->fv_fves == NULL) {
   1276 		free(fv, M_DEVBUF);
   1277 		return (NULL);
   1278 	}
   1279 
   1280 	fv->fv_flows = 0;
   1281 	TAILQ_INIT(&fv->fv_flowlist);
   1282 	for (i = 0; i < num; i++) {
   1283 		fve = &fv->fv_fves[i];
   1284 		fve->fve_lastdrop.tv_sec = 0;
   1285 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1286 	}
   1287 
   1288 	/* initialize drop rate threshold in scaled fixed-point */
   1289 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1290 
   1291 	/* initialize drop rate to fraction table */
   1292 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1293 	if (fv->fv_p2ftab == NULL) {
   1294 		free(fv->fv_fves, M_DEVBUF);
   1295 		free(fv, M_DEVBUF);
   1296 		return (NULL);
   1297 	}
   1298 	/*
   1299 	 * create the p2f table.
   1300 	 * (shift is used to keep the precision)
   1301 	 */
   1302 	for (i = 1; i < BRTT_SIZE; i++) {
   1303 		int f;
   1304 
   1305 		f = brtt_tab[i] << 8;
   1306 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1307 	}
   1308 
   1309 	return (fv);
   1310 }
   1311 
   1312 static void fv_destroy(fv)
   1313 	struct flowvalve *fv;
   1314 {
   1315 	free(fv->fv_p2ftab, M_DEVBUF);
   1316 	free(fv->fv_fves, M_DEVBUF);
   1317 	free(fv, M_DEVBUF);
   1318 }
   1319 
   1320 static inline int
   1321 fv_p2f(fv, p)
   1322 	struct flowvalve	*fv;
   1323 	int	p;
   1324 {
   1325 	int val, f;
   1326 
   1327 	if (p >= BRTT_PMAX)
   1328 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1329 	else if ((val = (p & BRTT_MASK)))
   1330 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1331 	else
   1332 		f = fv->fv_p2ftab[1];
   1333 	return (f);
   1334 }
   1335 
   1336 /*
   1337  * check if an arriving packet should be pre-dropped.
   1338  * called from red_addq() when a packet arrives.
   1339  * returns 1 when the packet should be pre-dropped.
   1340  * should be called in splnet.
   1341  */
   1342 static int
   1343 fv_checkflow(fv, pktattr, fcache)
   1344 	struct flowvalve *fv;
   1345 	struct altq_pktattr *pktattr;
   1346 	struct fve **fcache;
   1347 {
   1348 	struct fve *fve;
   1349 	struct timeval now;
   1350 
   1351 	fv->fv_ifseq++;
   1352 	FV_TIMESTAMP(&now);
   1353 
   1354 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1355 		/* no matching entry in the flowlist */
   1356 		return (0);
   1357 
   1358 	*fcache = fve;
   1359 
   1360 	/* update fraction f for every FV_N packets */
   1361 	if (++fve->fve_count == FV_N) {
   1362 		/*
   1363 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1364 		 */
   1365 		fve->fve_f =
   1366 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1367 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1368 		fve->fve_ifseq = fv->fv_ifseq;
   1369 		fve->fve_count = 0;
   1370 	}
   1371 
   1372 	/*
   1373 	 * overpumping test
   1374 	 */
   1375 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1376 		int fthresh;
   1377 
   1378 		/* calculate a threshold */
   1379 		fthresh = fv_p2f(fv, fve->fve_p);
   1380 		if (fve->fve_f > fthresh)
   1381 			fve->fve_state = Red;
   1382 	}
   1383 
   1384 	if (fve->fve_state == Red) {
   1385 		/*
   1386 		 * backoff test
   1387 		 */
   1388 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1389 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1390 			fve->fve_p = 0;
   1391 			fve->fve_state = Green;
   1392 #ifdef FV_STATS
   1393 			fv->fv_stats.escape++;
   1394 #endif
   1395 		} else {
   1396 			/* block this flow */
   1397 			flowlist_move_to_head(fv, fve);
   1398 			fve->fve_lastdrop = now;
   1399 #ifdef FV_STATS
   1400 			fv->fv_stats.predrop++;
   1401 #endif
   1402 			return (1);
   1403 		}
   1404 	}
   1405 
   1406 	/*
   1407 	 * p = (1 - Wp) * p
   1408 	 */
   1409 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1410 	if (fve->fve_p < 0)
   1411 		fve->fve_p = 0;
   1412 #ifdef FV_STATS
   1413 	fv->fv_stats.pass++;
   1414 #endif
   1415 	return (0);
   1416 }
   1417 
   1418 /*
   1419  * called from red_addq when a packet is dropped by red.
   1420  * should be called in splnet.
   1421  */
   1422 static void fv_dropbyred(fv, pktattr, fcache)
   1423 	struct flowvalve *fv;
   1424 	struct altq_pktattr *pktattr;
   1425 	struct fve *fcache;
   1426 {
   1427 	struct fve *fve;
   1428 	struct timeval now;
   1429 
   1430 	if (pktattr == NULL)
   1431 		return;
   1432 	FV_TIMESTAMP(&now);
   1433 
   1434 	if (fcache != NULL)
   1435 		/* the fve of this packet is already cached */
   1436 		fve = fcache;
   1437 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1438 		fve = flowlist_reclaim(fv, pktattr);
   1439 
   1440 	flowlist_move_to_head(fv, fve);
   1441 
   1442 	/*
   1443 	 * update p:  the following line cancels the update
   1444 	 *	      in fv_checkflow() and calculate
   1445 	 *	p = Wp + (1 - Wp) * p
   1446 	 */
   1447 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1448 
   1449 	fve->fve_lastdrop = now;
   1450 }
   1451 
   1452 #endif /* ALTQ_FLOWVALVE */
   1453 
   1454 #ifdef KLD_MODULE
   1455 
   1456 static struct altqsw red_sw =
   1457 	{"red", redopen, redclose, redioctl};
   1458 
   1459 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1460 
   1461 #endif /* KLD_MODULE */
   1462 
   1463 #endif /* ALTQ_RED */
   1464