Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.21
      1 /*	$NetBSD: altq_red.c,v 1.21 2006/10/12 19:59:08 peter Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2003
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.21 2006/10/12 19:59:08 peter Exp $");
     65 
     66 #ifdef _KERNEL_OPT
     67 #include "opt_altq.h"
     68 #include "opt_inet.h"
     69 #endif
     70 
     71 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     72 
     73 #include <sys/param.h>
     74 #include <sys/malloc.h>
     75 #include <sys/mbuf.h>
     76 #include <sys/socket.h>
     77 #include <sys/systm.h>
     78 #include <sys/errno.h>
     79 #include <sys/kauth.h>
     80 #if 1 /* ALTQ3_COMPAT */
     81 #include <sys/sockio.h>
     82 #include <sys/proc.h>
     83 #include <sys/kernel.h>
     84 #include <sys/kauth.h>
     85 #ifdef ALTQ_FLOWVALVE
     86 #include <sys/queue.h>
     87 #include <sys/time.h>
     88 #endif
     89 #endif /* ALTQ3_COMPAT */
     90 
     91 #include <net/if.h>
     92 
     93 #include <netinet/in.h>
     94 #include <netinet/in_systm.h>
     95 #include <netinet/ip.h>
     96 #ifdef INET6
     97 #include <netinet/ip6.h>
     98 #endif
     99 
    100 #include <net/pfvar.h>
    101 #include <altq/altq.h>
    102 #include <altq/altq_red.h>
    103 #ifdef ALTQ3_COMPAT
    104 #include <altq/altq_conf.h>
    105 #ifdef ALTQ_FLOWVALVE
    106 #include <altq/altq_flowvalve.h>
    107 #endif
    108 #endif
    109 
    110 /*
    111  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    112  * fixed-point calculation.
    113  *
    114  * written by kjc using the ns code as a reference.
    115  * you can learn more about red and ns from Sally's home page at
    116  * http://www-nrg.ee.lbl.gov/floyd/
    117  *
    118  * most of the red parameter values are fixed in this implementation
    119  * to prevent fixed-point overflow/underflow.
    120  * if you change the parameters, watch out for overflow/underflow!
    121  *
    122  * the parameters used are recommended values by Sally.
    123  * the corresponding ns config looks:
    124  *	q_weight=0.00195
    125  *	minthresh=5 maxthresh=15 queue-size=60
    126  *	linterm=30
    127  *	dropmech=drop-tail
    128  *	bytes=false (can't be handled by 32-bit fixed-point)
    129  *	doubleq=false dqthresh=false
    130  *	wait=true
    131  */
    132 /*
    133  * alternative red parameters for a slow link.
    134  *
    135  * assume the queue length becomes from zero to L and keeps L, it takes
    136  * N packets for q_avg to reach 63% of L.
    137  * when q_weight is 0.002, N is about 500 packets.
    138  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    139  * when q_weight is 0.008, N is about 127 packets.
    140  * when q_weight is 0.016, N is about 63 packets.
    141  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
    142  * are allowed for 0.016.
    143  * see Sally's paper for more details.
    144  */
    145 /* normal red parameters */
    146 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    147 				/* q_weight = 0.00195 */
    148 
    149 /* red parameters for a slow link */
    150 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    151 				/* q_weight = 0.0078125 */
    152 
    153 /* red parameters for a very slow link (e.g., dialup) */
    154 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    155 				/* q_weight = 0.015625 */
    156 
    157 /* fixed-point uses 12-bit decimal places */
    158 #define	FP_SHIFT	12	/* fixed-point shift */
    159 
    160 /* red parameters for drop probability */
    161 #define	INV_P_MAX	10	/* inverse of max drop probability */
    162 #define	TH_MIN		5	/* min threshold */
    163 #define	TH_MAX		15	/* max threshold */
    164 
    165 #define	RED_LIMIT	60	/* default max queue lenght */
    166 #define	RED_STATS		/* collect statistics */
    167 
    168 /*
    169  * our default policy for forced-drop is drop-tail.
    170  * (in altq-1.1.2 or earlier, the default was random-drop.
    171  * but it makes more sense to punish the cause of the surge.)
    172  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    173  */
    174 
    175 #ifdef ALTQ3_COMPAT
    176 #ifdef ALTQ_FLOWVALVE
    177 /*
    178  * flow-valve is an extention to protect red from unresponsive flows
    179  * and to promote end-to-end congestion control.
    180  * flow-valve observes the average drop rates of the flows that have
    181  * experienced packet drops in the recent past.
    182  * when the average drop rate exceeds the threshold, the flow is
    183  * blocked by the flow-valve.  the trapped flow should back off
    184  * exponentially to escape from the flow-valve.
    185  */
    186 #ifdef RED_RANDOM_DROP
    187 #error "random-drop can't be used with flow-valve!"
    188 #endif
    189 #endif /* ALTQ_FLOWVALVE */
    190 
    191 /* red_list keeps all red_queue_t's allocated. */
    192 static red_queue_t *red_list = NULL;
    193 
    194 #endif /* ALTQ3_COMPAT */
    195 
    196 /* default red parameter values */
    197 static int default_th_min = TH_MIN;
    198 static int default_th_max = TH_MAX;
    199 static int default_inv_pmax = INV_P_MAX;
    200 
    201 #ifdef ALTQ3_COMPAT
    202 /* internal function prototypes */
    203 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
    204 static struct mbuf *red_dequeue(struct ifaltq *, int);
    205 static int red_request(struct ifaltq *, int, void *);
    206 static void red_purgeq(red_queue_t *);
    207 static int red_detach(red_queue_t *);
    208 #ifdef ALTQ_FLOWVALVE
    209 static inline struct fve *flowlist_lookup(struct flowvalve *,
    210 			 struct altq_pktattr *, struct timeval *);
    211 static inline struct fve *flowlist_reclaim(struct flowvalve *,
    212 					     struct altq_pktattr *);
    213 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
    214 static inline int fv_p2f(struct flowvalve *, int);
    215 static struct flowvalve *fv_alloc(struct red *);
    216 static void fv_destroy(struct flowvalve *);
    217 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
    218 			struct fve **);
    219 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
    220 			 struct fve *);
    221 #endif
    222 #endif /* ALTQ3_COMPAT */
    223 
    224 /*
    225  * red support routines
    226  */
    227 red_t *
    228 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
    229    int pkttime)
    230 {
    231 	red_t	*rp;
    232 	int	 w, i;
    233 	int	 npkts_per_sec;
    234 
    235 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    236 	if (rp == NULL)
    237 		return (NULL);
    238 
    239 	rp->red_avg = 0;
    240 	rp->red_idle = 1;
    241 
    242 	if (weight == 0)
    243 		rp->red_weight = W_WEIGHT;
    244 	else
    245 		rp->red_weight = weight;
    246 	if (inv_pmax == 0)
    247 		rp->red_inv_pmax = default_inv_pmax;
    248 	else
    249 		rp->red_inv_pmax = inv_pmax;
    250 	if (th_min == 0)
    251 		rp->red_thmin = default_th_min;
    252 	else
    253 		rp->red_thmin = th_min;
    254 	if (th_max == 0)
    255 		rp->red_thmax = default_th_max;
    256 	else
    257 		rp->red_thmax = th_max;
    258 
    259 	rp->red_flags = flags;
    260 
    261 	if (pkttime == 0)
    262 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    263 		rp->red_pkttime = 800;
    264 	else
    265 		rp->red_pkttime = pkttime;
    266 
    267 	if (weight == 0) {
    268 		/* when the link is very slow, adjust red parameters */
    269 		npkts_per_sec = 1000000 / rp->red_pkttime;
    270 		if (npkts_per_sec < 50) {
    271 			/* up to about 400Kbps */
    272 			rp->red_weight = W_WEIGHT_2;
    273 		} else if (npkts_per_sec < 300) {
    274 			/* up to about 2.4Mbps */
    275 			rp->red_weight = W_WEIGHT_1;
    276 		}
    277 	}
    278 
    279 	/* calculate wshift.  weight must be power of 2 */
    280 	w = rp->red_weight;
    281 	for (i = 0; w > 1; i++)
    282 		w = w >> 1;
    283 	rp->red_wshift = i;
    284 	w = 1 << rp->red_wshift;
    285 	if (w != rp->red_weight) {
    286 		printf("invalid weight value %d for red! use %d\n",
    287 		       rp->red_weight, w);
    288 		rp->red_weight = w;
    289 	}
    290 
    291 	/*
    292 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    293 	 * to be compared with avg.
    294 	 */
    295 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    296 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    297 
    298 	/*
    299 	 * precompute probability denominator
    300 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    301 	 */
    302 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    303 			 * rp->red_inv_pmax) << FP_SHIFT;
    304 
    305 	/* allocate weight table */
    306 	rp->red_wtab = wtab_alloc(rp->red_weight);
    307 
    308 	microtime(&rp->red_last);
    309 #ifdef ALTQ3_COMPAT
    310 #ifdef ALTQ_FLOWVALVE
    311 	if (flags & REDF_FLOWVALVE)
    312 		rp->red_flowvalve = fv_alloc(rp);
    313 	/* if fv_alloc failes, flowvalve is just disabled */
    314 #endif
    315 #endif /* ALTQ3_COMPAT */
    316 	return (rp);
    317 }
    318 
    319 void
    320 red_destroy(red_t *rp)
    321 {
    322 #ifdef ALTQ3_COMPAT
    323 #ifdef ALTQ_FLOWVALVE
    324 	if (rp->red_flowvalve != NULL)
    325 		fv_destroy(rp->red_flowvalve);
    326 #endif
    327 #endif /* ALTQ3_COMPAT */
    328 	wtab_destroy(rp->red_wtab);
    329 	free(rp, M_DEVBUF);
    330 }
    331 
    332 void
    333 red_getstats(red_t *rp, struct redstats *sp)
    334 {
    335 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
    336 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    337 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    338 	sp->drop_forced		= rp->red_stats.drop_forced;
    339 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    340 	sp->marked_packets	= rp->red_stats.marked_packets;
    341 }
    342 
    343 int
    344 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
    345     struct altq_pktattr *pktattr)
    346 {
    347 	int avg, droptype;
    348 	int n;
    349 #ifdef ALTQ3_COMPAT
    350 #ifdef ALTQ_FLOWVALVE
    351 	struct fve *fve = NULL;
    352 
    353 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    354 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    355 			m_freem(m);
    356 			return (-1);
    357 		}
    358 #endif
    359 #endif /* ALTQ3_COMPAT */
    360 
    361 	avg = rp->red_avg;
    362 
    363 	/*
    364 	 * if we were idle, we pretend that n packets arrived during
    365 	 * the idle period.
    366 	 */
    367 	if (rp->red_idle) {
    368 		struct timeval now;
    369 		int t;
    370 
    371 		rp->red_idle = 0;
    372 		microtime(&now);
    373 		t = (now.tv_sec - rp->red_last.tv_sec);
    374 		if (t > 60) {
    375 			/*
    376 			 * being idle for more than 1 minute, set avg to zero.
    377 			 * this prevents t from overflow.
    378 			 */
    379 			avg = 0;
    380 		} else {
    381 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    382 			n = t / rp->red_pkttime - 1;
    383 
    384 			/* the following line does (avg = (1 - Wq)^n * avg) */
    385 			if (n > 0)
    386 				avg = (avg >> FP_SHIFT) *
    387 				    pow_w(rp->red_wtab, n);
    388 		}
    389 	}
    390 
    391 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    392 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    393 	rp->red_avg = avg;		/* save the new value */
    394 
    395 	/*
    396 	 * red_count keeps a tally of arriving traffic that has not
    397 	 * been dropped.
    398 	 */
    399 	rp->red_count++;
    400 
    401 	/* see if we drop early */
    402 	droptype = DTYPE_NODROP;
    403 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    404 		if (avg >= rp->red_thmax_s) {
    405 			/* avg >= th_max: forced drop */
    406 			droptype = DTYPE_FORCED;
    407 		} else if (rp->red_old == 0) {
    408 			/* first exceeds th_min */
    409 			rp->red_count = 1;
    410 			rp->red_old = 1;
    411 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    412 				      rp->red_probd, rp->red_count)) {
    413 			/* mark or drop by red */
    414 			if ((rp->red_flags & REDF_ECN) &&
    415 			    mark_ecn(m, pktattr, rp->red_flags)) {
    416 				/* successfully marked.  do not drop. */
    417 				rp->red_count = 0;
    418 #ifdef RED_STATS
    419 				rp->red_stats.marked_packets++;
    420 #endif
    421 			} else {
    422 				/* unforced drop by red */
    423 				droptype = DTYPE_EARLY;
    424 			}
    425 		}
    426 	} else {
    427 		/* avg < th_min */
    428 		rp->red_old = 0;
    429 	}
    430 
    431 	/*
    432 	 * if the queue length hits the hard limit, it's a forced drop.
    433 	 */
    434 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    435 		droptype = DTYPE_FORCED;
    436 
    437 #ifdef RED_RANDOM_DROP
    438 	/* if successful or forced drop, enqueue this packet. */
    439 	if (droptype != DTYPE_EARLY)
    440 		_addq(q, m);
    441 #else
    442 	/* if successful, enqueue this packet. */
    443 	if (droptype == DTYPE_NODROP)
    444 		_addq(q, m);
    445 #endif
    446 	if (droptype != DTYPE_NODROP) {
    447 		if (droptype == DTYPE_EARLY) {
    448 			/* drop the incoming packet */
    449 #ifdef RED_STATS
    450 			rp->red_stats.drop_unforced++;
    451 #endif
    452 		} else {
    453 			/* forced drop, select a victim packet in the queue. */
    454 #ifdef RED_RANDOM_DROP
    455 			m = _getq_random(q);
    456 #endif
    457 #ifdef RED_STATS
    458 			rp->red_stats.drop_forced++;
    459 #endif
    460 		}
    461 #ifdef RED_STATS
    462 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    463 #endif
    464 		rp->red_count = 0;
    465 #ifdef ALTQ3_COMPAT
    466 #ifdef ALTQ_FLOWVALVE
    467 		if (rp->red_flowvalve != NULL)
    468 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    469 #endif
    470 #endif /* ALTQ3_COMPAT */
    471 		m_freem(m);
    472 		return (-1);
    473 	}
    474 	/* successfully queued */
    475 #ifdef RED_STATS
    476 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    477 #endif
    478 	return (0);
    479 }
    480 
    481 /*
    482  * early-drop probability is calculated as follows:
    483  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    484  *   prob_a = prob / (2 - count*prob)
    485  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    486  * here prob_a increases as successive undrop count increases.
    487  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    488  * becomes 1 when (count >= (2 / prob))).
    489  */
    490 int
    491 drop_early(int fp_len, int fp_probd, int count)
    492 {
    493 	int	d;		/* denominator of drop-probability */
    494 
    495 	d = fp_probd - count * fp_len;
    496 	if (d <= 0)
    497 		/* count exceeds the hard limit: drop or mark */
    498 		return (1);
    499 
    500 	/*
    501 	 * now the range of d is [1..600] in fixed-point. (when
    502 	 * th_max-th_min=10 and p_max=1/30)
    503 	 * drop probability = (avg - TH_MIN) / d
    504 	 */
    505 
    506 	if ((arc4random() % d) < fp_len) {
    507 		/* drop or mark */
    508 		return (1);
    509 	}
    510 	/* no drop/mark */
    511 	return (0);
    512 }
    513 
    514 /*
    515  * try to mark CE bit to the packet.
    516  *    returns 1 if successfully marked, 0 otherwise.
    517  */
    518 int
    519 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
    520 {
    521 	struct mbuf	*m0;
    522 	struct m_tag	*t;
    523 	struct altq_tag	*at;
    524 	void		*hdr;
    525 	int		 af;
    526 
    527 	t = m_tag_find(m, PACKET_TAG_PF_QID, NULL);
    528 	if (t != NULL) {
    529 		at = (struct altq_tag *)(t + 1);
    530 		if (at == NULL)
    531 			return (0);
    532 		af = at->af;
    533 		hdr = at->hdr;
    534 #ifdef ALTQ3_COMPAT
    535 	} else if (pktattr != NULL) {
    536 		af = pktattr->pattr_af;
    537 		hdr = pktattr->pattr_hdr;
    538 #endif /* ALTQ3_COMPAT */
    539 	} else
    540 		return (0);
    541 
    542 	if (af != AF_INET && af != AF_INET6)
    543 		return (0);
    544 
    545 	/* verify that pattr_hdr is within the mbuf data */
    546 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    547 		if (((caddr_t)hdr >= m0->m_data) &&
    548 		    ((caddr_t)hdr < m0->m_data + m0->m_len))
    549 			break;
    550 	if (m0 == NULL) {
    551 		/* ick, tag info is stale */
    552 		return (0);
    553 	}
    554 
    555 	switch (af) {
    556 	case AF_INET:
    557 		if (flags & REDF_ECN4) {
    558 			struct ip *ip = hdr;
    559 			u_int8_t otos;
    560 			int sum;
    561 
    562 			if (ip->ip_v != 4)
    563 				return (0);	/* version mismatch! */
    564 
    565 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    566 				return (0);	/* not-ECT */
    567 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    568 				return (1);	/* already marked */
    569 
    570 			/*
    571 			 * ecn-capable but not marked,
    572 			 * mark CE and update checksum
    573 			 */
    574 			otos = ip->ip_tos;
    575 			ip->ip_tos |= IPTOS_ECN_CE;
    576 			/*
    577 			 * update checksum (from RFC1624)
    578 			 *	   HC' = ~(~HC + ~m + m')
    579 			 */
    580 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    581 			sum += (~otos & 0xffff) + ip->ip_tos;
    582 			sum = (sum >> 16) + (sum & 0xffff);
    583 			sum += (sum >> 16);  /* add carry */
    584 			ip->ip_sum = htons(~sum & 0xffff);
    585 			return (1);
    586 		}
    587 		break;
    588 #ifdef INET6
    589 	case AF_INET6:
    590 		if (flags & REDF_ECN6) {
    591 			struct ip6_hdr *ip6 = hdr;
    592 			u_int32_t flowlabel;
    593 
    594 			flowlabel = ntohl(ip6->ip6_flow);
    595 			if ((flowlabel >> 28) != 6)
    596 				return (0);	/* version mismatch! */
    597 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    598 			    (IPTOS_ECN_NOTECT << 20))
    599 				return (0);	/* not-ECT */
    600 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    601 			    (IPTOS_ECN_CE << 20))
    602 				return (1);	/* already marked */
    603 			/*
    604 			 * ecn-capable but not marked,  mark CE
    605 			 */
    606 			flowlabel |= (IPTOS_ECN_CE << 20);
    607 			ip6->ip6_flow = htonl(flowlabel);
    608 			return (1);
    609 		}
    610 		break;
    611 #endif  /* INET6 */
    612 	}
    613 
    614 	/* not marked */
    615 	return (0);
    616 }
    617 
    618 struct mbuf *
    619 red_getq(red_t *rp, class_queue_t *q)
    620 {
    621 	struct mbuf *m;
    622 
    623 	if ((m = _getq(q)) == NULL) {
    624 		if (rp->red_idle == 0) {
    625 			rp->red_idle = 1;
    626 			microtime(&rp->red_last);
    627 		}
    628 		return NULL;
    629 	}
    630 
    631 	rp->red_idle = 0;
    632 	return (m);
    633 }
    634 
    635 /*
    636  * helper routine to calibrate avg during idle.
    637  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    638  * here Wq = 1/weight and the code assumes Wq is close to zero.
    639  *
    640  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    641  */
    642 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    643 
    644 struct wtab *
    645 wtab_alloc(int weight)
    646 {
    647 	struct wtab	*w;
    648 	int		 i;
    649 
    650 	for (w = wtab_list; w != NULL; w = w->w_next)
    651 		if (w->w_weight == weight) {
    652 			w->w_refcount++;
    653 			return (w);
    654 		}
    655 
    656 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
    657 	if (w == NULL)
    658 		panic("wtab_alloc: malloc failed!");
    659 	w->w_weight = weight;
    660 	w->w_refcount = 1;
    661 	w->w_next = wtab_list;
    662 	wtab_list = w;
    663 
    664 	/* initialize the weight table */
    665 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
    666 	for (i = 1; i < 32; i++) {
    667 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
    668 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
    669 			w->w_param_max = 1 << i;
    670 	}
    671 
    672 	return (w);
    673 }
    674 
    675 int
    676 wtab_destroy(struct wtab *w)
    677 {
    678 	struct wtab	*prev;
    679 
    680 	if (--w->w_refcount > 0)
    681 		return (0);
    682 
    683 	if (wtab_list == w)
    684 		wtab_list = w->w_next;
    685 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
    686 		if (prev->w_next == w) {
    687 			prev->w_next = w->w_next;
    688 			break;
    689 		}
    690 
    691 	free(w, M_DEVBUF);
    692 	return (0);
    693 }
    694 
    695 int32_t
    696 pow_w(struct wtab *w, int n)
    697 {
    698 	int	i, bit;
    699 	int32_t	val;
    700 
    701 	if (n >= w->w_param_max)
    702 		return (0);
    703 
    704 	val = 1 << FP_SHIFT;
    705 	if (n <= 0)
    706 		return (val);
    707 
    708 	bit = 1;
    709 	i = 0;
    710 	while (n) {
    711 		if (n & bit) {
    712 			val = (val * w->w_tab[i]) >> FP_SHIFT;
    713 			n &= ~bit;
    714 		}
    715 		i++;
    716 		bit <<=  1;
    717 	}
    718 	return (val);
    719 }
    720 
    721 #ifdef ALTQ3_COMPAT
    722 /*
    723  * red device interface
    724  */
    725 altqdev_decl(red);
    726 
    727 int
    728 redopen(dev_t dev __unused, int flag __unused, int fmt __unused,
    729     struct lwp *l __unused)
    730 {
    731 	/* everything will be done when the queueing scheme is attached. */
    732 	return 0;
    733 }
    734 
    735 int
    736 redclose(dev_t dev __unused, int flag __unused, int fmt __unused,
    737     struct lwp *l __unused)
    738 {
    739 	red_queue_t *rqp;
    740 	int err, error = 0;
    741 
    742 	while ((rqp = red_list) != NULL) {
    743 		/* destroy all */
    744 		err = red_detach(rqp);
    745 		if (err != 0 && error == 0)
    746 			error = err;
    747 	}
    748 
    749 	return error;
    750 }
    751 
    752 int
    753 redioctl(dev_t dev __unused, ioctlcmd_t cmd, caddr_t addr, int flag __unused,
    754     struct lwp *l)
    755 {
    756 	red_queue_t *rqp;
    757 	struct red_interface *ifacep;
    758 	struct ifnet *ifp;
    759 	struct proc *p = l->l_proc;
    760 	int	error = 0;
    761 
    762 	/* check super-user privilege */
    763 	switch (cmd) {
    764 	case RED_GETSTATS:
    765 		break;
    766 	default:
    767 #if (__FreeBSD_version > 400000)
    768 		if ((error = suser(p)) != 0)
    769 #else
    770 		if ((error = kauth_authorize_generic(p->p_cred,
    771 		    KAUTH_GENERIC_ISSUSER, &p->p_acflag)) != 0)
    772 #endif
    773 			return (error);
    774 		break;
    775 	}
    776 
    777 	switch (cmd) {
    778 
    779 	case RED_ENABLE:
    780 		ifacep = (struct red_interface *)addr;
    781 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    782 			error = EBADF;
    783 			break;
    784 		}
    785 		error = altq_enable(rqp->rq_ifq);
    786 		break;
    787 
    788 	case RED_DISABLE:
    789 		ifacep = (struct red_interface *)addr;
    790 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    791 			error = EBADF;
    792 			break;
    793 		}
    794 		error = altq_disable(rqp->rq_ifq);
    795 		break;
    796 
    797 	case RED_IF_ATTACH:
    798 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    799 		if (ifp == NULL) {
    800 			error = ENXIO;
    801 			break;
    802 		}
    803 
    804 		/* allocate and initialize red_queue_t */
    805 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    806 		if (rqp == NULL) {
    807 			error = ENOMEM;
    808 			break;
    809 		}
    810 
    811 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    812 		    M_WAITOK|M_ZERO);
    813 		if (rqp->rq_q == NULL) {
    814 			free(rqp, M_DEVBUF);
    815 			error = ENOMEM;
    816 			break;
    817 		}
    818 
    819 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    820 		if (rqp->rq_red == NULL) {
    821 			free(rqp->rq_q, M_DEVBUF);
    822 			free(rqp, M_DEVBUF);
    823 			error = ENOMEM;
    824 			break;
    825 		}
    826 
    827 		rqp->rq_ifq = &ifp->if_snd;
    828 		qtail(rqp->rq_q) = NULL;
    829 		qlen(rqp->rq_q) = 0;
    830 		qlimit(rqp->rq_q) = RED_LIMIT;
    831 		qtype(rqp->rq_q) = Q_RED;
    832 
    833 		/*
    834 		 * set RED to this ifnet structure.
    835 		 */
    836 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    837 				    red_enqueue, red_dequeue, red_request,
    838 				    NULL, NULL);
    839 		if (error) {
    840 			red_destroy(rqp->rq_red);
    841 			free(rqp->rq_q, M_DEVBUF);
    842 			free(rqp, M_DEVBUF);
    843 			break;
    844 		}
    845 
    846 		/* add this state to the red list */
    847 		rqp->rq_next = red_list;
    848 		red_list = rqp;
    849 		break;
    850 
    851 	case RED_IF_DETACH:
    852 		ifacep = (struct red_interface *)addr;
    853 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    854 			error = EBADF;
    855 			break;
    856 		}
    857 		error = red_detach(rqp);
    858 		break;
    859 
    860 	case RED_GETSTATS:
    861 		do {
    862 			struct red_stats *q_stats;
    863 			red_t *rp;
    864 
    865 			q_stats = (struct red_stats *)addr;
    866 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    867 					     ALTQT_RED)) == NULL) {
    868 				error = EBADF;
    869 				break;
    870 			}
    871 
    872 			q_stats->q_len 	   = qlen(rqp->rq_q);
    873 			q_stats->q_limit   = qlimit(rqp->rq_q);
    874 
    875 			rp = rqp->rq_red;
    876 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    877 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    878 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    879 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    880 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    881 			q_stats->marked_packets = rp->red_stats.marked_packets;
    882 
    883 			q_stats->weight		= rp->red_weight;
    884 			q_stats->inv_pmax	= rp->red_inv_pmax;
    885 			q_stats->th_min		= rp->red_thmin;
    886 			q_stats->th_max		= rp->red_thmax;
    887 
    888 #ifdef ALTQ_FLOWVALVE
    889 			if (rp->red_flowvalve != NULL) {
    890 				struct flowvalve *fv = rp->red_flowvalve;
    891 				q_stats->fv_flows    = fv->fv_flows;
    892 				q_stats->fv_pass     = fv->fv_stats.pass;
    893 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    894 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    895 				q_stats->fv_escape   = fv->fv_stats.escape;
    896 			} else {
    897 #endif /* ALTQ_FLOWVALVE */
    898 				q_stats->fv_flows    = 0;
    899 				q_stats->fv_pass     = 0;
    900 				q_stats->fv_predrop  = 0;
    901 				q_stats->fv_alloc    = 0;
    902 				q_stats->fv_escape   = 0;
    903 #ifdef ALTQ_FLOWVALVE
    904 			}
    905 #endif /* ALTQ_FLOWVALVE */
    906 		} while (/*CONSTCOND*/ 0);
    907 		break;
    908 
    909 	case RED_CONFIG:
    910 		do {
    911 			struct red_conf *fc;
    912 			red_t *new;
    913 			int s, limit;
    914 
    915 			fc = (struct red_conf *)addr;
    916 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    917 					       ALTQT_RED)) == NULL) {
    918 				error = EBADF;
    919 				break;
    920 			}
    921 			new = red_alloc(fc->red_weight,
    922 					fc->red_inv_pmax,
    923 					fc->red_thmin,
    924 					fc->red_thmax,
    925 					fc->red_flags,
    926 					fc->red_pkttime);
    927 			if (new == NULL) {
    928 				error = ENOMEM;
    929 				break;
    930 			}
    931 
    932 			s = splnet();
    933 			red_purgeq(rqp);
    934 			limit = fc->red_limit;
    935 			if (limit < fc->red_thmax)
    936 				limit = fc->red_thmax;
    937 			qlimit(rqp->rq_q) = limit;
    938 			fc->red_limit = limit;	/* write back the new value */
    939 
    940 			red_destroy(rqp->rq_red);
    941 			rqp->rq_red = new;
    942 
    943 			splx(s);
    944 
    945 			/* write back new values */
    946 			fc->red_limit = limit;
    947 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    948 			fc->red_thmin = rqp->rq_red->red_thmin;
    949 			fc->red_thmax = rqp->rq_red->red_thmax;
    950 
    951 		} while (/*CONSTCOND*/ 0);
    952 		break;
    953 
    954 	case RED_SETDEFAULTS:
    955 		do {
    956 			struct redparams *rp;
    957 
    958 			rp = (struct redparams *)addr;
    959 
    960 			default_th_min = rp->th_min;
    961 			default_th_max = rp->th_max;
    962 			default_inv_pmax = rp->inv_pmax;
    963 		} while (/*CONSTCOND*/ 0);
    964 		break;
    965 
    966 	default:
    967 		error = EINVAL;
    968 		break;
    969 	}
    970 	return error;
    971 }
    972 
    973 static int
    974 red_detach(red_queue_t *rqp)
    975 {
    976 	red_queue_t *tmp;
    977 	int error = 0;
    978 
    979 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    980 		altq_disable(rqp->rq_ifq);
    981 
    982 	if ((error = altq_detach(rqp->rq_ifq)))
    983 		return (error);
    984 
    985 	if (red_list == rqp)
    986 		red_list = rqp->rq_next;
    987 	else {
    988 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    989 			if (tmp->rq_next == rqp) {
    990 				tmp->rq_next = rqp->rq_next;
    991 				break;
    992 			}
    993 		if (tmp == NULL)
    994 			printf("red_detach: no state found in red_list!\n");
    995 	}
    996 
    997 	red_destroy(rqp->rq_red);
    998 	free(rqp->rq_q, M_DEVBUF);
    999 	free(rqp, M_DEVBUF);
   1000 	return (error);
   1001 }
   1002 
   1003 /*
   1004  * enqueue routine:
   1005  *
   1006  *	returns: 0 when successfully queued.
   1007  *		 ENOBUFS when drop occurs.
   1008  */
   1009 static int
   1010 red_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
   1011 {
   1012 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1013 
   1014 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
   1015 		return ENOBUFS;
   1016 	ifq->ifq_len++;
   1017 	return 0;
   1018 }
   1019 
   1020 /*
   1021  * dequeue routine:
   1022  *	must be called in splnet.
   1023  *
   1024  *	returns: mbuf dequeued.
   1025  *		 NULL when no packet is available in the queue.
   1026  */
   1027 
   1028 static struct mbuf *
   1029 red_dequeue(struct ifaltq *ifq, int op)
   1030 {
   1031 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1032 	struct mbuf *m;
   1033 
   1034 	if (op == ALTDQ_POLL)
   1035 		return qhead(rqp->rq_q);
   1036 
   1037 	/* op == ALTDQ_REMOVE */
   1038 	m =  red_getq(rqp->rq_red, rqp->rq_q);
   1039 	if (m != NULL)
   1040 		ifq->ifq_len--;
   1041 	return (m);
   1042 }
   1043 
   1044 static int
   1045 red_request(struct ifaltq *ifq, int req, void *arg __unused)
   1046 {
   1047 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1048 
   1049 	switch (req) {
   1050 	case ALTRQ_PURGE:
   1051 		red_purgeq(rqp);
   1052 		break;
   1053 	}
   1054 	return (0);
   1055 }
   1056 
   1057 static void
   1058 red_purgeq(red_queue_t *rqp)
   1059 {
   1060 	_flushq(rqp->rq_q);
   1061 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
   1062 		rqp->rq_ifq->ifq_len = 0;
   1063 }
   1064 
   1065 #ifdef ALTQ_FLOWVALVE
   1066 
   1067 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1068 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1069 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1070 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1071 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1072 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1073 
   1074 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1075 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1076 
   1077 #define	FV_N			10	/* update fve_f every FV_N packets */
   1078 
   1079 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1080 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1081 #define	FV_ALPHA		5  /* extra packet count */
   1082 
   1083 #define	FV_STATS
   1084 
   1085 #if (__FreeBSD_version > 300000) || defined(__HAVE_TIMECOUNTER)
   1086 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1087 #else
   1088 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1089 #endif
   1090 
   1091 /*
   1092  * Brtt table: 127 entry table to convert drop rate (p) to
   1093  * the corresponding bandwidth fraction (f)
   1094  * the following equation is implemented to use scaled values,
   1095  * fve_p and fve_f, in the fixed point format.
   1096  *
   1097  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1098  *   f = Brtt(p) / (max_th + alpha)
   1099  */
   1100 #define	BRTT_SIZE	128
   1101 #define	BRTT_SHIFT	12
   1102 #define	BRTT_MASK	0x0007f000
   1103 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1104 
   1105 const int brtt_tab[BRTT_SIZE] = {
   1106 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1107 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1108 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1109 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1110 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1111 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1112 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1113 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1114 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1115 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1116 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1117 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1118 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1119 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1120 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1121 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1122 };
   1123 
   1124 static inline struct fve *
   1125 flowlist_lookup(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1126     struct timeval *now)
   1127 {
   1128 	struct fve *fve;
   1129 	int flows;
   1130 	struct ip *ip;
   1131 #ifdef INET6
   1132 	struct ip6_hdr *ip6;
   1133 #endif
   1134 	struct timeval tthresh;
   1135 
   1136 	if (pktattr == NULL)
   1137 		return (NULL);
   1138 
   1139 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1140 	flows = 0;
   1141 	/*
   1142 	 * search the flow list
   1143 	 */
   1144 	switch (pktattr->pattr_af) {
   1145 	case AF_INET:
   1146 		ip = (struct ip *)pktattr->pattr_hdr;
   1147 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1148 			if (fve->fve_lastdrop.tv_sec == 0)
   1149 				break;
   1150 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1151 				fve->fve_lastdrop.tv_sec = 0;
   1152 				break;
   1153 			}
   1154 			if (fve->fve_flow.flow_af == AF_INET &&
   1155 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1156 			    ip->ip_src.s_addr &&
   1157 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1158 			    ip->ip_dst.s_addr)
   1159 				return (fve);
   1160 			flows++;
   1161 		}
   1162 		break;
   1163 #ifdef INET6
   1164 	case AF_INET6:
   1165 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1166 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1167 			if (fve->fve_lastdrop.tv_sec == 0)
   1168 				break;
   1169 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1170 				fve->fve_lastdrop.tv_sec = 0;
   1171 				break;
   1172 			}
   1173 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1174 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1175 					       &ip6->ip6_src) &&
   1176 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1177 					       &ip6->ip6_dst))
   1178 				return (fve);
   1179 			flows++;
   1180 		}
   1181 		break;
   1182 #endif /* INET6 */
   1183 
   1184 	default:
   1185 		/* unknown protocol.  no drop. */
   1186 		return (NULL);
   1187 	}
   1188 	fv->fv_flows = flows;	/* save the number of active fve's */
   1189 	return (NULL);
   1190 }
   1191 
   1192 static inline struct fve *
   1193 flowlist_reclaim(struct flowvalve *fv, struct altq_pktattr *pktattr)
   1194 {
   1195 	struct fve *fve;
   1196 	struct ip *ip;
   1197 #ifdef INET6
   1198 	struct ip6_hdr *ip6;
   1199 #endif
   1200 
   1201 	/*
   1202 	 * get an entry from the tail of the LRU list.
   1203 	 */
   1204 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1205 
   1206 	switch (pktattr->pattr_af) {
   1207 	case AF_INET:
   1208 		ip = (struct ip *)pktattr->pattr_hdr;
   1209 		fve->fve_flow.flow_af = AF_INET;
   1210 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1211 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1212 		break;
   1213 #ifdef INET6
   1214 	case AF_INET6:
   1215 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1216 		fve->fve_flow.flow_af = AF_INET6;
   1217 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1218 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1219 		break;
   1220 #endif
   1221 	}
   1222 
   1223 	fve->fve_state = Green;
   1224 	fve->fve_p = 0.0;
   1225 	fve->fve_f = 0.0;
   1226 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1227 	fve->fve_count = 0;
   1228 
   1229 	fv->fv_flows++;
   1230 #ifdef FV_STATS
   1231 	fv->fv_stats.alloc++;
   1232 #endif
   1233 	return (fve);
   1234 }
   1235 
   1236 static inline void
   1237 flowlist_move_to_head(struct flowvalve *fv, struct fve *fve)
   1238 {
   1239 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1240 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1241 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1242 	}
   1243 }
   1244 
   1245 /*
   1246  * allocate flowvalve structure
   1247  */
   1248 static struct flowvalve *
   1249 fv_alloc(struct red *rp)
   1250 {
   1251 	struct flowvalve *fv;
   1252 	struct fve *fve;
   1253 	int i, num;
   1254 
   1255 	num = FV_FLOWLISTSIZE;
   1256 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1257 	if (fv == NULL)
   1258 		return (NULL);
   1259 
   1260 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1261 	    M_WAITOK|M_ZERO);
   1262 	if (fv->fv_fves == NULL) {
   1263 		free(fv, M_DEVBUF);
   1264 		return (NULL);
   1265 	}
   1266 
   1267 	fv->fv_flows = 0;
   1268 	TAILQ_INIT(&fv->fv_flowlist);
   1269 	for (i = 0; i < num; i++) {
   1270 		fve = &fv->fv_fves[i];
   1271 		fve->fve_lastdrop.tv_sec = 0;
   1272 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1273 	}
   1274 
   1275 	/* initialize drop rate threshold in scaled fixed-point */
   1276 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1277 
   1278 	/* initialize drop rate to fraction table */
   1279 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1280 	if (fv->fv_p2ftab == NULL) {
   1281 		free(fv->fv_fves, M_DEVBUF);
   1282 		free(fv, M_DEVBUF);
   1283 		return (NULL);
   1284 	}
   1285 	/*
   1286 	 * create the p2f table.
   1287 	 * (shift is used to keep the precision)
   1288 	 */
   1289 	for (i = 1; i < BRTT_SIZE; i++) {
   1290 		int f;
   1291 
   1292 		f = brtt_tab[i] << 8;
   1293 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1294 	}
   1295 
   1296 	return (fv);
   1297 }
   1298 
   1299 static void
   1300 fv_destroy(struct flowvalve *fv)
   1301 {
   1302 	free(fv->fv_p2ftab, M_DEVBUF);
   1303 	free(fv->fv_fves, M_DEVBUF);
   1304 	free(fv, M_DEVBUF);
   1305 }
   1306 
   1307 static inline int
   1308 fv_p2f(struct flowvalve *fv, int p)
   1309 {
   1310 	int val, f;
   1311 
   1312 	if (p >= BRTT_PMAX)
   1313 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1314 	else if ((val = (p & BRTT_MASK)))
   1315 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1316 	else
   1317 		f = fv->fv_p2ftab[1];
   1318 	return (f);
   1319 }
   1320 
   1321 /*
   1322  * check if an arriving packet should be pre-dropped.
   1323  * called from red_addq() when a packet arrives.
   1324  * returns 1 when the packet should be pre-dropped.
   1325  * should be called in splnet.
   1326  */
   1327 static int
   1328 fv_checkflow(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1329     struct fve **fcache)
   1330 {
   1331 	struct fve *fve;
   1332 	struct timeval now;
   1333 
   1334 	fv->fv_ifseq++;
   1335 	FV_TIMESTAMP(&now);
   1336 
   1337 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1338 		/* no matching entry in the flowlist */
   1339 		return (0);
   1340 
   1341 	*fcache = fve;
   1342 
   1343 	/* update fraction f for every FV_N packets */
   1344 	if (++fve->fve_count == FV_N) {
   1345 		/*
   1346 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1347 		 */
   1348 		fve->fve_f =
   1349 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1350 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1351 		fve->fve_ifseq = fv->fv_ifseq;
   1352 		fve->fve_count = 0;
   1353 	}
   1354 
   1355 	/*
   1356 	 * overpumping test
   1357 	 */
   1358 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1359 		int fthresh;
   1360 
   1361 		/* calculate a threshold */
   1362 		fthresh = fv_p2f(fv, fve->fve_p);
   1363 		if (fve->fve_f > fthresh)
   1364 			fve->fve_state = Red;
   1365 	}
   1366 
   1367 	if (fve->fve_state == Red) {
   1368 		/*
   1369 		 * backoff test
   1370 		 */
   1371 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1372 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1373 			fve->fve_p = 0;
   1374 			fve->fve_state = Green;
   1375 #ifdef FV_STATS
   1376 			fv->fv_stats.escape++;
   1377 #endif
   1378 		} else {
   1379 			/* block this flow */
   1380 			flowlist_move_to_head(fv, fve);
   1381 			fve->fve_lastdrop = now;
   1382 #ifdef FV_STATS
   1383 			fv->fv_stats.predrop++;
   1384 #endif
   1385 			return (1);
   1386 		}
   1387 	}
   1388 
   1389 	/*
   1390 	 * p = (1 - Wp) * p
   1391 	 */
   1392 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1393 	if (fve->fve_p < 0)
   1394 		fve->fve_p = 0;
   1395 #ifdef FV_STATS
   1396 	fv->fv_stats.pass++;
   1397 #endif
   1398 	return (0);
   1399 }
   1400 
   1401 /*
   1402  * called from red_addq when a packet is dropped by red.
   1403  * should be called in splnet.
   1404  */
   1405 static void
   1406 fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1407     struct fve *fcache)
   1408 {
   1409 	struct fve *fve;
   1410 	struct timeval now;
   1411 
   1412 	if (pktattr == NULL)
   1413 		return;
   1414 	FV_TIMESTAMP(&now);
   1415 
   1416 	if (fcache != NULL)
   1417 		/* the fve of this packet is already cached */
   1418 		fve = fcache;
   1419 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1420 		fve = flowlist_reclaim(fv, pktattr);
   1421 
   1422 	flowlist_move_to_head(fv, fve);
   1423 
   1424 	/*
   1425 	 * update p:  the following line cancels the update
   1426 	 *	      in fv_checkflow() and calculate
   1427 	 *	p = Wp + (1 - Wp) * p
   1428 	 */
   1429 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1430 
   1431 	fve->fve_lastdrop = now;
   1432 }
   1433 
   1434 #endif /* ALTQ_FLOWVALVE */
   1435 
   1436 #ifdef KLD_MODULE
   1437 
   1438 static struct altqsw red_sw =
   1439 	{"red", redopen, redclose, redioctl};
   1440 
   1441 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1442 MODULE_VERSION(altq_red, 1);
   1443 
   1444 #endif /* KLD_MODULE */
   1445 #endif /* ALTQ3_COMPAT */
   1446 
   1447 #endif /* ALTQ_RED */
   1448