Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.13.12.3
      1 /*	$NetBSD: altq_red.c,v 1.13.12.3 2006/06/09 19:52:35 peter Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2003
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.13.12.3 2006/06/09 19:52:35 peter Exp $");
     65 
     66 #ifdef _KERNEL_OPT
     67 #include "opt_altq.h"
     68 #include "opt_inet.h"
     69 #endif
     70 
     71 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     72 
     73 #include <sys/param.h>
     74 #include <sys/malloc.h>
     75 #include <sys/mbuf.h>
     76 #include <sys/socket.h>
     77 #include <sys/systm.h>
     78 #include <sys/errno.h>
     79 #include <sys/kauth.h>
     80 #if 1 /* ALTQ3_COMPAT */
     81 #include <sys/sockio.h>
     82 #include <sys/proc.h>
     83 #include <sys/kernel.h>
     84 #ifdef ALTQ_FLOWVALVE
     85 #include <sys/queue.h>
     86 #include <sys/time.h>
     87 #endif
     88 #endif /* ALTQ3_COMPAT */
     89 
     90 #include <net/if.h>
     91 
     92 #include <netinet/in.h>
     93 #include <netinet/in_systm.h>
     94 #include <netinet/ip.h>
     95 #ifdef INET6
     96 #include <netinet/ip6.h>
     97 #endif
     98 
     99 #include <net/pfvar.h>
    100 #include <altq/altq.h>
    101 #include <altq/altq_red.h>
    102 #ifdef ALTQ3_COMPAT
    103 #include <altq/altq_conf.h>
    104 #ifdef ALTQ_FLOWVALVE
    105 #include <altq/altq_flowvalve.h>
    106 #endif
    107 #endif
    108 
    109 /*
    110  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    111  * fixed-point calculation.
    112  *
    113  * written by kjc using the ns code as a reference.
    114  * you can learn more about red and ns from Sally's home page at
    115  * http://www-nrg.ee.lbl.gov/floyd/
    116  *
    117  * most of the red parameter values are fixed in this implementation
    118  * to prevent fixed-point overflow/underflow.
    119  * if you change the parameters, watch out for overflow/underflow!
    120  *
    121  * the parameters used are recommended values by Sally.
    122  * the corresponding ns config looks:
    123  *	q_weight=0.00195
    124  *	minthresh=5 maxthresh=15 queue-size=60
    125  *	linterm=30
    126  *	dropmech=drop-tail
    127  *	bytes=false (can't be handled by 32-bit fixed-point)
    128  *	doubleq=false dqthresh=false
    129  *	wait=true
    130  */
    131 /*
    132  * alternative red parameters for a slow link.
    133  *
    134  * assume the queue length becomes from zero to L and keeps L, it takes
    135  * N packets for q_avg to reach 63% of L.
    136  * when q_weight is 0.002, N is about 500 packets.
    137  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    138  * when q_weight is 0.008, N is about 127 packets.
    139  * when q_weight is 0.016, N is about 63 packets.
    140  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
    141  * are allowed for 0.016.
    142  * see Sally's paper for more details.
    143  */
    144 /* normal red parameters */
    145 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    146 				/* q_weight = 0.00195 */
    147 
    148 /* red parameters for a slow link */
    149 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    150 				/* q_weight = 0.0078125 */
    151 
    152 /* red parameters for a very slow link (e.g., dialup) */
    153 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    154 				/* q_weight = 0.015625 */
    155 
    156 /* fixed-point uses 12-bit decimal places */
    157 #define	FP_SHIFT	12	/* fixed-point shift */
    158 
    159 /* red parameters for drop probability */
    160 #define	INV_P_MAX	10	/* inverse of max drop probability */
    161 #define	TH_MIN		5	/* min threshold */
    162 #define	TH_MAX		15	/* max threshold */
    163 
    164 #define	RED_LIMIT	60	/* default max queue lenght */
    165 #define	RED_STATS		/* collect statistics */
    166 
    167 /*
    168  * our default policy for forced-drop is drop-tail.
    169  * (in altq-1.1.2 or earlier, the default was random-drop.
    170  * but it makes more sense to punish the cause of the surge.)
    171  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    172  */
    173 
    174 #ifdef ALTQ3_COMPAT
    175 #ifdef ALTQ_FLOWVALVE
    176 /*
    177  * flow-valve is an extention to protect red from unresponsive flows
    178  * and to promote end-to-end congestion control.
    179  * flow-valve observes the average drop rates of the flows that have
    180  * experienced packet drops in the recent past.
    181  * when the average drop rate exceeds the threshold, the flow is
    182  * blocked by the flow-valve.  the trapped flow should back off
    183  * exponentially to escape from the flow-valve.
    184  */
    185 #ifdef RED_RANDOM_DROP
    186 #error "random-drop can't be used with flow-valve!"
    187 #endif
    188 #endif /* ALTQ_FLOWVALVE */
    189 
    190 /* red_list keeps all red_queue_t's allocated. */
    191 static red_queue_t *red_list = NULL;
    192 
    193 #endif /* ALTQ3_COMPAT */
    194 
    195 /* default red parameter values */
    196 static int default_th_min = TH_MIN;
    197 static int default_th_max = TH_MAX;
    198 static int default_inv_pmax = INV_P_MAX;
    199 
    200 #ifdef ALTQ3_COMPAT
    201 /* internal function prototypes */
    202 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
    203 static struct mbuf *red_dequeue(struct ifaltq *, int);
    204 static int red_request(struct ifaltq *, int, void *);
    205 static void red_purgeq(red_queue_t *);
    206 static int red_detach(red_queue_t *);
    207 #ifdef ALTQ_FLOWVALVE
    208 static inline struct fve *flowlist_lookup(struct flowvalve *,
    209 			 struct altq_pktattr *, struct timeval *);
    210 static inline struct fve *flowlist_reclaim(struct flowvalve *,
    211 					     struct altq_pktattr *);
    212 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
    213 static inline int fv_p2f(struct flowvalve *, int);
    214 static struct flowvalve *fv_alloc(struct red *);
    215 static void fv_destroy(struct flowvalve *);
    216 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
    217 			struct fve **);
    218 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
    219 			 struct fve *);
    220 #endif
    221 #endif /* ALTQ3_COMPAT */
    222 
    223 /*
    224  * red support routines
    225  */
    226 red_t *
    227 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
    228    int pkttime)
    229 {
    230 	red_t	*rp;
    231 	int	 w, i;
    232 	int	 npkts_per_sec;
    233 
    234 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    235 	if (rp == NULL)
    236 		return (NULL);
    237 
    238 	rp->red_avg = 0;
    239 	rp->red_idle = 1;
    240 
    241 	if (weight == 0)
    242 		rp->red_weight = W_WEIGHT;
    243 	else
    244 		rp->red_weight = weight;
    245 	if (inv_pmax == 0)
    246 		rp->red_inv_pmax = default_inv_pmax;
    247 	else
    248 		rp->red_inv_pmax = inv_pmax;
    249 	if (th_min == 0)
    250 		rp->red_thmin = default_th_min;
    251 	else
    252 		rp->red_thmin = th_min;
    253 	if (th_max == 0)
    254 		rp->red_thmax = default_th_max;
    255 	else
    256 		rp->red_thmax = th_max;
    257 
    258 	rp->red_flags = flags;
    259 
    260 	if (pkttime == 0)
    261 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    262 		rp->red_pkttime = 800;
    263 	else
    264 		rp->red_pkttime = pkttime;
    265 
    266 	if (weight == 0) {
    267 		/* when the link is very slow, adjust red parameters */
    268 		npkts_per_sec = 1000000 / rp->red_pkttime;
    269 		if (npkts_per_sec < 50) {
    270 			/* up to about 400Kbps */
    271 			rp->red_weight = W_WEIGHT_2;
    272 		} else if (npkts_per_sec < 300) {
    273 			/* up to about 2.4Mbps */
    274 			rp->red_weight = W_WEIGHT_1;
    275 		}
    276 	}
    277 
    278 	/* calculate wshift.  weight must be power of 2 */
    279 	w = rp->red_weight;
    280 	for (i = 0; w > 1; i++)
    281 		w = w >> 1;
    282 	rp->red_wshift = i;
    283 	w = 1 << rp->red_wshift;
    284 	if (w != rp->red_weight) {
    285 		printf("invalid weight value %d for red! use %d\n",
    286 		       rp->red_weight, w);
    287 		rp->red_weight = w;
    288 	}
    289 
    290 	/*
    291 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    292 	 * to be compared with avg.
    293 	 */
    294 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    295 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    296 
    297 	/*
    298 	 * precompute probability denominator
    299 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    300 	 */
    301 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    302 			 * rp->red_inv_pmax) << FP_SHIFT;
    303 
    304 	/* allocate weight table */
    305 	rp->red_wtab = wtab_alloc(rp->red_weight);
    306 
    307 	microtime(&rp->red_last);
    308 #ifdef ALTQ3_COMPAT
    309 #ifdef ALTQ_FLOWVALVE
    310 	if (flags & REDF_FLOWVALVE)
    311 		rp->red_flowvalve = fv_alloc(rp);
    312 	/* if fv_alloc failes, flowvalve is just disabled */
    313 #endif
    314 #endif /* ALTQ3_COMPAT */
    315 	return (rp);
    316 }
    317 
    318 void
    319 red_destroy(red_t *rp)
    320 {
    321 #ifdef ALTQ3_COMPAT
    322 #ifdef ALTQ_FLOWVALVE
    323 	if (rp->red_flowvalve != NULL)
    324 		fv_destroy(rp->red_flowvalve);
    325 #endif
    326 #endif /* ALTQ3_COMPAT */
    327 	wtab_destroy(rp->red_wtab);
    328 	free(rp, M_DEVBUF);
    329 }
    330 
    331 void
    332 red_getstats(red_t *rp, struct redstats *sp)
    333 {
    334 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
    335 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    336 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    337 	sp->drop_forced		= rp->red_stats.drop_forced;
    338 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    339 	sp->marked_packets	= rp->red_stats.marked_packets;
    340 }
    341 
    342 int
    343 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
    344     struct altq_pktattr *pktattr)
    345 {
    346 	int avg, droptype;
    347 	int n;
    348 #ifdef ALTQ3_COMPAT
    349 #ifdef ALTQ_FLOWVALVE
    350 	struct fve *fve = NULL;
    351 
    352 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    353 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    354 			m_freem(m);
    355 			return (-1);
    356 		}
    357 #endif
    358 #endif /* ALTQ3_COMPAT */
    359 
    360 	avg = rp->red_avg;
    361 
    362 	/*
    363 	 * if we were idle, we pretend that n packets arrived during
    364 	 * the idle period.
    365 	 */
    366 	if (rp->red_idle) {
    367 		struct timeval now;
    368 		int t;
    369 
    370 		rp->red_idle = 0;
    371 		microtime(&now);
    372 		t = (now.tv_sec - rp->red_last.tv_sec);
    373 		if (t > 60) {
    374 			/*
    375 			 * being idle for more than 1 minute, set avg to zero.
    376 			 * this prevents t from overflow.
    377 			 */
    378 			avg = 0;
    379 		} else {
    380 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    381 			n = t / rp->red_pkttime - 1;
    382 
    383 			/* the following line does (avg = (1 - Wq)^n * avg) */
    384 			if (n > 0)
    385 				avg = (avg >> FP_SHIFT) *
    386 				    pow_w(rp->red_wtab, n);
    387 		}
    388 	}
    389 
    390 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    391 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    392 	rp->red_avg = avg;		/* save the new value */
    393 
    394 	/*
    395 	 * red_count keeps a tally of arriving traffic that has not
    396 	 * been dropped.
    397 	 */
    398 	rp->red_count++;
    399 
    400 	/* see if we drop early */
    401 	droptype = DTYPE_NODROP;
    402 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    403 		if (avg >= rp->red_thmax_s) {
    404 			/* avg >= th_max: forced drop */
    405 			droptype = DTYPE_FORCED;
    406 		} else if (rp->red_old == 0) {
    407 			/* first exceeds th_min */
    408 			rp->red_count = 1;
    409 			rp->red_old = 1;
    410 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    411 				      rp->red_probd, rp->red_count)) {
    412 			/* mark or drop by red */
    413 			if ((rp->red_flags & REDF_ECN) &&
    414 			    mark_ecn(m, pktattr, rp->red_flags)) {
    415 				/* successfully marked.  do not drop. */
    416 				rp->red_count = 0;
    417 #ifdef RED_STATS
    418 				rp->red_stats.marked_packets++;
    419 #endif
    420 			} else {
    421 				/* unforced drop by red */
    422 				droptype = DTYPE_EARLY;
    423 			}
    424 		}
    425 	} else {
    426 		/* avg < th_min */
    427 		rp->red_old = 0;
    428 	}
    429 
    430 	/*
    431 	 * if the queue length hits the hard limit, it's a forced drop.
    432 	 */
    433 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    434 		droptype = DTYPE_FORCED;
    435 
    436 #ifdef RED_RANDOM_DROP
    437 	/* if successful or forced drop, enqueue this packet. */
    438 	if (droptype != DTYPE_EARLY)
    439 		_addq(q, m);
    440 #else
    441 	/* if successful, enqueue this packet. */
    442 	if (droptype == DTYPE_NODROP)
    443 		_addq(q, m);
    444 #endif
    445 	if (droptype != DTYPE_NODROP) {
    446 		if (droptype == DTYPE_EARLY) {
    447 			/* drop the incoming packet */
    448 #ifdef RED_STATS
    449 			rp->red_stats.drop_unforced++;
    450 #endif
    451 		} else {
    452 			/* forced drop, select a victim packet in the queue. */
    453 #ifdef RED_RANDOM_DROP
    454 			m = _getq_random(q);
    455 #endif
    456 #ifdef RED_STATS
    457 			rp->red_stats.drop_forced++;
    458 #endif
    459 		}
    460 #ifdef RED_STATS
    461 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    462 #endif
    463 		rp->red_count = 0;
    464 #ifdef ALTQ3_COMPAT
    465 #ifdef ALTQ_FLOWVALVE
    466 		if (rp->red_flowvalve != NULL)
    467 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    468 #endif
    469 #endif /* ALTQ3_COMPAT */
    470 		m_freem(m);
    471 		return (-1);
    472 	}
    473 	/* successfully queued */
    474 #ifdef RED_STATS
    475 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    476 #endif
    477 	return (0);
    478 }
    479 
    480 /*
    481  * early-drop probability is calculated as follows:
    482  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    483  *   prob_a = prob / (2 - count*prob)
    484  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    485  * here prob_a increases as successive undrop count increases.
    486  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    487  * becomes 1 when (count >= (2 / prob))).
    488  */
    489 int
    490 drop_early(int fp_len, int fp_probd, int count)
    491 {
    492 	int	d;		/* denominator of drop-probability */
    493 
    494 	d = fp_probd - count * fp_len;
    495 	if (d <= 0)
    496 		/* count exceeds the hard limit: drop or mark */
    497 		return (1);
    498 
    499 	/*
    500 	 * now the range of d is [1..600] in fixed-point. (when
    501 	 * th_max-th_min=10 and p_max=1/30)
    502 	 * drop probability = (avg - TH_MIN) / d
    503 	 */
    504 
    505 	if ((arc4random() % d) < fp_len) {
    506 		/* drop or mark */
    507 		return (1);
    508 	}
    509 	/* no drop/mark */
    510 	return (0);
    511 }
    512 
    513 /*
    514  * try to mark CE bit to the packet.
    515  *    returns 1 if successfully marked, 0 otherwise.
    516  */
    517 int
    518 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
    519 {
    520 	struct mbuf	*m0;
    521 	struct m_tag	*t;
    522 	struct altq_tag	*at;
    523 	void		*hdr;
    524 	int		 af;
    525 
    526 	t = m_tag_find(m, PACKET_TAG_PF_QID, NULL);
    527 	if (t != NULL) {
    528 		at = (struct altq_tag *)(t + 1);
    529 		if (at == NULL)
    530 			return (0);
    531 		af = at->af;
    532 		hdr = at->hdr;
    533 #ifdef ALTQ3_COMPAT
    534 	} else if (pktattr != NULL) {
    535 		af = pktattr->pattr_af;
    536 		hdr = pktattr->pattr_hdr;
    537 #endif /* ALTQ3_COMPAT */
    538 	} else
    539 		return (0);
    540 
    541 	if (af != AF_INET && af != AF_INET6)
    542 		return (0);
    543 
    544 	/* verify that pattr_hdr is within the mbuf data */
    545 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    546 		if (((caddr_t)hdr >= m0->m_data) &&
    547 		    ((caddr_t)hdr < m0->m_data + m0->m_len))
    548 			break;
    549 	if (m0 == NULL) {
    550 		/* ick, tag info is stale */
    551 		return (0);
    552 	}
    553 
    554 	switch (af) {
    555 	case AF_INET:
    556 		if (flags & REDF_ECN4) {
    557 			struct ip *ip = hdr;
    558 			u_int8_t otos;
    559 			int sum;
    560 
    561 			if (ip->ip_v != 4)
    562 				return (0);	/* version mismatch! */
    563 
    564 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    565 				return (0);	/* not-ECT */
    566 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    567 				return (1);	/* already marked */
    568 
    569 			/*
    570 			 * ecn-capable but not marked,
    571 			 * mark CE and update checksum
    572 			 */
    573 			otos = ip->ip_tos;
    574 			ip->ip_tos |= IPTOS_ECN_CE;
    575 			/*
    576 			 * update checksum (from RFC1624)
    577 			 *	   HC' = ~(~HC + ~m + m')
    578 			 */
    579 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    580 			sum += (~otos & 0xffff) + ip->ip_tos;
    581 			sum = (sum >> 16) + (sum & 0xffff);
    582 			sum += (sum >> 16);  /* add carry */
    583 			ip->ip_sum = htons(~sum & 0xffff);
    584 			return (1);
    585 		}
    586 		break;
    587 #ifdef INET6
    588 	case AF_INET6:
    589 		if (flags & REDF_ECN6) {
    590 			struct ip6_hdr *ip6 = hdr;
    591 			u_int32_t flowlabel;
    592 
    593 			flowlabel = ntohl(ip6->ip6_flow);
    594 			if ((flowlabel >> 28) != 6)
    595 				return (0);	/* version mismatch! */
    596 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    597 			    (IPTOS_ECN_NOTECT << 20))
    598 				return (0);	/* not-ECT */
    599 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    600 			    (IPTOS_ECN_CE << 20))
    601 				return (1);	/* already marked */
    602 			/*
    603 			 * ecn-capable but not marked,  mark CE
    604 			 */
    605 			flowlabel |= (IPTOS_ECN_CE << 20);
    606 			ip6->ip6_flow = htonl(flowlabel);
    607 			return (1);
    608 		}
    609 		break;
    610 #endif  /* INET6 */
    611 	}
    612 
    613 	/* not marked */
    614 	return (0);
    615 }
    616 
    617 struct mbuf *
    618 red_getq(rp, q)
    619 	red_t *rp;
    620 	class_queue_t *q;
    621 {
    622 	struct mbuf *m;
    623 
    624 	if ((m = _getq(q)) == NULL) {
    625 		if (rp->red_idle == 0) {
    626 			rp->red_idle = 1;
    627 			microtime(&rp->red_last);
    628 		}
    629 		return NULL;
    630 	}
    631 
    632 	rp->red_idle = 0;
    633 	return (m);
    634 }
    635 
    636 /*
    637  * helper routine to calibrate avg during idle.
    638  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    639  * here Wq = 1/weight and the code assumes Wq is close to zero.
    640  *
    641  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    642  */
    643 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    644 
    645 struct wtab *
    646 wtab_alloc(int weight)
    647 {
    648 	struct wtab	*w;
    649 	int		 i;
    650 
    651 	for (w = wtab_list; w != NULL; w = w->w_next)
    652 		if (w->w_weight == weight) {
    653 			w->w_refcount++;
    654 			return (w);
    655 		}
    656 
    657 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
    658 	if (w == NULL)
    659 		panic("wtab_alloc: malloc failed!");
    660 	w->w_weight = weight;
    661 	w->w_refcount = 1;
    662 	w->w_next = wtab_list;
    663 	wtab_list = w;
    664 
    665 	/* initialize the weight table */
    666 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
    667 	for (i = 1; i < 32; i++) {
    668 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
    669 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
    670 			w->w_param_max = 1 << i;
    671 	}
    672 
    673 	return (w);
    674 }
    675 
    676 int
    677 wtab_destroy(struct wtab *w)
    678 {
    679 	struct wtab	*prev;
    680 
    681 	if (--w->w_refcount > 0)
    682 		return (0);
    683 
    684 	if (wtab_list == w)
    685 		wtab_list = w->w_next;
    686 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
    687 		if (prev->w_next == w) {
    688 			prev->w_next = w->w_next;
    689 			break;
    690 		}
    691 
    692 	free(w, M_DEVBUF);
    693 	return (0);
    694 }
    695 
    696 int32_t
    697 pow_w(struct wtab *w, int n)
    698 {
    699 	int	i, bit;
    700 	int32_t	val;
    701 
    702 	if (n >= w->w_param_max)
    703 		return (0);
    704 
    705 	val = 1 << FP_SHIFT;
    706 	if (n <= 0)
    707 		return (val);
    708 
    709 	bit = 1;
    710 	i = 0;
    711 	while (n) {
    712 		if (n & bit) {
    713 			val = (val * w->w_tab[i]) >> FP_SHIFT;
    714 			n &= ~bit;
    715 		}
    716 		i++;
    717 		bit <<=  1;
    718 	}
    719 	return (val);
    720 }
    721 
    722 #ifdef ALTQ3_COMPAT
    723 /*
    724  * red device interface
    725  */
    726 altqdev_decl(red);
    727 
    728 int
    729 redopen(dev, flag, fmt, l)
    730 	dev_t dev;
    731 	int flag, fmt;
    732 	struct lwp *l;
    733 {
    734 	/* everything will be done when the queueing scheme is attached. */
    735 	return 0;
    736 }
    737 
    738 int
    739 redclose(dev, flag, fmt, l)
    740 	dev_t dev;
    741 	int flag, fmt;
    742 	struct lwp *l;
    743 {
    744 	red_queue_t *rqp;
    745 	int err, error = 0;
    746 
    747 	while ((rqp = red_list) != NULL) {
    748 		/* destroy all */
    749 		err = red_detach(rqp);
    750 		if (err != 0 && error == 0)
    751 			error = err;
    752 	}
    753 
    754 	return error;
    755 }
    756 
    757 int
    758 redioctl(dev, cmd, addr, flag, l)
    759 	dev_t dev;
    760 	ioctlcmd_t cmd;
    761 	caddr_t addr;
    762 	int flag;
    763 	struct lwp *l;
    764 {
    765 	red_queue_t *rqp;
    766 	struct red_interface *ifacep;
    767 	struct ifnet *ifp;
    768 	struct proc *p = l->l_proc;
    769 	int	error = 0;
    770 
    771 	/* check super-user privilege */
    772 	switch (cmd) {
    773 	case RED_GETSTATS:
    774 		break;
    775 	default:
    776 #if (__FreeBSD_version > 400000)
    777 		if ((error = suser(p)) != 0)
    778 #else
    779 		if ((error = kauth_authorize_generic(p->p_cred,
    780 		    KAUTH_GENERIC_ISSUSER, &p->p_acflag)) != 0)
    781 #endif
    782 			return (error);
    783 		break;
    784 	}
    785 
    786 	switch (cmd) {
    787 
    788 	case RED_ENABLE:
    789 		ifacep = (struct red_interface *)addr;
    790 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    791 			error = EBADF;
    792 			break;
    793 		}
    794 		error = altq_enable(rqp->rq_ifq);
    795 		break;
    796 
    797 	case RED_DISABLE:
    798 		ifacep = (struct red_interface *)addr;
    799 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    800 			error = EBADF;
    801 			break;
    802 		}
    803 		error = altq_disable(rqp->rq_ifq);
    804 		break;
    805 
    806 	case RED_IF_ATTACH:
    807 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    808 		if (ifp == NULL) {
    809 			error = ENXIO;
    810 			break;
    811 		}
    812 
    813 		/* allocate and initialize red_queue_t */
    814 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    815 		if (rqp == NULL) {
    816 			error = ENOMEM;
    817 			break;
    818 		}
    819 
    820 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    821 		    M_WAITOK|M_ZERO);
    822 		if (rqp->rq_q == NULL) {
    823 			free(rqp, M_DEVBUF);
    824 			error = ENOMEM;
    825 			break;
    826 		}
    827 
    828 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    829 		if (rqp->rq_red == NULL) {
    830 			free(rqp->rq_q, M_DEVBUF);
    831 			free(rqp, M_DEVBUF);
    832 			error = ENOMEM;
    833 			break;
    834 		}
    835 
    836 		rqp->rq_ifq = &ifp->if_snd;
    837 		qtail(rqp->rq_q) = NULL;
    838 		qlen(rqp->rq_q) = 0;
    839 		qlimit(rqp->rq_q) = RED_LIMIT;
    840 		qtype(rqp->rq_q) = Q_RED;
    841 
    842 		/*
    843 		 * set RED to this ifnet structure.
    844 		 */
    845 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    846 				    red_enqueue, red_dequeue, red_request,
    847 				    NULL, NULL);
    848 		if (error) {
    849 			red_destroy(rqp->rq_red);
    850 			free(rqp->rq_q, M_DEVBUF);
    851 			free(rqp, M_DEVBUF);
    852 			break;
    853 		}
    854 
    855 		/* add this state to the red list */
    856 		rqp->rq_next = red_list;
    857 		red_list = rqp;
    858 		break;
    859 
    860 	case RED_IF_DETACH:
    861 		ifacep = (struct red_interface *)addr;
    862 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    863 			error = EBADF;
    864 			break;
    865 		}
    866 		error = red_detach(rqp);
    867 		break;
    868 
    869 	case RED_GETSTATS:
    870 		do {
    871 			struct red_stats *q_stats;
    872 			red_t *rp;
    873 
    874 			q_stats = (struct red_stats *)addr;
    875 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    876 					     ALTQT_RED)) == NULL) {
    877 				error = EBADF;
    878 				break;
    879 			}
    880 
    881 			q_stats->q_len 	   = qlen(rqp->rq_q);
    882 			q_stats->q_limit   = qlimit(rqp->rq_q);
    883 
    884 			rp = rqp->rq_red;
    885 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    886 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    887 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    888 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    889 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    890 			q_stats->marked_packets = rp->red_stats.marked_packets;
    891 
    892 			q_stats->weight		= rp->red_weight;
    893 			q_stats->inv_pmax	= rp->red_inv_pmax;
    894 			q_stats->th_min		= rp->red_thmin;
    895 			q_stats->th_max		= rp->red_thmax;
    896 
    897 #ifdef ALTQ_FLOWVALVE
    898 			if (rp->red_flowvalve != NULL) {
    899 				struct flowvalve *fv = rp->red_flowvalve;
    900 				q_stats->fv_flows    = fv->fv_flows;
    901 				q_stats->fv_pass     = fv->fv_stats.pass;
    902 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    903 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    904 				q_stats->fv_escape   = fv->fv_stats.escape;
    905 			} else {
    906 #endif /* ALTQ_FLOWVALVE */
    907 				q_stats->fv_flows    = 0;
    908 				q_stats->fv_pass     = 0;
    909 				q_stats->fv_predrop  = 0;
    910 				q_stats->fv_alloc    = 0;
    911 				q_stats->fv_escape   = 0;
    912 #ifdef ALTQ_FLOWVALVE
    913 			}
    914 #endif /* ALTQ_FLOWVALVE */
    915 		} while (/*CONSTCOND*/ 0);
    916 		break;
    917 
    918 	case RED_CONFIG:
    919 		do {
    920 			struct red_conf *fc;
    921 			red_t *new;
    922 			int s, limit;
    923 
    924 			fc = (struct red_conf *)addr;
    925 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    926 					       ALTQT_RED)) == NULL) {
    927 				error = EBADF;
    928 				break;
    929 			}
    930 			new = red_alloc(fc->red_weight,
    931 					fc->red_inv_pmax,
    932 					fc->red_thmin,
    933 					fc->red_thmax,
    934 					fc->red_flags,
    935 					fc->red_pkttime);
    936 			if (new == NULL) {
    937 				error = ENOMEM;
    938 				break;
    939 			}
    940 
    941 			s = splnet();
    942 			red_purgeq(rqp);
    943 			limit = fc->red_limit;
    944 			if (limit < fc->red_thmax)
    945 				limit = fc->red_thmax;
    946 			qlimit(rqp->rq_q) = limit;
    947 			fc->red_limit = limit;	/* write back the new value */
    948 
    949 			red_destroy(rqp->rq_red);
    950 			rqp->rq_red = new;
    951 
    952 			splx(s);
    953 
    954 			/* write back new values */
    955 			fc->red_limit = limit;
    956 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    957 			fc->red_thmin = rqp->rq_red->red_thmin;
    958 			fc->red_thmax = rqp->rq_red->red_thmax;
    959 
    960 		} while (/*CONSTCOND*/ 0);
    961 		break;
    962 
    963 	case RED_SETDEFAULTS:
    964 		do {
    965 			struct redparams *rp;
    966 
    967 			rp = (struct redparams *)addr;
    968 
    969 			default_th_min = rp->th_min;
    970 			default_th_max = rp->th_max;
    971 			default_inv_pmax = rp->inv_pmax;
    972 		} while (/*CONSTCOND*/ 0);
    973 		break;
    974 
    975 	default:
    976 		error = EINVAL;
    977 		break;
    978 	}
    979 	return error;
    980 }
    981 
    982 static int
    983 red_detach(rqp)
    984 	red_queue_t *rqp;
    985 {
    986 	red_queue_t *tmp;
    987 	int error = 0;
    988 
    989 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    990 		altq_disable(rqp->rq_ifq);
    991 
    992 	if ((error = altq_detach(rqp->rq_ifq)))
    993 		return (error);
    994 
    995 	if (red_list == rqp)
    996 		red_list = rqp->rq_next;
    997 	else {
    998 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    999 			if (tmp->rq_next == rqp) {
   1000 				tmp->rq_next = rqp->rq_next;
   1001 				break;
   1002 			}
   1003 		if (tmp == NULL)
   1004 			printf("red_detach: no state found in red_list!\n");
   1005 	}
   1006 
   1007 	red_destroy(rqp->rq_red);
   1008 	free(rqp->rq_q, M_DEVBUF);
   1009 	free(rqp, M_DEVBUF);
   1010 	return (error);
   1011 }
   1012 
   1013 /*
   1014  * enqueue routine:
   1015  *
   1016  *	returns: 0 when successfully queued.
   1017  *		 ENOBUFS when drop occurs.
   1018  */
   1019 static int
   1020 red_enqueue(ifq, m, pktattr)
   1021 	struct ifaltq *ifq;
   1022 	struct mbuf *m;
   1023 	struct altq_pktattr *pktattr;
   1024 {
   1025 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1026 
   1027 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
   1028 		return ENOBUFS;
   1029 	ifq->ifq_len++;
   1030 	return 0;
   1031 }
   1032 
   1033 /*
   1034  * dequeue routine:
   1035  *	must be called in splnet.
   1036  *
   1037  *	returns: mbuf dequeued.
   1038  *		 NULL when no packet is available in the queue.
   1039  */
   1040 
   1041 static struct mbuf *
   1042 red_dequeue(ifq, op)
   1043 	struct ifaltq *ifq;
   1044 	int op;
   1045 {
   1046 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1047 	struct mbuf *m;
   1048 
   1049 	if (op == ALTDQ_POLL)
   1050 		return qhead(rqp->rq_q);
   1051 
   1052 	/* op == ALTDQ_REMOVE */
   1053 	m =  red_getq(rqp->rq_red, rqp->rq_q);
   1054 	if (m != NULL)
   1055 		ifq->ifq_len--;
   1056 	return (m);
   1057 }
   1058 
   1059 static int
   1060 red_request(ifq, req, arg)
   1061 	struct ifaltq *ifq;
   1062 	int req;
   1063 	void *arg;
   1064 {
   1065 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1066 
   1067 	switch (req) {
   1068 	case ALTRQ_PURGE:
   1069 		red_purgeq(rqp);
   1070 		break;
   1071 	}
   1072 	return (0);
   1073 }
   1074 
   1075 static void
   1076 red_purgeq(rqp)
   1077 	red_queue_t *rqp;
   1078 {
   1079 	_flushq(rqp->rq_q);
   1080 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
   1081 		rqp->rq_ifq->ifq_len = 0;
   1082 }
   1083 
   1084 #ifdef ALTQ_FLOWVALVE
   1085 
   1086 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1087 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1088 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1089 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1090 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1091 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1092 
   1093 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1094 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1095 
   1096 #define	FV_N			10	/* update fve_f every FV_N packets */
   1097 
   1098 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1099 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1100 #define	FV_ALPHA		5  /* extra packet count */
   1101 
   1102 #define	FV_STATS
   1103 
   1104 #if (__FreeBSD_version > 300000)
   1105 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1106 #else
   1107 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1108 #endif
   1109 
   1110 /*
   1111  * Brtt table: 127 entry table to convert drop rate (p) to
   1112  * the corresponding bandwidth fraction (f)
   1113  * the following equation is implemented to use scaled values,
   1114  * fve_p and fve_f, in the fixed point format.
   1115  *
   1116  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1117  *   f = Brtt(p) / (max_th + alpha)
   1118  */
   1119 #define	BRTT_SIZE	128
   1120 #define	BRTT_SHIFT	12
   1121 #define	BRTT_MASK	0x0007f000
   1122 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1123 
   1124 const int brtt_tab[BRTT_SIZE] = {
   1125 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1126 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1127 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1128 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1129 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1130 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1131 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1132 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1133 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1134 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1135 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1136 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1137 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1138 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1139 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1140 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1141 };
   1142 
   1143 static inline struct fve *
   1144 flowlist_lookup(fv, pktattr, now)
   1145 	struct flowvalve *fv;
   1146 	struct altq_pktattr *pktattr;
   1147 	struct timeval *now;
   1148 {
   1149 	struct fve *fve;
   1150 	int flows;
   1151 	struct ip *ip;
   1152 #ifdef INET6
   1153 	struct ip6_hdr *ip6;
   1154 #endif
   1155 	struct timeval tthresh;
   1156 
   1157 	if (pktattr == NULL)
   1158 		return (NULL);
   1159 
   1160 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1161 	flows = 0;
   1162 	/*
   1163 	 * search the flow list
   1164 	 */
   1165 	switch (pktattr->pattr_af) {
   1166 	case AF_INET:
   1167 		ip = (struct ip *)pktattr->pattr_hdr;
   1168 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1169 			if (fve->fve_lastdrop.tv_sec == 0)
   1170 				break;
   1171 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1172 				fve->fve_lastdrop.tv_sec = 0;
   1173 				break;
   1174 			}
   1175 			if (fve->fve_flow.flow_af == AF_INET &&
   1176 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1177 			    ip->ip_src.s_addr &&
   1178 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1179 			    ip->ip_dst.s_addr)
   1180 				return (fve);
   1181 			flows++;
   1182 		}
   1183 		break;
   1184 #ifdef INET6
   1185 	case AF_INET6:
   1186 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1187 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1188 			if (fve->fve_lastdrop.tv_sec == 0)
   1189 				break;
   1190 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1191 				fve->fve_lastdrop.tv_sec = 0;
   1192 				break;
   1193 			}
   1194 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1195 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1196 					       &ip6->ip6_src) &&
   1197 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1198 					       &ip6->ip6_dst))
   1199 				return (fve);
   1200 			flows++;
   1201 		}
   1202 		break;
   1203 #endif /* INET6 */
   1204 
   1205 	default:
   1206 		/* unknown protocol.  no drop. */
   1207 		return (NULL);
   1208 	}
   1209 	fv->fv_flows = flows;	/* save the number of active fve's */
   1210 	return (NULL);
   1211 }
   1212 
   1213 static inline struct fve *
   1214 flowlist_reclaim(fv, pktattr)
   1215 	struct flowvalve *fv;
   1216 	struct altq_pktattr *pktattr;
   1217 {
   1218 	struct fve *fve;
   1219 	struct ip *ip;
   1220 #ifdef INET6
   1221 	struct ip6_hdr *ip6;
   1222 #endif
   1223 
   1224 	/*
   1225 	 * get an entry from the tail of the LRU list.
   1226 	 */
   1227 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1228 
   1229 	switch (pktattr->pattr_af) {
   1230 	case AF_INET:
   1231 		ip = (struct ip *)pktattr->pattr_hdr;
   1232 		fve->fve_flow.flow_af = AF_INET;
   1233 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1234 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1235 		break;
   1236 #ifdef INET6
   1237 	case AF_INET6:
   1238 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1239 		fve->fve_flow.flow_af = AF_INET6;
   1240 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1241 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1242 		break;
   1243 #endif
   1244 	}
   1245 
   1246 	fve->fve_state = Green;
   1247 	fve->fve_p = 0.0;
   1248 	fve->fve_f = 0.0;
   1249 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1250 	fve->fve_count = 0;
   1251 
   1252 	fv->fv_flows++;
   1253 #ifdef FV_STATS
   1254 	fv->fv_stats.alloc++;
   1255 #endif
   1256 	return (fve);
   1257 }
   1258 
   1259 static inline void
   1260 flowlist_move_to_head(fv, fve)
   1261 	struct flowvalve *fv;
   1262 	struct fve *fve;
   1263 {
   1264 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1265 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1266 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1267 	}
   1268 }
   1269 
   1270 /*
   1271  * allocate flowvalve structure
   1272  */
   1273 static struct flowvalve *
   1274 fv_alloc(rp)
   1275 	struct red *rp;
   1276 {
   1277 	struct flowvalve *fv;
   1278 	struct fve *fve;
   1279 	int i, num;
   1280 
   1281 	num = FV_FLOWLISTSIZE;
   1282 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1283 	if (fv == NULL)
   1284 		return (NULL);
   1285 
   1286 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1287 	    M_WAITOK|M_ZERO);
   1288 	if (fv->fv_fves == NULL) {
   1289 		free(fv, M_DEVBUF);
   1290 		return (NULL);
   1291 	}
   1292 
   1293 	fv->fv_flows = 0;
   1294 	TAILQ_INIT(&fv->fv_flowlist);
   1295 	for (i = 0; i < num; i++) {
   1296 		fve = &fv->fv_fves[i];
   1297 		fve->fve_lastdrop.tv_sec = 0;
   1298 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1299 	}
   1300 
   1301 	/* initialize drop rate threshold in scaled fixed-point */
   1302 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1303 
   1304 	/* initialize drop rate to fraction table */
   1305 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1306 	if (fv->fv_p2ftab == NULL) {
   1307 		free(fv->fv_fves, M_DEVBUF);
   1308 		free(fv, M_DEVBUF);
   1309 		return (NULL);
   1310 	}
   1311 	/*
   1312 	 * create the p2f table.
   1313 	 * (shift is used to keep the precision)
   1314 	 */
   1315 	for (i = 1; i < BRTT_SIZE; i++) {
   1316 		int f;
   1317 
   1318 		f = brtt_tab[i] << 8;
   1319 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1320 	}
   1321 
   1322 	return (fv);
   1323 }
   1324 
   1325 static void fv_destroy(fv)
   1326 	struct flowvalve *fv;
   1327 {
   1328 	free(fv->fv_p2ftab, M_DEVBUF);
   1329 	free(fv->fv_fves, M_DEVBUF);
   1330 	free(fv, M_DEVBUF);
   1331 }
   1332 
   1333 static inline int
   1334 fv_p2f(fv, p)
   1335 	struct flowvalve	*fv;
   1336 	int	p;
   1337 {
   1338 	int val, f;
   1339 
   1340 	if (p >= BRTT_PMAX)
   1341 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1342 	else if ((val = (p & BRTT_MASK)))
   1343 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1344 	else
   1345 		f = fv->fv_p2ftab[1];
   1346 	return (f);
   1347 }
   1348 
   1349 /*
   1350  * check if an arriving packet should be pre-dropped.
   1351  * called from red_addq() when a packet arrives.
   1352  * returns 1 when the packet should be pre-dropped.
   1353  * should be called in splnet.
   1354  */
   1355 static int
   1356 fv_checkflow(fv, pktattr, fcache)
   1357 	struct flowvalve *fv;
   1358 	struct altq_pktattr *pktattr;
   1359 	struct fve **fcache;
   1360 {
   1361 	struct fve *fve;
   1362 	struct timeval now;
   1363 
   1364 	fv->fv_ifseq++;
   1365 	FV_TIMESTAMP(&now);
   1366 
   1367 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1368 		/* no matching entry in the flowlist */
   1369 		return (0);
   1370 
   1371 	*fcache = fve;
   1372 
   1373 	/* update fraction f for every FV_N packets */
   1374 	if (++fve->fve_count == FV_N) {
   1375 		/*
   1376 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1377 		 */
   1378 		fve->fve_f =
   1379 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1380 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1381 		fve->fve_ifseq = fv->fv_ifseq;
   1382 		fve->fve_count = 0;
   1383 	}
   1384 
   1385 	/*
   1386 	 * overpumping test
   1387 	 */
   1388 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1389 		int fthresh;
   1390 
   1391 		/* calculate a threshold */
   1392 		fthresh = fv_p2f(fv, fve->fve_p);
   1393 		if (fve->fve_f > fthresh)
   1394 			fve->fve_state = Red;
   1395 	}
   1396 
   1397 	if (fve->fve_state == Red) {
   1398 		/*
   1399 		 * backoff test
   1400 		 */
   1401 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1402 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1403 			fve->fve_p = 0;
   1404 			fve->fve_state = Green;
   1405 #ifdef FV_STATS
   1406 			fv->fv_stats.escape++;
   1407 #endif
   1408 		} else {
   1409 			/* block this flow */
   1410 			flowlist_move_to_head(fv, fve);
   1411 			fve->fve_lastdrop = now;
   1412 #ifdef FV_STATS
   1413 			fv->fv_stats.predrop++;
   1414 #endif
   1415 			return (1);
   1416 		}
   1417 	}
   1418 
   1419 	/*
   1420 	 * p = (1 - Wp) * p
   1421 	 */
   1422 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1423 	if (fve->fve_p < 0)
   1424 		fve->fve_p = 0;
   1425 #ifdef FV_STATS
   1426 	fv->fv_stats.pass++;
   1427 #endif
   1428 	return (0);
   1429 }
   1430 
   1431 /*
   1432  * called from red_addq when a packet is dropped by red.
   1433  * should be called in splnet.
   1434  */
   1435 static void fv_dropbyred(fv, pktattr, fcache)
   1436 	struct flowvalve *fv;
   1437 	struct altq_pktattr *pktattr;
   1438 	struct fve *fcache;
   1439 {
   1440 	struct fve *fve;
   1441 	struct timeval now;
   1442 
   1443 	if (pktattr == NULL)
   1444 		return;
   1445 	FV_TIMESTAMP(&now);
   1446 
   1447 	if (fcache != NULL)
   1448 		/* the fve of this packet is already cached */
   1449 		fve = fcache;
   1450 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1451 		fve = flowlist_reclaim(fv, pktattr);
   1452 
   1453 	flowlist_move_to_head(fv, fve);
   1454 
   1455 	/*
   1456 	 * update p:  the following line cancels the update
   1457 	 *	      in fv_checkflow() and calculate
   1458 	 *	p = Wp + (1 - Wp) * p
   1459 	 */
   1460 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1461 
   1462 	fve->fve_lastdrop = now;
   1463 }
   1464 
   1465 #endif /* ALTQ_FLOWVALVE */
   1466 
   1467 #ifdef KLD_MODULE
   1468 
   1469 static struct altqsw red_sw =
   1470 	{"red", redopen, redclose, redioctl};
   1471 
   1472 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1473 MODULE_VERSION(altq_red, 1);
   1474 
   1475 #endif /* KLD_MODULE */
   1476 #endif /* ALTQ3_COMPAT */
   1477 
   1478 #endif /* ALTQ_RED */
   1479