Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.26.10.1
      1 /*	altq_red.c,v 1.26 2007/03/26 22:43:19 hubertf Exp	*/
      2 /*	$KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2003
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "altq_red.c,v 1.26 2007/03/26 22:43:19 hubertf Exp");
     65 
     66 #ifdef _KERNEL_OPT
     67 #include "opt_altq.h"
     68 #include "opt_inet.h"
     69 #include "pf.h"
     70 #endif
     71 
     72 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     73 
     74 #include <sys/param.h>
     75 #include <sys/malloc.h>
     76 #include <sys/mbuf.h>
     77 #include <sys/socket.h>
     78 #include <sys/systm.h>
     79 #include <sys/errno.h>
     80 #include <sys/kauth.h>
     81 #if 1 /* ALTQ3_COMPAT */
     82 #include <sys/sockio.h>
     83 #include <sys/proc.h>
     84 #include <sys/kernel.h>
     85 #ifdef ALTQ_FLOWVALVE
     86 #include <sys/queue.h>
     87 #include <sys/time.h>
     88 #endif
     89 #endif /* ALTQ3_COMPAT */
     90 
     91 #include <net/if.h>
     92 
     93 #include <netinet/in.h>
     94 #include <netinet/in_systm.h>
     95 #include <netinet/ip.h>
     96 #ifdef INET6
     97 #include <netinet/ip6.h>
     98 #endif
     99 
    100 #if NPF > 0
    101 #include <net/pfvar.h>
    102 #endif
    103 #include <altq/altq.h>
    104 #include <altq/altq_red.h>
    105 #ifdef ALTQ3_COMPAT
    106 #include <altq/altq_conf.h>
    107 #ifdef ALTQ_FLOWVALVE
    108 #include <altq/altq_flowvalve.h>
    109 #endif
    110 #endif
    111 
    112 /*
    113  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    114  * fixed-point calculation.
    115  *
    116  * written by kjc using the ns code as a reference.
    117  * you can learn more about red and ns from Sally's home page at
    118  * http://www-nrg.ee.lbl.gov/floyd/
    119  *
    120  * most of the red parameter values are fixed in this implementation
    121  * to prevent fixed-point overflow/underflow.
    122  * if you change the parameters, watch out for overflow/underflow!
    123  *
    124  * the parameters used are recommended values by Sally.
    125  * the corresponding ns config looks:
    126  *	q_weight=0.00195
    127  *	minthresh=5 maxthresh=15 queue-size=60
    128  *	linterm=30
    129  *	dropmech=drop-tail
    130  *	bytes=false (can't be handled by 32-bit fixed-point)
    131  *	doubleq=false dqthresh=false
    132  *	wait=true
    133  */
    134 /*
    135  * alternative red parameters for a slow link.
    136  *
    137  * assume the queue length becomes from zero to L and keeps L, it takes
    138  * N packets for q_avg to reach 63% of L.
    139  * when q_weight is 0.002, N is about 500 packets.
    140  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    141  * when q_weight is 0.008, N is about 127 packets.
    142  * when q_weight is 0.016, N is about 63 packets.
    143  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
    144  * are allowed for 0.016.
    145  * see Sally's paper for more details.
    146  */
    147 /* normal red parameters */
    148 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    149 				/* q_weight = 0.00195 */
    150 
    151 /* red parameters for a slow link */
    152 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    153 				/* q_weight = 0.0078125 */
    154 
    155 /* red parameters for a very slow link (e.g., dialup) */
    156 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    157 				/* q_weight = 0.015625 */
    158 
    159 /* fixed-point uses 12-bit decimal places */
    160 #define	FP_SHIFT	12	/* fixed-point shift */
    161 
    162 /* red parameters for drop probability */
    163 #define	INV_P_MAX	10	/* inverse of max drop probability */
    164 #define	TH_MIN		5	/* min threshold */
    165 #define	TH_MAX		15	/* max threshold */
    166 
    167 #define	RED_LIMIT	60	/* default max queue lenght */
    168 #define	RED_STATS		/* collect statistics */
    169 
    170 /*
    171  * our default policy for forced-drop is drop-tail.
    172  * (in altq-1.1.2 or earlier, the default was random-drop.
    173  * but it makes more sense to punish the cause of the surge.)
    174  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    175  */
    176 
    177 #ifdef ALTQ3_COMPAT
    178 #ifdef ALTQ_FLOWVALVE
    179 /*
    180  * flow-valve is an extention to protect red from unresponsive flows
    181  * and to promote end-to-end congestion control.
    182  * flow-valve observes the average drop rates of the flows that have
    183  * experienced packet drops in the recent past.
    184  * when the average drop rate exceeds the threshold, the flow is
    185  * blocked by the flow-valve.  the trapped flow should back off
    186  * exponentially to escape from the flow-valve.
    187  */
    188 #ifdef RED_RANDOM_DROP
    189 #error "random-drop can't be used with flow-valve!"
    190 #endif
    191 #endif /* ALTQ_FLOWVALVE */
    192 
    193 /* red_list keeps all red_queue_t's allocated. */
    194 static red_queue_t *red_list = NULL;
    195 
    196 #endif /* ALTQ3_COMPAT */
    197 
    198 /* default red parameter values */
    199 static int default_th_min = TH_MIN;
    200 static int default_th_max = TH_MAX;
    201 static int default_inv_pmax = INV_P_MAX;
    202 
    203 #ifdef ALTQ3_COMPAT
    204 /* internal function prototypes */
    205 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
    206 static struct mbuf *red_dequeue(struct ifaltq *, int);
    207 static int red_request(struct ifaltq *, int, void *);
    208 static void red_purgeq(red_queue_t *);
    209 static int red_detach(red_queue_t *);
    210 #ifdef ALTQ_FLOWVALVE
    211 static inline struct fve *flowlist_lookup(struct flowvalve *,
    212 			 struct altq_pktattr *, struct timeval *);
    213 static inline struct fve *flowlist_reclaim(struct flowvalve *,
    214 					     struct altq_pktattr *);
    215 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
    216 static inline int fv_p2f(struct flowvalve *, int);
    217 static struct flowvalve *fv_alloc(struct red *);
    218 static void fv_destroy(struct flowvalve *);
    219 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
    220 			struct fve **);
    221 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
    222 			 struct fve *);
    223 #endif
    224 #endif /* ALTQ3_COMPAT */
    225 
    226 /*
    227  * red support routines
    228  */
    229 red_t *
    230 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
    231    int pkttime)
    232 {
    233 	red_t	*rp;
    234 	int	 w, i;
    235 	int	 npkts_per_sec;
    236 
    237 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    238 	if (rp == NULL)
    239 		return (NULL);
    240 
    241 	rp->red_avg = 0;
    242 	rp->red_idle = 1;
    243 
    244 	if (weight == 0)
    245 		rp->red_weight = W_WEIGHT;
    246 	else
    247 		rp->red_weight = weight;
    248 	if (inv_pmax == 0)
    249 		rp->red_inv_pmax = default_inv_pmax;
    250 	else
    251 		rp->red_inv_pmax = inv_pmax;
    252 	if (th_min == 0)
    253 		rp->red_thmin = default_th_min;
    254 	else
    255 		rp->red_thmin = th_min;
    256 	if (th_max == 0)
    257 		rp->red_thmax = default_th_max;
    258 	else
    259 		rp->red_thmax = th_max;
    260 
    261 	rp->red_flags = flags;
    262 
    263 	if (pkttime == 0)
    264 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    265 		rp->red_pkttime = 800;
    266 	else
    267 		rp->red_pkttime = pkttime;
    268 
    269 	if (weight == 0) {
    270 		/* when the link is very slow, adjust red parameters */
    271 		npkts_per_sec = 1000000 / rp->red_pkttime;
    272 		if (npkts_per_sec < 50) {
    273 			/* up to about 400Kbps */
    274 			rp->red_weight = W_WEIGHT_2;
    275 		} else if (npkts_per_sec < 300) {
    276 			/* up to about 2.4Mbps */
    277 			rp->red_weight = W_WEIGHT_1;
    278 		}
    279 	}
    280 
    281 	/* calculate wshift.  weight must be power of 2 */
    282 	w = rp->red_weight;
    283 	for (i = 0; w > 1; i++)
    284 		w = w >> 1;
    285 	rp->red_wshift = i;
    286 	w = 1 << rp->red_wshift;
    287 	if (w != rp->red_weight) {
    288 		printf("invalid weight value %d for red! use %d\n",
    289 		       rp->red_weight, w);
    290 		rp->red_weight = w;
    291 	}
    292 
    293 	/*
    294 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    295 	 * to be compared with avg.
    296 	 */
    297 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    298 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    299 
    300 	/*
    301 	 * precompute probability denominator
    302 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    303 	 */
    304 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    305 			 * rp->red_inv_pmax) << FP_SHIFT;
    306 
    307 	/* allocate weight table */
    308 	rp->red_wtab = wtab_alloc(rp->red_weight);
    309 
    310 	microtime(&rp->red_last);
    311 #ifdef ALTQ3_COMPAT
    312 #ifdef ALTQ_FLOWVALVE
    313 	if (flags & REDF_FLOWVALVE)
    314 		rp->red_flowvalve = fv_alloc(rp);
    315 	/* if fv_alloc failes, flowvalve is just disabled */
    316 #endif
    317 #endif /* ALTQ3_COMPAT */
    318 	return (rp);
    319 }
    320 
    321 void
    322 red_destroy(red_t *rp)
    323 {
    324 #ifdef ALTQ3_COMPAT
    325 #ifdef ALTQ_FLOWVALVE
    326 	if (rp->red_flowvalve != NULL)
    327 		fv_destroy(rp->red_flowvalve);
    328 #endif
    329 #endif /* ALTQ3_COMPAT */
    330 	wtab_destroy(rp->red_wtab);
    331 	free(rp, M_DEVBUF);
    332 }
    333 
    334 void
    335 red_getstats(red_t *rp, struct redstats *sp)
    336 {
    337 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
    338 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    339 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    340 	sp->drop_forced		= rp->red_stats.drop_forced;
    341 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    342 	sp->marked_packets	= rp->red_stats.marked_packets;
    343 }
    344 
    345 int
    346 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
    347     struct altq_pktattr *pktattr)
    348 {
    349 	int avg, droptype;
    350 	int n;
    351 #ifdef ALTQ3_COMPAT
    352 #ifdef ALTQ_FLOWVALVE
    353 	struct fve *fve = NULL;
    354 
    355 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    356 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    357 			m_freem(m);
    358 			return (-1);
    359 		}
    360 #endif
    361 #endif /* ALTQ3_COMPAT */
    362 
    363 	avg = rp->red_avg;
    364 
    365 	/*
    366 	 * if we were idle, we pretend that n packets arrived during
    367 	 * the idle period.
    368 	 */
    369 	if (rp->red_idle) {
    370 		struct timeval now;
    371 		int t;
    372 
    373 		rp->red_idle = 0;
    374 		microtime(&now);
    375 		t = (now.tv_sec - rp->red_last.tv_sec);
    376 		if (t > 60) {
    377 			/*
    378 			 * being idle for more than 1 minute, set avg to zero.
    379 			 * this prevents t from overflow.
    380 			 */
    381 			avg = 0;
    382 		} else {
    383 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    384 			n = t / rp->red_pkttime - 1;
    385 
    386 			/* the following line does (avg = (1 - Wq)^n * avg) */
    387 			if (n > 0)
    388 				avg = (avg >> FP_SHIFT) *
    389 				    pow_w(rp->red_wtab, n);
    390 		}
    391 	}
    392 
    393 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    394 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    395 	rp->red_avg = avg;		/* save the new value */
    396 
    397 	/*
    398 	 * red_count keeps a tally of arriving traffic that has not
    399 	 * been dropped.
    400 	 */
    401 	rp->red_count++;
    402 
    403 	/* see if we drop early */
    404 	droptype = DTYPE_NODROP;
    405 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    406 		if (avg >= rp->red_thmax_s) {
    407 			/* avg >= th_max: forced drop */
    408 			droptype = DTYPE_FORCED;
    409 		} else if (rp->red_old == 0) {
    410 			/* first exceeds th_min */
    411 			rp->red_count = 1;
    412 			rp->red_old = 1;
    413 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    414 				      rp->red_probd, rp->red_count)) {
    415 			/* mark or drop by red */
    416 			if ((rp->red_flags & REDF_ECN) &&
    417 			    mark_ecn(m, pktattr, rp->red_flags)) {
    418 				/* successfully marked.  do not drop. */
    419 				rp->red_count = 0;
    420 #ifdef RED_STATS
    421 				rp->red_stats.marked_packets++;
    422 #endif
    423 			} else {
    424 				/* unforced drop by red */
    425 				droptype = DTYPE_EARLY;
    426 			}
    427 		}
    428 	} else {
    429 		/* avg < th_min */
    430 		rp->red_old = 0;
    431 	}
    432 
    433 	/*
    434 	 * if the queue length hits the hard limit, it's a forced drop.
    435 	 */
    436 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    437 		droptype = DTYPE_FORCED;
    438 
    439 #ifdef RED_RANDOM_DROP
    440 	/* if successful or forced drop, enqueue this packet. */
    441 	if (droptype != DTYPE_EARLY)
    442 		_addq(q, m);
    443 #else
    444 	/* if successful, enqueue this packet. */
    445 	if (droptype == DTYPE_NODROP)
    446 		_addq(q, m);
    447 #endif
    448 	if (droptype != DTYPE_NODROP) {
    449 		if (droptype == DTYPE_EARLY) {
    450 			/* drop the incoming packet */
    451 #ifdef RED_STATS
    452 			rp->red_stats.drop_unforced++;
    453 #endif
    454 		} else {
    455 			/* forced drop, select a victim packet in the queue. */
    456 #ifdef RED_RANDOM_DROP
    457 			m = _getq_random(q);
    458 #endif
    459 #ifdef RED_STATS
    460 			rp->red_stats.drop_forced++;
    461 #endif
    462 		}
    463 #ifdef RED_STATS
    464 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    465 #endif
    466 		rp->red_count = 0;
    467 #ifdef ALTQ3_COMPAT
    468 #ifdef ALTQ_FLOWVALVE
    469 		if (rp->red_flowvalve != NULL)
    470 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    471 #endif
    472 #endif /* ALTQ3_COMPAT */
    473 		m_freem(m);
    474 		return (-1);
    475 	}
    476 	/* successfully queued */
    477 #ifdef RED_STATS
    478 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    479 #endif
    480 	return (0);
    481 }
    482 
    483 /*
    484  * early-drop probability is calculated as follows:
    485  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    486  *   prob_a = prob / (2 - count*prob)
    487  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    488  * here prob_a increases as successive undrop count increases.
    489  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    490  * becomes 1 when (count >= (2 / prob))).
    491  */
    492 int
    493 drop_early(int fp_len, int fp_probd, int count)
    494 {
    495 	int	d;		/* denominator of drop-probability */
    496 
    497 	d = fp_probd - count * fp_len;
    498 	if (d <= 0)
    499 		/* count exceeds the hard limit: drop or mark */
    500 		return (1);
    501 
    502 	/*
    503 	 * now the range of d is [1..600] in fixed-point. (when
    504 	 * th_max-th_min=10 and p_max=1/30)
    505 	 * drop probability = (avg - TH_MIN) / d
    506 	 */
    507 
    508 	if ((arc4random() % d) < fp_len) {
    509 		/* drop or mark */
    510 		return (1);
    511 	}
    512 	/* no drop/mark */
    513 	return (0);
    514 }
    515 
    516 /*
    517  * try to mark CE bit to the packet.
    518  *    returns 1 if successfully marked, 0 otherwise.
    519  */
    520 int
    521 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
    522 {
    523 	struct mbuf	*m0;
    524 	struct m_tag	*t;
    525 	struct altq_tag	*at;
    526 	void		*hdr;
    527 	int		 af;
    528 
    529 	t = m_tag_find(m, PACKET_TAG_PF_QID, NULL);
    530 	if (t != NULL) {
    531 		at = (struct altq_tag *)(t + 1);
    532 		if (at == NULL)
    533 			return (0);
    534 		af = at->af;
    535 		hdr = at->hdr;
    536 #ifdef ALTQ3_COMPAT
    537 	} else if (pktattr != NULL) {
    538 		af = pktattr->pattr_af;
    539 		hdr = pktattr->pattr_hdr;
    540 #endif /* ALTQ3_COMPAT */
    541 	} else
    542 		return (0);
    543 
    544 	if (af != AF_INET && af != AF_INET6)
    545 		return (0);
    546 
    547 	/* verify that pattr_hdr is within the mbuf data */
    548 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    549 		if (((char *)hdr >= m0->m_data) &&
    550 		    ((char *)hdr < m0->m_data + m0->m_len))
    551 			break;
    552 	if (m0 == NULL) {
    553 		/* ick, tag info is stale */
    554 		return (0);
    555 	}
    556 
    557 	switch (af) {
    558 	case AF_INET:
    559 		if (flags & REDF_ECN4) {
    560 			struct ip *ip = hdr;
    561 			u_int8_t otos;
    562 			int sum;
    563 
    564 			if (ip->ip_v != 4)
    565 				return (0);	/* version mismatch! */
    566 
    567 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    568 				return (0);	/* not-ECT */
    569 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    570 				return (1);	/* already marked */
    571 
    572 			/*
    573 			 * ecn-capable but not marked,
    574 			 * mark CE and update checksum
    575 			 */
    576 			otos = ip->ip_tos;
    577 			ip->ip_tos |= IPTOS_ECN_CE;
    578 			/*
    579 			 * update checksum (from RFC1624)
    580 			 *	   HC' = ~(~HC + ~m + m')
    581 			 */
    582 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    583 			sum += (~otos & 0xffff) + ip->ip_tos;
    584 			sum = (sum >> 16) + (sum & 0xffff);
    585 			sum += (sum >> 16);  /* add carry */
    586 			ip->ip_sum = htons(~sum & 0xffff);
    587 			return (1);
    588 		}
    589 		break;
    590 #ifdef INET6
    591 	case AF_INET6:
    592 		if (flags & REDF_ECN6) {
    593 			struct ip6_hdr *ip6 = hdr;
    594 			u_int32_t flowlabel;
    595 
    596 			flowlabel = ntohl(ip6->ip6_flow);
    597 			if ((flowlabel >> 28) != 6)
    598 				return (0);	/* version mismatch! */
    599 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    600 			    (IPTOS_ECN_NOTECT << 20))
    601 				return (0);	/* not-ECT */
    602 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    603 			    (IPTOS_ECN_CE << 20))
    604 				return (1);	/* already marked */
    605 			/*
    606 			 * ecn-capable but not marked,  mark CE
    607 			 */
    608 			flowlabel |= (IPTOS_ECN_CE << 20);
    609 			ip6->ip6_flow = htonl(flowlabel);
    610 			return (1);
    611 		}
    612 		break;
    613 #endif  /* INET6 */
    614 	}
    615 
    616 	/* not marked */
    617 	return (0);
    618 }
    619 
    620 struct mbuf *
    621 red_getq(red_t *rp, class_queue_t *q)
    622 {
    623 	struct mbuf *m;
    624 
    625 	if ((m = _getq(q)) == NULL) {
    626 		if (rp->red_idle == 0) {
    627 			rp->red_idle = 1;
    628 			microtime(&rp->red_last);
    629 		}
    630 		return NULL;
    631 	}
    632 
    633 	rp->red_idle = 0;
    634 	return (m);
    635 }
    636 
    637 /*
    638  * helper routine to calibrate avg during idle.
    639  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    640  * here Wq = 1/weight and the code assumes Wq is close to zero.
    641  *
    642  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    643  */
    644 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    645 
    646 struct wtab *
    647 wtab_alloc(int weight)
    648 {
    649 	struct wtab	*w;
    650 	int		 i;
    651 
    652 	for (w = wtab_list; w != NULL; w = w->w_next)
    653 		if (w->w_weight == weight) {
    654 			w->w_refcount++;
    655 			return (w);
    656 		}
    657 
    658 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
    659 	if (w == NULL)
    660 		panic("wtab_alloc: malloc failed!");
    661 	w->w_weight = weight;
    662 	w->w_refcount = 1;
    663 	w->w_next = wtab_list;
    664 	wtab_list = w;
    665 
    666 	/* initialize the weight table */
    667 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
    668 	for (i = 1; i < 32; i++) {
    669 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
    670 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
    671 			w->w_param_max = 1 << i;
    672 	}
    673 
    674 	return (w);
    675 }
    676 
    677 int
    678 wtab_destroy(struct wtab *w)
    679 {
    680 	struct wtab	*prev;
    681 
    682 	if (--w->w_refcount > 0)
    683 		return (0);
    684 
    685 	if (wtab_list == w)
    686 		wtab_list = w->w_next;
    687 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
    688 		if (prev->w_next == w) {
    689 			prev->w_next = w->w_next;
    690 			break;
    691 		}
    692 
    693 	free(w, M_DEVBUF);
    694 	return (0);
    695 }
    696 
    697 int32_t
    698 pow_w(struct wtab *w, int n)
    699 {
    700 	int	i, bit;
    701 	int32_t	val;
    702 
    703 	if (n >= w->w_param_max)
    704 		return (0);
    705 
    706 	val = 1 << FP_SHIFT;
    707 	if (n <= 0)
    708 		return (val);
    709 
    710 	bit = 1;
    711 	i = 0;
    712 	while (n) {
    713 		if (n & bit) {
    714 			val = (val * w->w_tab[i]) >> FP_SHIFT;
    715 			n &= ~bit;
    716 		}
    717 		i++;
    718 		bit <<=  1;
    719 	}
    720 	return (val);
    721 }
    722 
    723 #ifdef ALTQ3_COMPAT
    724 /*
    725  * red device interface
    726  */
    727 altqdev_decl(red);
    728 
    729 int
    730 redopen(dev_t dev, int flag, int fmt,
    731     struct lwp *l)
    732 {
    733 	/* everything will be done when the queueing scheme is attached. */
    734 	return 0;
    735 }
    736 
    737 int
    738 redclose(dev_t dev, int flag, int fmt,
    739     struct lwp *l)
    740 {
    741 	red_queue_t *rqp;
    742 	int err, error = 0;
    743 
    744 	while ((rqp = red_list) != NULL) {
    745 		/* destroy all */
    746 		err = red_detach(rqp);
    747 		if (err != 0 && error == 0)
    748 			error = err;
    749 	}
    750 
    751 	return error;
    752 }
    753 
    754 int
    755 redioctl(dev_t dev, ioctlcmd_t cmd, void *addr, int flag,
    756     struct lwp *l)
    757 {
    758 	red_queue_t *rqp;
    759 	struct red_interface *ifacep;
    760 	struct ifnet *ifp;
    761 	struct proc *p = l->l_proc;
    762 	int	error = 0;
    763 
    764 	/* check super-user privilege */
    765 	switch (cmd) {
    766 	case RED_GETSTATS:
    767 		break;
    768 	default:
    769 #if (__FreeBSD_version > 400000)
    770 		if ((error = suser(p)) != 0)
    771 #else
    772 		if ((error = kauth_authorize_network(p->p_cred,
    773 		    KAUTH_NETWORK_ALTQ, KAUTH_REQ_NETWORK_ALTQ_RED, NULL,
    774 		    NULL, NULL)) != 0)
    775 #endif
    776 			return (error);
    777 		break;
    778 	}
    779 
    780 	switch (cmd) {
    781 
    782 	case RED_ENABLE:
    783 		ifacep = (struct red_interface *)addr;
    784 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    785 			error = EBADF;
    786 			break;
    787 		}
    788 		error = altq_enable(rqp->rq_ifq);
    789 		break;
    790 
    791 	case RED_DISABLE:
    792 		ifacep = (struct red_interface *)addr;
    793 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    794 			error = EBADF;
    795 			break;
    796 		}
    797 		error = altq_disable(rqp->rq_ifq);
    798 		break;
    799 
    800 	case RED_IF_ATTACH:
    801 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    802 		if (ifp == NULL) {
    803 			error = ENXIO;
    804 			break;
    805 		}
    806 
    807 		/* allocate and initialize red_queue_t */
    808 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    809 		if (rqp == NULL) {
    810 			error = ENOMEM;
    811 			break;
    812 		}
    813 
    814 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    815 		    M_WAITOK|M_ZERO);
    816 		if (rqp->rq_q == NULL) {
    817 			free(rqp, M_DEVBUF);
    818 			error = ENOMEM;
    819 			break;
    820 		}
    821 
    822 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    823 		if (rqp->rq_red == NULL) {
    824 			free(rqp->rq_q, M_DEVBUF);
    825 			free(rqp, M_DEVBUF);
    826 			error = ENOMEM;
    827 			break;
    828 		}
    829 
    830 		rqp->rq_ifq = &ifp->if_snd;
    831 		qtail(rqp->rq_q) = NULL;
    832 		qlen(rqp->rq_q) = 0;
    833 		qlimit(rqp->rq_q) = RED_LIMIT;
    834 		qtype(rqp->rq_q) = Q_RED;
    835 
    836 		/*
    837 		 * set RED to this ifnet structure.
    838 		 */
    839 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    840 				    red_enqueue, red_dequeue, red_request,
    841 				    NULL, NULL);
    842 		if (error) {
    843 			red_destroy(rqp->rq_red);
    844 			free(rqp->rq_q, M_DEVBUF);
    845 			free(rqp, M_DEVBUF);
    846 			break;
    847 		}
    848 
    849 		/* add this state to the red list */
    850 		rqp->rq_next = red_list;
    851 		red_list = rqp;
    852 		break;
    853 
    854 	case RED_IF_DETACH:
    855 		ifacep = (struct red_interface *)addr;
    856 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    857 			error = EBADF;
    858 			break;
    859 		}
    860 		error = red_detach(rqp);
    861 		break;
    862 
    863 	case RED_GETSTATS:
    864 		do {
    865 			struct red_stats *q_stats;
    866 			red_t *rp;
    867 
    868 			q_stats = (struct red_stats *)addr;
    869 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    870 					     ALTQT_RED)) == NULL) {
    871 				error = EBADF;
    872 				break;
    873 			}
    874 
    875 			q_stats->q_len 	   = qlen(rqp->rq_q);
    876 			q_stats->q_limit   = qlimit(rqp->rq_q);
    877 
    878 			rp = rqp->rq_red;
    879 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    880 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    881 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    882 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    883 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    884 			q_stats->marked_packets = rp->red_stats.marked_packets;
    885 
    886 			q_stats->weight		= rp->red_weight;
    887 			q_stats->inv_pmax	= rp->red_inv_pmax;
    888 			q_stats->th_min		= rp->red_thmin;
    889 			q_stats->th_max		= rp->red_thmax;
    890 
    891 #ifdef ALTQ_FLOWVALVE
    892 			if (rp->red_flowvalve != NULL) {
    893 				struct flowvalve *fv = rp->red_flowvalve;
    894 				q_stats->fv_flows    = fv->fv_flows;
    895 				q_stats->fv_pass     = fv->fv_stats.pass;
    896 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    897 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    898 				q_stats->fv_escape   = fv->fv_stats.escape;
    899 			} else {
    900 #endif /* ALTQ_FLOWVALVE */
    901 				q_stats->fv_flows    = 0;
    902 				q_stats->fv_pass     = 0;
    903 				q_stats->fv_predrop  = 0;
    904 				q_stats->fv_alloc    = 0;
    905 				q_stats->fv_escape   = 0;
    906 #ifdef ALTQ_FLOWVALVE
    907 			}
    908 #endif /* ALTQ_FLOWVALVE */
    909 		} while (/*CONSTCOND*/ 0);
    910 		break;
    911 
    912 	case RED_CONFIG:
    913 		do {
    914 			struct red_conf *fc;
    915 			red_t *new;
    916 			int s, limit;
    917 
    918 			fc = (struct red_conf *)addr;
    919 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    920 					       ALTQT_RED)) == NULL) {
    921 				error = EBADF;
    922 				break;
    923 			}
    924 			new = red_alloc(fc->red_weight,
    925 					fc->red_inv_pmax,
    926 					fc->red_thmin,
    927 					fc->red_thmax,
    928 					fc->red_flags,
    929 					fc->red_pkttime);
    930 			if (new == NULL) {
    931 				error = ENOMEM;
    932 				break;
    933 			}
    934 
    935 			s = splnet();
    936 			red_purgeq(rqp);
    937 			limit = fc->red_limit;
    938 			if (limit < fc->red_thmax)
    939 				limit = fc->red_thmax;
    940 			qlimit(rqp->rq_q) = limit;
    941 			fc->red_limit = limit;	/* write back the new value */
    942 
    943 			red_destroy(rqp->rq_red);
    944 			rqp->rq_red = new;
    945 
    946 			splx(s);
    947 
    948 			/* write back new values */
    949 			fc->red_limit = limit;
    950 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    951 			fc->red_thmin = rqp->rq_red->red_thmin;
    952 			fc->red_thmax = rqp->rq_red->red_thmax;
    953 
    954 		} while (/*CONSTCOND*/ 0);
    955 		break;
    956 
    957 	case RED_SETDEFAULTS:
    958 		do {
    959 			struct redparams *rp;
    960 
    961 			rp = (struct redparams *)addr;
    962 
    963 			default_th_min = rp->th_min;
    964 			default_th_max = rp->th_max;
    965 			default_inv_pmax = rp->inv_pmax;
    966 		} while (/*CONSTCOND*/ 0);
    967 		break;
    968 
    969 	default:
    970 		error = EINVAL;
    971 		break;
    972 	}
    973 	return error;
    974 }
    975 
    976 static int
    977 red_detach(red_queue_t *rqp)
    978 {
    979 	red_queue_t *tmp;
    980 	int error = 0;
    981 
    982 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    983 		altq_disable(rqp->rq_ifq);
    984 
    985 	if ((error = altq_detach(rqp->rq_ifq)))
    986 		return (error);
    987 
    988 	if (red_list == rqp)
    989 		red_list = rqp->rq_next;
    990 	else {
    991 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    992 			if (tmp->rq_next == rqp) {
    993 				tmp->rq_next = rqp->rq_next;
    994 				break;
    995 			}
    996 		if (tmp == NULL)
    997 			printf("red_detach: no state found in red_list!\n");
    998 	}
    999 
   1000 	red_destroy(rqp->rq_red);
   1001 	free(rqp->rq_q, M_DEVBUF);
   1002 	free(rqp, M_DEVBUF);
   1003 	return (error);
   1004 }
   1005 
   1006 /*
   1007  * enqueue routine:
   1008  *
   1009  *	returns: 0 when successfully queued.
   1010  *		 ENOBUFS when drop occurs.
   1011  */
   1012 static int
   1013 red_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
   1014 {
   1015 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1016 
   1017 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
   1018 		return ENOBUFS;
   1019 	ifq->ifq_len++;
   1020 	return 0;
   1021 }
   1022 
   1023 /*
   1024  * dequeue routine:
   1025  *	must be called in splnet.
   1026  *
   1027  *	returns: mbuf dequeued.
   1028  *		 NULL when no packet is available in the queue.
   1029  */
   1030 
   1031 static struct mbuf *
   1032 red_dequeue(struct ifaltq *ifq, int op)
   1033 {
   1034 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1035 	struct mbuf *m;
   1036 
   1037 	if (op == ALTDQ_POLL)
   1038 		return qhead(rqp->rq_q);
   1039 
   1040 	/* op == ALTDQ_REMOVE */
   1041 	m =  red_getq(rqp->rq_red, rqp->rq_q);
   1042 	if (m != NULL)
   1043 		ifq->ifq_len--;
   1044 	return (m);
   1045 }
   1046 
   1047 static int
   1048 red_request(struct ifaltq *ifq, int req, void *arg)
   1049 {
   1050 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1051 
   1052 	switch (req) {
   1053 	case ALTRQ_PURGE:
   1054 		red_purgeq(rqp);
   1055 		break;
   1056 	}
   1057 	return (0);
   1058 }
   1059 
   1060 static void
   1061 red_purgeq(red_queue_t *rqp)
   1062 {
   1063 	_flushq(rqp->rq_q);
   1064 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
   1065 		rqp->rq_ifq->ifq_len = 0;
   1066 }
   1067 
   1068 #ifdef ALTQ_FLOWVALVE
   1069 
   1070 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1071 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1072 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1073 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1074 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1075 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1076 
   1077 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1078 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1079 
   1080 #define	FV_N			10	/* update fve_f every FV_N packets */
   1081 
   1082 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1083 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1084 #define	FV_ALPHA		5  /* extra packet count */
   1085 
   1086 #define	FV_STATS
   1087 
   1088 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1089 
   1090 /*
   1091  * Brtt table: 127 entry table to convert drop rate (p) to
   1092  * the corresponding bandwidth fraction (f)
   1093  * the following equation is implemented to use scaled values,
   1094  * fve_p and fve_f, in the fixed point format.
   1095  *
   1096  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1097  *   f = Brtt(p) / (max_th + alpha)
   1098  */
   1099 #define	BRTT_SIZE	128
   1100 #define	BRTT_SHIFT	12
   1101 #define	BRTT_MASK	0x0007f000
   1102 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1103 
   1104 const int brtt_tab[BRTT_SIZE] = {
   1105 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1106 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1107 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1108 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1109 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1110 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1111 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1112 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1113 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1114 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1115 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1116 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1117 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1118 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1119 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1120 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1121 };
   1122 
   1123 static inline struct fve *
   1124 flowlist_lookup(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1125     struct timeval *now)
   1126 {
   1127 	struct fve *fve;
   1128 	int flows;
   1129 	struct ip *ip;
   1130 #ifdef INET6
   1131 	struct ip6_hdr *ip6;
   1132 #endif
   1133 	struct timeval tthresh;
   1134 
   1135 	if (pktattr == NULL)
   1136 		return (NULL);
   1137 
   1138 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1139 	flows = 0;
   1140 	/*
   1141 	 * search the flow list
   1142 	 */
   1143 	switch (pktattr->pattr_af) {
   1144 	case AF_INET:
   1145 		ip = (struct ip *)pktattr->pattr_hdr;
   1146 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1147 			if (fve->fve_lastdrop.tv_sec == 0)
   1148 				break;
   1149 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1150 				fve->fve_lastdrop.tv_sec = 0;
   1151 				break;
   1152 			}
   1153 			if (fve->fve_flow.flow_af == AF_INET &&
   1154 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1155 			    ip->ip_src.s_addr &&
   1156 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1157 			    ip->ip_dst.s_addr)
   1158 				return (fve);
   1159 			flows++;
   1160 		}
   1161 		break;
   1162 #ifdef INET6
   1163 	case AF_INET6:
   1164 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1165 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1166 			if (fve->fve_lastdrop.tv_sec == 0)
   1167 				break;
   1168 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1169 				fve->fve_lastdrop.tv_sec = 0;
   1170 				break;
   1171 			}
   1172 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1173 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1174 					       &ip6->ip6_src) &&
   1175 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1176 					       &ip6->ip6_dst))
   1177 				return (fve);
   1178 			flows++;
   1179 		}
   1180 		break;
   1181 #endif /* INET6 */
   1182 
   1183 	default:
   1184 		/* unknown protocol.  no drop. */
   1185 		return (NULL);
   1186 	}
   1187 	fv->fv_flows = flows;	/* save the number of active fve's */
   1188 	return (NULL);
   1189 }
   1190 
   1191 static inline struct fve *
   1192 flowlist_reclaim(struct flowvalve *fv, struct altq_pktattr *pktattr)
   1193 {
   1194 	struct fve *fve;
   1195 	struct ip *ip;
   1196 #ifdef INET6
   1197 	struct ip6_hdr *ip6;
   1198 #endif
   1199 
   1200 	/*
   1201 	 * get an entry from the tail of the LRU list.
   1202 	 */
   1203 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1204 
   1205 	switch (pktattr->pattr_af) {
   1206 	case AF_INET:
   1207 		ip = (struct ip *)pktattr->pattr_hdr;
   1208 		fve->fve_flow.flow_af = AF_INET;
   1209 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1210 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1211 		break;
   1212 #ifdef INET6
   1213 	case AF_INET6:
   1214 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1215 		fve->fve_flow.flow_af = AF_INET6;
   1216 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1217 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1218 		break;
   1219 #endif
   1220 	}
   1221 
   1222 	fve->fve_state = Green;
   1223 	fve->fve_p = 0.0;
   1224 	fve->fve_f = 0.0;
   1225 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1226 	fve->fve_count = 0;
   1227 
   1228 	fv->fv_flows++;
   1229 #ifdef FV_STATS
   1230 	fv->fv_stats.alloc++;
   1231 #endif
   1232 	return (fve);
   1233 }
   1234 
   1235 static inline void
   1236 flowlist_move_to_head(struct flowvalve *fv, struct fve *fve)
   1237 {
   1238 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1239 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1240 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1241 	}
   1242 }
   1243 
   1244 /*
   1245  * allocate flowvalve structure
   1246  */
   1247 static struct flowvalve *
   1248 fv_alloc(struct red *rp)
   1249 {
   1250 	struct flowvalve *fv;
   1251 	struct fve *fve;
   1252 	int i, num;
   1253 
   1254 	num = FV_FLOWLISTSIZE;
   1255 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1256 	if (fv == NULL)
   1257 		return (NULL);
   1258 
   1259 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1260 	    M_WAITOK|M_ZERO);
   1261 	if (fv->fv_fves == NULL) {
   1262 		free(fv, M_DEVBUF);
   1263 		return (NULL);
   1264 	}
   1265 
   1266 	fv->fv_flows = 0;
   1267 	TAILQ_INIT(&fv->fv_flowlist);
   1268 	for (i = 0; i < num; i++) {
   1269 		fve = &fv->fv_fves[i];
   1270 		fve->fve_lastdrop.tv_sec = 0;
   1271 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1272 	}
   1273 
   1274 	/* initialize drop rate threshold in scaled fixed-point */
   1275 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1276 
   1277 	/* initialize drop rate to fraction table */
   1278 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1279 	if (fv->fv_p2ftab == NULL) {
   1280 		free(fv->fv_fves, M_DEVBUF);
   1281 		free(fv, M_DEVBUF);
   1282 		return (NULL);
   1283 	}
   1284 	/*
   1285 	 * create the p2f table.
   1286 	 * (shift is used to keep the precision)
   1287 	 */
   1288 	for (i = 1; i < BRTT_SIZE; i++) {
   1289 		int f;
   1290 
   1291 		f = brtt_tab[i] << 8;
   1292 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1293 	}
   1294 
   1295 	return (fv);
   1296 }
   1297 
   1298 static void
   1299 fv_destroy(struct flowvalve *fv)
   1300 {
   1301 	free(fv->fv_p2ftab, M_DEVBUF);
   1302 	free(fv->fv_fves, M_DEVBUF);
   1303 	free(fv, M_DEVBUF);
   1304 }
   1305 
   1306 static inline int
   1307 fv_p2f(struct flowvalve *fv, int p)
   1308 {
   1309 	int val, f;
   1310 
   1311 	if (p >= BRTT_PMAX)
   1312 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1313 	else if ((val = (p & BRTT_MASK)))
   1314 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1315 	else
   1316 		f = fv->fv_p2ftab[1];
   1317 	return (f);
   1318 }
   1319 
   1320 /*
   1321  * check if an arriving packet should be pre-dropped.
   1322  * called from red_addq() when a packet arrives.
   1323  * returns 1 when the packet should be pre-dropped.
   1324  * should be called in splnet.
   1325  */
   1326 static int
   1327 fv_checkflow(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1328     struct fve **fcache)
   1329 {
   1330 	struct fve *fve;
   1331 	struct timeval now;
   1332 
   1333 	fv->fv_ifseq++;
   1334 	FV_TIMESTAMP(&now);
   1335 
   1336 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1337 		/* no matching entry in the flowlist */
   1338 		return (0);
   1339 
   1340 	*fcache = fve;
   1341 
   1342 	/* update fraction f for every FV_N packets */
   1343 	if (++fve->fve_count == FV_N) {
   1344 		/*
   1345 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1346 		 */
   1347 		fve->fve_f =
   1348 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1349 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1350 		fve->fve_ifseq = fv->fv_ifseq;
   1351 		fve->fve_count = 0;
   1352 	}
   1353 
   1354 	/*
   1355 	 * overpumping test
   1356 	 */
   1357 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1358 		int fthresh;
   1359 
   1360 		/* calculate a threshold */
   1361 		fthresh = fv_p2f(fv, fve->fve_p);
   1362 		if (fve->fve_f > fthresh)
   1363 			fve->fve_state = Red;
   1364 	}
   1365 
   1366 	if (fve->fve_state == Red) {
   1367 		/*
   1368 		 * backoff test
   1369 		 */
   1370 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1371 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1372 			fve->fve_p = 0;
   1373 			fve->fve_state = Green;
   1374 #ifdef FV_STATS
   1375 			fv->fv_stats.escape++;
   1376 #endif
   1377 		} else {
   1378 			/* block this flow */
   1379 			flowlist_move_to_head(fv, fve);
   1380 			fve->fve_lastdrop = now;
   1381 #ifdef FV_STATS
   1382 			fv->fv_stats.predrop++;
   1383 #endif
   1384 			return (1);
   1385 		}
   1386 	}
   1387 
   1388 	/*
   1389 	 * p = (1 - Wp) * p
   1390 	 */
   1391 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1392 	if (fve->fve_p < 0)
   1393 		fve->fve_p = 0;
   1394 #ifdef FV_STATS
   1395 	fv->fv_stats.pass++;
   1396 #endif
   1397 	return (0);
   1398 }
   1399 
   1400 /*
   1401  * called from red_addq when a packet is dropped by red.
   1402  * should be called in splnet.
   1403  */
   1404 static void
   1405 fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1406     struct fve *fcache)
   1407 {
   1408 	struct fve *fve;
   1409 	struct timeval now;
   1410 
   1411 	if (pktattr == NULL)
   1412 		return;
   1413 	FV_TIMESTAMP(&now);
   1414 
   1415 	if (fcache != NULL)
   1416 		/* the fve of this packet is already cached */
   1417 		fve = fcache;
   1418 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1419 		fve = flowlist_reclaim(fv, pktattr);
   1420 
   1421 	flowlist_move_to_head(fv, fve);
   1422 
   1423 	/*
   1424 	 * update p:  the following line cancels the update
   1425 	 *	      in fv_checkflow() and calculate
   1426 	 *	p = Wp + (1 - Wp) * p
   1427 	 */
   1428 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1429 
   1430 	fve->fve_lastdrop = now;
   1431 }
   1432 
   1433 #endif /* ALTQ_FLOWVALVE */
   1434 
   1435 #ifdef KLD_MODULE
   1436 
   1437 static struct altqsw red_sw =
   1438 	{"red", redopen, redclose, redioctl};
   1439 
   1440 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1441 MODULE_VERSION(altq_red, 1);
   1442 
   1443 #endif /* KLD_MODULE */
   1444 #endif /* ALTQ3_COMPAT */
   1445 
   1446 #endif /* ALTQ_RED */
   1447