Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.30
      1 /*	$NetBSD: altq_red.c,v 1.30 2016/04/20 08:58:48 knakahara Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2003
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.30 2016/04/20 08:58:48 knakahara Exp $");
     65 
     66 #ifdef _KERNEL_OPT
     67 #include "opt_altq.h"
     68 #include "opt_inet.h"
     69 #include "pf.h"
     70 #endif
     71 
     72 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     73 
     74 #include <sys/param.h>
     75 #include <sys/malloc.h>
     76 #include <sys/mbuf.h>
     77 #include <sys/socket.h>
     78 #include <sys/systm.h>
     79 #include <sys/errno.h>
     80 #include <sys/kauth.h>
     81 #if 1 /* ALTQ3_COMPAT */
     82 #include <sys/sockio.h>
     83 #include <sys/proc.h>
     84 #include <sys/kernel.h>
     85 #ifdef ALTQ_FLOWVALVE
     86 #include <sys/queue.h>
     87 #include <sys/time.h>
     88 #endif
     89 #endif /* ALTQ3_COMPAT */
     90 #include <sys/cprng.h>
     91 
     92 #include <net/if.h>
     93 
     94 #include <netinet/in.h>
     95 #include <netinet/in_systm.h>
     96 #include <netinet/ip.h>
     97 #ifdef INET6
     98 #include <netinet/ip6.h>
     99 #endif
    100 
    101 #if NPF > 0
    102 #include <net/pfvar.h>
    103 #endif
    104 #include <altq/altq.h>
    105 #include <altq/altq_red.h>
    106 #ifdef ALTQ3_COMPAT
    107 #include <altq/altq_conf.h>
    108 #ifdef ALTQ_FLOWVALVE
    109 #include <altq/altq_flowvalve.h>
    110 #endif
    111 #endif
    112 
    113 /*
    114  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    115  * fixed-point calculation.
    116  *
    117  * written by kjc using the ns code as a reference.
    118  * you can learn more about red and ns from Sally's home page at
    119  * http://www-nrg.ee.lbl.gov/floyd/
    120  *
    121  * most of the red parameter values are fixed in this implementation
    122  * to prevent fixed-point overflow/underflow.
    123  * if you change the parameters, watch out for overflow/underflow!
    124  *
    125  * the parameters used are recommended values by Sally.
    126  * the corresponding ns config looks:
    127  *	q_weight=0.00195
    128  *	minthresh=5 maxthresh=15 queue-size=60
    129  *	linterm=30
    130  *	dropmech=drop-tail
    131  *	bytes=false (can't be handled by 32-bit fixed-point)
    132  *	doubleq=false dqthresh=false
    133  *	wait=true
    134  */
    135 /*
    136  * alternative red parameters for a slow link.
    137  *
    138  * assume the queue length becomes from zero to L and keeps L, it takes
    139  * N packets for q_avg to reach 63% of L.
    140  * when q_weight is 0.002, N is about 500 packets.
    141  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    142  * when q_weight is 0.008, N is about 127 packets.
    143  * when q_weight is 0.016, N is about 63 packets.
    144  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
    145  * are allowed for 0.016.
    146  * see Sally's paper for more details.
    147  */
    148 /* normal red parameters */
    149 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    150 				/* q_weight = 0.00195 */
    151 
    152 /* red parameters for a slow link */
    153 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    154 				/* q_weight = 0.0078125 */
    155 
    156 /* red parameters for a very slow link (e.g., dialup) */
    157 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    158 				/* q_weight = 0.015625 */
    159 
    160 /* fixed-point uses 12-bit decimal places */
    161 #define	FP_SHIFT	12	/* fixed-point shift */
    162 
    163 /* red parameters for drop probability */
    164 #define	INV_P_MAX	10	/* inverse of max drop probability */
    165 #define	TH_MIN		5	/* min threshold */
    166 #define	TH_MAX		15	/* max threshold */
    167 
    168 #define	RED_LIMIT	60	/* default max queue lenght */
    169 #define	RED_STATS		/* collect statistics */
    170 
    171 /*
    172  * our default policy for forced-drop is drop-tail.
    173  * (in altq-1.1.2 or earlier, the default was random-drop.
    174  * but it makes more sense to punish the cause of the surge.)
    175  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    176  */
    177 
    178 #ifdef ALTQ3_COMPAT
    179 #ifdef ALTQ_FLOWVALVE
    180 /*
    181  * flow-valve is an extention to protect red from unresponsive flows
    182  * and to promote end-to-end congestion control.
    183  * flow-valve observes the average drop rates of the flows that have
    184  * experienced packet drops in the recent past.
    185  * when the average drop rate exceeds the threshold, the flow is
    186  * blocked by the flow-valve.  the trapped flow should back off
    187  * exponentially to escape from the flow-valve.
    188  */
    189 #ifdef RED_RANDOM_DROP
    190 #error "random-drop can't be used with flow-valve!"
    191 #endif
    192 #endif /* ALTQ_FLOWVALVE */
    193 
    194 /* red_list keeps all red_queue_t's allocated. */
    195 static red_queue_t *red_list = NULL;
    196 
    197 #endif /* ALTQ3_COMPAT */
    198 
    199 /* default red parameter values */
    200 static int default_th_min = TH_MIN;
    201 static int default_th_max = TH_MAX;
    202 static int default_inv_pmax = INV_P_MAX;
    203 
    204 #ifdef ALTQ3_COMPAT
    205 /* internal function prototypes */
    206 static int red_enqueue(struct ifaltq *, struct mbuf *);
    207 static struct mbuf *red_dequeue(struct ifaltq *, int);
    208 static int red_request(struct ifaltq *, int, void *);
    209 static void red_purgeq(red_queue_t *);
    210 static int red_detach(red_queue_t *);
    211 #ifdef ALTQ_FLOWVALVE
    212 static inline struct fve *flowlist_lookup(struct flowvalve *,
    213 			 struct altq_pktattr *, struct timeval *);
    214 static inline struct fve *flowlist_reclaim(struct flowvalve *,
    215 					     struct altq_pktattr *);
    216 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
    217 static inline int fv_p2f(struct flowvalve *, int);
    218 static struct flowvalve *fv_alloc(struct red *);
    219 static void fv_destroy(struct flowvalve *);
    220 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
    221 			struct fve **);
    222 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
    223 			 struct fve *);
    224 #endif
    225 #endif /* ALTQ3_COMPAT */
    226 
    227 /*
    228  * red support routines
    229  */
    230 red_t *
    231 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
    232    int pkttime)
    233 {
    234 	red_t	*rp;
    235 	int	 w, i;
    236 	int	 npkts_per_sec;
    237 
    238 	rp = malloc(sizeof(red_t), M_DEVBUF, M_WAITOK|M_ZERO);
    239 	if (rp == NULL)
    240 		return (NULL);
    241 
    242 	rp->red_avg = 0;
    243 	rp->red_idle = 1;
    244 
    245 	if (weight == 0)
    246 		rp->red_weight = W_WEIGHT;
    247 	else
    248 		rp->red_weight = weight;
    249 	if (inv_pmax == 0)
    250 		rp->red_inv_pmax = default_inv_pmax;
    251 	else
    252 		rp->red_inv_pmax = inv_pmax;
    253 	if (th_min == 0)
    254 		rp->red_thmin = default_th_min;
    255 	else
    256 		rp->red_thmin = th_min;
    257 	if (th_max == 0)
    258 		rp->red_thmax = default_th_max;
    259 	else
    260 		rp->red_thmax = th_max;
    261 
    262 	rp->red_flags = flags;
    263 
    264 	if (pkttime == 0)
    265 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    266 		rp->red_pkttime = 800;
    267 	else
    268 		rp->red_pkttime = pkttime;
    269 
    270 	if (weight == 0) {
    271 		/* when the link is very slow, adjust red parameters */
    272 		npkts_per_sec = 1000000 / rp->red_pkttime;
    273 		if (npkts_per_sec < 50) {
    274 			/* up to about 400Kbps */
    275 			rp->red_weight = W_WEIGHT_2;
    276 		} else if (npkts_per_sec < 300) {
    277 			/* up to about 2.4Mbps */
    278 			rp->red_weight = W_WEIGHT_1;
    279 		}
    280 	}
    281 
    282 	/* calculate wshift.  weight must be power of 2 */
    283 	w = rp->red_weight;
    284 	for (i = 0; w > 1; i++)
    285 		w = w >> 1;
    286 	rp->red_wshift = i;
    287 	w = 1 << rp->red_wshift;
    288 	if (w != rp->red_weight) {
    289 		printf("invalid weight value %d for red! use %d\n",
    290 		       rp->red_weight, w);
    291 		rp->red_weight = w;
    292 	}
    293 
    294 	/*
    295 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    296 	 * to be compared with avg.
    297 	 */
    298 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    299 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    300 
    301 	/*
    302 	 * precompute probability denominator
    303 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    304 	 */
    305 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    306 			 * rp->red_inv_pmax) << FP_SHIFT;
    307 
    308 	/* allocate weight table */
    309 	rp->red_wtab = wtab_alloc(rp->red_weight);
    310 
    311 	microtime(&rp->red_last);
    312 #ifdef ALTQ3_COMPAT
    313 #ifdef ALTQ_FLOWVALVE
    314 	if (flags & REDF_FLOWVALVE)
    315 		rp->red_flowvalve = fv_alloc(rp);
    316 	/* if fv_alloc failes, flowvalve is just disabled */
    317 #endif
    318 #endif /* ALTQ3_COMPAT */
    319 	return (rp);
    320 }
    321 
    322 void
    323 red_destroy(red_t *rp)
    324 {
    325 #ifdef ALTQ3_COMPAT
    326 #ifdef ALTQ_FLOWVALVE
    327 	if (rp->red_flowvalve != NULL)
    328 		fv_destroy(rp->red_flowvalve);
    329 #endif
    330 #endif /* ALTQ3_COMPAT */
    331 	wtab_destroy(rp->red_wtab);
    332 	free(rp, M_DEVBUF);
    333 }
    334 
    335 void
    336 red_getstats(red_t *rp, struct redstats *sp)
    337 {
    338 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
    339 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    340 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    341 	sp->drop_forced		= rp->red_stats.drop_forced;
    342 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    343 	sp->marked_packets	= rp->red_stats.marked_packets;
    344 }
    345 
    346 int
    347 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
    348     struct altq_pktattr *pktattr)
    349 {
    350 	int avg, droptype;
    351 	int n;
    352 #ifdef ALTQ3_COMPAT
    353 #ifdef ALTQ_FLOWVALVE
    354 	struct fve *fve = NULL;
    355 
    356 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    357 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    358 			m_freem(m);
    359 			return (-1);
    360 		}
    361 #endif
    362 #endif /* ALTQ3_COMPAT */
    363 
    364 	avg = rp->red_avg;
    365 
    366 	/*
    367 	 * if we were idle, we pretend that n packets arrived during
    368 	 * the idle period.
    369 	 */
    370 	if (rp->red_idle) {
    371 		struct timeval now;
    372 		int t;
    373 
    374 		rp->red_idle = 0;
    375 		microtime(&now);
    376 		t = (now.tv_sec - rp->red_last.tv_sec);
    377 		if (t > 60) {
    378 			/*
    379 			 * being idle for more than 1 minute, set avg to zero.
    380 			 * this prevents t from overflow.
    381 			 */
    382 			avg = 0;
    383 		} else {
    384 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    385 			n = t / rp->red_pkttime - 1;
    386 
    387 			/* the following line does (avg = (1 - Wq)^n * avg) */
    388 			if (n > 0)
    389 				avg = (avg >> FP_SHIFT) *
    390 				    pow_w(rp->red_wtab, n);
    391 		}
    392 	}
    393 
    394 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    395 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    396 	rp->red_avg = avg;		/* save the new value */
    397 
    398 	/*
    399 	 * red_count keeps a tally of arriving traffic that has not
    400 	 * been dropped.
    401 	 */
    402 	rp->red_count++;
    403 
    404 	/* see if we drop early */
    405 	droptype = DTYPE_NODROP;
    406 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    407 		if (avg >= rp->red_thmax_s) {
    408 			/* avg >= th_max: forced drop */
    409 			droptype = DTYPE_FORCED;
    410 		} else if (rp->red_old == 0) {
    411 			/* first exceeds th_min */
    412 			rp->red_count = 1;
    413 			rp->red_old = 1;
    414 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    415 				      rp->red_probd, rp->red_count)) {
    416 			/* mark or drop by red */
    417 			if ((rp->red_flags & REDF_ECN) &&
    418 			    mark_ecn(m, pktattr, rp->red_flags)) {
    419 				/* successfully marked.  do not drop. */
    420 				rp->red_count = 0;
    421 #ifdef RED_STATS
    422 				rp->red_stats.marked_packets++;
    423 #endif
    424 			} else {
    425 				/* unforced drop by red */
    426 				droptype = DTYPE_EARLY;
    427 			}
    428 		}
    429 	} else {
    430 		/* avg < th_min */
    431 		rp->red_old = 0;
    432 	}
    433 
    434 	/*
    435 	 * if the queue length hits the hard limit, it's a forced drop.
    436 	 */
    437 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    438 		droptype = DTYPE_FORCED;
    439 
    440 #ifdef RED_RANDOM_DROP
    441 	/* if successful or forced drop, enqueue this packet. */
    442 	if (droptype != DTYPE_EARLY)
    443 		_addq(q, m);
    444 #else
    445 	/* if successful, enqueue this packet. */
    446 	if (droptype == DTYPE_NODROP)
    447 		_addq(q, m);
    448 #endif
    449 	if (droptype != DTYPE_NODROP) {
    450 		if (droptype == DTYPE_EARLY) {
    451 			/* drop the incoming packet */
    452 #ifdef RED_STATS
    453 			rp->red_stats.drop_unforced++;
    454 #endif
    455 		} else {
    456 			/* forced drop, select a victim packet in the queue. */
    457 #ifdef RED_RANDOM_DROP
    458 			m = _getq_random(q);
    459 #endif
    460 #ifdef RED_STATS
    461 			rp->red_stats.drop_forced++;
    462 #endif
    463 		}
    464 #ifdef RED_STATS
    465 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    466 #endif
    467 		rp->red_count = 0;
    468 #ifdef ALTQ3_COMPAT
    469 #ifdef ALTQ_FLOWVALVE
    470 		if (rp->red_flowvalve != NULL)
    471 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    472 #endif
    473 #endif /* ALTQ3_COMPAT */
    474 		m_freem(m);
    475 		return (-1);
    476 	}
    477 	/* successfully queued */
    478 #ifdef RED_STATS
    479 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    480 #endif
    481 	return (0);
    482 }
    483 
    484 /*
    485  * early-drop probability is calculated as follows:
    486  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    487  *   prob_a = prob / (2 - count*prob)
    488  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    489  * here prob_a increases as successive undrop count increases.
    490  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    491  * becomes 1 when (count >= (2 / prob))).
    492  */
    493 int
    494 drop_early(int fp_len, int fp_probd, int count)
    495 {
    496 	int	d;		/* denominator of drop-probability */
    497 
    498 	d = fp_probd - count * fp_len;
    499 	if (d <= 0)
    500 		/* count exceeds the hard limit: drop or mark */
    501 		return (1);
    502 
    503 	/*
    504 	 * now the range of d is [1..600] in fixed-point. (when
    505 	 * th_max-th_min=10 and p_max=1/30)
    506 	 * drop probability = (avg - TH_MIN) / d
    507 	 */
    508 
    509 	if ((cprng_fast32() % d) < fp_len) {
    510 		/* drop or mark */
    511 		return (1);
    512 	}
    513 	/* no drop/mark */
    514 	return (0);
    515 }
    516 
    517 /*
    518  * try to mark CE bit to the packet.
    519  *    returns 1 if successfully marked, 0 otherwise.
    520  */
    521 int
    522 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
    523 {
    524 	struct mbuf	*m0;
    525 	struct m_tag	*t;
    526 	struct altq_tag	*at;
    527 	void		*hdr;
    528 	int		 af;
    529 
    530 	t = m_tag_find(m, PACKET_TAG_ALTQ_QID, NULL);
    531 	if (t != NULL) {
    532 		at = (struct altq_tag *)(t + 1);
    533 		if (at == NULL)
    534 			return (0);
    535 		af = at->af;
    536 		hdr = at->hdr;
    537 #ifdef ALTQ3_COMPAT
    538 	} else if (pktattr != NULL) {
    539 		af = pktattr->pattr_af;
    540 		hdr = pktattr->pattr_hdr;
    541 #endif /* ALTQ3_COMPAT */
    542 	} else
    543 		return (0);
    544 
    545 	if (af != AF_INET && af != AF_INET6)
    546 		return (0);
    547 
    548 	/* verify that pattr_hdr is within the mbuf data */
    549 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    550 		if (((char *)hdr >= m0->m_data) &&
    551 		    ((char *)hdr < m0->m_data + m0->m_len))
    552 			break;
    553 	if (m0 == NULL) {
    554 		/* ick, tag info is stale */
    555 		return (0);
    556 	}
    557 
    558 	switch (af) {
    559 	case AF_INET:
    560 		if (flags & REDF_ECN4) {
    561 			struct ip *ip = hdr;
    562 			u_int8_t otos;
    563 			int sum;
    564 
    565 			if (ip->ip_v != 4)
    566 				return (0);	/* version mismatch! */
    567 
    568 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    569 				return (0);	/* not-ECT */
    570 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    571 				return (1);	/* already marked */
    572 
    573 			/*
    574 			 * ecn-capable but not marked,
    575 			 * mark CE and update checksum
    576 			 */
    577 			otos = ip->ip_tos;
    578 			ip->ip_tos |= IPTOS_ECN_CE;
    579 			/*
    580 			 * update checksum (from RFC1624)
    581 			 *	   HC' = ~(~HC + ~m + m')
    582 			 */
    583 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    584 			sum += (~otos & 0xffff) + ip->ip_tos;
    585 			sum = (sum >> 16) + (sum & 0xffff);
    586 			sum += (sum >> 16);  /* add carry */
    587 			ip->ip_sum = htons(~sum & 0xffff);
    588 			return (1);
    589 		}
    590 		break;
    591 #ifdef INET6
    592 	case AF_INET6:
    593 		if (flags & REDF_ECN6) {
    594 			struct ip6_hdr *ip6 = hdr;
    595 			u_int32_t flowlabel;
    596 
    597 			flowlabel = ntohl(ip6->ip6_flow);
    598 			if ((flowlabel >> 28) != 6)
    599 				return (0);	/* version mismatch! */
    600 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    601 			    (IPTOS_ECN_NOTECT << 20))
    602 				return (0);	/* not-ECT */
    603 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    604 			    (IPTOS_ECN_CE << 20))
    605 				return (1);	/* already marked */
    606 			/*
    607 			 * ecn-capable but not marked,  mark CE
    608 			 */
    609 			flowlabel |= (IPTOS_ECN_CE << 20);
    610 			ip6->ip6_flow = htonl(flowlabel);
    611 			return (1);
    612 		}
    613 		break;
    614 #endif  /* INET6 */
    615 	}
    616 
    617 	/* not marked */
    618 	return (0);
    619 }
    620 
    621 struct mbuf *
    622 red_getq(red_t *rp, class_queue_t *q)
    623 {
    624 	struct mbuf *m;
    625 
    626 	if ((m = _getq(q)) == NULL) {
    627 		if (rp->red_idle == 0) {
    628 			rp->red_idle = 1;
    629 			microtime(&rp->red_last);
    630 		}
    631 		return NULL;
    632 	}
    633 
    634 	rp->red_idle = 0;
    635 	return (m);
    636 }
    637 
    638 /*
    639  * helper routine to calibrate avg during idle.
    640  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    641  * here Wq = 1/weight and the code assumes Wq is close to zero.
    642  *
    643  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    644  */
    645 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    646 
    647 struct wtab *
    648 wtab_alloc(int weight)
    649 {
    650 	struct wtab	*w;
    651 	int		 i;
    652 
    653 	for (w = wtab_list; w != NULL; w = w->w_next)
    654 		if (w->w_weight == weight) {
    655 			w->w_refcount++;
    656 			return (w);
    657 		}
    658 
    659 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
    660 	if (w == NULL)
    661 		panic("wtab_alloc: malloc failed!");
    662 	w->w_weight = weight;
    663 	w->w_refcount = 1;
    664 	w->w_next = wtab_list;
    665 	wtab_list = w;
    666 
    667 	/* initialize the weight table */
    668 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
    669 	for (i = 1; i < 32; i++) {
    670 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
    671 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
    672 			w->w_param_max = 1 << i;
    673 	}
    674 
    675 	return (w);
    676 }
    677 
    678 int
    679 wtab_destroy(struct wtab *w)
    680 {
    681 	struct wtab	*prev;
    682 
    683 	if (--w->w_refcount > 0)
    684 		return (0);
    685 
    686 	if (wtab_list == w)
    687 		wtab_list = w->w_next;
    688 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
    689 		if (prev->w_next == w) {
    690 			prev->w_next = w->w_next;
    691 			break;
    692 		}
    693 
    694 	free(w, M_DEVBUF);
    695 	return (0);
    696 }
    697 
    698 int32_t
    699 pow_w(struct wtab *w, int n)
    700 {
    701 	int	i, bit;
    702 	int32_t	val;
    703 
    704 	if (n >= w->w_param_max)
    705 		return (0);
    706 
    707 	val = 1 << FP_SHIFT;
    708 	if (n <= 0)
    709 		return (val);
    710 
    711 	bit = 1;
    712 	i = 0;
    713 	while (n) {
    714 		if (n & bit) {
    715 			val = (val * w->w_tab[i]) >> FP_SHIFT;
    716 			n &= ~bit;
    717 		}
    718 		i++;
    719 		bit <<=  1;
    720 	}
    721 	return (val);
    722 }
    723 
    724 #ifdef ALTQ3_COMPAT
    725 /*
    726  * red device interface
    727  */
    728 altqdev_decl(red);
    729 
    730 int
    731 redopen(dev_t dev, int flag, int fmt,
    732     struct lwp *l)
    733 {
    734 	/* everything will be done when the queueing scheme is attached. */
    735 	return 0;
    736 }
    737 
    738 int
    739 redclose(dev_t dev, int flag, int fmt,
    740     struct lwp *l)
    741 {
    742 	red_queue_t *rqp;
    743 	int err, error = 0;
    744 
    745 	while ((rqp = red_list) != NULL) {
    746 		/* destroy all */
    747 		err = red_detach(rqp);
    748 		if (err != 0 && error == 0)
    749 			error = err;
    750 	}
    751 
    752 	return error;
    753 }
    754 
    755 int
    756 redioctl(dev_t dev, ioctlcmd_t cmd, void *addr, int flag,
    757     struct lwp *l)
    758 {
    759 	red_queue_t *rqp;
    760 	struct red_interface *ifacep;
    761 	struct ifnet *ifp;
    762 	struct proc *p = l->l_proc;
    763 	int	error = 0;
    764 
    765 	/* check super-user privilege */
    766 	switch (cmd) {
    767 	case RED_GETSTATS:
    768 		break;
    769 	default:
    770 #if (__FreeBSD_version > 400000)
    771 		if ((error = suser(p)) != 0)
    772 #else
    773 		if ((error = kauth_authorize_network(p->p_cred,
    774 		    KAUTH_NETWORK_ALTQ, KAUTH_REQ_NETWORK_ALTQ_RED, NULL,
    775 		    NULL, NULL)) != 0)
    776 #endif
    777 			return (error);
    778 		break;
    779 	}
    780 
    781 	switch (cmd) {
    782 
    783 	case RED_ENABLE:
    784 		ifacep = (struct red_interface *)addr;
    785 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    786 			error = EBADF;
    787 			break;
    788 		}
    789 		error = altq_enable(rqp->rq_ifq);
    790 		break;
    791 
    792 	case RED_DISABLE:
    793 		ifacep = (struct red_interface *)addr;
    794 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    795 			error = EBADF;
    796 			break;
    797 		}
    798 		error = altq_disable(rqp->rq_ifq);
    799 		break;
    800 
    801 	case RED_IF_ATTACH:
    802 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    803 		if (ifp == NULL) {
    804 			error = ENXIO;
    805 			break;
    806 		}
    807 
    808 		/* allocate and initialize red_queue_t */
    809 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    810 		if (rqp == NULL) {
    811 			error = ENOMEM;
    812 			break;
    813 		}
    814 
    815 		rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF,
    816 		    M_WAITOK|M_ZERO);
    817 		if (rqp->rq_q == NULL) {
    818 			free(rqp, M_DEVBUF);
    819 			error = ENOMEM;
    820 			break;
    821 		}
    822 
    823 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    824 		if (rqp->rq_red == NULL) {
    825 			free(rqp->rq_q, M_DEVBUF);
    826 			free(rqp, M_DEVBUF);
    827 			error = ENOMEM;
    828 			break;
    829 		}
    830 
    831 		rqp->rq_ifq = &ifp->if_snd;
    832 		qtail(rqp->rq_q) = NULL;
    833 		qlen(rqp->rq_q) = 0;
    834 		qlimit(rqp->rq_q) = RED_LIMIT;
    835 		qtype(rqp->rq_q) = Q_RED;
    836 
    837 		/*
    838 		 * set RED to this ifnet structure.
    839 		 */
    840 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    841 				    red_enqueue, red_dequeue, red_request,
    842 				    NULL, NULL);
    843 		if (error) {
    844 			red_destroy(rqp->rq_red);
    845 			free(rqp->rq_q, M_DEVBUF);
    846 			free(rqp, M_DEVBUF);
    847 			break;
    848 		}
    849 
    850 		/* add this state to the red list */
    851 		rqp->rq_next = red_list;
    852 		red_list = rqp;
    853 		break;
    854 
    855 	case RED_IF_DETACH:
    856 		ifacep = (struct red_interface *)addr;
    857 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    858 			error = EBADF;
    859 			break;
    860 		}
    861 		error = red_detach(rqp);
    862 		break;
    863 
    864 	case RED_GETSTATS:
    865 		do {
    866 			struct red_stats *q_stats;
    867 			red_t *rp;
    868 
    869 			q_stats = (struct red_stats *)addr;
    870 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    871 					     ALTQT_RED)) == NULL) {
    872 				error = EBADF;
    873 				break;
    874 			}
    875 
    876 			q_stats->q_len 	   = qlen(rqp->rq_q);
    877 			q_stats->q_limit   = qlimit(rqp->rq_q);
    878 
    879 			rp = rqp->rq_red;
    880 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    881 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    882 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    883 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    884 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    885 			q_stats->marked_packets = rp->red_stats.marked_packets;
    886 
    887 			q_stats->weight		= rp->red_weight;
    888 			q_stats->inv_pmax	= rp->red_inv_pmax;
    889 			q_stats->th_min		= rp->red_thmin;
    890 			q_stats->th_max		= rp->red_thmax;
    891 
    892 #ifdef ALTQ_FLOWVALVE
    893 			if (rp->red_flowvalve != NULL) {
    894 				struct flowvalve *fv = rp->red_flowvalve;
    895 				q_stats->fv_flows    = fv->fv_flows;
    896 				q_stats->fv_pass     = fv->fv_stats.pass;
    897 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    898 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    899 				q_stats->fv_escape   = fv->fv_stats.escape;
    900 			} else {
    901 #endif /* ALTQ_FLOWVALVE */
    902 				q_stats->fv_flows    = 0;
    903 				q_stats->fv_pass     = 0;
    904 				q_stats->fv_predrop  = 0;
    905 				q_stats->fv_alloc    = 0;
    906 				q_stats->fv_escape   = 0;
    907 #ifdef ALTQ_FLOWVALVE
    908 			}
    909 #endif /* ALTQ_FLOWVALVE */
    910 		} while (/*CONSTCOND*/ 0);
    911 		break;
    912 
    913 	case RED_CONFIG:
    914 		do {
    915 			struct red_conf *fc;
    916 			red_t *new;
    917 			int s, limit;
    918 
    919 			fc = (struct red_conf *)addr;
    920 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    921 					       ALTQT_RED)) == NULL) {
    922 				error = EBADF;
    923 				break;
    924 			}
    925 			new = red_alloc(fc->red_weight,
    926 					fc->red_inv_pmax,
    927 					fc->red_thmin,
    928 					fc->red_thmax,
    929 					fc->red_flags,
    930 					fc->red_pkttime);
    931 			if (new == NULL) {
    932 				error = ENOMEM;
    933 				break;
    934 			}
    935 
    936 			s = splnet();
    937 			red_purgeq(rqp);
    938 			limit = fc->red_limit;
    939 			if (limit < fc->red_thmax)
    940 				limit = fc->red_thmax;
    941 			qlimit(rqp->rq_q) = limit;
    942 			fc->red_limit = limit;	/* write back the new value */
    943 
    944 			red_destroy(rqp->rq_red);
    945 			rqp->rq_red = new;
    946 
    947 			splx(s);
    948 
    949 			/* write back new values */
    950 			fc->red_limit = limit;
    951 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    952 			fc->red_thmin = rqp->rq_red->red_thmin;
    953 			fc->red_thmax = rqp->rq_red->red_thmax;
    954 
    955 		} while (/*CONSTCOND*/ 0);
    956 		break;
    957 
    958 	case RED_SETDEFAULTS:
    959 		do {
    960 			struct redparams *rp;
    961 
    962 			rp = (struct redparams *)addr;
    963 
    964 			default_th_min = rp->th_min;
    965 			default_th_max = rp->th_max;
    966 			default_inv_pmax = rp->inv_pmax;
    967 		} while (/*CONSTCOND*/ 0);
    968 		break;
    969 
    970 	default:
    971 		error = EINVAL;
    972 		break;
    973 	}
    974 	return error;
    975 }
    976 
    977 static int
    978 red_detach(red_queue_t *rqp)
    979 {
    980 	red_queue_t *tmp;
    981 	int error = 0;
    982 
    983 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    984 		altq_disable(rqp->rq_ifq);
    985 
    986 	if ((error = altq_detach(rqp->rq_ifq)))
    987 		return (error);
    988 
    989 	if (red_list == rqp)
    990 		red_list = rqp->rq_next;
    991 	else {
    992 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
    993 			if (tmp->rq_next == rqp) {
    994 				tmp->rq_next = rqp->rq_next;
    995 				break;
    996 			}
    997 		if (tmp == NULL)
    998 			printf("red_detach: no state found in red_list!\n");
    999 	}
   1000 
   1001 	red_destroy(rqp->rq_red);
   1002 	free(rqp->rq_q, M_DEVBUF);
   1003 	free(rqp, M_DEVBUF);
   1004 	return (error);
   1005 }
   1006 
   1007 /*
   1008  * enqueue routine:
   1009  *
   1010  *	returns: 0 when successfully queued.
   1011  *		 ENOBUFS when drop occurs.
   1012  */
   1013 static int
   1014 red_enqueue(struct ifaltq *ifq, struct mbuf *m)
   1015 {
   1016 	struct altq_pktattr pktattr;
   1017 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1018 
   1019 	pktattr.pattr_class = m->m_pkthdr.pattr_class;
   1020 	pktattr.pattr_af = m->m_pkthdr.pattr_af;
   1021 	pktattr.pattr_hdr = m->m_pkthdr.pattr_hdr;
   1022 
   1023 	if (red_addq(rqp->rq_red, rqp->rq_q, m, &pktattr) < 0)
   1024 		return ENOBUFS;
   1025 	ifq->ifq_len++;
   1026 	return 0;
   1027 }
   1028 
   1029 /*
   1030  * dequeue routine:
   1031  *	must be called in splnet.
   1032  *
   1033  *	returns: mbuf dequeued.
   1034  *		 NULL when no packet is available in the queue.
   1035  */
   1036 
   1037 static struct mbuf *
   1038 red_dequeue(struct ifaltq *ifq, int op)
   1039 {
   1040 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1041 	struct mbuf *m;
   1042 
   1043 	if (op == ALTDQ_POLL)
   1044 		return qhead(rqp->rq_q);
   1045 
   1046 	/* op == ALTDQ_REMOVE */
   1047 	m =  red_getq(rqp->rq_red, rqp->rq_q);
   1048 	if (m != NULL)
   1049 		ifq->ifq_len--;
   1050 	return (m);
   1051 }
   1052 
   1053 static int
   1054 red_request(struct ifaltq *ifq, int req, void *arg)
   1055 {
   1056 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1057 
   1058 	switch (req) {
   1059 	case ALTRQ_PURGE:
   1060 		red_purgeq(rqp);
   1061 		break;
   1062 	}
   1063 	return (0);
   1064 }
   1065 
   1066 static void
   1067 red_purgeq(red_queue_t *rqp)
   1068 {
   1069 	_flushq(rqp->rq_q);
   1070 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
   1071 		rqp->rq_ifq->ifq_len = 0;
   1072 }
   1073 
   1074 #ifdef ALTQ_FLOWVALVE
   1075 
   1076 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1077 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1078 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1079 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1080 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1081 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1082 
   1083 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1084 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1085 
   1086 #define	FV_N			10	/* update fve_f every FV_N packets */
   1087 
   1088 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1089 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1090 #define	FV_ALPHA		5  /* extra packet count */
   1091 
   1092 #define	FV_STATS
   1093 
   1094 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1095 
   1096 /*
   1097  * Brtt table: 127 entry table to convert drop rate (p) to
   1098  * the corresponding bandwidth fraction (f)
   1099  * the following equation is implemented to use scaled values,
   1100  * fve_p and fve_f, in the fixed point format.
   1101  *
   1102  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1103  *   f = Brtt(p) / (max_th + alpha)
   1104  */
   1105 #define	BRTT_SIZE	128
   1106 #define	BRTT_SHIFT	12
   1107 #define	BRTT_MASK	0x0007f000
   1108 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1109 
   1110 const int brtt_tab[BRTT_SIZE] = {
   1111 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1112 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1113 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1114 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1115 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1116 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1117 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1118 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1119 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1120 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1121 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1122 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1123 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1124 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1125 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1126 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1127 };
   1128 
   1129 static inline struct fve *
   1130 flowlist_lookup(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1131     struct timeval *now)
   1132 {
   1133 	struct fve *fve;
   1134 	int flows;
   1135 	struct ip *ip;
   1136 #ifdef INET6
   1137 	struct ip6_hdr *ip6;
   1138 #endif
   1139 	struct timeval tthresh;
   1140 
   1141 	if (pktattr == NULL)
   1142 		return (NULL);
   1143 
   1144 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1145 	flows = 0;
   1146 	/*
   1147 	 * search the flow list
   1148 	 */
   1149 	switch (pktattr->pattr_af) {
   1150 	case AF_INET:
   1151 		ip = (struct ip *)pktattr->pattr_hdr;
   1152 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1153 			if (fve->fve_lastdrop.tv_sec == 0)
   1154 				break;
   1155 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1156 				fve->fve_lastdrop.tv_sec = 0;
   1157 				break;
   1158 			}
   1159 			if (fve->fve_flow.flow_af == AF_INET &&
   1160 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1161 			    ip->ip_src.s_addr &&
   1162 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1163 			    ip->ip_dst.s_addr)
   1164 				return (fve);
   1165 			flows++;
   1166 		}
   1167 		break;
   1168 #ifdef INET6
   1169 	case AF_INET6:
   1170 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1171 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1172 			if (fve->fve_lastdrop.tv_sec == 0)
   1173 				break;
   1174 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1175 				fve->fve_lastdrop.tv_sec = 0;
   1176 				break;
   1177 			}
   1178 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1179 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1180 					       &ip6->ip6_src) &&
   1181 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1182 					       &ip6->ip6_dst))
   1183 				return (fve);
   1184 			flows++;
   1185 		}
   1186 		break;
   1187 #endif /* INET6 */
   1188 
   1189 	default:
   1190 		/* unknown protocol.  no drop. */
   1191 		return (NULL);
   1192 	}
   1193 	fv->fv_flows = flows;	/* save the number of active fve's */
   1194 	return (NULL);
   1195 }
   1196 
   1197 static inline struct fve *
   1198 flowlist_reclaim(struct flowvalve *fv, struct altq_pktattr *pktattr)
   1199 {
   1200 	struct fve *fve;
   1201 	struct ip *ip;
   1202 #ifdef INET6
   1203 	struct ip6_hdr *ip6;
   1204 #endif
   1205 
   1206 	/*
   1207 	 * get an entry from the tail of the LRU list.
   1208 	 */
   1209 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1210 
   1211 	switch (pktattr->pattr_af) {
   1212 	case AF_INET:
   1213 		ip = (struct ip *)pktattr->pattr_hdr;
   1214 		fve->fve_flow.flow_af = AF_INET;
   1215 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1216 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1217 		break;
   1218 #ifdef INET6
   1219 	case AF_INET6:
   1220 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1221 		fve->fve_flow.flow_af = AF_INET6;
   1222 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1223 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1224 		break;
   1225 #endif
   1226 	}
   1227 
   1228 	fve->fve_state = Green;
   1229 	fve->fve_p = 0.0;
   1230 	fve->fve_f = 0.0;
   1231 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1232 	fve->fve_count = 0;
   1233 
   1234 	fv->fv_flows++;
   1235 #ifdef FV_STATS
   1236 	fv->fv_stats.alloc++;
   1237 #endif
   1238 	return (fve);
   1239 }
   1240 
   1241 static inline void
   1242 flowlist_move_to_head(struct flowvalve *fv, struct fve *fve)
   1243 {
   1244 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1245 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1246 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1247 	}
   1248 }
   1249 
   1250 /*
   1251  * allocate flowvalve structure
   1252  */
   1253 static struct flowvalve *
   1254 fv_alloc(struct red *rp)
   1255 {
   1256 	struct flowvalve *fv;
   1257 	struct fve *fve;
   1258 	int i, num;
   1259 
   1260 	num = FV_FLOWLISTSIZE;
   1261 	fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK|M_ZERO);
   1262 	if (fv == NULL)
   1263 		return (NULL);
   1264 
   1265 	fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF,
   1266 	    M_WAITOK|M_ZERO);
   1267 	if (fv->fv_fves == NULL) {
   1268 		free(fv, M_DEVBUF);
   1269 		return (NULL);
   1270 	}
   1271 
   1272 	fv->fv_flows = 0;
   1273 	TAILQ_INIT(&fv->fv_flowlist);
   1274 	for (i = 0; i < num; i++) {
   1275 		fve = &fv->fv_fves[i];
   1276 		fve->fve_lastdrop.tv_sec = 0;
   1277 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1278 	}
   1279 
   1280 	/* initialize drop rate threshold in scaled fixed-point */
   1281 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1282 
   1283 	/* initialize drop rate to fraction table */
   1284 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK);
   1285 	if (fv->fv_p2ftab == NULL) {
   1286 		free(fv->fv_fves, M_DEVBUF);
   1287 		free(fv, M_DEVBUF);
   1288 		return (NULL);
   1289 	}
   1290 	/*
   1291 	 * create the p2f table.
   1292 	 * (shift is used to keep the precision)
   1293 	 */
   1294 	for (i = 1; i < BRTT_SIZE; i++) {
   1295 		int f;
   1296 
   1297 		f = brtt_tab[i] << 8;
   1298 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1299 	}
   1300 
   1301 	return (fv);
   1302 }
   1303 
   1304 static void
   1305 fv_destroy(struct flowvalve *fv)
   1306 {
   1307 	free(fv->fv_p2ftab, M_DEVBUF);
   1308 	free(fv->fv_fves, M_DEVBUF);
   1309 	free(fv, M_DEVBUF);
   1310 }
   1311 
   1312 static inline int
   1313 fv_p2f(struct flowvalve *fv, int p)
   1314 {
   1315 	int val, f;
   1316 
   1317 	if (p >= BRTT_PMAX)
   1318 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1319 	else if ((val = (p & BRTT_MASK)))
   1320 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1321 	else
   1322 		f = fv->fv_p2ftab[1];
   1323 	return (f);
   1324 }
   1325 
   1326 /*
   1327  * check if an arriving packet should be pre-dropped.
   1328  * called from red_addq() when a packet arrives.
   1329  * returns 1 when the packet should be pre-dropped.
   1330  * should be called in splnet.
   1331  */
   1332 static int
   1333 fv_checkflow(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1334     struct fve **fcache)
   1335 {
   1336 	struct fve *fve;
   1337 	struct timeval now;
   1338 
   1339 	fv->fv_ifseq++;
   1340 	FV_TIMESTAMP(&now);
   1341 
   1342 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1343 		/* no matching entry in the flowlist */
   1344 		return (0);
   1345 
   1346 	*fcache = fve;
   1347 
   1348 	/* update fraction f for every FV_N packets */
   1349 	if (++fve->fve_count == FV_N) {
   1350 		/*
   1351 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1352 		 */
   1353 		fve->fve_f =
   1354 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1355 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1356 		fve->fve_ifseq = fv->fv_ifseq;
   1357 		fve->fve_count = 0;
   1358 	}
   1359 
   1360 	/*
   1361 	 * overpumping test
   1362 	 */
   1363 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1364 		int fthresh;
   1365 
   1366 		/* calculate a threshold */
   1367 		fthresh = fv_p2f(fv, fve->fve_p);
   1368 		if (fve->fve_f > fthresh)
   1369 			fve->fve_state = Red;
   1370 	}
   1371 
   1372 	if (fve->fve_state == Red) {
   1373 		/*
   1374 		 * backoff test
   1375 		 */
   1376 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1377 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1378 			fve->fve_p = 0;
   1379 			fve->fve_state = Green;
   1380 #ifdef FV_STATS
   1381 			fv->fv_stats.escape++;
   1382 #endif
   1383 		} else {
   1384 			/* block this flow */
   1385 			flowlist_move_to_head(fv, fve);
   1386 			fve->fve_lastdrop = now;
   1387 #ifdef FV_STATS
   1388 			fv->fv_stats.predrop++;
   1389 #endif
   1390 			return (1);
   1391 		}
   1392 	}
   1393 
   1394 	/*
   1395 	 * p = (1 - Wp) * p
   1396 	 */
   1397 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1398 	if (fve->fve_p < 0)
   1399 		fve->fve_p = 0;
   1400 #ifdef FV_STATS
   1401 	fv->fv_stats.pass++;
   1402 #endif
   1403 	return (0);
   1404 }
   1405 
   1406 /*
   1407  * called from red_addq when a packet is dropped by red.
   1408  * should be called in splnet.
   1409  */
   1410 static void
   1411 fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *pktattr,
   1412     struct fve *fcache)
   1413 {
   1414 	struct fve *fve;
   1415 	struct timeval now;
   1416 
   1417 	if (pktattr == NULL)
   1418 		return;
   1419 	FV_TIMESTAMP(&now);
   1420 
   1421 	if (fcache != NULL)
   1422 		/* the fve of this packet is already cached */
   1423 		fve = fcache;
   1424 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1425 		fve = flowlist_reclaim(fv, pktattr);
   1426 
   1427 	flowlist_move_to_head(fv, fve);
   1428 
   1429 	/*
   1430 	 * update p:  the following line cancels the update
   1431 	 *	      in fv_checkflow() and calculate
   1432 	 *	p = Wp + (1 - Wp) * p
   1433 	 */
   1434 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1435 
   1436 	fve->fve_lastdrop = now;
   1437 }
   1438 
   1439 #endif /* ALTQ_FLOWVALVE */
   1440 
   1441 #ifdef KLD_MODULE
   1442 
   1443 static struct altqsw red_sw =
   1444 	{"red", redopen, redclose, redioctl};
   1445 
   1446 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1447 MODULE_VERSION(altq_red, 1);
   1448 
   1449 #endif /* KLD_MODULE */
   1450 #endif /* ALTQ3_COMPAT */
   1451 
   1452 #endif /* ALTQ_RED */
   1453