Home | History | Annotate | Line # | Download | only in altq
altq_red.c revision 1.13.12.2
      1 /*	$NetBSD: altq_red.c,v 1.13.12.2 2006/03/18 18:18:53 peter Exp $	*/
      2 /*	$KAME: altq_red.c,v 1.20 2005/04/13 03:44:25 suz Exp $	*/
      3 
      4 /*
      5  * Copyright (C) 1997-2003
      6  *	Sony Computer Science Laboratories Inc.  All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  *
     17  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
     18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
     21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27  * SUCH DAMAGE.
     28  *
     29  */
     30 /*
     31  * Copyright (c) 1990-1994 Regents of the University of California.
     32  * All rights reserved.
     33  *
     34  * Redistribution and use in source and binary forms, with or without
     35  * modification, are permitted provided that the following conditions
     36  * are met:
     37  * 1. Redistributions of source code must retain the above copyright
     38  *    notice, this list of conditions and the following disclaimer.
     39  * 2. Redistributions in binary form must reproduce the above copyright
     40  *    notice, this list of conditions and the following disclaimer in the
     41  *    documentation and/or other materials provided with the distribution.
     42  * 3. All advertising materials mentioning features or use of this software
     43  *    must display the following acknowledgement:
     44  *	This product includes software developed by the Computer Systems
     45  *	Engineering Group at Lawrence Berkeley Laboratory.
     46  * 4. Neither the name of the University nor of the Laboratory may be used
     47  *    to endorse or promote products derived from this software without
     48  *    specific prior written permission.
     49  *
     50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     60  * SUCH DAMAGE.
     61  */
     62 
     63 #include <sys/cdefs.h>
     64 __KERNEL_RCSID(0, "$NetBSD: altq_red.c,v 1.13.12.2 2006/03/18 18:18:53 peter Exp $");
     65 
     66 #ifdef _KERNEL_OPT
     67 #include "opt_altq.h"
     68 #include "opt_inet.h"
     69 #endif
     70 
     71 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
     72 
     73 #include <sys/param.h>
     74 #include <sys/malloc.h>
     75 #include <sys/mbuf.h>
     76 #include <sys/socket.h>
     77 #include <sys/systm.h>
     78 #include <sys/errno.h>
     79 #if 1 /* ALTQ3_COMPAT */
     80 #include <sys/sockio.h>
     81 #include <sys/proc.h>
     82 #include <sys/kernel.h>
     83 #ifdef ALTQ_FLOWVALVE
     84 #include <sys/queue.h>
     85 #include <sys/time.h>
     86 #endif
     87 #endif /* ALTQ3_COMPAT */
     88 
     89 #include <net/if.h>
     90 
     91 #include <netinet/in.h>
     92 #include <netinet/in_systm.h>
     93 #include <netinet/ip.h>
     94 #ifdef INET6
     95 #include <netinet/ip6.h>
     96 #endif
     97 
     98 #include <net/pfvar.h>
     99 #include <altq/altq.h>
    100 #include <altq/altq_red.h>
    101 #ifdef ALTQ3_COMPAT
    102 #include <altq/altq_conf.h>
    103 #ifdef ALTQ_FLOWVALVE
    104 #include <altq/altq_flowvalve.h>
    105 #endif
    106 #endif
    107 
    108 /*
    109  * ALTQ/RED (Random Early Detection) implementation using 32-bit
    110  * fixed-point calculation.
    111  *
    112  * written by kjc using the ns code as a reference.
    113  * you can learn more about red and ns from Sally's home page at
    114  * http://www-nrg.ee.lbl.gov/floyd/
    115  *
    116  * most of the red parameter values are fixed in this implementation
    117  * to prevent fixed-point overflow/underflow.
    118  * if you change the parameters, watch out for overflow/underflow!
    119  *
    120  * the parameters used are recommended values by Sally.
    121  * the corresponding ns config looks:
    122  *	q_weight=0.00195
    123  *	minthresh=5 maxthresh=15 queue-size=60
    124  *	linterm=30
    125  *	dropmech=drop-tail
    126  *	bytes=false (can't be handled by 32-bit fixed-point)
    127  *	doubleq=false dqthresh=false
    128  *	wait=true
    129  */
    130 /*
    131  * alternative red parameters for a slow link.
    132  *
    133  * assume the queue length becomes from zero to L and keeps L, it takes
    134  * N packets for q_avg to reach 63% of L.
    135  * when q_weight is 0.002, N is about 500 packets.
    136  * for a slow link like dial-up, 500 packets takes more than 1 minute!
    137  * when q_weight is 0.008, N is about 127 packets.
    138  * when q_weight is 0.016, N is about 63 packets.
    139  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
    140  * are allowed for 0.016.
    141  * see Sally's paper for more details.
    142  */
    143 /* normal red parameters */
    144 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
    145 				/* q_weight = 0.00195 */
    146 
    147 /* red parameters for a slow link */
    148 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
    149 				/* q_weight = 0.0078125 */
    150 
    151 /* red parameters for a very slow link (e.g., dialup) */
    152 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
    153 				/* q_weight = 0.015625 */
    154 
    155 /* fixed-point uses 12-bit decimal places */
    156 #define	FP_SHIFT	12	/* fixed-point shift */
    157 
    158 /* red parameters for drop probability */
    159 #define	INV_P_MAX	10	/* inverse of max drop probability */
    160 #define	TH_MIN		5	/* min threshold */
    161 #define	TH_MAX		15	/* max threshold */
    162 
    163 #define	RED_LIMIT	60	/* default max queue lenght */
    164 #define	RED_STATS		/* collect statistics */
    165 
    166 /*
    167  * our default policy for forced-drop is drop-tail.
    168  * (in altq-1.1.2 or earlier, the default was random-drop.
    169  * but it makes more sense to punish the cause of the surge.)
    170  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
    171  */
    172 
    173 #ifdef ALTQ3_COMPAT
    174 #ifdef ALTQ_FLOWVALVE
    175 /*
    176  * flow-valve is an extention to protect red from unresponsive flows
    177  * and to promote end-to-end congestion control.
    178  * flow-valve observes the average drop rates of the flows that have
    179  * experienced packet drops in the recent past.
    180  * when the average drop rate exceeds the threshold, the flow is
    181  * blocked by the flow-valve.  the trapped flow should back off
    182  * exponentially to escape from the flow-valve.
    183  */
    184 #ifdef RED_RANDOM_DROP
    185 #error "random-drop can't be used with flow-valve!"
    186 #endif
    187 #endif /* ALTQ_FLOWVALVE */
    188 
    189 /* red_list keeps all red_queue_t's allocated. */
    190 static red_queue_t *red_list = NULL;
    191 
    192 #endif /* ALTQ3_COMPAT */
    193 
    194 /* default red parameter values */
    195 static int default_th_min = TH_MIN;
    196 static int default_th_max = TH_MAX;
    197 static int default_inv_pmax = INV_P_MAX;
    198 
    199 #ifdef ALTQ3_COMPAT
    200 /* internal function prototypes */
    201 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
    202 static struct mbuf *red_dequeue(struct ifaltq *, int);
    203 static int red_request(struct ifaltq *, int, void *);
    204 static void red_purgeq(red_queue_t *);
    205 static int red_detach(red_queue_t *);
    206 #ifdef ALTQ_FLOWVALVE
    207 static inline struct fve *flowlist_lookup(struct flowvalve *,
    208 			 struct altq_pktattr *, struct timeval *);
    209 static inline struct fve *flowlist_reclaim(struct flowvalve *,
    210 					     struct altq_pktattr *);
    211 static inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
    212 static inline int fv_p2f(struct flowvalve *, int);
    213 static struct flowvalve *fv_alloc(struct red *);
    214 static void fv_destroy(struct flowvalve *);
    215 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
    216 			struct fve **);
    217 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
    218 			 struct fve *);
    219 #endif
    220 #endif /* ALTQ3_COMPAT */
    221 
    222 /*
    223  * red support routines
    224  */
    225 red_t *
    226 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
    227    int pkttime)
    228 {
    229 	red_t	*rp;
    230 	int	 w, i;
    231 	int	 npkts_per_sec;
    232 
    233 	MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
    234 	if (rp == NULL)
    235 		return (NULL);
    236 	(void)memset(rp, 0, sizeof(red_t));
    237 
    238 	rp->red_avg = 0;
    239 	rp->red_idle = 1;
    240 
    241 	if (weight == 0)
    242 		rp->red_weight = W_WEIGHT;
    243 	else
    244 		rp->red_weight = weight;
    245 	if (inv_pmax == 0)
    246 		rp->red_inv_pmax = default_inv_pmax;
    247 	else
    248 		rp->red_inv_pmax = inv_pmax;
    249 	if (th_min == 0)
    250 		rp->red_thmin = default_th_min;
    251 	else
    252 		rp->red_thmin = th_min;
    253 	if (th_max == 0)
    254 		rp->red_thmax = default_th_max;
    255 	else
    256 		rp->red_thmax = th_max;
    257 
    258 	rp->red_flags = flags;
    259 
    260 	if (pkttime == 0)
    261 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
    262 		rp->red_pkttime = 800;
    263 	else
    264 		rp->red_pkttime = pkttime;
    265 
    266 	if (weight == 0) {
    267 		/* when the link is very slow, adjust red parameters */
    268 		npkts_per_sec = 1000000 / rp->red_pkttime;
    269 		if (npkts_per_sec < 50) {
    270 			/* up to about 400Kbps */
    271 			rp->red_weight = W_WEIGHT_2;
    272 		} else if (npkts_per_sec < 300) {
    273 			/* up to about 2.4Mbps */
    274 			rp->red_weight = W_WEIGHT_1;
    275 		}
    276 	}
    277 
    278 	/* calculate wshift.  weight must be power of 2 */
    279 	w = rp->red_weight;
    280 	for (i = 0; w > 1; i++)
    281 		w = w >> 1;
    282 	rp->red_wshift = i;
    283 	w = 1 << rp->red_wshift;
    284 	if (w != rp->red_weight) {
    285 		printf("invalid weight value %d for red! use %d\n",
    286 		       rp->red_weight, w);
    287 		rp->red_weight = w;
    288 	}
    289 
    290 	/*
    291 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
    292 	 * to be compared with avg.
    293 	 */
    294 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
    295 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
    296 
    297 	/*
    298 	 * precompute probability denominator
    299 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
    300 	 */
    301 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
    302 			 * rp->red_inv_pmax) << FP_SHIFT;
    303 
    304 	/* allocate weight table */
    305 	rp->red_wtab = wtab_alloc(rp->red_weight);
    306 
    307 	microtime(&rp->red_last);
    308 #ifdef ALTQ3_COMPAT
    309 #ifdef ALTQ_FLOWVALVE
    310 	if (flags & REDF_FLOWVALVE)
    311 		rp->red_flowvalve = fv_alloc(rp);
    312 	/* if fv_alloc failes, flowvalve is just disabled */
    313 #endif
    314 #endif /* ALTQ3_COMPAT */
    315 	return (rp);
    316 }
    317 
    318 void
    319 red_destroy(red_t *rp)
    320 {
    321 #ifdef ALTQ3_COMPAT
    322 #ifdef ALTQ_FLOWVALVE
    323 	if (rp->red_flowvalve != NULL)
    324 		fv_destroy(rp->red_flowvalve);
    325 #endif
    326 #endif /* ALTQ3_COMPAT */
    327 	wtab_destroy(rp->red_wtab);
    328 	FREE(rp, M_DEVBUF);
    329 }
    330 
    331 void
    332 red_getstats(red_t *rp, struct redstats *sp)
    333 {
    334 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
    335 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
    336 	sp->drop_cnt		= rp->red_stats.drop_cnt;
    337 	sp->drop_forced		= rp->red_stats.drop_forced;
    338 	sp->drop_unforced	= rp->red_stats.drop_unforced;
    339 	sp->marked_packets	= rp->red_stats.marked_packets;
    340 }
    341 
    342 int
    343 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
    344     struct altq_pktattr *pktattr)
    345 {
    346 	int avg, droptype;
    347 	int n;
    348 #ifdef ALTQ3_COMPAT
    349 #ifdef ALTQ_FLOWVALVE
    350 	struct fve *fve = NULL;
    351 
    352 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
    353 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
    354 			m_freem(m);
    355 			return (-1);
    356 		}
    357 #endif
    358 #endif /* ALTQ3_COMPAT */
    359 
    360 	avg = rp->red_avg;
    361 
    362 	/*
    363 	 * if we were idle, we pretend that n packets arrived during
    364 	 * the idle period.
    365 	 */
    366 	if (rp->red_idle) {
    367 		struct timeval now;
    368 		int t;
    369 
    370 		rp->red_idle = 0;
    371 		microtime(&now);
    372 		t = (now.tv_sec - rp->red_last.tv_sec);
    373 		if (t > 60) {
    374 			/*
    375 			 * being idle for more than 1 minute, set avg to zero.
    376 			 * this prevents t from overflow.
    377 			 */
    378 			avg = 0;
    379 		} else {
    380 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
    381 			n = t / rp->red_pkttime - 1;
    382 
    383 			/* the following line does (avg = (1 - Wq)^n * avg) */
    384 			if (n > 0)
    385 				avg = (avg >> FP_SHIFT) *
    386 				    pow_w(rp->red_wtab, n);
    387 		}
    388 	}
    389 
    390 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
    391 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
    392 	rp->red_avg = avg;		/* save the new value */
    393 
    394 	/*
    395 	 * red_count keeps a tally of arriving traffic that has not
    396 	 * been dropped.
    397 	 */
    398 	rp->red_count++;
    399 
    400 	/* see if we drop early */
    401 	droptype = DTYPE_NODROP;
    402 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
    403 		if (avg >= rp->red_thmax_s) {
    404 			/* avg >= th_max: forced drop */
    405 			droptype = DTYPE_FORCED;
    406 		} else if (rp->red_old == 0) {
    407 			/* first exceeds th_min */
    408 			rp->red_count = 1;
    409 			rp->red_old = 1;
    410 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
    411 				      rp->red_probd, rp->red_count)) {
    412 			/* mark or drop by red */
    413 			if ((rp->red_flags & REDF_ECN) &&
    414 			    mark_ecn(m, pktattr, rp->red_flags)) {
    415 				/* successfully marked.  do not drop. */
    416 				rp->red_count = 0;
    417 #ifdef RED_STATS
    418 				rp->red_stats.marked_packets++;
    419 #endif
    420 			} else {
    421 				/* unforced drop by red */
    422 				droptype = DTYPE_EARLY;
    423 			}
    424 		}
    425 	} else {
    426 		/* avg < th_min */
    427 		rp->red_old = 0;
    428 	}
    429 
    430 	/*
    431 	 * if the queue length hits the hard limit, it's a forced drop.
    432 	 */
    433 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
    434 		droptype = DTYPE_FORCED;
    435 
    436 #ifdef RED_RANDOM_DROP
    437 	/* if successful or forced drop, enqueue this packet. */
    438 	if (droptype != DTYPE_EARLY)
    439 		_addq(q, m);
    440 #else
    441 	/* if successful, enqueue this packet. */
    442 	if (droptype == DTYPE_NODROP)
    443 		_addq(q, m);
    444 #endif
    445 	if (droptype != DTYPE_NODROP) {
    446 		if (droptype == DTYPE_EARLY) {
    447 			/* drop the incoming packet */
    448 #ifdef RED_STATS
    449 			rp->red_stats.drop_unforced++;
    450 #endif
    451 		} else {
    452 			/* forced drop, select a victim packet in the queue. */
    453 #ifdef RED_RANDOM_DROP
    454 			m = _getq_random(q);
    455 #endif
    456 #ifdef RED_STATS
    457 			rp->red_stats.drop_forced++;
    458 #endif
    459 		}
    460 #ifdef RED_STATS
    461 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
    462 #endif
    463 		rp->red_count = 0;
    464 #ifdef ALTQ3_COMPAT
    465 #ifdef ALTQ_FLOWVALVE
    466 		if (rp->red_flowvalve != NULL)
    467 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
    468 #endif
    469 #endif /* ALTQ3_COMPAT */
    470 		m_freem(m);
    471 		return (-1);
    472 	}
    473 	/* successfully queued */
    474 #ifdef RED_STATS
    475 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
    476 #endif
    477 	return (0);
    478 }
    479 
    480 /*
    481  * early-drop probability is calculated as follows:
    482  *   prob = p_max * (avg - th_min) / (th_max - th_min)
    483  *   prob_a = prob / (2 - count*prob)
    484  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
    485  * here prob_a increases as successive undrop count increases.
    486  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
    487  * becomes 1 when (count >= (2 / prob))).
    488  */
    489 int
    490 drop_early(int fp_len, int fp_probd, int count)
    491 {
    492 	int	d;		/* denominator of drop-probability */
    493 
    494 	d = fp_probd - count * fp_len;
    495 	if (d <= 0)
    496 		/* count exceeds the hard limit: drop or mark */
    497 		return (1);
    498 
    499 	/*
    500 	 * now the range of d is [1..600] in fixed-point. (when
    501 	 * th_max-th_min=10 and p_max=1/30)
    502 	 * drop probability = (avg - TH_MIN) / d
    503 	 */
    504 
    505 	if ((arc4random() % d) < fp_len) {
    506 		/* drop or mark */
    507 		return (1);
    508 	}
    509 	/* no drop/mark */
    510 	return (0);
    511 }
    512 
    513 /*
    514  * try to mark CE bit to the packet.
    515  *    returns 1 if successfully marked, 0 otherwise.
    516  */
    517 int
    518 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
    519 {
    520 	struct mbuf	*m0;
    521 	struct m_tag	*t;
    522 	struct altq_tag	*at;
    523 	void		*hdr;
    524 	int		 af;
    525 
    526 	t = m_tag_find(m, PACKET_TAG_PF_QID, NULL);
    527 	if (t != NULL) {
    528 		at = (struct altq_tag *)(t + 1);
    529 		if (at == NULL)
    530 			return (0);
    531 		af = at->af;
    532 		hdr = at->hdr;
    533 #ifdef ALTQ3_COMPAT
    534 	} else if (pktattr != NULL) {
    535 		af = pktattr->pattr_af;
    536 		hdr = pktattr->pattr_hdr;
    537 #endif /* ALTQ3_COMPAT */
    538 	} else
    539 		return (0);
    540 
    541 	if (af != AF_INET && af != AF_INET6)
    542 		return (0);
    543 
    544 	/* verify that pattr_hdr is within the mbuf data */
    545 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    546 		if (((caddr_t)hdr >= m0->m_data) &&
    547 		    ((caddr_t)hdr < m0->m_data + m0->m_len))
    548 			break;
    549 	if (m0 == NULL) {
    550 		/* ick, tag info is stale */
    551 		return (0);
    552 	}
    553 
    554 	switch (af) {
    555 	case AF_INET:
    556 		if (flags & REDF_ECN4) {
    557 			struct ip *ip = hdr;
    558 			u_int8_t otos;
    559 			int sum;
    560 
    561 			if (ip->ip_v != 4)
    562 				return (0);	/* version mismatch! */
    563 
    564 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
    565 				return (0);	/* not-ECT */
    566 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
    567 				return (1);	/* already marked */
    568 
    569 			/*
    570 			 * ecn-capable but not marked,
    571 			 * mark CE and update checksum
    572 			 */
    573 			otos = ip->ip_tos;
    574 			ip->ip_tos |= IPTOS_ECN_CE;
    575 			/*
    576 			 * update checksum (from RFC1624)
    577 			 *	   HC' = ~(~HC + ~m + m')
    578 			 */
    579 			sum = ~ntohs(ip->ip_sum) & 0xffff;
    580 			sum += (~otos & 0xffff) + ip->ip_tos;
    581 			sum = (sum >> 16) + (sum & 0xffff);
    582 			sum += (sum >> 16);  /* add carry */
    583 			ip->ip_sum = htons(~sum & 0xffff);
    584 			return (1);
    585 		}
    586 		break;
    587 #ifdef INET6
    588 	case AF_INET6:
    589 		if (flags & REDF_ECN6) {
    590 			struct ip6_hdr *ip6 = hdr;
    591 			u_int32_t flowlabel;
    592 
    593 			flowlabel = ntohl(ip6->ip6_flow);
    594 			if ((flowlabel >> 28) != 6)
    595 				return (0);	/* version mismatch! */
    596 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    597 			    (IPTOS_ECN_NOTECT << 20))
    598 				return (0);	/* not-ECT */
    599 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
    600 			    (IPTOS_ECN_CE << 20))
    601 				return (1);	/* already marked */
    602 			/*
    603 			 * ecn-capable but not marked,  mark CE
    604 			 */
    605 			flowlabel |= (IPTOS_ECN_CE << 20);
    606 			ip6->ip6_flow = htonl(flowlabel);
    607 			return (1);
    608 		}
    609 		break;
    610 #endif  /* INET6 */
    611 	}
    612 
    613 	/* not marked */
    614 	return (0);
    615 }
    616 
    617 struct mbuf *
    618 red_getq(rp, q)
    619 	red_t *rp;
    620 	class_queue_t *q;
    621 {
    622 	struct mbuf *m;
    623 
    624 	if ((m = _getq(q)) == NULL) {
    625 		if (rp->red_idle == 0) {
    626 			rp->red_idle = 1;
    627 			microtime(&rp->red_last);
    628 		}
    629 		return NULL;
    630 	}
    631 
    632 	rp->red_idle = 0;
    633 	return (m);
    634 }
    635 
    636 /*
    637  * helper routine to calibrate avg during idle.
    638  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
    639  * here Wq = 1/weight and the code assumes Wq is close to zero.
    640  *
    641  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
    642  */
    643 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
    644 
    645 struct wtab *
    646 wtab_alloc(int weight)
    647 {
    648 	struct wtab	*w;
    649 	int		 i;
    650 
    651 	for (w = wtab_list; w != NULL; w = w->w_next)
    652 		if (w->w_weight == weight) {
    653 			w->w_refcount++;
    654 			return (w);
    655 		}
    656 
    657 	MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
    658 	if (w == NULL)
    659 		panic("wtab_alloc: malloc failed!");
    660 	(void)memset(w, 0, sizeof(struct wtab));
    661 	w->w_weight = weight;
    662 	w->w_refcount = 1;
    663 	w->w_next = wtab_list;
    664 	wtab_list = w;
    665 
    666 	/* initialize the weight table */
    667 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
    668 	for (i = 1; i < 32; i++) {
    669 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
    670 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
    671 			w->w_param_max = 1 << i;
    672 	}
    673 
    674 	return (w);
    675 }
    676 
    677 int
    678 wtab_destroy(struct wtab *w)
    679 {
    680 	struct wtab	*prev;
    681 
    682 	if (--w->w_refcount > 0)
    683 		return (0);
    684 
    685 	if (wtab_list == w)
    686 		wtab_list = w->w_next;
    687 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
    688 		if (prev->w_next == w) {
    689 			prev->w_next = w->w_next;
    690 			break;
    691 		}
    692 
    693 	FREE(w, M_DEVBUF);
    694 	return (0);
    695 }
    696 
    697 int32_t
    698 pow_w(struct wtab *w, int n)
    699 {
    700 	int	i, bit;
    701 	int32_t	val;
    702 
    703 	if (n >= w->w_param_max)
    704 		return (0);
    705 
    706 	val = 1 << FP_SHIFT;
    707 	if (n <= 0)
    708 		return (val);
    709 
    710 	bit = 1;
    711 	i = 0;
    712 	while (n) {
    713 		if (n & bit) {
    714 			val = (val * w->w_tab[i]) >> FP_SHIFT;
    715 			n &= ~bit;
    716 		}
    717 		i++;
    718 		bit <<=  1;
    719 	}
    720 	return (val);
    721 }
    722 
    723 #ifdef ALTQ3_COMPAT
    724 /*
    725  * red device interface
    726  */
    727 altqdev_decl(red);
    728 
    729 int
    730 redopen(dev, flag, fmt, l)
    731 	dev_t dev;
    732 	int flag, fmt;
    733 	struct lwp *l;
    734 {
    735 	/* everything will be done when the queueing scheme is attached. */
    736 	return 0;
    737 }
    738 
    739 int
    740 redclose(dev, flag, fmt, l)
    741 	dev_t dev;
    742 	int flag, fmt;
    743 	struct lwp *l;
    744 {
    745 	red_queue_t *rqp;
    746 	int err, error = 0;
    747 
    748 	while ((rqp = red_list) != NULL) {
    749 		/* destroy all */
    750 		err = red_detach(rqp);
    751 		if (err != 0 && error == 0)
    752 			error = err;
    753 	}
    754 
    755 	return error;
    756 }
    757 
    758 int
    759 redioctl(dev, cmd, addr, flag, l)
    760 	dev_t dev;
    761 	ioctlcmd_t cmd;
    762 	caddr_t addr;
    763 	int flag;
    764 	struct lwp *l;
    765 {
    766 	red_queue_t *rqp;
    767 	struct red_interface *ifacep;
    768 	struct ifnet *ifp;
    769 	struct proc *p = l->l_proc;
    770 	int	error = 0;
    771 
    772 	/* check super-user privilege */
    773 	switch (cmd) {
    774 	case RED_GETSTATS:
    775 		break;
    776 	default:
    777 #if (__FreeBSD_version > 400000)
    778 		if ((error = suser(p)) != 0)
    779 #else
    780 		if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
    781 #endif
    782 			return (error);
    783 		break;
    784 	}
    785 
    786 	switch (cmd) {
    787 
    788 	case RED_ENABLE:
    789 		ifacep = (struct red_interface *)addr;
    790 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    791 			error = EBADF;
    792 			break;
    793 		}
    794 		error = altq_enable(rqp->rq_ifq);
    795 		break;
    796 
    797 	case RED_DISABLE:
    798 		ifacep = (struct red_interface *)addr;
    799 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    800 			error = EBADF;
    801 			break;
    802 		}
    803 		error = altq_disable(rqp->rq_ifq);
    804 		break;
    805 
    806 	case RED_IF_ATTACH:
    807 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
    808 		if (ifp == NULL) {
    809 			error = ENXIO;
    810 			break;
    811 		}
    812 
    813 		/* allocate and initialize red_queue_t */
    814 		MALLOC(rqp, red_queue_t *, sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
    815 		if (rqp == NULL) {
    816 			error = ENOMEM;
    817 			break;
    818 		}
    819 		(void)memset(rqp, 0, sizeof(red_queue_t));
    820 
    821 		MALLOC(rqp->rq_q, class_queue_t *, sizeof(class_queue_t),
    822 		       M_DEVBUF, M_WAITOK);
    823 		if (rqp->rq_q == NULL) {
    824 			FREE(rqp, M_DEVBUF);
    825 			error = ENOMEM;
    826 			break;
    827 		}
    828 		(void)memset(rqp->rq_q, 0, sizeof(class_queue_t));
    829 
    830 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
    831 		if (rqp->rq_red == NULL) {
    832 			FREE(rqp->rq_q, M_DEVBUF);
    833 			FREE(rqp, M_DEVBUF);
    834 			error = ENOMEM;
    835 			break;
    836 		}
    837 
    838 		rqp->rq_ifq = &ifp->if_snd;
    839 		qtail(rqp->rq_q) = NULL;
    840 		qlen(rqp->rq_q) = 0;
    841 		qlimit(rqp->rq_q) = RED_LIMIT;
    842 		qtype(rqp->rq_q) = Q_RED;
    843 
    844 		/*
    845 		 * set RED to this ifnet structure.
    846 		 */
    847 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
    848 				    red_enqueue, red_dequeue, red_request,
    849 				    NULL, NULL);
    850 		if (error) {
    851 			red_destroy(rqp->rq_red);
    852 			FREE(rqp->rq_q, M_DEVBUF);
    853 			FREE(rqp, M_DEVBUF);
    854 			break;
    855 		}
    856 
    857 		/* add this state to the red list */
    858 		rqp->rq_next = red_list;
    859 		red_list = rqp;
    860 		break;
    861 
    862 	case RED_IF_DETACH:
    863 		ifacep = (struct red_interface *)addr;
    864 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
    865 			error = EBADF;
    866 			break;
    867 		}
    868 		error = red_detach(rqp);
    869 		break;
    870 
    871 	case RED_GETSTATS:
    872 		do {
    873 			struct red_stats *q_stats;
    874 			red_t *rp;
    875 
    876 			q_stats = (struct red_stats *)addr;
    877 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
    878 					     ALTQT_RED)) == NULL) {
    879 				error = EBADF;
    880 				break;
    881 			}
    882 
    883 			q_stats->q_len 	   = qlen(rqp->rq_q);
    884 			q_stats->q_limit   = qlimit(rqp->rq_q);
    885 
    886 			rp = rqp->rq_red;
    887 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
    888 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
    889 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
    890 			q_stats->drop_forced   = rp->red_stats.drop_forced;
    891 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
    892 			q_stats->marked_packets = rp->red_stats.marked_packets;
    893 
    894 			q_stats->weight		= rp->red_weight;
    895 			q_stats->inv_pmax	= rp->red_inv_pmax;
    896 			q_stats->th_min		= rp->red_thmin;
    897 			q_stats->th_max		= rp->red_thmax;
    898 
    899 #ifdef ALTQ_FLOWVALVE
    900 			if (rp->red_flowvalve != NULL) {
    901 				struct flowvalve *fv = rp->red_flowvalve;
    902 				q_stats->fv_flows    = fv->fv_flows;
    903 				q_stats->fv_pass     = fv->fv_stats.pass;
    904 				q_stats->fv_predrop  = fv->fv_stats.predrop;
    905 				q_stats->fv_alloc    = fv->fv_stats.alloc;
    906 				q_stats->fv_escape   = fv->fv_stats.escape;
    907 			} else {
    908 #endif /* ALTQ_FLOWVALVE */
    909 				q_stats->fv_flows    = 0;
    910 				q_stats->fv_pass     = 0;
    911 				q_stats->fv_predrop  = 0;
    912 				q_stats->fv_alloc    = 0;
    913 				q_stats->fv_escape   = 0;
    914 #ifdef ALTQ_FLOWVALVE
    915 			}
    916 #endif /* ALTQ_FLOWVALVE */
    917 		} while (/*CONSTCOND*/ 0);
    918 		break;
    919 
    920 	case RED_CONFIG:
    921 		do {
    922 			struct red_conf *fc;
    923 			red_t *new;
    924 			int s, limit;
    925 
    926 			fc = (struct red_conf *)addr;
    927 			if ((rqp = altq_lookup(fc->iface.red_ifname,
    928 					       ALTQT_RED)) == NULL) {
    929 				error = EBADF;
    930 				break;
    931 			}
    932 			new = red_alloc(fc->red_weight,
    933 					fc->red_inv_pmax,
    934 					fc->red_thmin,
    935 					fc->red_thmax,
    936 					fc->red_flags,
    937 					fc->red_pkttime);
    938 			if (new == NULL) {
    939 				error = ENOMEM;
    940 				break;
    941 			}
    942 
    943 			s = splnet();
    944 			red_purgeq(rqp);
    945 			limit = fc->red_limit;
    946 			if (limit < fc->red_thmax)
    947 				limit = fc->red_thmax;
    948 			qlimit(rqp->rq_q) = limit;
    949 			fc->red_limit = limit;	/* write back the new value */
    950 
    951 			red_destroy(rqp->rq_red);
    952 			rqp->rq_red = new;
    953 
    954 			splx(s);
    955 
    956 			/* write back new values */
    957 			fc->red_limit = limit;
    958 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
    959 			fc->red_thmin = rqp->rq_red->red_thmin;
    960 			fc->red_thmax = rqp->rq_red->red_thmax;
    961 
    962 		} while (/*CONSTCOND*/ 0);
    963 		break;
    964 
    965 	case RED_SETDEFAULTS:
    966 		do {
    967 			struct redparams *rp;
    968 
    969 			rp = (struct redparams *)addr;
    970 
    971 			default_th_min = rp->th_min;
    972 			default_th_max = rp->th_max;
    973 			default_inv_pmax = rp->inv_pmax;
    974 		} while (/*CONSTCOND*/ 0);
    975 		break;
    976 
    977 	default:
    978 		error = EINVAL;
    979 		break;
    980 	}
    981 	return error;
    982 }
    983 
    984 static int
    985 red_detach(rqp)
    986 	red_queue_t *rqp;
    987 {
    988 	red_queue_t *tmp;
    989 	int error = 0;
    990 
    991 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
    992 		altq_disable(rqp->rq_ifq);
    993 
    994 	if ((error = altq_detach(rqp->rq_ifq)))
    995 		return (error);
    996 
    997 	if (red_list == rqp)
    998 		red_list = rqp->rq_next;
    999 	else {
   1000 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
   1001 			if (tmp->rq_next == rqp) {
   1002 				tmp->rq_next = rqp->rq_next;
   1003 				break;
   1004 			}
   1005 		if (tmp == NULL)
   1006 			printf("red_detach: no state found in red_list!\n");
   1007 	}
   1008 
   1009 	red_destroy(rqp->rq_red);
   1010 	FREE(rqp->rq_q, M_DEVBUF);
   1011 	FREE(rqp, M_DEVBUF);
   1012 	return (error);
   1013 }
   1014 
   1015 /*
   1016  * enqueue routine:
   1017  *
   1018  *	returns: 0 when successfully queued.
   1019  *		 ENOBUFS when drop occurs.
   1020  */
   1021 static int
   1022 red_enqueue(ifq, m, pktattr)
   1023 	struct ifaltq *ifq;
   1024 	struct mbuf *m;
   1025 	struct altq_pktattr *pktattr;
   1026 {
   1027 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1028 
   1029 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
   1030 		return ENOBUFS;
   1031 	ifq->ifq_len++;
   1032 	return 0;
   1033 }
   1034 
   1035 /*
   1036  * dequeue routine:
   1037  *	must be called in splnet.
   1038  *
   1039  *	returns: mbuf dequeued.
   1040  *		 NULL when no packet is available in the queue.
   1041  */
   1042 
   1043 static struct mbuf *
   1044 red_dequeue(ifq, op)
   1045 	struct ifaltq *ifq;
   1046 	int op;
   1047 {
   1048 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1049 	struct mbuf *m;
   1050 
   1051 	if (op == ALTDQ_POLL)
   1052 		return qhead(rqp->rq_q);
   1053 
   1054 	/* op == ALTDQ_REMOVE */
   1055 	m =  red_getq(rqp->rq_red, rqp->rq_q);
   1056 	if (m != NULL)
   1057 		ifq->ifq_len--;
   1058 	return (m);
   1059 }
   1060 
   1061 static int
   1062 red_request(ifq, req, arg)
   1063 	struct ifaltq *ifq;
   1064 	int req;
   1065 	void *arg;
   1066 {
   1067 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
   1068 
   1069 	switch (req) {
   1070 	case ALTRQ_PURGE:
   1071 		red_purgeq(rqp);
   1072 		break;
   1073 	}
   1074 	return (0);
   1075 }
   1076 
   1077 static void
   1078 red_purgeq(rqp)
   1079 	red_queue_t *rqp;
   1080 {
   1081 	_flushq(rqp->rq_q);
   1082 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
   1083 		rqp->rq_ifq->ifq_len = 0;
   1084 }
   1085 
   1086 #ifdef ALTQ_FLOWVALVE
   1087 
   1088 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
   1089 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
   1090 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
   1091 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
   1092 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
   1093 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
   1094 
   1095 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
   1096 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
   1097 
   1098 #define	FV_N			10	/* update fve_f every FV_N packets */
   1099 
   1100 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
   1101 #define	FV_TTHRESH		3  /* time threshold to delete fve */
   1102 #define	FV_ALPHA		5  /* extra packet count */
   1103 
   1104 #define	FV_STATS
   1105 
   1106 #if (__FreeBSD_version > 300000)
   1107 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
   1108 #else
   1109 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
   1110 #endif
   1111 
   1112 /*
   1113  * Brtt table: 127 entry table to convert drop rate (p) to
   1114  * the corresponding bandwidth fraction (f)
   1115  * the following equation is implemented to use scaled values,
   1116  * fve_p and fve_f, in the fixed point format.
   1117  *
   1118  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
   1119  *   f = Brtt(p) / (max_th + alpha)
   1120  */
   1121 #define	BRTT_SIZE	128
   1122 #define	BRTT_SHIFT	12
   1123 #define	BRTT_MASK	0x0007f000
   1124 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
   1125 
   1126 const int brtt_tab[BRTT_SIZE] = {
   1127 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
   1128 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
   1129 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
   1130 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
   1131 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
   1132 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
   1133 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
   1134 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
   1135 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
   1136 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
   1137 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
   1138 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
   1139 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
   1140 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
   1141 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
   1142 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
   1143 };
   1144 
   1145 static inline struct fve *
   1146 flowlist_lookup(fv, pktattr, now)
   1147 	struct flowvalve *fv;
   1148 	struct altq_pktattr *pktattr;
   1149 	struct timeval *now;
   1150 {
   1151 	struct fve *fve;
   1152 	int flows;
   1153 	struct ip *ip;
   1154 #ifdef INET6
   1155 	struct ip6_hdr *ip6;
   1156 #endif
   1157 	struct timeval tthresh;
   1158 
   1159 	if (pktattr == NULL)
   1160 		return (NULL);
   1161 
   1162 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
   1163 	flows = 0;
   1164 	/*
   1165 	 * search the flow list
   1166 	 */
   1167 	switch (pktattr->pattr_af) {
   1168 	case AF_INET:
   1169 		ip = (struct ip *)pktattr->pattr_hdr;
   1170 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1171 			if (fve->fve_lastdrop.tv_sec == 0)
   1172 				break;
   1173 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1174 				fve->fve_lastdrop.tv_sec = 0;
   1175 				break;
   1176 			}
   1177 			if (fve->fve_flow.flow_af == AF_INET &&
   1178 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
   1179 			    ip->ip_src.s_addr &&
   1180 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
   1181 			    ip->ip_dst.s_addr)
   1182 				return (fve);
   1183 			flows++;
   1184 		}
   1185 		break;
   1186 #ifdef INET6
   1187 	case AF_INET6:
   1188 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1189 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
   1190 			if (fve->fve_lastdrop.tv_sec == 0)
   1191 				break;
   1192 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
   1193 				fve->fve_lastdrop.tv_sec = 0;
   1194 				break;
   1195 			}
   1196 			if (fve->fve_flow.flow_af == AF_INET6 &&
   1197 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
   1198 					       &ip6->ip6_src) &&
   1199 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
   1200 					       &ip6->ip6_dst))
   1201 				return (fve);
   1202 			flows++;
   1203 		}
   1204 		break;
   1205 #endif /* INET6 */
   1206 
   1207 	default:
   1208 		/* unknown protocol.  no drop. */
   1209 		return (NULL);
   1210 	}
   1211 	fv->fv_flows = flows;	/* save the number of active fve's */
   1212 	return (NULL);
   1213 }
   1214 
   1215 static inline struct fve *
   1216 flowlist_reclaim(fv, pktattr)
   1217 	struct flowvalve *fv;
   1218 	struct altq_pktattr *pktattr;
   1219 {
   1220 	struct fve *fve;
   1221 	struct ip *ip;
   1222 #ifdef INET6
   1223 	struct ip6_hdr *ip6;
   1224 #endif
   1225 
   1226 	/*
   1227 	 * get an entry from the tail of the LRU list.
   1228 	 */
   1229 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
   1230 
   1231 	switch (pktattr->pattr_af) {
   1232 	case AF_INET:
   1233 		ip = (struct ip *)pktattr->pattr_hdr;
   1234 		fve->fve_flow.flow_af = AF_INET;
   1235 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
   1236 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
   1237 		break;
   1238 #ifdef INET6
   1239 	case AF_INET6:
   1240 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
   1241 		fve->fve_flow.flow_af = AF_INET6;
   1242 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
   1243 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
   1244 		break;
   1245 #endif
   1246 	}
   1247 
   1248 	fve->fve_state = Green;
   1249 	fve->fve_p = 0.0;
   1250 	fve->fve_f = 0.0;
   1251 	fve->fve_ifseq = fv->fv_ifseq - 1;
   1252 	fve->fve_count = 0;
   1253 
   1254 	fv->fv_flows++;
   1255 #ifdef FV_STATS
   1256 	fv->fv_stats.alloc++;
   1257 #endif
   1258 	return (fve);
   1259 }
   1260 
   1261 static inline void
   1262 flowlist_move_to_head(fv, fve)
   1263 	struct flowvalve *fv;
   1264 	struct fve *fve;
   1265 {
   1266 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
   1267 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
   1268 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
   1269 	}
   1270 }
   1271 
   1272 /*
   1273  * allocate flowvalve structure
   1274  */
   1275 static struct flowvalve *
   1276 fv_alloc(rp)
   1277 	struct red *rp;
   1278 {
   1279 	struct flowvalve *fv;
   1280 	struct fve *fve;
   1281 	int i, num;
   1282 
   1283 	num = FV_FLOWLISTSIZE;
   1284 	MALLOC(fv, struct flowvalve *, sizeof(struct flowvalve),
   1285 	       M_DEVBUF, M_WAITOK);
   1286 	if (fv == NULL)
   1287 		return (NULL);
   1288 	(void)memset(fv, 0, sizeof(struct flowvalve));
   1289 
   1290 	MALLOC(fv->fv_fves, struct fve *, sizeof(struct fve) * num,
   1291 	       M_DEVBUF, M_WAITOK);
   1292 	if (fv->fv_fves == NULL) {
   1293 		FREE(fv, M_DEVBUF);
   1294 		return (NULL);
   1295 	}
   1296 	(void)memset(fv->fv_fves, 0, sizeof(struct fve) * num);
   1297 
   1298 	fv->fv_flows = 0;
   1299 	TAILQ_INIT(&fv->fv_flowlist);
   1300 	for (i = 0; i < num; i++) {
   1301 		fve = &fv->fv_fves[i];
   1302 		fve->fve_lastdrop.tv_sec = 0;
   1303 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
   1304 	}
   1305 
   1306 	/* initialize drop rate threshold in scaled fixed-point */
   1307 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
   1308 
   1309 	/* initialize drop rate to fraction table */
   1310 	MALLOC(fv->fv_p2ftab, int *, sizeof(int) * BRTT_SIZE,
   1311 	       M_DEVBUF, M_WAITOK);
   1312 	if (fv->fv_p2ftab == NULL) {
   1313 		FREE(fv->fv_fves, M_DEVBUF);
   1314 		FREE(fv, M_DEVBUF);
   1315 		return (NULL);
   1316 	}
   1317 	/*
   1318 	 * create the p2f table.
   1319 	 * (shift is used to keep the precision)
   1320 	 */
   1321 	for (i = 1; i < BRTT_SIZE; i++) {
   1322 		int f;
   1323 
   1324 		f = brtt_tab[i] << 8;
   1325 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
   1326 	}
   1327 
   1328 	return (fv);
   1329 }
   1330 
   1331 static void fv_destroy(fv)
   1332 	struct flowvalve *fv;
   1333 {
   1334 	FREE(fv->fv_p2ftab, M_DEVBUF);
   1335 	FREE(fv->fv_fves, M_DEVBUF);
   1336 	FREE(fv, M_DEVBUF);
   1337 }
   1338 
   1339 static inline int
   1340 fv_p2f(fv, p)
   1341 	struct flowvalve	*fv;
   1342 	int	p;
   1343 {
   1344 	int val, f;
   1345 
   1346 	if (p >= BRTT_PMAX)
   1347 		f = fv->fv_p2ftab[BRTT_SIZE-1];
   1348 	else if ((val = (p & BRTT_MASK)))
   1349 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
   1350 	else
   1351 		f = fv->fv_p2ftab[1];
   1352 	return (f);
   1353 }
   1354 
   1355 /*
   1356  * check if an arriving packet should be pre-dropped.
   1357  * called from red_addq() when a packet arrives.
   1358  * returns 1 when the packet should be pre-dropped.
   1359  * should be called in splnet.
   1360  */
   1361 static int
   1362 fv_checkflow(fv, pktattr, fcache)
   1363 	struct flowvalve *fv;
   1364 	struct altq_pktattr *pktattr;
   1365 	struct fve **fcache;
   1366 {
   1367 	struct fve *fve;
   1368 	struct timeval now;
   1369 
   1370 	fv->fv_ifseq++;
   1371 	FV_TIMESTAMP(&now);
   1372 
   1373 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1374 		/* no matching entry in the flowlist */
   1375 		return (0);
   1376 
   1377 	*fcache = fve;
   1378 
   1379 	/* update fraction f for every FV_N packets */
   1380 	if (++fve->fve_count == FV_N) {
   1381 		/*
   1382 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
   1383 		 */
   1384 		fve->fve_f =
   1385 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
   1386 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
   1387 		fve->fve_ifseq = fv->fv_ifseq;
   1388 		fve->fve_count = 0;
   1389 	}
   1390 
   1391 	/*
   1392 	 * overpumping test
   1393 	 */
   1394 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
   1395 		int fthresh;
   1396 
   1397 		/* calculate a threshold */
   1398 		fthresh = fv_p2f(fv, fve->fve_p);
   1399 		if (fve->fve_f > fthresh)
   1400 			fve->fve_state = Red;
   1401 	}
   1402 
   1403 	if (fve->fve_state == Red) {
   1404 		/*
   1405 		 * backoff test
   1406 		 */
   1407 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
   1408 			/* no drop for at least FV_BACKOFFTHRESH sec */
   1409 			fve->fve_p = 0;
   1410 			fve->fve_state = Green;
   1411 #ifdef FV_STATS
   1412 			fv->fv_stats.escape++;
   1413 #endif
   1414 		} else {
   1415 			/* block this flow */
   1416 			flowlist_move_to_head(fv, fve);
   1417 			fve->fve_lastdrop = now;
   1418 #ifdef FV_STATS
   1419 			fv->fv_stats.predrop++;
   1420 #endif
   1421 			return (1);
   1422 		}
   1423 	}
   1424 
   1425 	/*
   1426 	 * p = (1 - Wp) * p
   1427 	 */
   1428 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
   1429 	if (fve->fve_p < 0)
   1430 		fve->fve_p = 0;
   1431 #ifdef FV_STATS
   1432 	fv->fv_stats.pass++;
   1433 #endif
   1434 	return (0);
   1435 }
   1436 
   1437 /*
   1438  * called from red_addq when a packet is dropped by red.
   1439  * should be called in splnet.
   1440  */
   1441 static void fv_dropbyred(fv, pktattr, fcache)
   1442 	struct flowvalve *fv;
   1443 	struct altq_pktattr *pktattr;
   1444 	struct fve *fcache;
   1445 {
   1446 	struct fve *fve;
   1447 	struct timeval now;
   1448 
   1449 	if (pktattr == NULL)
   1450 		return;
   1451 	FV_TIMESTAMP(&now);
   1452 
   1453 	if (fcache != NULL)
   1454 		/* the fve of this packet is already cached */
   1455 		fve = fcache;
   1456 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
   1457 		fve = flowlist_reclaim(fv, pktattr);
   1458 
   1459 	flowlist_move_to_head(fv, fve);
   1460 
   1461 	/*
   1462 	 * update p:  the following line cancels the update
   1463 	 *	      in fv_checkflow() and calculate
   1464 	 *	p = Wp + (1 - Wp) * p
   1465 	 */
   1466 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
   1467 
   1468 	fve->fve_lastdrop = now;
   1469 }
   1470 
   1471 #endif /* ALTQ_FLOWVALVE */
   1472 
   1473 #ifdef KLD_MODULE
   1474 
   1475 static struct altqsw red_sw =
   1476 	{"red", redopen, redclose, redioctl};
   1477 
   1478 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
   1479 MODULE_VERSION(altq_red, 1);
   1480 
   1481 #endif /* KLD_MODULE */
   1482 #endif /* ALTQ3_COMPAT */
   1483 
   1484 #endif /* ALTQ_RED */
   1485