Home | History | Annotate | Line # | Download | only in altq
altq_rmclass.c revision 1.21
      1 /*	$NetBSD: altq_rmclass.c,v 1.21 2008/07/15 16:18:08 christos Exp $	*/
      2 /*	$KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $	*/
      3 
      4 /*
      5  * Copyright (c) 1991-1997 Regents of the University of California.
      6  * All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. All advertising materials mentioning features or use of this software
     17  *    must display the following acknowledgement:
     18  *      This product includes software developed by the Network Research
     19  *      Group at Lawrence Berkeley Laboratory.
     20  * 4. Neither the name of the University nor of the Laboratory may be used
     21  *    to endorse or promote products derived from this software without
     22  *    specific prior written permission.
     23  *
     24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     34  * SUCH DAMAGE.
     35  *
     36  * LBL code modified by speer (at) eng.sun.com, May 1977.
     37  * For questions and/or comments, please send mail to cbq (at) ee.lbl.gov
     38  */
     39 
     40 #include <sys/cdefs.h>
     41 __KERNEL_RCSID(0, "$NetBSD: altq_rmclass.c,v 1.21 2008/07/15 16:18:08 christos Exp $");
     42 
     43 /* #ident "@(#)rm_class.c  1.48     97/12/05 SMI" */
     44 
     45 #ifdef _KERNEL_OPT
     46 #include "opt_altq.h"
     47 #include "opt_inet.h"
     48 #endif
     49 
     50 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
     51 
     52 #include <sys/param.h>
     53 #include <sys/malloc.h>
     54 #include <sys/mbuf.h>
     55 #include <sys/socket.h>
     56 #include <sys/systm.h>
     57 #include <sys/errno.h>
     58 #include <sys/time.h>
     59 #ifdef ALTQ3_COMPAT
     60 #include <sys/kernel.h>
     61 #endif
     62 
     63 #include <net/if.h>
     64 #ifdef ALTQ3_COMPAT
     65 #include <netinet/in.h>
     66 #include <netinet/in_systm.h>
     67 #include <netinet/ip.h>
     68 #endif
     69 
     70 #include <altq/altq.h>
     71 #include <altq/altq_rmclass.h>
     72 #include <altq/altq_rmclass_debug.h>
     73 #include <altq/altq_red.h>
     74 #include <altq/altq_rio.h>
     75 
     76 /*
     77  * Local Macros
     78  */
     79 
     80 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
     81 
     82 /*
     83  * Local routines.
     84  */
     85 
     86 static int	rmc_satisfied(struct rm_class *, struct timeval *);
     87 static void	rmc_wrr_set_weights(struct rm_ifdat *);
     88 static void	rmc_depth_compute(struct rm_class *);
     89 static void	rmc_depth_recompute(rm_class_t *);
     90 
     91 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
     92 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
     93 
     94 static int	_rmc_addq(rm_class_t *, mbuf_t *);
     95 static void	_rmc_dropq(rm_class_t *);
     96 static mbuf_t	*_rmc_getq(rm_class_t *);
     97 static mbuf_t	*_rmc_pollq(rm_class_t *);
     98 
     99 static int	rmc_under_limit(struct rm_class *, struct timeval *);
    100 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
    101 static void	rmc_drop_action(struct rm_class *);
    102 static void	rmc_restart(struct rm_class *);
    103 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
    104 
    105 #define	BORROW_OFFTIME
    106 /*
    107  * BORROW_OFFTIME (experimental):
    108  * borrow the offtime of the class borrowing from.
    109  * the reason is that when its own offtime is set, the class is unable
    110  * to borrow much, especially when cutoff is taking effect.
    111  * but when the borrowed class is overloaded (advidle is close to minidle),
    112  * use the borrowing class's offtime to avoid overload.
    113  */
    114 #define	ADJUST_CUTOFF
    115 /*
    116  * ADJUST_CUTOFF (experimental):
    117  * if no underlimit class is found due to cutoff, increase cutoff and
    118  * retry the scheduling loop.
    119  * also, don't invoke delay_actions while cutoff is taking effect,
    120  * since a sleeping class won't have a chance to be scheduled in the
    121  * next loop.
    122  *
    123  * now heuristics for setting the top-level variable (cutoff_) becomes:
    124  *	1. if a packet arrives for a not-overlimit class, set cutoff
    125  *	   to the depth of the class.
    126  *	2. if cutoff is i, and a packet arrives for an overlimit class
    127  *	   with an underlimit ancestor at a lower level than i (say j),
    128  *	   then set cutoff to j.
    129  *	3. at scheduling a packet, if there is no underlimit class
    130  *	   due to the current cutoff level, increase cutoff by 1 and
    131  *	   then try to schedule again.
    132  */
    133 
    134 /*
    135  * rm_class_t *
    136  * rmc_newclass(...) - Create a new resource management class at priority
    137  * 'pri' on the interface given by 'ifd'.
    138  *
    139  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
    140  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
    141  *              than 100% of the bandwidth, this number should be the
    142  *              'effective' rate for the class.  Let f be the
    143  *              bandwidth fraction allocated to this class, and let
    144  *              nsPerByte be the data rate of the output link in
    145  *              nanoseconds/byte.  Then nsecPerByte is set to
    146  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
    147  *              for a class that gets 50% of an ethernet's bandwidth.
    148  *
    149  * action       the routine to call when the class is over limit.
    150  *
    151  * maxq         max allowable queue size for class (in packets).
    152  *
    153  * parent       parent class pointer.
    154  *
    155  * borrow       class to borrow from (should be either 'parent' or null).
    156  *
    157  * maxidle      max value allowed for class 'idle' time estimate (this
    158  *              parameter determines how large an initial burst of packets
    159  *              can be before overlimit action is invoked.
    160  *
    161  * offtime      how long 'delay' action will delay when class goes over
    162  *              limit (this parameter determines the steady-state burst
    163  *              size when a class is running over its limit).
    164  *
    165  * Maxidle and offtime have to be computed from the following:  If the
    166  * average packet size is s, the bandwidth fraction allocated to this
    167  * class is f, we want to allow b packet bursts, and the gain of the
    168  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
    169  *
    170  *   ptime = s * nsPerByte * (1 - f) / f
    171  *   maxidle = ptime * (1 - g^b) / g^b
    172  *   minidle = -ptime * (1 / (f - 1))
    173  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
    174  *
    175  * Operationally, it's convenient to specify maxidle & offtime in units
    176  * independent of the link bandwidth so the maxidle & offtime passed to
    177  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
    178  * (The constant factor is a scale factor needed to make the parameters
    179  * integers.  This scaling also means that the 'unscaled' values of
    180  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
    181  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
    182  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
    183  * maxidle also must be scaled upward by this value.  Thus, the passed
    184  * values for maxidle and offtime can be computed as follows:
    185  *
    186  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
    187  * offtime = offtime * 8 / (1000 * nsecPerByte)
    188  *
    189  * When USE_HRTIME is employed, then maxidle and offtime become:
    190  * 	maxidle = maxilde * (8.0 / nsecPerByte);
    191  * 	offtime = offtime * (8.0 / nsecPerByte);
    192  */
    193 struct rm_class *
    194 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
    195     void (*action)(rm_class_t *, rm_class_t *), int maxq,
    196     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
    197     int minidle, u_int offtime, int pktsize, int flags)
    198 {
    199 	struct rm_class	*cl;
    200 	struct rm_class	*peer;
    201 	int		 s;
    202 
    203 	if (pri >= RM_MAXPRIO)
    204 		return (NULL);
    205 #ifndef ALTQ_RED
    206 	if (flags & RMCF_RED) {
    207 #ifdef ALTQ_DEBUG
    208 		printf("rmc_newclass: RED not configured for CBQ!\n");
    209 #endif
    210 		return (NULL);
    211 	}
    212 #endif
    213 #ifndef ALTQ_RIO
    214 	if (flags & RMCF_RIO) {
    215 #ifdef ALTQ_DEBUG
    216 		printf("rmc_newclass: RIO not configured for CBQ!\n");
    217 #endif
    218 		return (NULL);
    219 	}
    220 #endif
    221 
    222 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_WAITOK|M_ZERO);
    223 	if (cl == NULL)
    224 		return (NULL);
    225 	CALLOUT_INIT(&cl->callout_);
    226 
    227 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    228 	if (cl->q_ == NULL) {
    229 		free(cl, M_DEVBUF);
    230 		return (NULL);
    231 	}
    232 
    233 	/*
    234 	 * Class initialization.
    235 	 */
    236 	cl->children_ = NULL;
    237 	cl->parent_ = parent;
    238 	cl->borrow_ = borrow;
    239 	cl->leaf_ = 1;
    240 	cl->ifdat_ = ifd;
    241 	cl->pri_ = pri;
    242 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
    243 	cl->depth_ = 0;
    244 	cl->qthresh_ = 0;
    245 	cl->ns_per_byte_ = nsecPerByte;
    246 
    247 	qlimit(cl->q_) = maxq;
    248 	qtype(cl->q_) = Q_DROPHEAD;
    249 	qlen(cl->q_) = 0;
    250 	cl->flags_ = flags;
    251 
    252 #if 1 /* minidle is also scaled in ALTQ */
    253 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
    254 	if (cl->minidle_ > 0)
    255 		cl->minidle_ = 0;
    256 #else
    257 	cl->minidle_ = minidle;
    258 #endif
    259 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
    260 	if (cl->maxidle_ == 0)
    261 		cl->maxidle_ = 1;
    262 #if 1 /* offtime is also scaled in ALTQ */
    263 	cl->avgidle_ = cl->maxidle_;
    264 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
    265 	if (cl->offtime_ == 0)
    266 		cl->offtime_ = 1;
    267 #else
    268 	cl->avgidle_ = 0;
    269 	cl->offtime_ = (offtime * nsecPerByte) / 8;
    270 #endif
    271 	cl->overlimit = action;
    272 
    273 #ifdef ALTQ_RED
    274 	if (flags & (RMCF_RED|RMCF_RIO)) {
    275 		int red_flags, red_pkttime;
    276 
    277 		red_flags = 0;
    278 		if (flags & RMCF_ECN)
    279 			red_flags |= REDF_ECN;
    280 		if (flags & RMCF_FLOWVALVE)
    281 			red_flags |= REDF_FLOWVALVE;
    282 #ifdef ALTQ_RIO
    283 		if (flags & RMCF_CLEARDSCP)
    284 			red_flags |= RIOF_CLEARDSCP;
    285 #endif
    286 		red_pkttime = nsecPerByte * pktsize  / 1000;
    287 
    288 		if (flags & RMCF_RED) {
    289 			cl->red_ = red_alloc(0, 0,
    290 			    qlimit(cl->q_) * 10/100,
    291 			    qlimit(cl->q_) * 30/100,
    292 			    red_flags, red_pkttime);
    293 			if (cl->red_ != NULL)
    294 				qtype(cl->q_) = Q_RED;
    295 		}
    296 #ifdef ALTQ_RIO
    297 		else {
    298 			cl->red_ = (red_t *)rio_alloc(0, NULL,
    299 						      red_flags, red_pkttime);
    300 			if (cl->red_ != NULL)
    301 				qtype(cl->q_) = Q_RIO;
    302 		}
    303 #endif
    304 	}
    305 #endif /* ALTQ_RED */
    306 
    307 	/*
    308 	 * put the class into the class tree
    309 	 */
    310 	s = splnet();
    311 	if ((peer = ifd->active_[pri]) != NULL) {
    312 		/* find the last class at this pri */
    313 		cl->peer_ = peer;
    314 		while (peer->peer_ != ifd->active_[pri])
    315 			peer = peer->peer_;
    316 		peer->peer_ = cl;
    317 	} else {
    318 		ifd->active_[pri] = cl;
    319 		cl->peer_ = cl;
    320 	}
    321 
    322 	if (cl->parent_) {
    323 		cl->next_ = parent->children_;
    324 		parent->children_ = cl;
    325 		parent->leaf_ = 0;
    326 	}
    327 
    328 	/*
    329 	 * Compute the depth of this class and its ancestors in the class
    330 	 * hierarchy.
    331 	 */
    332 	rmc_depth_compute(cl);
    333 
    334 	/*
    335 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
    336 	 */
    337 	if (ifd->wrr_) {
    338 		ifd->num_[pri]++;
    339 		ifd->alloc_[pri] += cl->allotment_;
    340 		rmc_wrr_set_weights(ifd);
    341 	}
    342 	splx(s);
    343 	return (cl);
    344 }
    345 
    346 int
    347 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
    348     int minidle, u_int offtime, int pktsize)
    349 {
    350 	struct rm_ifdat	*ifd;
    351 	u_int		 old_allotment;
    352 	int		 s;
    353 
    354 	ifd = cl->ifdat_;
    355 	old_allotment = cl->allotment_;
    356 
    357 	s = splnet();
    358 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
    359 	cl->qthresh_ = 0;
    360 	cl->ns_per_byte_ = nsecPerByte;
    361 
    362 	qlimit(cl->q_) = maxq;
    363 
    364 #if 1 /* minidle is also scaled in ALTQ */
    365 	cl->minidle_ = (minidle * nsecPerByte) / 8;
    366 	if (cl->minidle_ > 0)
    367 		cl->minidle_ = 0;
    368 #else
    369 	cl->minidle_ = minidle;
    370 #endif
    371 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
    372 	if (cl->maxidle_ == 0)
    373 		cl->maxidle_ = 1;
    374 #if 1 /* offtime is also scaled in ALTQ */
    375 	cl->avgidle_ = cl->maxidle_;
    376 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
    377 	if (cl->offtime_ == 0)
    378 		cl->offtime_ = 1;
    379 #else
    380 	cl->avgidle_ = 0;
    381 	cl->offtime_ = (offtime * nsecPerByte) / 8;
    382 #endif
    383 
    384 	/*
    385 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
    386 	 */
    387 	if (ifd->wrr_) {
    388 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
    389 		rmc_wrr_set_weights(ifd);
    390 	}
    391 	splx(s);
    392 	return (0);
    393 }
    394 
    395 /*
    396  * static void
    397  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
    398  *	the appropriate run robin weights for the CBQ weighted round robin
    399  *	algorithm.
    400  *
    401  *	Returns: NONE
    402  */
    403 
    404 static void
    405 rmc_wrr_set_weights(struct rm_ifdat *ifd)
    406 {
    407 	int		i;
    408 	struct rm_class	*cl, *clh;
    409 
    410 	for (i = 0; i < RM_MAXPRIO; i++) {
    411 		/*
    412 		 * This is inverted from that of the simulator to
    413 		 * maintain precision.
    414 		 */
    415 		if (ifd->num_[i] == 0)
    416 			ifd->M_[i] = 0;
    417 		else
    418 			ifd->M_[i] = ifd->alloc_[i] /
    419 				(ifd->num_[i] * ifd->maxpkt_);
    420 		/*
    421 		 * Compute the weighted allotment for each class.
    422 		 * This takes the expensive div instruction out
    423 		 * of the main loop for the wrr scheduling path.
    424 		 * These only get recomputed when a class comes or
    425 		 * goes.
    426 		 */
    427 		if (ifd->active_[i] != NULL) {
    428 			clh = cl = ifd->active_[i];
    429 			do {
    430 				/* safe-guard for slow link or alloc_ == 0 */
    431 				if (ifd->M_[i] == 0)
    432 					cl->w_allotment_ = 0;
    433 				else
    434 					cl->w_allotment_ = cl->allotment_ /
    435 						ifd->M_[i];
    436 				cl = cl->peer_;
    437 			} while ((cl != NULL) && (cl != clh));
    438 		}
    439 	}
    440 }
    441 
    442 int
    443 rmc_get_weight(struct rm_ifdat *ifd, int pri)
    444 {
    445 	if ((pri >= 0) && (pri < RM_MAXPRIO))
    446 		return (ifd->M_[pri]);
    447 	else
    448 		return (0);
    449 }
    450 
    451 /*
    452  * static void
    453  * rmc_depth_compute(struct rm_class *cl) - This function computes the
    454  *	appropriate depth of class 'cl' and its ancestors.
    455  *
    456  *	Returns:	NONE
    457  */
    458 
    459 static void
    460 rmc_depth_compute(struct rm_class *cl)
    461 {
    462 	rm_class_t	*t = cl, *p;
    463 
    464 	/*
    465 	 * Recompute the depth for the branch of the tree.
    466 	 */
    467 	while (t != NULL) {
    468 		p = t->parent_;
    469 		if (p && (t->depth_ >= p->depth_)) {
    470 			p->depth_ = t->depth_ + 1;
    471 			t = p;
    472 		} else
    473 			t = NULL;
    474 	}
    475 }
    476 
    477 /*
    478  * static void
    479  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
    480  *	the depth of the tree after a class has been deleted.
    481  *
    482  *	Returns:	NONE
    483  */
    484 
    485 static void
    486 rmc_depth_recompute(rm_class_t *cl)
    487 {
    488 #if 1 /* ALTQ */
    489 	rm_class_t	*p, *t;
    490 
    491 	p = cl;
    492 	while (p != NULL) {
    493 		if ((t = p->children_) == NULL) {
    494 			p->depth_ = 0;
    495 		} else {
    496 			int cdepth = 0;
    497 
    498 			while (t != NULL) {
    499 				if (t->depth_ > cdepth)
    500 					cdepth = t->depth_;
    501 				t = t->next_;
    502 			}
    503 
    504 			if (p->depth_ == cdepth + 1)
    505 				/* no change to this parent */
    506 				return;
    507 
    508 			p->depth_ = cdepth + 1;
    509 		}
    510 
    511 		p = p->parent_;
    512 	}
    513 #else
    514 	rm_class_t	*t;
    515 
    516 	if (cl->depth_ >= 1) {
    517 		if (cl->children_ == NULL) {
    518 			cl->depth_ = 0;
    519 		} else if ((t = cl->children_) != NULL) {
    520 			while (t != NULL) {
    521 				if (t->children_ != NULL)
    522 					rmc_depth_recompute(t);
    523 				t = t->next_;
    524 			}
    525 		} else
    526 			rmc_depth_compute(cl);
    527 	}
    528 #endif
    529 }
    530 
    531 /*
    532  * void
    533  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
    534  *	function deletes a class from the link-sharing structure and frees
    535  *	all resources associated with the class.
    536  *
    537  *	Returns: NONE
    538  */
    539 
    540 void
    541 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
    542 {
    543 	struct rm_class	*p, *head, *previous;
    544 	int		 s;
    545 
    546 	ASSERT(cl->children_ == NULL);
    547 
    548 	if (cl->sleeping_)
    549 		CALLOUT_STOP(&cl->callout_);
    550 
    551 	s = splnet();
    552 	/*
    553 	 * Free packets in the packet queue.
    554 	 * XXX - this may not be a desired behavior.  Packets should be
    555 	 *		re-queued.
    556 	 */
    557 	rmc_dropall(cl);
    558 
    559 	/*
    560 	 * If the class has a parent, then remove the class from the
    561 	 * class from the parent's children chain.
    562 	 */
    563 	if (cl->parent_ != NULL) {
    564 		head = cl->parent_->children_;
    565 		p = previous = head;
    566 		if (head->next_ == NULL) {
    567 			ASSERT(head == cl);
    568 			cl->parent_->children_ = NULL;
    569 			cl->parent_->leaf_ = 1;
    570 		} else while (p != NULL) {
    571 			if (p == cl) {
    572 				if (cl == head)
    573 					cl->parent_->children_ = cl->next_;
    574 				else
    575 					previous->next_ = cl->next_;
    576 				cl->next_ = NULL;
    577 				p = NULL;
    578 			} else {
    579 				previous = p;
    580 				p = p->next_;
    581 			}
    582 		}
    583 	}
    584 
    585 	/*
    586 	 * Delete class from class priority peer list.
    587 	 */
    588 	if ((p = ifd->active_[cl->pri_]) != NULL) {
    589 		/*
    590 		 * If there is more than one member of this priority
    591 		 * level, then look for class(cl) in the priority level.
    592 		 */
    593 		if (p != p->peer_) {
    594 			while (p->peer_ != cl)
    595 				p = p->peer_;
    596 			p->peer_ = cl->peer_;
    597 
    598 			if (ifd->active_[cl->pri_] == cl)
    599 				ifd->active_[cl->pri_] = cl->peer_;
    600 		} else {
    601 			ASSERT(p == cl);
    602 			ifd->active_[cl->pri_] = NULL;
    603 		}
    604 	}
    605 
    606 	/*
    607 	 * Recompute the WRR weights.
    608 	 */
    609 	if (ifd->wrr_) {
    610 		ifd->alloc_[cl->pri_] -= cl->allotment_;
    611 		ifd->num_[cl->pri_]--;
    612 		rmc_wrr_set_weights(ifd);
    613 	}
    614 
    615 	/*
    616 	 * Re-compute the depth of the tree.
    617 	 */
    618 #if 1 /* ALTQ */
    619 	rmc_depth_recompute(cl->parent_);
    620 #else
    621 	rmc_depth_recompute(ifd->root_);
    622 #endif
    623 
    624 	splx(s);
    625 
    626 	/*
    627 	 * Free the class structure.
    628 	 */
    629 	if (cl->red_ != NULL) {
    630 #ifdef ALTQ_RIO
    631 		if (q_is_rio(cl->q_))
    632 			rio_destroy((rio_t *)cl->red_);
    633 #endif
    634 #ifdef ALTQ_RED
    635 		if (q_is_red(cl->q_))
    636 			red_destroy(cl->red_);
    637 #endif
    638 	}
    639 	free(cl->q_, M_DEVBUF);
    640 	free(cl, M_DEVBUF);
    641 }
    642 
    643 
    644 /*
    645  * int
    646  * rmc_init(...) - Initialize the resource management data structures
    647  *	associated with the output portion of interface 'ifp'.  'ifd' is
    648  *	where the structures will be built (for backwards compatibility, the
    649  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
    650  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
    651  *	'restart' is the driver-specific routine that the generic 'delay
    652  *	until under limit' action will call to restart output.  `maxq'
    653  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
    654  *	is the maximum number of packets that the resource management
    655  *	code will allow to be queued 'downstream' (this is typically 1).
    656  *
    657  *	Returns:	0 on success
    658  */
    659 
    660 int
    661 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
    662     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
    663     int minidle, u_int offtime, int flags)
    664 {
    665 	int i, mtu;
    666 
    667 	/*
    668 	 * Initialize the CBQ tracing/debug facility.
    669 	 */
    670 	CBQTRACEINIT();
    671 
    672 	mtu = ifq->altq_ifp->if_mtu;
    673 	if (mtu < 1) {
    674 		printf("altq: %s: invalid MTU (interface not initialized?)\n",
    675 		    ifq->altq_ifp->if_xname);
    676 		return (EINVAL);
    677 	}
    678 
    679 	(void)memset((char *)ifd, 0, sizeof (*ifd));
    680 	ifd->ifq_ = ifq;
    681 	ifd->restart = restart;
    682 	ifd->maxqueued_ = maxqueued;
    683 	ifd->ns_per_byte_ = nsecPerByte;
    684 	ifd->maxpkt_ = mtu;
    685 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
    686 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
    687 #if 1
    688 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
    689 	if (mtu * nsecPerByte > 10 * 1000000)
    690 		ifd->maxiftime_ /= 4;
    691 #endif
    692 
    693 	reset_cutoff(ifd);
    694 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
    695 
    696 	/*
    697 	 * Initialize the CBQ's WRR state.
    698 	 */
    699 	for (i = 0; i < RM_MAXPRIO; i++) {
    700 		ifd->alloc_[i] = 0;
    701 		ifd->M_[i] = 0;
    702 		ifd->num_[i] = 0;
    703 		ifd->na_[i] = 0;
    704 		ifd->active_[i] = NULL;
    705 	}
    706 
    707 	/*
    708 	 * Initialize current packet state.
    709 	 */
    710 	ifd->qi_ = 0;
    711 	ifd->qo_ = 0;
    712 	for (i = 0; i < RM_MAXQUEUED; i++) {
    713 		ifd->class_[i] = NULL;
    714 		ifd->curlen_[i] = 0;
    715 		ifd->borrowed_[i] = NULL;
    716 	}
    717 
    718 	/*
    719 	 * Create the root class of the link-sharing structure.
    720 	 */
    721 	if ((ifd->root_ = rmc_newclass(0, ifd,
    722 				       nsecPerByte,
    723 				       rmc_root_overlimit, maxq, 0, 0,
    724 				       maxidle, minidle, offtime,
    725 				       0, 0)) == NULL) {
    726 		printf("rmc_init: root class not allocated\n");
    727 		return (ENOMEM);
    728 	}
    729 	ifd->root_->depth_ = 0;
    730 
    731 	return (0);
    732 }
    733 
    734 /*
    735  * void
    736  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
    737  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
    738  *	by a driver's if_output routine.  This routine must be called with
    739  *	output packet completion interrupts locked out (to avoid racing with
    740  *	rmc_dequeue_next).
    741  *
    742  *	Returns:	0 on successful queueing
    743  *			-1 when packet drop occurs
    744  */
    745 int
    746 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
    747 {
    748 	struct timeval	 now;
    749 	struct rm_ifdat *ifd = cl->ifdat_;
    750 	int		 cpri = cl->pri_;
    751 	int		 is_empty = qempty(cl->q_);
    752 
    753 	RM_GETTIME(now);
    754 	if (ifd->cutoff_ > 0) {
    755 		if (TV_LT(&cl->undertime_, &now)) {
    756 			if (ifd->cutoff_ > cl->depth_)
    757 				ifd->cutoff_ = cl->depth_;
    758 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
    759 		}
    760 #if 1 /* ALTQ */
    761 		else {
    762 			/*
    763 			 * the class is overlimit. if the class has
    764 			 * underlimit ancestors, set cutoff to the lowest
    765 			 * depth among them.
    766 			 */
    767 			struct rm_class *borrow = cl->borrow_;
    768 
    769 			while (borrow != NULL &&
    770 			       borrow->depth_ < ifd->cutoff_) {
    771 				if (TV_LT(&borrow->undertime_, &now)) {
    772 					ifd->cutoff_ = borrow->depth_;
    773 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
    774 					break;
    775 				}
    776 				borrow = borrow->borrow_;
    777 			}
    778 		}
    779 #else /* !ALTQ */
    780 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
    781 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
    782 				ifd->cutoff_ = cl->borrow_->depth_;
    783 				CBQTRACE(rmc_queue_packet, 'ffob',
    784 					 cl->borrow_->depth_);
    785 			}
    786 		}
    787 #endif /* !ALTQ */
    788 	}
    789 
    790 	if (_rmc_addq(cl, m) < 0)
    791 		/* failed */
    792 		return (-1);
    793 
    794 	if (is_empty) {
    795 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
    796 		ifd->na_[cpri]++;
    797 	}
    798 
    799 	if (qlen(cl->q_) > qlimit(cl->q_)) {
    800 		/* note: qlimit can be set to 0 or 1 */
    801 		rmc_drop_action(cl);
    802 		return (-1);
    803 	}
    804 	return (0);
    805 }
    806 
    807 /*
    808  * void
    809  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
    810  *	classes to see if there are satified.
    811  */
    812 
    813 static void
    814 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
    815 {
    816 	int		 i;
    817 	rm_class_t	*p, *bp;
    818 
    819 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
    820 		if ((bp = ifd->active_[i]) != NULL) {
    821 			p = bp;
    822 			do {
    823 				if (!rmc_satisfied(p, now)) {
    824 					ifd->cutoff_ = p->depth_;
    825 					return;
    826 				}
    827 				p = p->peer_;
    828 			} while (p != bp);
    829 		}
    830 	}
    831 
    832 	reset_cutoff(ifd);
    833 }
    834 
    835 /*
    836  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
    837  */
    838 
    839 static int
    840 rmc_satisfied(struct rm_class *cl, struct timeval *now)
    841 {
    842 	rm_class_t	*p;
    843 
    844 	if (cl == NULL)
    845 		return (1);
    846 	if (TV_LT(now, &cl->undertime_))
    847 		return (1);
    848 	if (cl->depth_ == 0) {
    849 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
    850 			return (0);
    851 		else
    852 			return (1);
    853 	}
    854 	if (cl->children_ != NULL) {
    855 		p = cl->children_;
    856 		while (p != NULL) {
    857 			if (!rmc_satisfied(p, now))
    858 				return (0);
    859 			p = p->next_;
    860 		}
    861 	}
    862 
    863 	return (1);
    864 }
    865 
    866 /*
    867  * Return 1 if class 'cl' is under limit or can borrow from a parent,
    868  * 0 if overlimit.  As a side-effect, this routine will invoke the
    869  * class overlimit action if the class if overlimit.
    870  */
    871 
    872 static int
    873 rmc_under_limit(struct rm_class *cl, struct timeval *now)
    874 {
    875 	rm_class_t	*p = cl;
    876 	rm_class_t	*top;
    877 	struct rm_ifdat	*ifd = cl->ifdat_;
    878 
    879 	ifd->borrowed_[ifd->qi_] = NULL;
    880 	/*
    881 	 * If cl is the root class, then always return that it is
    882 	 * underlimit.  Otherwise, check to see if the class is underlimit.
    883 	 */
    884 	if (cl->parent_ == NULL)
    885 		return (1);
    886 
    887 	if (cl->sleeping_) {
    888 		if (TV_LT(now, &cl->undertime_))
    889 			return (0);
    890 
    891 		CALLOUT_STOP(&cl->callout_);
    892 		cl->sleeping_ = 0;
    893 		cl->undertime_.tv_sec = 0;
    894 		return (1);
    895 	}
    896 
    897 	top = NULL;
    898 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
    899 		if (((cl = cl->borrow_) == NULL) ||
    900 		    (cl->depth_ > ifd->cutoff_)) {
    901 #ifdef ADJUST_CUTOFF
    902 			if (cl != NULL)
    903 				/* cutoff is taking effect, just
    904 				   return false without calling
    905 				   the delay action. */
    906 				return (0);
    907 #endif
    908 #ifdef BORROW_OFFTIME
    909 			/*
    910 			 * check if the class can borrow offtime too.
    911 			 * borrow offtime from the top of the borrow
    912 			 * chain if the top class is not overloaded.
    913 			 */
    914 			if (cl != NULL) {
    915 				/* cutoff is taking effect, use this class as top. */
    916 				top = cl;
    917 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
    918 			}
    919 			if (top != NULL && top->avgidle_ == top->minidle_)
    920 				top = NULL;
    921 			p->overtime_ = *now;
    922 			(p->overlimit)(p, top);
    923 #else
    924 			p->overtime_ = *now;
    925 			(p->overlimit)(p, NULL);
    926 #endif
    927 			return (0);
    928 		}
    929 		top = cl;
    930 	}
    931 
    932 	if (cl != p)
    933 		ifd->borrowed_[ifd->qi_] = cl;
    934 	return (1);
    935 }
    936 
    937 /*
    938  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
    939  *	Packet-by-packet round robin.
    940  *
    941  * The heart of the weighted round-robin scheduler, which decides which
    942  * class next gets to send a packet.  Highest priority first, then
    943  * weighted round-robin within priorites.
    944  *
    945  * Each able-to-send class gets to send until its byte allocation is
    946  * exhausted.  Thus, the active pointer is only changed after a class has
    947  * exhausted its allocation.
    948  *
    949  * If the scheduler finds no class that is underlimit or able to borrow,
    950  * then the first class found that had a nonzero queue and is allowed to
    951  * borrow gets to send.
    952  */
    953 
    954 static mbuf_t *
    955 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
    956 {
    957 	struct rm_class	*cl = NULL, *first = NULL;
    958 	u_int		 deficit;
    959 	int		 cpri;
    960 	mbuf_t		*m;
    961 	struct timeval	 now;
    962 
    963 	RM_GETTIME(now);
    964 
    965 	/*
    966 	 * if the driver polls the top of the queue and then removes
    967 	 * the polled packet, we must return the same packet.
    968 	 */
    969 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
    970 		cl = ifd->pollcache_;
    971 		cpri = cl->pri_;
    972 		if (ifd->efficient_) {
    973 			/* check if this class is overlimit */
    974 			if (cl->undertime_.tv_sec != 0 &&
    975 			    rmc_under_limit(cl, &now) == 0)
    976 				first = cl;
    977 		}
    978 		ifd->pollcache_ = NULL;
    979 		goto _wrr_out;
    980 	}
    981 	else {
    982 		/* mode == ALTDQ_POLL || pollcache == NULL */
    983 		ifd->pollcache_ = NULL;
    984 		ifd->borrowed_[ifd->qi_] = NULL;
    985 	}
    986 #ifdef ADJUST_CUTOFF
    987  _again:
    988 #endif
    989 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
    990 		if (ifd->na_[cpri] == 0)
    991 			continue;
    992 		deficit = 0;
    993 		/*
    994 		 * Loop through twice for a priority level, if some class
    995 		 * was unable to send a packet the first round because
    996 		 * of the weighted round-robin mechanism.
    997 		 * During the second loop at this level, deficit==2.
    998 		 * (This second loop is not needed if for every class,
    999 		 * "M[cl->pri_])" times "cl->allotment" is greater than
   1000 		 * the byte size for the largest packet in the class.)
   1001 		 */
   1002  _wrr_loop:
   1003 		cl = ifd->active_[cpri];
   1004 		ASSERT(cl != NULL);
   1005 		do {
   1006 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
   1007 				cl->bytes_alloc_ += cl->w_allotment_;
   1008 			if (!qempty(cl->q_)) {
   1009 				if ((cl->undertime_.tv_sec == 0) ||
   1010 				    rmc_under_limit(cl, &now)) {
   1011 					if (cl->bytes_alloc_ > 0 || deficit > 1)
   1012 						goto _wrr_out;
   1013 
   1014 					/* underlimit but no alloc */
   1015 					deficit = 1;
   1016 #if 1
   1017 					ifd->borrowed_[ifd->qi_] = NULL;
   1018 #endif
   1019 				}
   1020 				else if (first == NULL && cl->borrow_ != NULL)
   1021 					first = cl; /* borrowing candidate */
   1022 			}
   1023 
   1024 			cl->bytes_alloc_ = 0;
   1025 			cl = cl->peer_;
   1026 		} while (cl != ifd->active_[cpri]);
   1027 
   1028 		if (deficit == 1) {
   1029 			/* first loop found an underlimit class with deficit */
   1030 			/* Loop on same priority level, with new deficit.  */
   1031 			deficit = 2;
   1032 			goto _wrr_loop;
   1033 		}
   1034 	}
   1035 
   1036 #ifdef ADJUST_CUTOFF
   1037 	/*
   1038 	 * no underlimit class found.  if cutoff is taking effect,
   1039 	 * increase cutoff and try again.
   1040 	 */
   1041 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
   1042 		ifd->cutoff_++;
   1043 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
   1044 		goto _again;
   1045 	}
   1046 #endif /* ADJUST_CUTOFF */
   1047 	/*
   1048 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
   1049 	 * class we encounter will send a packet if all the classes
   1050 	 * of the link-sharing structure are overlimit.
   1051 	 */
   1052 	reset_cutoff(ifd);
   1053 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
   1054 
   1055 	if (!ifd->efficient_ || first == NULL)
   1056 		return (NULL);
   1057 
   1058 	cl = first;
   1059 	cpri = cl->pri_;
   1060 #if 0	/* too time-consuming for nothing */
   1061 	if (cl->sleeping_)
   1062 		CALLOUT_STOP(&cl->callout_);
   1063 	cl->sleeping_ = 0;
   1064 	cl->undertime_.tv_sec = 0;
   1065 #endif
   1066 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
   1067 	ifd->cutoff_ = cl->borrow_->depth_;
   1068 
   1069 	/*
   1070 	 * Deque the packet and do the book keeping...
   1071 	 */
   1072  _wrr_out:
   1073 	if (op == ALTDQ_REMOVE) {
   1074 		m = _rmc_getq(cl);
   1075 		if (m == NULL)
   1076 			panic("_rmc_wrr_dequeue_next");
   1077 		if (qempty(cl->q_))
   1078 			ifd->na_[cpri]--;
   1079 
   1080 		/*
   1081 		 * Update class statistics and link data.
   1082 		 */
   1083 		if (cl->bytes_alloc_ > 0)
   1084 			cl->bytes_alloc_ -= m_pktlen(m);
   1085 
   1086 		if ((cl->bytes_alloc_ <= 0) || first == cl)
   1087 			ifd->active_[cl->pri_] = cl->peer_;
   1088 		else
   1089 			ifd->active_[cl->pri_] = cl;
   1090 
   1091 		ifd->class_[ifd->qi_] = cl;
   1092 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
   1093 		ifd->now_[ifd->qi_] = now;
   1094 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
   1095 		ifd->queued_++;
   1096 	} else {
   1097 		/* mode == ALTDQ_PPOLL */
   1098 		m = _rmc_pollq(cl);
   1099 		ifd->pollcache_ = cl;
   1100 	}
   1101 	return (m);
   1102 }
   1103 
   1104 /*
   1105  * Dequeue & return next packet from the highest priority class that
   1106  * has a packet to send & has enough allocation to send it.  This
   1107  * routine is called by a driver whenever it needs a new packet to
   1108  * output.
   1109  */
   1110 static mbuf_t *
   1111 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
   1112 {
   1113 	mbuf_t		*m;
   1114 	int		 cpri;
   1115 	struct rm_class	*cl, *first = NULL;
   1116 	struct timeval	 now;
   1117 
   1118 	RM_GETTIME(now);
   1119 
   1120 	/*
   1121 	 * if the driver polls the top of the queue and then removes
   1122 	 * the polled packet, we must return the same packet.
   1123 	 */
   1124 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
   1125 		cl = ifd->pollcache_;
   1126 		cpri = cl->pri_;
   1127 		ifd->pollcache_ = NULL;
   1128 		goto _prr_out;
   1129 	} else {
   1130 		/* mode == ALTDQ_POLL || pollcache == NULL */
   1131 		ifd->pollcache_ = NULL;
   1132 		ifd->borrowed_[ifd->qi_] = NULL;
   1133 	}
   1134 #ifdef ADJUST_CUTOFF
   1135  _again:
   1136 #endif
   1137 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
   1138 		if (ifd->na_[cpri] == 0)
   1139 			continue;
   1140 		cl = ifd->active_[cpri];
   1141 		ASSERT(cl != NULL);
   1142 		do {
   1143 			if (!qempty(cl->q_)) {
   1144 				if ((cl->undertime_.tv_sec == 0) ||
   1145 				    rmc_under_limit(cl, &now))
   1146 					goto _prr_out;
   1147 				if (first == NULL && cl->borrow_ != NULL)
   1148 					first = cl;
   1149 			}
   1150 			cl = cl->peer_;
   1151 		} while (cl != ifd->active_[cpri]);
   1152 	}
   1153 
   1154 #ifdef ADJUST_CUTOFF
   1155 	/*
   1156 	 * no underlimit class found.  if cutoff is taking effect, increase
   1157 	 * cutoff and try again.
   1158 	 */
   1159 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
   1160 		ifd->cutoff_++;
   1161 		goto _again;
   1162 	}
   1163 #endif /* ADJUST_CUTOFF */
   1164 	/*
   1165 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
   1166 	 * class we encounter will send a packet if all the classes
   1167 	 * of the link-sharing structure are overlimit.
   1168 	 */
   1169 	reset_cutoff(ifd);
   1170 	if (!ifd->efficient_ || first == NULL)
   1171 		return (NULL);
   1172 
   1173 	cl = first;
   1174 	cpri = cl->pri_;
   1175 #if 0	/* too time-consuming for nothing */
   1176 	if (cl->sleeping_)
   1177 		CALLOUT_STOP(&cl->callout_);
   1178 	cl->sleeping_ = 0;
   1179 	cl->undertime_.tv_sec = 0;
   1180 #endif
   1181 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
   1182 	ifd->cutoff_ = cl->borrow_->depth_;
   1183 
   1184 	/*
   1185 	 * Deque the packet and do the book keeping...
   1186 	 */
   1187  _prr_out:
   1188 	if (op == ALTDQ_REMOVE) {
   1189 		m = _rmc_getq(cl);
   1190 		if (m == NULL)
   1191 			panic("_rmc_prr_dequeue_next");
   1192 		if (qempty(cl->q_))
   1193 			ifd->na_[cpri]--;
   1194 
   1195 		ifd->active_[cpri] = cl->peer_;
   1196 
   1197 		ifd->class_[ifd->qi_] = cl;
   1198 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
   1199 		ifd->now_[ifd->qi_] = now;
   1200 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
   1201 		ifd->queued_++;
   1202 	} else {
   1203 		/* mode == ALTDQ_POLL */
   1204 		m = _rmc_pollq(cl);
   1205 		ifd->pollcache_ = cl;
   1206 	}
   1207 	return (m);
   1208 }
   1209 
   1210 /*
   1211  * mbuf_t *
   1212  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
   1213  *	is invoked by the packet driver to get the next packet to be
   1214  *	dequeued and output on the link.  If WRR is enabled, then the
   1215  *	WRR dequeue next routine will determine the next packet to sent.
   1216  *	Otherwise, packet-by-packet round robin is invoked.
   1217  *
   1218  *	Returns:	NULL, if a packet is not available or if all
   1219  *			classes are overlimit.
   1220  *
   1221  *			Otherwise, Pointer to the next packet.
   1222  */
   1223 
   1224 mbuf_t *
   1225 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
   1226 {
   1227 	if (ifd->queued_ >= ifd->maxqueued_)
   1228 		return (NULL);
   1229 	else if (ifd->wrr_)
   1230 		return (_rmc_wrr_dequeue_next(ifd, mode));
   1231 	else
   1232 		return (_rmc_prr_dequeue_next(ifd, mode));
   1233 }
   1234 
   1235 /*
   1236  * Update the utilization estimate for the packet that just completed.
   1237  * The packet's class & the parent(s) of that class all get their
   1238  * estimators updated.  This routine is called by the driver's output-
   1239  * packet-completion interrupt service routine.
   1240  */
   1241 
   1242 /*
   1243  * a macro to approximate "divide by 1000" that gives 0.000999,
   1244  * if a value has enough effective digits.
   1245  * (on pentium, mul takes 9 cycles but div takes 46!)
   1246  */
   1247 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
   1248 void
   1249 rmc_update_class_util(struct rm_ifdat *ifd)
   1250 {
   1251 	int		 idle, avgidle, pktlen;
   1252 	int		 pkt_time, tidle;
   1253 	rm_class_t	*cl, *borrowed;
   1254 	rm_class_t	*borrows;
   1255 	struct timeval	*nowp;
   1256 
   1257 	/*
   1258 	 * Get the most recent completed class.
   1259 	 */
   1260 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
   1261 		return;
   1262 
   1263 	pktlen = ifd->curlen_[ifd->qo_];
   1264 	borrowed = ifd->borrowed_[ifd->qo_];
   1265 	borrows = borrowed;
   1266 
   1267 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
   1268 
   1269 	/*
   1270 	 * Run estimator on class and its ancestors.
   1271 	 */
   1272 	/*
   1273 	 * rm_update_class_util is designed to be called when the
   1274 	 * transfer is completed from a xmit complete interrupt,
   1275 	 * but most drivers don't implement an upcall for that.
   1276 	 * so, just use estimated completion time.
   1277 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
   1278 	 */
   1279 	nowp = &ifd->now_[ifd->qo_];
   1280 	/* get pkt_time (for link) in usec */
   1281 #if 1  /* use approximation */
   1282 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
   1283 	pkt_time = NSEC_TO_USEC(pkt_time);
   1284 #else
   1285 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
   1286 #endif
   1287 #if 1 /* ALTQ4PPP */
   1288 	if (TV_LT(nowp, &ifd->ifnow_)) {
   1289 		int iftime;
   1290 
   1291 		/*
   1292 		 * make sure the estimated completion time does not go
   1293 		 * too far.  it can happen when the link layer supports
   1294 		 * data compression or the interface speed is set to
   1295 		 * a much lower value.
   1296 		 */
   1297 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
   1298 		if (iftime+pkt_time < ifd->maxiftime_) {
   1299 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
   1300 		} else {
   1301 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
   1302 		}
   1303 	} else {
   1304 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
   1305 	}
   1306 #else
   1307 	if (TV_LT(nowp, &ifd->ifnow_)) {
   1308 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
   1309 	} else {
   1310 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
   1311 	}
   1312 #endif
   1313 
   1314 	while (cl != NULL) {
   1315 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
   1316 		if (idle >= 2000000)
   1317 			/*
   1318 			 * this class is idle enough, reset avgidle.
   1319 			 * (TV_DELTA returns 2000000 us when delta is large.)
   1320 			 */
   1321 			cl->avgidle_ = cl->maxidle_;
   1322 
   1323 		/* get pkt_time (for class) in usec */
   1324 #if 1  /* use approximation */
   1325 		pkt_time = pktlen * cl->ns_per_byte_;
   1326 		pkt_time = NSEC_TO_USEC(pkt_time);
   1327 #else
   1328 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
   1329 #endif
   1330 		idle -= pkt_time;
   1331 
   1332 		avgidle = cl->avgidle_;
   1333 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
   1334 		cl->avgidle_ = avgidle;
   1335 
   1336 		/* Are we overlimit ? */
   1337 		if (avgidle <= 0) {
   1338 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
   1339 #if 1 /* ALTQ */
   1340 			/*
   1341 			 * need some lower bound for avgidle, otherwise
   1342 			 * a borrowing class gets unbounded penalty.
   1343 			 */
   1344 			if (avgidle < cl->minidle_)
   1345 				avgidle = cl->avgidle_ = cl->minidle_;
   1346 #endif
   1347 			/* set next idle to make avgidle 0 */
   1348 			tidle = pkt_time +
   1349 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
   1350 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
   1351 			++cl->stats_.over;
   1352 		} else {
   1353 			cl->avgidle_ =
   1354 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
   1355 			cl->undertime_.tv_sec = 0;
   1356 			if (cl->sleeping_) {
   1357 				CALLOUT_STOP(&cl->callout_);
   1358 				cl->sleeping_ = 0;
   1359 			}
   1360 		}
   1361 
   1362 		if (borrows != NULL) {
   1363 			if (borrows != cl)
   1364 				++cl->stats_.borrows;
   1365 			else
   1366 				borrows = NULL;
   1367 		}
   1368 		cl->last_ = ifd->ifnow_;
   1369 		cl->last_pkttime_ = pkt_time;
   1370 
   1371 #if 1
   1372 		if (cl->parent_ == NULL) {
   1373 			/* take stats of root class */
   1374 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
   1375 		}
   1376 #endif
   1377 
   1378 		cl = cl->parent_;
   1379 	}
   1380 
   1381 	/*
   1382 	 * Check to see if cutoff needs to set to a new level.
   1383 	 */
   1384 	cl = ifd->class_[ifd->qo_];
   1385 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
   1386 #if 1 /* ALTQ */
   1387 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
   1388 			rmc_tl_satisfied(ifd, nowp);
   1389 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
   1390 		} else {
   1391 			ifd->cutoff_ = borrowed->depth_;
   1392 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
   1393 		}
   1394 #else /* !ALTQ */
   1395 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
   1396 			reset_cutoff(ifd);
   1397 #ifdef notdef
   1398 			rmc_tl_satisfied(ifd, &now);
   1399 #endif
   1400 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
   1401 		} else {
   1402 			ifd->cutoff_ = borrowed->depth_;
   1403 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
   1404 		}
   1405 #endif /* !ALTQ */
   1406 	}
   1407 
   1408 	/*
   1409 	 * Release class slot
   1410 	 */
   1411 	ifd->borrowed_[ifd->qo_] = NULL;
   1412 	ifd->class_[ifd->qo_] = NULL;
   1413 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
   1414 	ifd->queued_--;
   1415 }
   1416 
   1417 /*
   1418  * void
   1419  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
   1420  *	over-limit action routines.  These get invoked by rmc_under_limit()
   1421  *	if a class with packets to send if over its bandwidth limit & can't
   1422  *	borrow from a parent class.
   1423  *
   1424  *	Returns: NONE
   1425  */
   1426 
   1427 static void
   1428 rmc_drop_action(struct rm_class *cl)
   1429 {
   1430 	struct rm_ifdat	*ifd = cl->ifdat_;
   1431 
   1432 	ASSERT(qlen(cl->q_) > 0);
   1433 	_rmc_dropq(cl);
   1434 	if (qempty(cl->q_))
   1435 		ifd->na_[cl->pri_]--;
   1436 }
   1437 
   1438 void
   1439 rmc_dropall(struct rm_class *cl)
   1440 {
   1441 	struct rm_ifdat	*ifd = cl->ifdat_;
   1442 
   1443 	if (!qempty(cl->q_)) {
   1444 		_flushq(cl->q_);
   1445 
   1446 		ifd->na_[cl->pri_]--;
   1447 	}
   1448 }
   1449 
   1450 #if (__FreeBSD_version > 300000)
   1451 static int tvhzto(struct timeval *);
   1452 
   1453 static int
   1454 tvhzto(struct timeval *tv)
   1455 {
   1456 	struct timeval t2;
   1457 
   1458 	getmicrotime(&t2);
   1459 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
   1460 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
   1461 	return (tvtohz(&t2));
   1462 }
   1463 #endif /* __FreeBSD_version > 300000 */
   1464 
   1465 /*
   1466  * void
   1467  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
   1468  *	delay action routine.  It is invoked via rmc_under_limit when the
   1469  *	packet is discoverd to be overlimit.
   1470  *
   1471  *	If the delay action is result of borrow class being overlimit, then
   1472  *	delay for the offtime of the borrowing class that is overlimit.
   1473  *
   1474  *	Returns: NONE
   1475  */
   1476 
   1477 void
   1478 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
   1479 {
   1480 	int	ndelay, t, extradelay;
   1481 
   1482 	cl->stats_.overactions++;
   1483 	TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay);
   1484 #ifndef BORROW_OFFTIME
   1485 	ndelay += cl->offtime_;
   1486 #endif
   1487 
   1488 	if (!cl->sleeping_) {
   1489 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
   1490 #ifdef BORROW_OFFTIME
   1491 		if (borrow != NULL)
   1492 			extradelay = borrow->offtime_;
   1493 		else
   1494 #endif
   1495 			extradelay = cl->offtime_;
   1496 
   1497 #ifdef ALTQ
   1498 		/*
   1499 		 * XXX recalculate suspend time:
   1500 		 * current undertime is (tidle + pkt_time) calculated
   1501 		 * from the last transmission.
   1502 		 *	tidle: time required to bring avgidle back to 0
   1503 		 *	pkt_time: target waiting time for this class
   1504 		 * we need to replace pkt_time by offtime
   1505 		 */
   1506 		extradelay -= cl->last_pkttime_;
   1507 #endif
   1508 		if (extradelay > 0) {
   1509 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
   1510 			ndelay += extradelay;
   1511 		}
   1512 
   1513 		cl->sleeping_ = 1;
   1514 		cl->stats_.delays++;
   1515 
   1516 		/*
   1517 		 * Since packets are phased randomly with respect to the
   1518 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
   1519 		 * short time so we have to wait for at least two ticks.
   1520 		 * NOTE:  If there's no other traffic, we need the timer as
   1521 		 * a 'backstop' to restart this class.
   1522 		 */
   1523 		if (ndelay > tick * 2) {
   1524 #ifdef __FreeBSD__
   1525 			/* FreeBSD rounds up the tick */
   1526 			t = tvhzto(&cl->undertime_);
   1527 #else
   1528 			/* other BSDs round down the tick */
   1529 			t = tvhzto(&cl->undertime_) + 1;
   1530 #endif
   1531 		} else
   1532 			t = 2;
   1533 		CALLOUT_RESET(&cl->callout_, t,
   1534 			      (timeout_t *)rmc_restart, (void *)cl);
   1535 	}
   1536 }
   1537 
   1538 /*
   1539  * void
   1540  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
   1541  *	called by the system timer code & is responsible checking if the
   1542  *	class is still sleeping (it might have been restarted as a side
   1543  *	effect of the queue scan on a packet arrival) and, if so, restarting
   1544  *	output for the class.  Inspecting the class state & restarting output
   1545  *	require locking the class structure.  In general the driver is
   1546  *	responsible for locking but this is the only routine that is not
   1547  *	called directly or indirectly from the interface driver so it has
   1548  *	know about system locking conventions.  Under bsd, locking is done
   1549  *	by raising IPL to splnet so that's what's implemented here.  On a
   1550  *	different system this would probably need to be changed.
   1551  *
   1552  *	Returns:	NONE
   1553  */
   1554 
   1555 static void
   1556 rmc_restart(struct rm_class *cl)
   1557 {
   1558 	struct rm_ifdat	*ifd = cl->ifdat_;
   1559 	int		 s;
   1560 
   1561 	s = splnet();
   1562 	if (cl->sleeping_) {
   1563 		cl->sleeping_ = 0;
   1564 		cl->undertime_.tv_sec = 0;
   1565 
   1566 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
   1567 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
   1568 			(ifd->restart)(ifd->ifq_);
   1569 		}
   1570 	}
   1571 	splx(s);
   1572 }
   1573 
   1574 /*
   1575  * void
   1576  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
   1577  *	handling routine for the root class of the link sharing structure.
   1578  *
   1579  *	Returns: NONE
   1580  */
   1581 
   1582 static void
   1583 rmc_root_overlimit(struct rm_class *cl,
   1584     struct rm_class *borrow)
   1585 {
   1586 	panic("rmc_root_overlimit");
   1587 }
   1588 
   1589 /*
   1590  * Packet Queue handling routines.  Eventually, this is to localize the
   1591  *	effects on the code whether queues are red queues or droptail
   1592  *	queues.
   1593  */
   1594 
   1595 static int
   1596 _rmc_addq(rm_class_t *cl, mbuf_t *m)
   1597 {
   1598 #ifdef ALTQ_RIO
   1599 	if (q_is_rio(cl->q_))
   1600 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
   1601 #endif
   1602 #ifdef ALTQ_RED
   1603 	if (q_is_red(cl->q_))
   1604 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
   1605 #endif /* ALTQ_RED */
   1606 
   1607 	if (cl->flags_ & RMCF_CLEARDSCP)
   1608 		write_dsfield(m, cl->pktattr_, 0);
   1609 
   1610 	_addq(cl->q_, m);
   1611 	return (0);
   1612 }
   1613 
   1614 /* note: _rmc_dropq is not called for red */
   1615 static void
   1616 _rmc_dropq(rm_class_t *cl)
   1617 {
   1618 	mbuf_t	*m;
   1619 
   1620 	if ((m = _getq(cl->q_)) != NULL)
   1621 		m_freem(m);
   1622 }
   1623 
   1624 static mbuf_t *
   1625 _rmc_getq(rm_class_t *cl)
   1626 {
   1627 #ifdef ALTQ_RIO
   1628 	if (q_is_rio(cl->q_))
   1629 		return rio_getq((rio_t *)cl->red_, cl->q_);
   1630 #endif
   1631 #ifdef ALTQ_RED
   1632 	if (q_is_red(cl->q_))
   1633 		return red_getq(cl->red_, cl->q_);
   1634 #endif
   1635 	return _getq(cl->q_);
   1636 }
   1637 
   1638 static mbuf_t *
   1639 _rmc_pollq(rm_class_t *cl)
   1640 {
   1641 	return qhead(cl->q_);
   1642 }
   1643 
   1644 #ifdef CBQ_TRACE
   1645 
   1646 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
   1647 struct cbqtrace		*cbqtrace_ptr = NULL;
   1648 int			 cbqtrace_count;
   1649 
   1650 /*
   1651  * DDB hook to trace cbq events:
   1652  *  the last 1024 events are held in a circular buffer.
   1653  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
   1654  */
   1655 void cbqtrace_dump(int);
   1656 static char *rmc_funcname(void *);
   1657 
   1658 static struct rmc_funcs {
   1659 	void	*func;
   1660 	char	*name;
   1661 } rmc_funcs[] =
   1662 {
   1663 	rmc_init,		"rmc_init",
   1664 	rmc_queue_packet,	"rmc_queue_packet",
   1665 	rmc_under_limit,	"rmc_under_limit",
   1666 	rmc_update_class_util,	"rmc_update_class_util",
   1667 	rmc_delay_action,	"rmc_delay_action",
   1668 	rmc_restart,		"rmc_restart",
   1669 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
   1670 	NULL,			NULL
   1671 };
   1672 
   1673 static char *
   1674 rmc_funcname(void *func)
   1675 {
   1676 	struct rmc_funcs *fp;
   1677 
   1678 	for (fp = rmc_funcs; fp->func != NULL; fp++)
   1679 		if (fp->func == func)
   1680 			return (fp->name);
   1681 	return ("unknown");
   1682 }
   1683 
   1684 void
   1685 cbqtrace_dump(int counter)
   1686 {
   1687 	int	 i, *p;
   1688 	char	*cp;
   1689 
   1690 	counter = counter % NCBQTRACE;
   1691 	p = (int *)&cbqtrace_buffer[counter];
   1692 
   1693 	for (i=0; i<20; i++) {
   1694 		printf("[0x%x] ", *p++);
   1695 		printf("%s: ", rmc_funcname((void *)*p++));
   1696 		cp = (char *)p++;
   1697 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
   1698 		printf("%d\n",*p++);
   1699 
   1700 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
   1701 			p = (int *)cbqtrace_buffer;
   1702 	}
   1703 }
   1704 #endif /* CBQ_TRACE */
   1705 #endif /* ALTQ_CBQ */
   1706 
   1707 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ)
   1708 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
   1709 
   1710 void
   1711 _addq(class_queue_t *q, mbuf_t *m)
   1712 {
   1713         mbuf_t	*m0;
   1714 
   1715 	if ((m0 = qtail(q)) != NULL)
   1716 		m->m_nextpkt = m0->m_nextpkt;
   1717 	else
   1718 		m0 = m;
   1719 	m0->m_nextpkt = m;
   1720 	qtail(q) = m;
   1721 	qlen(q)++;
   1722 }
   1723 
   1724 mbuf_t *
   1725 _getq(class_queue_t *q)
   1726 {
   1727 	mbuf_t	*m, *m0;
   1728 
   1729 	if ((m = qtail(q)) == NULL)
   1730 		return (NULL);
   1731 	if ((m0 = m->m_nextpkt) != m)
   1732 		m->m_nextpkt = m0->m_nextpkt;
   1733 	else {
   1734 		ASSERT(qlen(q) == 1);
   1735 		qtail(q) = NULL;
   1736 	}
   1737 	qlen(q)--;
   1738 	m0->m_nextpkt = NULL;
   1739 	return (m0);
   1740 }
   1741 
   1742 /* drop a packet at the tail of the queue */
   1743 mbuf_t *
   1744 _getq_tail(class_queue_t *q)
   1745 {
   1746 	mbuf_t	*m, *m0, *prev;
   1747 
   1748 	if ((m = m0 = qtail(q)) == NULL)
   1749 		return NULL;
   1750 	do {
   1751 		prev = m0;
   1752 		m0 = m0->m_nextpkt;
   1753 	} while (m0 != m);
   1754 	prev->m_nextpkt = m->m_nextpkt;
   1755 	if (prev == m)  {
   1756 		ASSERT(qlen(q) == 1);
   1757 		qtail(q) = NULL;
   1758 	} else
   1759 		qtail(q) = prev;
   1760 	qlen(q)--;
   1761 	m->m_nextpkt = NULL;
   1762 	return (m);
   1763 }
   1764 
   1765 /* randomly select a packet in the queue */
   1766 mbuf_t *
   1767 _getq_random(class_queue_t *q)
   1768 {
   1769 	struct mbuf	*m;
   1770 	int		 i, n;
   1771 
   1772 	if ((m = qtail(q)) == NULL)
   1773 		return NULL;
   1774 	if (m->m_nextpkt == m) {
   1775 		ASSERT(qlen(q) == 1);
   1776 		qtail(q) = NULL;
   1777 	} else {
   1778 		struct mbuf *prev = NULL;
   1779 
   1780 		n = arc4random() % qlen(q) + 1;
   1781 		for (i = 0; i < n; i++) {
   1782 			prev = m;
   1783 			m = m->m_nextpkt;
   1784 		}
   1785 		prev->m_nextpkt = m->m_nextpkt;
   1786 		if (m == qtail(q))
   1787 			qtail(q) = prev;
   1788 	}
   1789 	qlen(q)--;
   1790 	m->m_nextpkt = NULL;
   1791 	return (m);
   1792 }
   1793 
   1794 void
   1795 _removeq(class_queue_t *q, mbuf_t *m)
   1796 {
   1797 	mbuf_t	*m0, *prev;
   1798 
   1799 	m0 = qtail(q);
   1800 	do {
   1801 		prev = m0;
   1802 		m0 = m0->m_nextpkt;
   1803 	} while (m0 != m);
   1804 	prev->m_nextpkt = m->m_nextpkt;
   1805 	if (prev == m)
   1806 		qtail(q) = NULL;
   1807 	else if (qtail(q) == m)
   1808 		qtail(q) = prev;
   1809 	qlen(q)--;
   1810 }
   1811 
   1812 void
   1813 _flushq(class_queue_t *q)
   1814 {
   1815 	mbuf_t *m;
   1816 
   1817 	while ((m = _getq(q)) != NULL)
   1818 		m_freem(m);
   1819 	ASSERT(qlen(q) == 0);
   1820 }
   1821 
   1822 #endif /* !__GNUC__ || ALTQ_DEBUG */
   1823 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
   1824