Home | History | Annotate | Line # | Download | only in altq
altq_rmclass.c revision 1.23
      1 /*	$NetBSD: altq_rmclass.c,v 1.23 2021/07/13 07:59:48 ozaki-r Exp $	*/
      2 /*	$KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $	*/
      3 
      4 /*
      5  * Copyright (c) 1991-1997 Regents of the University of California.
      6  * All rights reserved.
      7  *
      8  * Redistribution and use in source and binary forms, with or without
      9  * modification, are permitted provided that the following conditions
     10  * are met:
     11  * 1. Redistributions of source code must retain the above copyright
     12  *    notice, this list of conditions and the following disclaimer.
     13  * 2. Redistributions in binary form must reproduce the above copyright
     14  *    notice, this list of conditions and the following disclaimer in the
     15  *    documentation and/or other materials provided with the distribution.
     16  * 3. All advertising materials mentioning features or use of this software
     17  *    must display the following acknowledgement:
     18  *      This product includes software developed by the Network Research
     19  *      Group at Lawrence Berkeley Laboratory.
     20  * 4. Neither the name of the University nor of the Laboratory may be used
     21  *    to endorse or promote products derived from this software without
     22  *    specific prior written permission.
     23  *
     24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     34  * SUCH DAMAGE.
     35  *
     36  * LBL code modified by speer (at) eng.sun.com, May 1977.
     37  * For questions and/or comments, please send mail to cbq (at) ee.lbl.gov
     38  */
     39 
     40 #include <sys/cdefs.h>
     41 __KERNEL_RCSID(0, "$NetBSD: altq_rmclass.c,v 1.23 2021/07/13 07:59:48 ozaki-r Exp $");
     42 
     43 /* #ident "@(#)rm_class.c  1.48     97/12/05 SMI" */
     44 
     45 #ifdef _KERNEL_OPT
     46 #include "opt_altq.h"
     47 #include "opt_inet.h"
     48 #endif
     49 
     50 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
     51 
     52 #include <sys/param.h>
     53 #include <sys/malloc.h>
     54 #include <sys/mbuf.h>
     55 #include <sys/socket.h>
     56 #include <sys/systm.h>
     57 #include <sys/errno.h>
     58 #include <sys/time.h>
     59 #ifdef ALTQ3_COMPAT
     60 #include <sys/kernel.h>
     61 #endif
     62 #include <sys/cprng.h>
     63 
     64 #include <net/if.h>
     65 #ifdef ALTQ3_COMPAT
     66 #include <netinet/in.h>
     67 #include <netinet/in_systm.h>
     68 #include <netinet/ip.h>
     69 #endif
     70 
     71 #include <altq/altq.h>
     72 #include <altq/altq_rmclass.h>
     73 #include <altq/altq_rmclass_debug.h>
     74 #include <altq/altq_red.h>
     75 #include <altq/altq_rio.h>
     76 
     77 /*
     78  * Local Macros
     79  */
     80 
     81 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
     82 
     83 /*
     84  * Local routines.
     85  */
     86 
     87 static int	rmc_satisfied(struct rm_class *, struct timeval *);
     88 static void	rmc_wrr_set_weights(struct rm_ifdat *);
     89 static void	rmc_depth_compute(struct rm_class *);
     90 static void	rmc_depth_recompute(rm_class_t *);
     91 
     92 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
     93 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
     94 
     95 static int	_rmc_addq(rm_class_t *, mbuf_t *);
     96 static void	_rmc_dropq(rm_class_t *);
     97 static mbuf_t	*_rmc_getq(rm_class_t *);
     98 static mbuf_t	*_rmc_pollq(rm_class_t *);
     99 
    100 static int	rmc_under_limit(struct rm_class *, struct timeval *);
    101 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
    102 static void	rmc_drop_action(struct rm_class *);
    103 static void	rmc_restart(struct rm_class *);
    104 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
    105 
    106 #define	BORROW_OFFTIME
    107 /*
    108  * BORROW_OFFTIME (experimental):
    109  * borrow the offtime of the class borrowing from.
    110  * the reason is that when its own offtime is set, the class is unable
    111  * to borrow much, especially when cutoff is taking effect.
    112  * but when the borrowed class is overloaded (advidle is close to minidle),
    113  * use the borrowing class's offtime to avoid overload.
    114  */
    115 #define	ADJUST_CUTOFF
    116 /*
    117  * ADJUST_CUTOFF (experimental):
    118  * if no underlimit class is found due to cutoff, increase cutoff and
    119  * retry the scheduling loop.
    120  * also, don't invoke delay_actions while cutoff is taking effect,
    121  * since a sleeping class won't have a chance to be scheduled in the
    122  * next loop.
    123  *
    124  * now heuristics for setting the top-level variable (cutoff_) becomes:
    125  *	1. if a packet arrives for a not-overlimit class, set cutoff
    126  *	   to the depth of the class.
    127  *	2. if cutoff is i, and a packet arrives for an overlimit class
    128  *	   with an underlimit ancestor at a lower level than i (say j),
    129  *	   then set cutoff to j.
    130  *	3. at scheduling a packet, if there is no underlimit class
    131  *	   due to the current cutoff level, increase cutoff by 1 and
    132  *	   then try to schedule again.
    133  */
    134 
    135 /*
    136  * rm_class_t *
    137  * rmc_newclass(...) - Create a new resource management class at priority
    138  * 'pri' on the interface given by 'ifd'.
    139  *
    140  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
    141  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
    142  *              than 100% of the bandwidth, this number should be the
    143  *              'effective' rate for the class.  Let f be the
    144  *              bandwidth fraction allocated to this class, and let
    145  *              nsPerByte be the data rate of the output link in
    146  *              nanoseconds/byte.  Then nsecPerByte is set to
    147  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
    148  *              for a class that gets 50% of an ethernet's bandwidth.
    149  *
    150  * action       the routine to call when the class is over limit.
    151  *
    152  * maxq         max allowable queue size for class (in packets).
    153  *
    154  * parent       parent class pointer.
    155  *
    156  * borrow       class to borrow from (should be either 'parent' or null).
    157  *
    158  * maxidle      max value allowed for class 'idle' time estimate (this
    159  *              parameter determines how large an initial burst of packets
    160  *              can be before overlimit action is invoked.
    161  *
    162  * offtime      how long 'delay' action will delay when class goes over
    163  *              limit (this parameter determines the steady-state burst
    164  *              size when a class is running over its limit).
    165  *
    166  * Maxidle and offtime have to be computed from the following:  If the
    167  * average packet size is s, the bandwidth fraction allocated to this
    168  * class is f, we want to allow b packet bursts, and the gain of the
    169  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
    170  *
    171  *   ptime = s * nsPerByte * (1 - f) / f
    172  *   maxidle = ptime * (1 - g^b) / g^b
    173  *   minidle = -ptime * (1 / (f - 1))
    174  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
    175  *
    176  * Operationally, it's convenient to specify maxidle & offtime in units
    177  * independent of the link bandwidth so the maxidle & offtime passed to
    178  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
    179  * (The constant factor is a scale factor needed to make the parameters
    180  * integers.  This scaling also means that the 'unscaled' values of
    181  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
    182  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
    183  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
    184  * maxidle also must be scaled upward by this value.  Thus, the passed
    185  * values for maxidle and offtime can be computed as follows:
    186  *
    187  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
    188  * offtime = offtime * 8 / (1000 * nsecPerByte)
    189  *
    190  * When USE_HRTIME is employed, then maxidle and offtime become:
    191  * 	maxidle = maxilde * (8.0 / nsecPerByte);
    192  * 	offtime = offtime * (8.0 / nsecPerByte);
    193  */
    194 struct rm_class *
    195 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
    196     void (*action)(rm_class_t *, rm_class_t *), int maxq,
    197     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
    198     int minidle, u_int offtime, int pktsize, int flags)
    199 {
    200 	struct rm_class	*cl;
    201 	struct rm_class	*peer;
    202 	int		 s;
    203 
    204 	if (pri >= RM_MAXPRIO)
    205 		return (NULL);
    206 #ifndef ALTQ_RED
    207 	if (flags & RMCF_RED) {
    208 #ifdef ALTQ_DEBUG
    209 		printf("rmc_newclass: RED not configured for CBQ!\n");
    210 #endif
    211 		return (NULL);
    212 	}
    213 #endif
    214 #ifndef ALTQ_RIO
    215 	if (flags & RMCF_RIO) {
    216 #ifdef ALTQ_DEBUG
    217 		printf("rmc_newclass: RIO not configured for CBQ!\n");
    218 #endif
    219 		return (NULL);
    220 	}
    221 #endif
    222 
    223 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_WAITOK|M_ZERO);
    224 	if (cl == NULL)
    225 		return (NULL);
    226 	CALLOUT_INIT(&cl->callout_);
    227 
    228 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
    229 	if (cl->q_ == NULL) {
    230 		free(cl, M_DEVBUF);
    231 		return (NULL);
    232 	}
    233 
    234 	/*
    235 	 * Class initialization.
    236 	 */
    237 	cl->children_ = NULL;
    238 	cl->parent_ = parent;
    239 	cl->borrow_ = borrow;
    240 	cl->leaf_ = 1;
    241 	cl->ifdat_ = ifd;
    242 	cl->pri_ = pri;
    243 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
    244 	cl->depth_ = 0;
    245 	cl->qthresh_ = 0;
    246 	cl->ns_per_byte_ = nsecPerByte;
    247 
    248 	qlimit(cl->q_) = maxq;
    249 	qtype(cl->q_) = Q_DROPHEAD;
    250 	qlen(cl->q_) = 0;
    251 	cl->flags_ = flags;
    252 
    253 #if 1 /* minidle is also scaled in ALTQ */
    254 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
    255 	if (cl->minidle_ > 0)
    256 		cl->minidle_ = 0;
    257 #else
    258 	cl->minidle_ = minidle;
    259 #endif
    260 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
    261 	if (cl->maxidle_ == 0)
    262 		cl->maxidle_ = 1;
    263 #if 1 /* offtime is also scaled in ALTQ */
    264 	cl->avgidle_ = cl->maxidle_;
    265 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
    266 	if (cl->offtime_ == 0)
    267 		cl->offtime_ = 1;
    268 #else
    269 	cl->avgidle_ = 0;
    270 	cl->offtime_ = (offtime * nsecPerByte) / 8;
    271 #endif
    272 	cl->overlimit = action;
    273 
    274 #ifdef ALTQ_RED
    275 	if (flags & (RMCF_RED|RMCF_RIO)) {
    276 		int red_flags, red_pkttime;
    277 
    278 		red_flags = 0;
    279 		if (flags & RMCF_ECN)
    280 			red_flags |= REDF_ECN;
    281 		if (flags & RMCF_FLOWVALVE)
    282 			red_flags |= REDF_FLOWVALVE;
    283 #ifdef ALTQ_RIO
    284 		if (flags & RMCF_CLEARDSCP)
    285 			red_flags |= RIOF_CLEARDSCP;
    286 #endif
    287 		red_pkttime = nsecPerByte * pktsize  / 1000;
    288 
    289 		if (flags & RMCF_RED) {
    290 			cl->red_ = red_alloc(0, 0,
    291 			    qlimit(cl->q_) * 10/100,
    292 			    qlimit(cl->q_) * 30/100,
    293 			    red_flags, red_pkttime);
    294 			if (cl->red_ != NULL)
    295 				qtype(cl->q_) = Q_RED;
    296 		}
    297 #ifdef ALTQ_RIO
    298 		else {
    299 			cl->red_ = (red_t *)rio_alloc(0, NULL,
    300 						      red_flags, red_pkttime);
    301 			if (cl->red_ != NULL)
    302 				qtype(cl->q_) = Q_RIO;
    303 		}
    304 #endif
    305 	}
    306 #endif /* ALTQ_RED */
    307 
    308 	/*
    309 	 * put the class into the class tree
    310 	 */
    311 	s = splnet();
    312 	if ((peer = ifd->active_[pri]) != NULL) {
    313 		/* find the last class at this pri */
    314 		cl->peer_ = peer;
    315 		while (peer->peer_ != ifd->active_[pri])
    316 			peer = peer->peer_;
    317 		peer->peer_ = cl;
    318 	} else {
    319 		ifd->active_[pri] = cl;
    320 		cl->peer_ = cl;
    321 	}
    322 
    323 	if (cl->parent_) {
    324 		cl->next_ = parent->children_;
    325 		parent->children_ = cl;
    326 		parent->leaf_ = 0;
    327 	}
    328 
    329 	/*
    330 	 * Compute the depth of this class and its ancestors in the class
    331 	 * hierarchy.
    332 	 */
    333 	rmc_depth_compute(cl);
    334 
    335 	/*
    336 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
    337 	 */
    338 	if (ifd->wrr_) {
    339 		ifd->num_[pri]++;
    340 		ifd->alloc_[pri] += cl->allotment_;
    341 		rmc_wrr_set_weights(ifd);
    342 	}
    343 	splx(s);
    344 	return (cl);
    345 }
    346 
    347 int
    348 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
    349     int minidle, u_int offtime, int pktsize)
    350 {
    351 	struct rm_ifdat	*ifd;
    352 	u_int		 old_allotment;
    353 	int		 s;
    354 
    355 	ifd = cl->ifdat_;
    356 	old_allotment = cl->allotment_;
    357 
    358 	s = splnet();
    359 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
    360 	cl->qthresh_ = 0;
    361 	cl->ns_per_byte_ = nsecPerByte;
    362 
    363 	qlimit(cl->q_) = maxq;
    364 
    365 #if 1 /* minidle is also scaled in ALTQ */
    366 	cl->minidle_ = (minidle * nsecPerByte) / 8;
    367 	if (cl->minidle_ > 0)
    368 		cl->minidle_ = 0;
    369 #else
    370 	cl->minidle_ = minidle;
    371 #endif
    372 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
    373 	if (cl->maxidle_ == 0)
    374 		cl->maxidle_ = 1;
    375 #if 1 /* offtime is also scaled in ALTQ */
    376 	cl->avgidle_ = cl->maxidle_;
    377 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
    378 	if (cl->offtime_ == 0)
    379 		cl->offtime_ = 1;
    380 #else
    381 	cl->avgidle_ = 0;
    382 	cl->offtime_ = (offtime * nsecPerByte) / 8;
    383 #endif
    384 
    385 	/*
    386 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
    387 	 */
    388 	if (ifd->wrr_) {
    389 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
    390 		rmc_wrr_set_weights(ifd);
    391 	}
    392 	splx(s);
    393 	return (0);
    394 }
    395 
    396 /*
    397  * static void
    398  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
    399  *	the appropriate run robin weights for the CBQ weighted round robin
    400  *	algorithm.
    401  *
    402  *	Returns: NONE
    403  */
    404 
    405 static void
    406 rmc_wrr_set_weights(struct rm_ifdat *ifd)
    407 {
    408 	int		i;
    409 	struct rm_class	*cl, *clh;
    410 
    411 	for (i = 0; i < RM_MAXPRIO; i++) {
    412 		/*
    413 		 * This is inverted from that of the simulator to
    414 		 * maintain precision.
    415 		 */
    416 		if (ifd->num_[i] == 0)
    417 			ifd->M_[i] = 0;
    418 		else
    419 			ifd->M_[i] = ifd->alloc_[i] /
    420 				(ifd->num_[i] * ifd->maxpkt_);
    421 		/*
    422 		 * Compute the weighted allotment for each class.
    423 		 * This takes the expensive div instruction out
    424 		 * of the main loop for the wrr scheduling path.
    425 		 * These only get recomputed when a class comes or
    426 		 * goes.
    427 		 */
    428 		if (ifd->active_[i] != NULL) {
    429 			clh = cl = ifd->active_[i];
    430 			do {
    431 				/* safe-guard for slow link or alloc_ == 0 */
    432 				if (ifd->M_[i] == 0)
    433 					cl->w_allotment_ = 0;
    434 				else
    435 					cl->w_allotment_ = cl->allotment_ /
    436 						ifd->M_[i];
    437 				cl = cl->peer_;
    438 			} while ((cl != NULL) && (cl != clh));
    439 		}
    440 	}
    441 }
    442 
    443 int
    444 rmc_get_weight(struct rm_ifdat *ifd, int pri)
    445 {
    446 	if ((pri >= 0) && (pri < RM_MAXPRIO))
    447 		return (ifd->M_[pri]);
    448 	else
    449 		return (0);
    450 }
    451 
    452 /*
    453  * static void
    454  * rmc_depth_compute(struct rm_class *cl) - This function computes the
    455  *	appropriate depth of class 'cl' and its ancestors.
    456  *
    457  *	Returns:	NONE
    458  */
    459 
    460 static void
    461 rmc_depth_compute(struct rm_class *cl)
    462 {
    463 	rm_class_t	*t = cl, *p;
    464 
    465 	/*
    466 	 * Recompute the depth for the branch of the tree.
    467 	 */
    468 	while (t != NULL) {
    469 		p = t->parent_;
    470 		if (p && (t->depth_ >= p->depth_)) {
    471 			p->depth_ = t->depth_ + 1;
    472 			t = p;
    473 		} else
    474 			t = NULL;
    475 	}
    476 }
    477 
    478 /*
    479  * static void
    480  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
    481  *	the depth of the tree after a class has been deleted.
    482  *
    483  *	Returns:	NONE
    484  */
    485 
    486 static void
    487 rmc_depth_recompute(rm_class_t *cl)
    488 {
    489 #if 1 /* ALTQ */
    490 	rm_class_t	*p, *t;
    491 
    492 	p = cl;
    493 	while (p != NULL) {
    494 		if ((t = p->children_) == NULL) {
    495 			p->depth_ = 0;
    496 		} else {
    497 			int cdepth = 0;
    498 
    499 			while (t != NULL) {
    500 				if (t->depth_ > cdepth)
    501 					cdepth = t->depth_;
    502 				t = t->next_;
    503 			}
    504 
    505 			if (p->depth_ == cdepth + 1)
    506 				/* no change to this parent */
    507 				return;
    508 
    509 			p->depth_ = cdepth + 1;
    510 		}
    511 
    512 		p = p->parent_;
    513 	}
    514 #else
    515 	rm_class_t	*t;
    516 
    517 	if (cl->depth_ >= 1) {
    518 		if (cl->children_ == NULL) {
    519 			cl->depth_ = 0;
    520 		} else if ((t = cl->children_) != NULL) {
    521 			while (t != NULL) {
    522 				if (t->children_ != NULL)
    523 					rmc_depth_recompute(t);
    524 				t = t->next_;
    525 			}
    526 		} else
    527 			rmc_depth_compute(cl);
    528 	}
    529 #endif
    530 }
    531 
    532 /*
    533  * void
    534  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
    535  *	function deletes a class from the link-sharing structure and frees
    536  *	all resources associated with the class.
    537  *
    538  *	Returns: NONE
    539  */
    540 
    541 void
    542 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
    543 {
    544 	struct rm_class	*p, *head, *previous;
    545 	int		 s;
    546 
    547 	ASSERT(cl->children_ == NULL);
    548 
    549 	if (cl->sleeping_)
    550 		CALLOUT_STOP(&cl->callout_);
    551 
    552 	s = splnet();
    553 	/*
    554 	 * Free packets in the packet queue.
    555 	 * XXX - this may not be a desired behavior.  Packets should be
    556 	 *		re-queued.
    557 	 */
    558 	rmc_dropall(cl);
    559 
    560 	/*
    561 	 * If the class has a parent, then remove the class from the
    562 	 * class from the parent's children chain.
    563 	 */
    564 	if (cl->parent_ != NULL) {
    565 		head = cl->parent_->children_;
    566 		p = previous = head;
    567 		if (head->next_ == NULL) {
    568 			ASSERT(head == cl);
    569 			cl->parent_->children_ = NULL;
    570 			cl->parent_->leaf_ = 1;
    571 		} else while (p != NULL) {
    572 			if (p == cl) {
    573 				if (cl == head)
    574 					cl->parent_->children_ = cl->next_;
    575 				else
    576 					previous->next_ = cl->next_;
    577 				cl->next_ = NULL;
    578 				p = NULL;
    579 			} else {
    580 				previous = p;
    581 				p = p->next_;
    582 			}
    583 		}
    584 	}
    585 
    586 	/*
    587 	 * Delete class from class priority peer list.
    588 	 */
    589 	if ((p = ifd->active_[cl->pri_]) != NULL) {
    590 		/*
    591 		 * If there is more than one member of this priority
    592 		 * level, then look for class(cl) in the priority level.
    593 		 */
    594 		if (p != p->peer_) {
    595 			while (p->peer_ != cl)
    596 				p = p->peer_;
    597 			p->peer_ = cl->peer_;
    598 
    599 			if (ifd->active_[cl->pri_] == cl)
    600 				ifd->active_[cl->pri_] = cl->peer_;
    601 		} else {
    602 			ASSERT(p == cl);
    603 			ifd->active_[cl->pri_] = NULL;
    604 		}
    605 	}
    606 
    607 	/*
    608 	 * Recompute the WRR weights.
    609 	 */
    610 	if (ifd->wrr_) {
    611 		ifd->alloc_[cl->pri_] -= cl->allotment_;
    612 		ifd->num_[cl->pri_]--;
    613 		rmc_wrr_set_weights(ifd);
    614 	}
    615 
    616 	/*
    617 	 * Re-compute the depth of the tree.
    618 	 */
    619 #if 1 /* ALTQ */
    620 	rmc_depth_recompute(cl->parent_);
    621 #else
    622 	rmc_depth_recompute(ifd->root_);
    623 #endif
    624 
    625 	splx(s);
    626 
    627 	/*
    628 	 * Free the class structure.
    629 	 */
    630 	if (cl->red_ != NULL) {
    631 #ifdef ALTQ_RIO
    632 		if (q_is_rio(cl->q_))
    633 			rio_destroy((rio_t *)cl->red_);
    634 #endif
    635 #ifdef ALTQ_RED
    636 		if (q_is_red(cl->q_))
    637 			red_destroy(cl->red_);
    638 #endif
    639 	}
    640 	free(cl->q_, M_DEVBUF);
    641 	free(cl, M_DEVBUF);
    642 }
    643 
    644 
    645 /*
    646  * int
    647  * rmc_init(...) - Initialize the resource management data structures
    648  *	associated with the output portion of interface 'ifp'.  'ifd' is
    649  *	where the structures will be built (for backwards compatibility, the
    650  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
    651  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
    652  *	'restart' is the driver-specific routine that the generic 'delay
    653  *	until under limit' action will call to restart output.  `maxq'
    654  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
    655  *	is the maximum number of packets that the resource management
    656  *	code will allow to be queued 'downstream' (this is typically 1).
    657  *
    658  *	Returns:	0 on success
    659  */
    660 
    661 int
    662 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
    663     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
    664     int minidle, u_int offtime, int flags)
    665 {
    666 	int i, mtu;
    667 
    668 	/*
    669 	 * Initialize the CBQ tracing/debug facility.
    670 	 */
    671 	CBQTRACEINIT();
    672 
    673 	mtu = ifq->altq_ifp->if_mtu;
    674 	if (mtu < 1) {
    675 		printf("altq: %s: invalid MTU (interface not initialized?)\n",
    676 		    ifq->altq_ifp->if_xname);
    677 		return (EINVAL);
    678 	}
    679 
    680 	(void)memset((char *)ifd, 0, sizeof (*ifd));
    681 	ifd->ifq_ = ifq;
    682 	ifd->restart = restart;
    683 	ifd->maxqueued_ = maxqueued;
    684 	ifd->ns_per_byte_ = nsecPerByte;
    685 	ifd->maxpkt_ = mtu;
    686 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
    687 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
    688 #if 1
    689 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
    690 	if (mtu * nsecPerByte > 10 * 1000000)
    691 		ifd->maxiftime_ /= 4;
    692 #endif
    693 
    694 	reset_cutoff(ifd);
    695 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
    696 
    697 	/*
    698 	 * Initialize the CBQ's WRR state.
    699 	 */
    700 	for (i = 0; i < RM_MAXPRIO; i++) {
    701 		ifd->alloc_[i] = 0;
    702 		ifd->M_[i] = 0;
    703 		ifd->num_[i] = 0;
    704 		ifd->na_[i] = 0;
    705 		ifd->active_[i] = NULL;
    706 	}
    707 
    708 	/*
    709 	 * Initialize current packet state.
    710 	 */
    711 	ifd->qi_ = 0;
    712 	ifd->qo_ = 0;
    713 	for (i = 0; i < RM_MAXQUEUED; i++) {
    714 		ifd->class_[i] = NULL;
    715 		ifd->curlen_[i] = 0;
    716 		ifd->borrowed_[i] = NULL;
    717 	}
    718 
    719 	/*
    720 	 * Create the root class of the link-sharing structure.
    721 	 */
    722 	if ((ifd->root_ = rmc_newclass(0, ifd,
    723 				       nsecPerByte,
    724 				       rmc_root_overlimit, maxq, 0, 0,
    725 				       maxidle, minidle, offtime,
    726 				       0, 0)) == NULL) {
    727 		printf("rmc_init: root class not allocated\n");
    728 		return (ENOMEM);
    729 	}
    730 	ifd->root_->depth_ = 0;
    731 
    732 	return (0);
    733 }
    734 
    735 /*
    736  * void
    737  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
    738  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
    739  *	by a driver's if_output routine.  This routine must be called with
    740  *	output packet completion interrupts locked out (to avoid racing with
    741  *	rmc_dequeue_next).
    742  *
    743  *	Returns:	0 on successful queueing
    744  *			-1 when packet drop occurs
    745  */
    746 int
    747 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
    748 {
    749 	struct timeval	 now;
    750 	struct rm_ifdat *ifd = cl->ifdat_;
    751 	int		 cpri = cl->pri_;
    752 	int		 is_empty = qempty(cl->q_);
    753 
    754 	RM_GETTIME(now);
    755 	if (ifd->cutoff_ > 0) {
    756 		if (TV_LT(&cl->undertime_, &now)) {
    757 			if (ifd->cutoff_ > cl->depth_)
    758 				ifd->cutoff_ = cl->depth_;
    759 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
    760 		}
    761 #if 1 /* ALTQ */
    762 		else {
    763 			/*
    764 			 * the class is overlimit. if the class has
    765 			 * underlimit ancestors, set cutoff to the lowest
    766 			 * depth among them.
    767 			 */
    768 			struct rm_class *borrow = cl->borrow_;
    769 
    770 			while (borrow != NULL &&
    771 			       borrow->depth_ < ifd->cutoff_) {
    772 				if (TV_LT(&borrow->undertime_, &now)) {
    773 					ifd->cutoff_ = borrow->depth_;
    774 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
    775 					break;
    776 				}
    777 				borrow = borrow->borrow_;
    778 			}
    779 		}
    780 #else /* !ALTQ */
    781 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
    782 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
    783 				ifd->cutoff_ = cl->borrow_->depth_;
    784 				CBQTRACE(rmc_queue_packet, 'ffob',
    785 					 cl->borrow_->depth_);
    786 			}
    787 		}
    788 #endif /* !ALTQ */
    789 	}
    790 
    791 	if (_rmc_addq(cl, m) < 0)
    792 		/* failed */
    793 		return (-1);
    794 
    795 	if (is_empty) {
    796 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
    797 		ifd->na_[cpri]++;
    798 	}
    799 
    800 	if (qlen(cl->q_) > qlimit(cl->q_)) {
    801 		/* note: qlimit can be set to 0 or 1 */
    802 		rmc_drop_action(cl);
    803 		return (-1);
    804 	}
    805 	return (0);
    806 }
    807 
    808 /*
    809  * void
    810  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
    811  *	classes to see if there are satified.
    812  */
    813 
    814 static void
    815 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
    816 {
    817 	int		 i;
    818 	rm_class_t	*p, *bp;
    819 
    820 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
    821 		if ((bp = ifd->active_[i]) != NULL) {
    822 			p = bp;
    823 			do {
    824 				if (!rmc_satisfied(p, now)) {
    825 					ifd->cutoff_ = p->depth_;
    826 					return;
    827 				}
    828 				p = p->peer_;
    829 			} while (p != bp);
    830 		}
    831 	}
    832 
    833 	reset_cutoff(ifd);
    834 }
    835 
    836 /*
    837  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
    838  */
    839 
    840 static int
    841 rmc_satisfied(struct rm_class *cl, struct timeval *now)
    842 {
    843 	rm_class_t	*p;
    844 
    845 	if (cl == NULL)
    846 		return (1);
    847 	if (TV_LT(now, &cl->undertime_))
    848 		return (1);
    849 	if (cl->depth_ == 0) {
    850 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
    851 			return (0);
    852 		else
    853 			return (1);
    854 	}
    855 	if (cl->children_ != NULL) {
    856 		p = cl->children_;
    857 		while (p != NULL) {
    858 			if (!rmc_satisfied(p, now))
    859 				return (0);
    860 			p = p->next_;
    861 		}
    862 	}
    863 
    864 	return (1);
    865 }
    866 
    867 /*
    868  * Return 1 if class 'cl' is under limit or can borrow from a parent,
    869  * 0 if overlimit.  As a side-effect, this routine will invoke the
    870  * class overlimit action if the class if overlimit.
    871  */
    872 
    873 static int
    874 rmc_under_limit(struct rm_class *cl, struct timeval *now)
    875 {
    876 	rm_class_t	*p = cl;
    877 	rm_class_t	*top;
    878 	struct rm_ifdat	*ifd = cl->ifdat_;
    879 
    880 	ifd->borrowed_[ifd->qi_] = NULL;
    881 	/*
    882 	 * If cl is the root class, then always return that it is
    883 	 * underlimit.  Otherwise, check to see if the class is underlimit.
    884 	 */
    885 	if (cl->parent_ == NULL)
    886 		return (1);
    887 
    888 	if (cl->sleeping_) {
    889 		if (TV_LT(now, &cl->undertime_))
    890 			return (0);
    891 
    892 		CALLOUT_STOP(&cl->callout_);
    893 		cl->sleeping_ = 0;
    894 		cl->undertime_.tv_sec = 0;
    895 		return (1);
    896 	}
    897 
    898 	top = NULL;
    899 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
    900 		if (((cl = cl->borrow_) == NULL) ||
    901 		    (cl->depth_ > ifd->cutoff_)) {
    902 #ifdef ADJUST_CUTOFF
    903 			if (cl != NULL)
    904 				/* cutoff is taking effect, just
    905 				   return false without calling
    906 				   the delay action. */
    907 				return (0);
    908 #endif
    909 #ifdef BORROW_OFFTIME
    910 			/*
    911 			 * check if the class can borrow offtime too.
    912 			 * borrow offtime from the top of the borrow
    913 			 * chain if the top class is not overloaded.
    914 			 */
    915 			if (cl != NULL) {
    916 				/* cutoff is taking effect, use this class as top. */
    917 				top = cl;
    918 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
    919 			}
    920 			if (top != NULL && top->avgidle_ == top->minidle_)
    921 				top = NULL;
    922 			p->overtime_ = *now;
    923 			(p->overlimit)(p, top);
    924 #else
    925 			p->overtime_ = *now;
    926 			(p->overlimit)(p, NULL);
    927 #endif
    928 			return (0);
    929 		}
    930 		top = cl;
    931 	}
    932 
    933 	if (cl != p)
    934 		ifd->borrowed_[ifd->qi_] = cl;
    935 	return (1);
    936 }
    937 
    938 /*
    939  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
    940  *	Packet-by-packet round robin.
    941  *
    942  * The heart of the weighted round-robin scheduler, which decides which
    943  * class next gets to send a packet.  Highest priority first, then
    944  * weighted round-robin within priorites.
    945  *
    946  * Each able-to-send class gets to send until its byte allocation is
    947  * exhausted.  Thus, the active pointer is only changed after a class has
    948  * exhausted its allocation.
    949  *
    950  * If the scheduler finds no class that is underlimit or able to borrow,
    951  * then the first class found that had a nonzero queue and is allowed to
    952  * borrow gets to send.
    953  */
    954 
    955 static mbuf_t *
    956 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
    957 {
    958 	struct rm_class	*cl = NULL, *first = NULL;
    959 	u_int		 deficit;
    960 	int		 cpri;
    961 	mbuf_t		*m;
    962 	struct timeval	 now;
    963 
    964 	RM_GETTIME(now);
    965 
    966 	/*
    967 	 * if the driver polls the top of the queue and then removes
    968 	 * the polled packet, we must return the same packet.
    969 	 */
    970 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
    971 		cl = ifd->pollcache_;
    972 		cpri = cl->pri_;
    973 		if (ifd->efficient_) {
    974 			/* check if this class is overlimit */
    975 			if (cl->undertime_.tv_sec != 0 &&
    976 			    rmc_under_limit(cl, &now) == 0)
    977 				first = cl;
    978 		}
    979 		ifd->pollcache_ = NULL;
    980 		goto _wrr_out;
    981 	}
    982 	else {
    983 		/* mode == ALTDQ_POLL || pollcache == NULL */
    984 		ifd->pollcache_ = NULL;
    985 		ifd->borrowed_[ifd->qi_] = NULL;
    986 	}
    987 #ifdef ADJUST_CUTOFF
    988  _again:
    989 #endif
    990 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
    991 		if (ifd->na_[cpri] == 0)
    992 			continue;
    993 		deficit = 0;
    994 		/*
    995 		 * Loop through twice for a priority level, if some class
    996 		 * was unable to send a packet the first round because
    997 		 * of the weighted round-robin mechanism.
    998 		 * During the second loop at this level, deficit==2.
    999 		 * (This second loop is not needed if for every class,
   1000 		 * "M[cl->pri_])" times "cl->allotment" is greater than
   1001 		 * the byte size for the largest packet in the class.)
   1002 		 */
   1003  _wrr_loop:
   1004 		cl = ifd->active_[cpri];
   1005 		ASSERT(cl != NULL);
   1006 		do {
   1007 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
   1008 				cl->bytes_alloc_ += cl->w_allotment_;
   1009 			if (!qempty(cl->q_)) {
   1010 				if ((cl->undertime_.tv_sec == 0) ||
   1011 				    rmc_under_limit(cl, &now)) {
   1012 					if (cl->bytes_alloc_ > 0 || deficit > 1)
   1013 						goto _wrr_out;
   1014 
   1015 					/* underlimit but no alloc */
   1016 					deficit = 1;
   1017 #if 1
   1018 					ifd->borrowed_[ifd->qi_] = NULL;
   1019 #endif
   1020 				}
   1021 				else if (first == NULL && cl->borrow_ != NULL)
   1022 					first = cl; /* borrowing candidate */
   1023 			}
   1024 
   1025 			cl->bytes_alloc_ = 0;
   1026 			cl = cl->peer_;
   1027 		} while (cl != ifd->active_[cpri]);
   1028 
   1029 		if (deficit == 1) {
   1030 			/* first loop found an underlimit class with deficit */
   1031 			/* Loop on same priority level, with new deficit.  */
   1032 			deficit = 2;
   1033 			goto _wrr_loop;
   1034 		}
   1035 	}
   1036 
   1037 #ifdef ADJUST_CUTOFF
   1038 	/*
   1039 	 * no underlimit class found.  if cutoff is taking effect,
   1040 	 * increase cutoff and try again.
   1041 	 */
   1042 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
   1043 		ifd->cutoff_++;
   1044 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
   1045 		goto _again;
   1046 	}
   1047 #endif /* ADJUST_CUTOFF */
   1048 	/*
   1049 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
   1050 	 * class we encounter will send a packet if all the classes
   1051 	 * of the link-sharing structure are overlimit.
   1052 	 */
   1053 	reset_cutoff(ifd);
   1054 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
   1055 
   1056 	if (!ifd->efficient_ || first == NULL)
   1057 		return (NULL);
   1058 
   1059 	cl = first;
   1060 	cpri = cl->pri_;
   1061 #if 0	/* too time-consuming for nothing */
   1062 	if (cl->sleeping_)
   1063 		CALLOUT_STOP(&cl->callout_);
   1064 	cl->sleeping_ = 0;
   1065 	cl->undertime_.tv_sec = 0;
   1066 #endif
   1067 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
   1068 	ifd->cutoff_ = cl->borrow_->depth_;
   1069 
   1070 	/*
   1071 	 * Deque the packet and do the book keeping...
   1072 	 */
   1073  _wrr_out:
   1074 	if (op == ALTDQ_REMOVE) {
   1075 		m = _rmc_getq(cl);
   1076 		if (m == NULL)
   1077 			panic("_rmc_wrr_dequeue_next");
   1078 		if (qempty(cl->q_))
   1079 			ifd->na_[cpri]--;
   1080 
   1081 		/*
   1082 		 * Update class statistics and link data.
   1083 		 */
   1084 		if (cl->bytes_alloc_ > 0)
   1085 			cl->bytes_alloc_ -= m_pktlen(m);
   1086 
   1087 		if ((cl->bytes_alloc_ <= 0) || first == cl)
   1088 			ifd->active_[cl->pri_] = cl->peer_;
   1089 		else
   1090 			ifd->active_[cl->pri_] = cl;
   1091 
   1092 		ifd->class_[ifd->qi_] = cl;
   1093 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
   1094 		ifd->now_[ifd->qi_] = now;
   1095 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
   1096 		ifd->queued_++;
   1097 	} else {
   1098 		/* mode == ALTDQ_PPOLL */
   1099 		m = _rmc_pollq(cl);
   1100 		ifd->pollcache_ = cl;
   1101 	}
   1102 	return (m);
   1103 }
   1104 
   1105 /*
   1106  * Dequeue & return next packet from the highest priority class that
   1107  * has a packet to send & has enough allocation to send it.  This
   1108  * routine is called by a driver whenever it needs a new packet to
   1109  * output.
   1110  */
   1111 static mbuf_t *
   1112 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
   1113 {
   1114 	mbuf_t		*m;
   1115 	int		 cpri;
   1116 	struct rm_class	*cl, *first = NULL;
   1117 	struct timeval	 now;
   1118 
   1119 	RM_GETTIME(now);
   1120 
   1121 	/*
   1122 	 * if the driver polls the top of the queue and then removes
   1123 	 * the polled packet, we must return the same packet.
   1124 	 */
   1125 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
   1126 		cl = ifd->pollcache_;
   1127 		cpri = cl->pri_;
   1128 		ifd->pollcache_ = NULL;
   1129 		goto _prr_out;
   1130 	} else {
   1131 		/* mode == ALTDQ_POLL || pollcache == NULL */
   1132 		ifd->pollcache_ = NULL;
   1133 		ifd->borrowed_[ifd->qi_] = NULL;
   1134 	}
   1135 #ifdef ADJUST_CUTOFF
   1136  _again:
   1137 #endif
   1138 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
   1139 		if (ifd->na_[cpri] == 0)
   1140 			continue;
   1141 		cl = ifd->active_[cpri];
   1142 		ASSERT(cl != NULL);
   1143 		do {
   1144 			if (!qempty(cl->q_)) {
   1145 				if ((cl->undertime_.tv_sec == 0) ||
   1146 				    rmc_under_limit(cl, &now))
   1147 					goto _prr_out;
   1148 				if (first == NULL && cl->borrow_ != NULL)
   1149 					first = cl;
   1150 			}
   1151 			cl = cl->peer_;
   1152 		} while (cl != ifd->active_[cpri]);
   1153 	}
   1154 
   1155 #ifdef ADJUST_CUTOFF
   1156 	/*
   1157 	 * no underlimit class found.  if cutoff is taking effect, increase
   1158 	 * cutoff and try again.
   1159 	 */
   1160 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
   1161 		ifd->cutoff_++;
   1162 		goto _again;
   1163 	}
   1164 #endif /* ADJUST_CUTOFF */
   1165 	/*
   1166 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
   1167 	 * class we encounter will send a packet if all the classes
   1168 	 * of the link-sharing structure are overlimit.
   1169 	 */
   1170 	reset_cutoff(ifd);
   1171 	if (!ifd->efficient_ || first == NULL)
   1172 		return (NULL);
   1173 
   1174 	cl = first;
   1175 	cpri = cl->pri_;
   1176 #if 0	/* too time-consuming for nothing */
   1177 	if (cl->sleeping_)
   1178 		CALLOUT_STOP(&cl->callout_);
   1179 	cl->sleeping_ = 0;
   1180 	cl->undertime_.tv_sec = 0;
   1181 #endif
   1182 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
   1183 	ifd->cutoff_ = cl->borrow_->depth_;
   1184 
   1185 	/*
   1186 	 * Deque the packet and do the book keeping...
   1187 	 */
   1188  _prr_out:
   1189 	if (op == ALTDQ_REMOVE) {
   1190 		m = _rmc_getq(cl);
   1191 		if (m == NULL)
   1192 			panic("_rmc_prr_dequeue_next");
   1193 		if (qempty(cl->q_))
   1194 			ifd->na_[cpri]--;
   1195 
   1196 		ifd->active_[cpri] = cl->peer_;
   1197 
   1198 		ifd->class_[ifd->qi_] = cl;
   1199 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
   1200 		ifd->now_[ifd->qi_] = now;
   1201 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
   1202 		ifd->queued_++;
   1203 	} else {
   1204 		/* mode == ALTDQ_POLL */
   1205 		m = _rmc_pollq(cl);
   1206 		ifd->pollcache_ = cl;
   1207 	}
   1208 	return (m);
   1209 }
   1210 
   1211 /*
   1212  * mbuf_t *
   1213  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
   1214  *	is invoked by the packet driver to get the next packet to be
   1215  *	dequeued and output on the link.  If WRR is enabled, then the
   1216  *	WRR dequeue next routine will determine the next packet to sent.
   1217  *	Otherwise, packet-by-packet round robin is invoked.
   1218  *
   1219  *	Returns:	NULL, if a packet is not available or if all
   1220  *			classes are overlimit.
   1221  *
   1222  *			Otherwise, Pointer to the next packet.
   1223  */
   1224 
   1225 mbuf_t *
   1226 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
   1227 {
   1228 	if (ifd->queued_ >= ifd->maxqueued_)
   1229 		return (NULL);
   1230 	else if (ifd->wrr_)
   1231 		return (_rmc_wrr_dequeue_next(ifd, mode));
   1232 	else
   1233 		return (_rmc_prr_dequeue_next(ifd, mode));
   1234 }
   1235 
   1236 /*
   1237  * Update the utilization estimate for the packet that just completed.
   1238  * The packet's class & the parent(s) of that class all get their
   1239  * estimators updated.  This routine is called by the driver's output-
   1240  * packet-completion interrupt service routine.
   1241  */
   1242 
   1243 /*
   1244  * a macro to approximate "divide by 1000" that gives 0.000999,
   1245  * if a value has enough effective digits.
   1246  * (on pentium, mul takes 9 cycles but div takes 46!)
   1247  */
   1248 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
   1249 void
   1250 rmc_update_class_util(struct rm_ifdat *ifd)
   1251 {
   1252 	int		 idle, avgidle, pktlen;
   1253 	int		 pkt_time, tidle;
   1254 	rm_class_t	*cl, *cl0, *borrowed;
   1255 	rm_class_t	*borrows;
   1256 	struct timeval	*nowp;
   1257 
   1258 	/*
   1259 	 * Get the most recent completed class.
   1260 	 */
   1261 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
   1262 		return;
   1263 
   1264 	cl0 = cl;
   1265 	pktlen = ifd->curlen_[ifd->qo_];
   1266 	borrowed = ifd->borrowed_[ifd->qo_];
   1267 	borrows = borrowed;
   1268 
   1269 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
   1270 
   1271 	/*
   1272 	 * Run estimator on class and its ancestors.
   1273 	 */
   1274 	/*
   1275 	 * rm_update_class_util is designed to be called when the
   1276 	 * transfer is completed from a xmit complete interrupt,
   1277 	 * but most drivers don't implement an upcall for that.
   1278 	 * so, just use estimated completion time.
   1279 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
   1280 	 */
   1281 	nowp = &ifd->now_[ifd->qo_];
   1282 	/* get pkt_time (for link) in usec */
   1283 #if 1  /* use approximation */
   1284 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
   1285 	pkt_time = NSEC_TO_USEC(pkt_time);
   1286 #else
   1287 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
   1288 #endif
   1289 #if 1 /* ALTQ4PPP */
   1290 	if (TV_LT(nowp, &ifd->ifnow_)) {
   1291 		int iftime;
   1292 
   1293 		/*
   1294 		 * make sure the estimated completion time does not go
   1295 		 * too far.  it can happen when the link layer supports
   1296 		 * data compression or the interface speed is set to
   1297 		 * a much lower value.
   1298 		 */
   1299 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
   1300 		if (iftime+pkt_time < ifd->maxiftime_) {
   1301 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
   1302 		} else {
   1303 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
   1304 		}
   1305 	} else {
   1306 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
   1307 	}
   1308 #else
   1309 	if (TV_LT(nowp, &ifd->ifnow_)) {
   1310 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
   1311 	} else {
   1312 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
   1313 	}
   1314 #endif
   1315 
   1316 	while (cl != NULL) {
   1317 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
   1318 		if (idle >= 2000000)
   1319 			/*
   1320 			 * this class is idle enough, reset avgidle.
   1321 			 * (TV_DELTA returns 2000000 us when delta is large.)
   1322 			 */
   1323 			cl->avgidle_ = cl->maxidle_;
   1324 
   1325 		/* get pkt_time (for class) in usec */
   1326 #if 1  /* use approximation */
   1327 		pkt_time = pktlen * cl->ns_per_byte_;
   1328 		pkt_time = NSEC_TO_USEC(pkt_time);
   1329 #else
   1330 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
   1331 #endif
   1332 		idle -= pkt_time;
   1333 
   1334 		avgidle = cl->avgidle_;
   1335 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
   1336 		cl->avgidle_ = avgidle;
   1337 
   1338 		/* Are we overlimit ? */
   1339 		if (avgidle <= 0) {
   1340 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
   1341 #if 1 /* ALTQ */
   1342 			/*
   1343 			 * need some lower bound for avgidle, otherwise
   1344 			 * a borrowing class gets unbounded penalty.
   1345 			 */
   1346 			if (avgidle < cl->minidle_)
   1347 				avgidle = cl->avgidle_ = cl->minidle_;
   1348 #endif
   1349 			/* set next idle to make avgidle 0 */
   1350 			tidle = pkt_time +
   1351 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
   1352 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
   1353 			++cl->stats_.over;
   1354 		} else {
   1355 			cl->avgidle_ =
   1356 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
   1357 			cl->undertime_.tv_sec = 0;
   1358 			if (cl->sleeping_) {
   1359 				CALLOUT_STOP(&cl->callout_);
   1360 				cl->sleeping_ = 0;
   1361 			}
   1362 		}
   1363 
   1364 		if (borrows != NULL) {
   1365 			if (borrows != cl)
   1366 				++cl->stats_.borrows;
   1367 			else
   1368 				borrows = NULL;
   1369 		}
   1370 		cl->last_ = ifd->ifnow_;
   1371 		cl->last_pkttime_ = pkt_time;
   1372 
   1373 #if 1
   1374 		if (cl->parent_ == NULL && cl != cl0) {
   1375 			/* take stats of root class */
   1376 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
   1377 		}
   1378 #endif
   1379 
   1380 		cl = cl->parent_;
   1381 	}
   1382 
   1383 	/*
   1384 	 * Check to see if cutoff needs to set to a new level.
   1385 	 */
   1386 	cl = ifd->class_[ifd->qo_];
   1387 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
   1388 #if 1 /* ALTQ */
   1389 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
   1390 			rmc_tl_satisfied(ifd, nowp);
   1391 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
   1392 		} else {
   1393 			ifd->cutoff_ = borrowed->depth_;
   1394 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
   1395 		}
   1396 #else /* !ALTQ */
   1397 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
   1398 			reset_cutoff(ifd);
   1399 #ifdef notdef
   1400 			rmc_tl_satisfied(ifd, &now);
   1401 #endif
   1402 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
   1403 		} else {
   1404 			ifd->cutoff_ = borrowed->depth_;
   1405 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
   1406 		}
   1407 #endif /* !ALTQ */
   1408 	}
   1409 
   1410 	/*
   1411 	 * Release class slot
   1412 	 */
   1413 	ifd->borrowed_[ifd->qo_] = NULL;
   1414 	ifd->class_[ifd->qo_] = NULL;
   1415 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
   1416 	ifd->queued_--;
   1417 }
   1418 
   1419 /*
   1420  * void
   1421  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
   1422  *	over-limit action routines.  These get invoked by rmc_under_limit()
   1423  *	if a class with packets to send if over its bandwidth limit & can't
   1424  *	borrow from a parent class.
   1425  *
   1426  *	Returns: NONE
   1427  */
   1428 
   1429 static void
   1430 rmc_drop_action(struct rm_class *cl)
   1431 {
   1432 	struct rm_ifdat	*ifd = cl->ifdat_;
   1433 
   1434 	ASSERT(qlen(cl->q_) > 0);
   1435 	_rmc_dropq(cl);
   1436 	if (qempty(cl->q_))
   1437 		ifd->na_[cl->pri_]--;
   1438 }
   1439 
   1440 void
   1441 rmc_dropall(struct rm_class *cl)
   1442 {
   1443 	struct rm_ifdat	*ifd = cl->ifdat_;
   1444 
   1445 	if (!qempty(cl->q_)) {
   1446 		_flushq(cl->q_);
   1447 
   1448 		ifd->na_[cl->pri_]--;
   1449 	}
   1450 }
   1451 
   1452 #if (__FreeBSD_version > 300000)
   1453 static int tvhzto(struct timeval *);
   1454 
   1455 static int
   1456 tvhzto(struct timeval *tv)
   1457 {
   1458 	struct timeval t2;
   1459 
   1460 	getmicrotime(&t2);
   1461 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
   1462 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
   1463 	return (tvtohz(&t2));
   1464 }
   1465 #endif /* __FreeBSD_version > 300000 */
   1466 
   1467 /*
   1468  * void
   1469  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
   1470  *	delay action routine.  It is invoked via rmc_under_limit when the
   1471  *	packet is discoverd to be overlimit.
   1472  *
   1473  *	If the delay action is result of borrow class being overlimit, then
   1474  *	delay for the offtime of the borrowing class that is overlimit.
   1475  *
   1476  *	Returns: NONE
   1477  */
   1478 
   1479 void
   1480 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
   1481 {
   1482 	int	ndelay, t, extradelay;
   1483 
   1484 	cl->stats_.overactions++;
   1485 	TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay);
   1486 #ifndef BORROW_OFFTIME
   1487 	ndelay += cl->offtime_;
   1488 #endif
   1489 
   1490 	if (!cl->sleeping_) {
   1491 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
   1492 #ifdef BORROW_OFFTIME
   1493 		if (borrow != NULL)
   1494 			extradelay = borrow->offtime_;
   1495 		else
   1496 #endif
   1497 			extradelay = cl->offtime_;
   1498 
   1499 #ifdef ALTQ
   1500 		/*
   1501 		 * XXX recalculate suspend time:
   1502 		 * current undertime is (tidle + pkt_time) calculated
   1503 		 * from the last transmission.
   1504 		 *	tidle: time required to bring avgidle back to 0
   1505 		 *	pkt_time: target waiting time for this class
   1506 		 * we need to replace pkt_time by offtime
   1507 		 */
   1508 		extradelay -= cl->last_pkttime_;
   1509 #endif
   1510 		if (extradelay > 0) {
   1511 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
   1512 			ndelay += extradelay;
   1513 		}
   1514 
   1515 		cl->sleeping_ = 1;
   1516 		cl->stats_.delays++;
   1517 
   1518 		/*
   1519 		 * Since packets are phased randomly with respect to the
   1520 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
   1521 		 * short time so we have to wait for at least two ticks.
   1522 		 * NOTE:  If there's no other traffic, we need the timer as
   1523 		 * a 'backstop' to restart this class.
   1524 		 */
   1525 		if (ndelay > tick * 2) {
   1526 #ifdef __FreeBSD__
   1527 			/* FreeBSD rounds up the tick */
   1528 			t = tvhzto(&cl->undertime_);
   1529 #else
   1530 			/* other BSDs round down the tick */
   1531 			t = tvhzto(&cl->undertime_) + 1;
   1532 #endif
   1533 		} else
   1534 			t = 2;
   1535 		CALLOUT_RESET(&cl->callout_, t,
   1536 			      (timeout_t *)rmc_restart, (void *)cl);
   1537 	}
   1538 }
   1539 
   1540 /*
   1541  * void
   1542  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
   1543  *	called by the system timer code & is responsible checking if the
   1544  *	class is still sleeping (it might have been restarted as a side
   1545  *	effect of the queue scan on a packet arrival) and, if so, restarting
   1546  *	output for the class.  Inspecting the class state & restarting output
   1547  *	require locking the class structure.  In general the driver is
   1548  *	responsible for locking but this is the only routine that is not
   1549  *	called directly or indirectly from the interface driver so it has
   1550  *	know about system locking conventions.  Under bsd, locking is done
   1551  *	by raising IPL to splnet so that's what's implemented here.  On a
   1552  *	different system this would probably need to be changed.
   1553  *
   1554  *	Returns:	NONE
   1555  */
   1556 
   1557 static void
   1558 rmc_restart(struct rm_class *cl)
   1559 {
   1560 	struct rm_ifdat	*ifd = cl->ifdat_;
   1561 	int		 s;
   1562 
   1563 	s = splnet();
   1564 	if (cl->sleeping_) {
   1565 		cl->sleeping_ = 0;
   1566 		cl->undertime_.tv_sec = 0;
   1567 
   1568 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
   1569 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
   1570 			(ifd->restart)(ifd->ifq_);
   1571 		}
   1572 	}
   1573 	splx(s);
   1574 }
   1575 
   1576 /*
   1577  * void
   1578  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
   1579  *	handling routine for the root class of the link sharing structure.
   1580  *
   1581  *	Returns: NONE
   1582  */
   1583 
   1584 static void
   1585 rmc_root_overlimit(struct rm_class *cl,
   1586     struct rm_class *borrow)
   1587 {
   1588 	panic("rmc_root_overlimit");
   1589 }
   1590 
   1591 /*
   1592  * Packet Queue handling routines.  Eventually, this is to localize the
   1593  *	effects on the code whether queues are red queues or droptail
   1594  *	queues.
   1595  */
   1596 
   1597 static int
   1598 _rmc_addq(rm_class_t *cl, mbuf_t *m)
   1599 {
   1600 #ifdef ALTQ_RIO
   1601 	if (q_is_rio(cl->q_))
   1602 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
   1603 #endif
   1604 #ifdef ALTQ_RED
   1605 	if (q_is_red(cl->q_))
   1606 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
   1607 #endif /* ALTQ_RED */
   1608 
   1609 	if (cl->flags_ & RMCF_CLEARDSCP)
   1610 		write_dsfield(m, cl->pktattr_, 0);
   1611 
   1612 	_addq(cl->q_, m);
   1613 	return (0);
   1614 }
   1615 
   1616 /* note: _rmc_dropq is not called for red */
   1617 static void
   1618 _rmc_dropq(rm_class_t *cl)
   1619 {
   1620 	mbuf_t	*m;
   1621 
   1622 	if ((m = _getq(cl->q_)) != NULL)
   1623 		m_freem(m);
   1624 }
   1625 
   1626 static mbuf_t *
   1627 _rmc_getq(rm_class_t *cl)
   1628 {
   1629 #ifdef ALTQ_RIO
   1630 	if (q_is_rio(cl->q_))
   1631 		return rio_getq((rio_t *)cl->red_, cl->q_);
   1632 #endif
   1633 #ifdef ALTQ_RED
   1634 	if (q_is_red(cl->q_))
   1635 		return red_getq(cl->red_, cl->q_);
   1636 #endif
   1637 	return _getq(cl->q_);
   1638 }
   1639 
   1640 static mbuf_t *
   1641 _rmc_pollq(rm_class_t *cl)
   1642 {
   1643 	return qhead(cl->q_);
   1644 }
   1645 
   1646 #ifdef CBQ_TRACE
   1647 
   1648 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
   1649 struct cbqtrace		*cbqtrace_ptr = NULL;
   1650 int			 cbqtrace_count;
   1651 
   1652 /*
   1653  * DDB hook to trace cbq events:
   1654  *  the last 1024 events are held in a circular buffer.
   1655  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
   1656  */
   1657 void cbqtrace_dump(int);
   1658 static char *rmc_funcname(void *);
   1659 
   1660 static struct rmc_funcs {
   1661 	void	*func;
   1662 	char	*name;
   1663 } rmc_funcs[] =
   1664 {
   1665 	rmc_init,		"rmc_init",
   1666 	rmc_queue_packet,	"rmc_queue_packet",
   1667 	rmc_under_limit,	"rmc_under_limit",
   1668 	rmc_update_class_util,	"rmc_update_class_util",
   1669 	rmc_delay_action,	"rmc_delay_action",
   1670 	rmc_restart,		"rmc_restart",
   1671 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
   1672 	NULL,			NULL
   1673 };
   1674 
   1675 static char *
   1676 rmc_funcname(void *func)
   1677 {
   1678 	struct rmc_funcs *fp;
   1679 
   1680 	for (fp = rmc_funcs; fp->func != NULL; fp++)
   1681 		if (fp->func == func)
   1682 			return (fp->name);
   1683 	return ("unknown");
   1684 }
   1685 
   1686 void
   1687 cbqtrace_dump(int counter)
   1688 {
   1689 	int	 i, *p;
   1690 	char	*cp;
   1691 
   1692 	counter = counter % NCBQTRACE;
   1693 	p = (int *)&cbqtrace_buffer[counter];
   1694 
   1695 	for (i=0; i<20; i++) {
   1696 		printf("[0x%x] ", *p++);
   1697 		printf("%s: ", rmc_funcname((void *)*p++));
   1698 		cp = (char *)p++;
   1699 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
   1700 		printf("%d\n",*p++);
   1701 
   1702 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
   1703 			p = (int *)cbqtrace_buffer;
   1704 	}
   1705 }
   1706 #endif /* CBQ_TRACE */
   1707 #endif /* ALTQ_CBQ */
   1708 
   1709 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ)
   1710 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
   1711 
   1712 void
   1713 _addq(class_queue_t *q, mbuf_t *m)
   1714 {
   1715         mbuf_t	*m0;
   1716 
   1717 	if ((m0 = qtail(q)) != NULL)
   1718 		m->m_nextpkt = m0->m_nextpkt;
   1719 	else
   1720 		m0 = m;
   1721 	m0->m_nextpkt = m;
   1722 	qtail(q) = m;
   1723 	qlen(q)++;
   1724 }
   1725 
   1726 mbuf_t *
   1727 _getq(class_queue_t *q)
   1728 {
   1729 	mbuf_t	*m, *m0;
   1730 
   1731 	if ((m = qtail(q)) == NULL)
   1732 		return (NULL);
   1733 	if ((m0 = m->m_nextpkt) != m)
   1734 		m->m_nextpkt = m0->m_nextpkt;
   1735 	else {
   1736 		ASSERT(qlen(q) == 1);
   1737 		qtail(q) = NULL;
   1738 	}
   1739 	qlen(q)--;
   1740 	m0->m_nextpkt = NULL;
   1741 	return (m0);
   1742 }
   1743 
   1744 /* drop a packet at the tail of the queue */
   1745 mbuf_t *
   1746 _getq_tail(class_queue_t *q)
   1747 {
   1748 	mbuf_t	*m, *m0, *prev;
   1749 
   1750 	if ((m = m0 = qtail(q)) == NULL)
   1751 		return NULL;
   1752 	do {
   1753 		prev = m0;
   1754 		m0 = m0->m_nextpkt;
   1755 	} while (m0 != m);
   1756 	prev->m_nextpkt = m->m_nextpkt;
   1757 	if (prev == m)  {
   1758 		ASSERT(qlen(q) == 1);
   1759 		qtail(q) = NULL;
   1760 	} else
   1761 		qtail(q) = prev;
   1762 	qlen(q)--;
   1763 	m->m_nextpkt = NULL;
   1764 	return (m);
   1765 }
   1766 
   1767 /* randomly select a packet in the queue */
   1768 mbuf_t *
   1769 _getq_random(class_queue_t *q)
   1770 {
   1771 	struct mbuf	*m;
   1772 	int		 i, n;
   1773 
   1774 	if ((m = qtail(q)) == NULL)
   1775 		return NULL;
   1776 	if (m->m_nextpkt == m) {
   1777 		ASSERT(qlen(q) == 1);
   1778 		qtail(q) = NULL;
   1779 	} else {
   1780 		struct mbuf *prev = NULL;
   1781 
   1782 		n = cprng_fast32() % qlen(q) + 1;
   1783 		for (i = 0; i < n; i++) {
   1784 			prev = m;
   1785 			m = m->m_nextpkt;
   1786 		}
   1787 		prev->m_nextpkt = m->m_nextpkt;
   1788 		if (m == qtail(q))
   1789 			qtail(q) = prev;
   1790 	}
   1791 	qlen(q)--;
   1792 	m->m_nextpkt = NULL;
   1793 	return (m);
   1794 }
   1795 
   1796 void
   1797 _removeq(class_queue_t *q, mbuf_t *m)
   1798 {
   1799 	mbuf_t	*m0, *prev;
   1800 
   1801 	m0 = qtail(q);
   1802 	do {
   1803 		prev = m0;
   1804 		m0 = m0->m_nextpkt;
   1805 	} while (m0 != m);
   1806 	prev->m_nextpkt = m->m_nextpkt;
   1807 	if (prev == m)
   1808 		qtail(q) = NULL;
   1809 	else if (qtail(q) == m)
   1810 		qtail(q) = prev;
   1811 	qlen(q)--;
   1812 }
   1813 
   1814 void
   1815 _flushq(class_queue_t *q)
   1816 {
   1817 	mbuf_t *m;
   1818 
   1819 	while ((m = _getq(q)) != NULL)
   1820 		m_freem(m);
   1821 	ASSERT(qlen(q) == 0);
   1822 }
   1823 
   1824 #endif /* !__GNUC__ || ALTQ_DEBUG */
   1825 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
   1826