Home | History | Annotate | Line # | Download | only in kern
bufq_priocscan.c revision 1.18
      1 /*	$NetBSD: bufq_priocscan.c,v 1.18 2014/01/28 12:50:54 martin Exp $	*/
      2 
      3 /*-
      4  * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi,
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.18 2014/01/28 12:50:54 martin Exp $");
     31 
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/buf.h>
     35 #include <sys/bufq.h>
     36 #include <sys/bufq_impl.h>
     37 #include <sys/kmem.h>
     38 #include <sys/rbtree.h>
     39 
     40 #undef	PRIOCSCAN_USE_GLOBAL_POSITION
     41 
     42 /*
     43  * Cyclical scan (CSCAN)
     44  */
     45 
     46 struct cscan_key {
     47 	daddr_t	k_rawblkno;
     48 	int k_cylinder;
     49 };
     50 
     51 struct cscan_queue {
     52 	rb_tree_t cq_buffers;		/* ordered list of buffers */
     53 #if !defined(PRIOCSCAN_USE_GLOBAL_POSITION)
     54 	struct cscan_key cq_lastkey;	/* key of last request */
     55 #endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
     56 	int cq_sortby;			/* BUFQ_SORT_MASK */
     57 	rb_tree_ops_t cq_ops;
     58 };
     59 
     60 static signed int
     61 buf_cmp(const struct buf *b1, const struct buf *b2, int sortby)
     62 {
     63 
     64 	if (buf_inorder(b2, b1, sortby)) {
     65 		return 1;	/* b1 > b2 */
     66 	}
     67 	if (buf_inorder(b1, b2, sortby)) {
     68 		return -1;	/* b1 < b2 */
     69 	}
     70 	return 0;
     71 }
     72 
     73 /* return positive if n1 > n2 */
     74 static signed int
     75 cscan_tree_compare_nodes(void *context, const void *n1, const void *n2)
     76 {
     77 	const struct cscan_queue * const q = context;
     78 	const struct buf * const b1 = n1;
     79 	const struct buf * const b2 = n2;
     80 	const int sortby = q->cq_sortby;
     81 	const int diff = buf_cmp(b1, b2, sortby);
     82 
     83 	/*
     84 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
     85 	 */
     86 
     87 	if (diff != 0) {
     88 		return diff;
     89 	}
     90 
     91 	/*
     92 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
     93 	 */
     94 	if (b1 > b2) {
     95 		return 1;
     96 	}
     97 	if (b1 < b2) {
     98 		return -1;
     99 	}
    100 	return 0;
    101 }
    102 
    103 /* return positive if n1 > k2 */
    104 static signed int
    105 cscan_tree_compare_key(void *context, const void *n1, const void *k2)
    106 {
    107 	const struct cscan_queue * const q = context;
    108 	const struct buf * const b1 = n1;
    109 	const struct cscan_key * const key = k2;
    110 	const struct buf tmp = {
    111 		.b_rawblkno = key->k_rawblkno,
    112 		.b_cylinder = key->k_cylinder,
    113 	};
    114 	const struct buf *b2 = &tmp;
    115 	const int sortby = q->cq_sortby;
    116 
    117 	return buf_cmp(b1, b2, sortby);
    118 }
    119 
    120 static void __unused
    121 cscan_dump(struct cscan_queue *cq)
    122 {
    123 	const int sortby = cq->cq_sortby;
    124 	struct buf *bp;
    125 
    126 	RB_TREE_FOREACH(bp, &cq->cq_buffers) {
    127 		if (sortby == BUFQ_SORT_RAWBLOCK) {
    128 			printf(" %jd", (intmax_t)bp->b_rawblkno);
    129 		} else {
    130 			printf(" %jd/%jd",
    131 			    (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno);
    132 		}
    133 	}
    134 }
    135 
    136 static inline bool
    137 cscan_empty(struct cscan_queue *q)
    138 {
    139 
    140 	/* XXX this might do more work than necessary */
    141 	return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL;
    142 }
    143 
    144 static void
    145 cscan_put(struct cscan_queue *q, struct buf *bp)
    146 {
    147 	struct buf *obp __diagused;
    148 
    149 	obp = rb_tree_insert_node(&q->cq_buffers, bp);
    150 	KASSERT(obp == bp); /* see cscan_tree_compare_nodes */
    151 }
    152 
    153 static struct buf *
    154 cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key)
    155 {
    156 	struct buf *bp;
    157 
    158 	bp = rb_tree_find_node_geq(&q->cq_buffers, key);
    159 	KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0);
    160 	if (bp == NULL) {
    161 		bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT);
    162 		KDASSERT(cscan_tree_compare_key(q, bp, key) < 0);
    163 	}
    164 	if (bp != NULL && remove) {
    165 #if defined(DEBUG)
    166 		struct buf *nbp;
    167 #endif /* defined(DEBUG) */
    168 
    169 		rb_tree_remove_node(&q->cq_buffers, bp);
    170 		/*
    171 		 * remember the head position.
    172 		 */
    173 		key->k_cylinder = bp->b_cylinder;
    174 		key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
    175 #if defined(DEBUG)
    176 		nbp = rb_tree_find_node_geq(&q->cq_buffers, key);
    177 		if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) {
    178 			panic("%s: wrong order %p < %p\n", __func__,
    179 			    nbp, bp);
    180 		}
    181 #endif /* defined(DEBUG) */
    182 	}
    183 	return bp;
    184 }
    185 
    186 static void
    187 cscan_init(struct cscan_queue *q, int sortby)
    188 {
    189 	static const rb_tree_ops_t cscan_tree_ops = {
    190 		.rbto_compare_nodes = cscan_tree_compare_nodes,
    191 		.rbto_compare_key = cscan_tree_compare_key,
    192 		.rbto_node_offset = offsetof(struct buf, b_u.u_rbnode),
    193 		.rbto_context = NULL,
    194 	};
    195 
    196 	q->cq_sortby = sortby;
    197 	/* XXX copy ops to workaround rbtree.h API limitation */
    198 	q->cq_ops = cscan_tree_ops;
    199 	q->cq_ops.rbto_context = q;
    200 	rb_tree_init(&q->cq_buffers, &q->cq_ops);
    201 }
    202 
    203 /*
    204  * Per-prioritiy CSCAN.
    205  *
    206  * XXX probably we should have a way to raise
    207  * priority of the on-queue requests.
    208  */
    209 #define	PRIOCSCAN_NQUEUE	3
    210 
    211 struct priocscan_queue {
    212 	struct cscan_queue q_queue;
    213 	unsigned int q_burst;
    214 };
    215 
    216 struct bufq_priocscan {
    217 	struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
    218 
    219 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
    220 	/*
    221 	 * XXX using "global" head position can reduce positioning time
    222 	 * when switching between queues.
    223 	 * although it might affect against fairness.
    224 	 */
    225 	struct cscan_key bq_lastkey;
    226 #endif
    227 };
    228 
    229 /*
    230  * how many requests to serve when having pending requests on other queues.
    231  *
    232  * XXX tune
    233  * be careful: while making these values larger likely
    234  * increases the total throughput, it can also increase latencies
    235  * for some workloads.
    236  */
    237 const int priocscan_burst[] = {
    238 	64, 16, 4
    239 };
    240 
    241 static void bufq_priocscan_init(struct bufq_state *);
    242 static void bufq_priocscan_put(struct bufq_state *, struct buf *);
    243 static struct buf *bufq_priocscan_get(struct bufq_state *, int);
    244 
    245 BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init);
    246 
    247 static inline struct cscan_queue *bufq_priocscan_selectqueue(
    248     struct bufq_priocscan *, const struct buf *);
    249 
    250 static inline struct cscan_queue *
    251 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
    252 {
    253 	static const int priocscan_priomap[] = {
    254 		[BPRIO_TIMENONCRITICAL] = 2,
    255 		[BPRIO_TIMELIMITED] = 1,
    256 		[BPRIO_TIMECRITICAL] = 0
    257 	};
    258 
    259 	return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
    260 }
    261 
    262 static void
    263 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
    264 {
    265 	struct bufq_priocscan *q = bufq->bq_private;
    266 	struct cscan_queue *cq;
    267 
    268 	cq = bufq_priocscan_selectqueue(q, bp);
    269 	cscan_put(cq, bp);
    270 }
    271 
    272 static struct buf *
    273 bufq_priocscan_get(struct bufq_state *bufq, int remove)
    274 {
    275 	struct bufq_priocscan *q = bufq->bq_private;
    276 	struct priocscan_queue *pq, *npq;
    277 	struct priocscan_queue *first; /* highest priority non-empty queue */
    278 	const struct priocscan_queue *epq;
    279 	struct buf *bp;
    280 	bool single; /* true if there's only one non-empty queue */
    281 
    282 	/*
    283 	 * find the highest priority non-empty queue.
    284 	 */
    285 	pq = &q->bq_queue[0];
    286 	epq = pq + PRIOCSCAN_NQUEUE;
    287 	for (; pq < epq; pq++) {
    288 		if (!cscan_empty(&pq->q_queue)) {
    289 			break;
    290 		}
    291 	}
    292 	if (pq == epq) {
    293 		/*
    294 		 * all our queues are empty.  there's nothing to serve.
    295 		 */
    296 		return NULL;
    297 	}
    298 	first = pq;
    299 
    300 	/*
    301 	 * scan the rest of queues.
    302 	 *
    303 	 * if we have two or more non-empty queues, we serve the highest
    304 	 * priority one with non-zero burst count.
    305 	 */
    306 	single = true;
    307 	for (npq = pq + 1; npq < epq; npq++) {
    308 		if (!cscan_empty(&npq->q_queue)) {
    309 			/*
    310 			 * we found another non-empty queue.
    311 			 * it means that a queue needs to consume its burst
    312 			 * count to be served.
    313 			 */
    314 			single = false;
    315 
    316 			/*
    317 			 * check if our current candidate queue has already
    318 			 * exhausted its burst count.
    319 			 */
    320 			if (pq->q_burst > 0) {
    321 				break;
    322 			}
    323 			pq = npq;
    324 		}
    325 	}
    326 	if (single) {
    327 		/*
    328 		 * there's only a non-empty queue.
    329 		 * just serve it without consuming its burst count.
    330 		 */
    331 		KASSERT(pq == first);
    332 	} else {
    333 		/*
    334 		 * there are two or more non-empty queues.
    335 		 */
    336 		if (pq->q_burst == 0) {
    337 			/*
    338 			 * no queues can be served because they have already
    339 			 * exhausted their burst count.
    340 			 */
    341 			unsigned int i;
    342 #ifdef DEBUG
    343 			for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    344 				pq = &q->bq_queue[i];
    345 				if (!cscan_empty(&pq->q_queue) && pq->q_burst) {
    346 					panic("%s: inconsist", __func__);
    347 				}
    348 			}
    349 #endif /* DEBUG */
    350 			/*
    351 			 * reset burst counts.
    352 			 */
    353 			if (remove) {
    354 				for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    355 					pq = &q->bq_queue[i];
    356 					pq->q_burst = priocscan_burst[i];
    357 				}
    358 			}
    359 
    360 			/*
    361 			 * serve the highest priority non-empty queue.
    362 			 */
    363 			pq = first;
    364 		}
    365 		/*
    366 		 * consume the burst count.
    367 		 *
    368 		 * XXX account only by number of requests.  is it good enough?
    369 		 */
    370 		if (remove) {
    371 			KASSERT(pq->q_burst > 0);
    372 			pq->q_burst--;
    373 		}
    374 	}
    375 
    376 	/*
    377 	 * finally, get a request from the selected queue.
    378 	 */
    379 	KDASSERT(!cscan_empty(&pq->q_queue));
    380 	bp = cscan_get(&pq->q_queue, remove,
    381 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
    382 	    &q->bq_lastkey
    383 #else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
    384 	    &pq->q_queue.cq_lastkey
    385 #endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
    386 	    );
    387 	KDASSERT(bp != NULL);
    388 	KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
    389 
    390 	return bp;
    391 }
    392 
    393 static struct buf *
    394 bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp)
    395 {
    396 	struct bufq_priocscan * const q = bufq->bq_private;
    397 	unsigned int i;
    398 
    399 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    400 		struct cscan_queue * const cq = &q->bq_queue[i].q_queue;
    401 		struct buf *it;
    402 
    403 		/*
    404 		 * XXX probably could be faster but the cancel functionality
    405 		 * is not widely used anyway.
    406 		 */
    407 		RB_TREE_FOREACH(it, &cq->cq_buffers) {
    408 			if (it == bp) {
    409 				rb_tree_remove_node(&cq->cq_buffers, bp);
    410 				return bp;
    411 			}
    412 		}
    413 	}
    414 	return NULL;
    415 }
    416 
    417 static void
    418 bufq_priocscan_fini(struct bufq_state *bufq)
    419 {
    420 
    421 	KASSERT(bufq->bq_private != NULL);
    422 	kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan));
    423 }
    424 
    425 static void
    426 bufq_priocscan_init(struct bufq_state *bufq)
    427 {
    428 	struct bufq_priocscan *q;
    429 	const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
    430 	unsigned int i;
    431 
    432 	bufq->bq_get = bufq_priocscan_get;
    433 	bufq->bq_put = bufq_priocscan_put;
    434 	bufq->bq_cancel = bufq_priocscan_cancel;
    435 	bufq->bq_fini = bufq_priocscan_fini;
    436 	bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP);
    437 
    438 	q = bufq->bq_private;
    439 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    440 		struct cscan_queue *cq = &q->bq_queue[i].q_queue;
    441 
    442 		cscan_init(cq, sortby);
    443 	}
    444 }
    445