Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $	*/
      2 
      3 /*-
      4  * Copyright (c)2004,2005,2006,2008,2009,2011,2012 YAMAMOTO Takashi,
      5  * All rights reserved.
      6  *
      7  * Redistribution and use in source and binary forms, with or without
      8  * modification, are permitted provided that the following conditions
      9  * are met:
     10  * 1. Redistributions of source code must retain the above copyright
     11  *    notice, this list of conditions and the following disclaimer.
     12  * 2. Redistributions in binary form must reproduce the above copyright
     13  *    notice, this list of conditions and the following disclaimer in the
     14  *    documentation and/or other materials provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
     17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
     20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     26  * SUCH DAMAGE.
     27  */
     28 
     29 #include <sys/cdefs.h>
     30 __KERNEL_RCSID(0, "$NetBSD: bufq_priocscan.c,v 1.21 2017/05/04 11:03:27 kamil Exp $");
     31 
     32 #include <sys/param.h>
     33 #include <sys/systm.h>
     34 #include <sys/buf.h>
     35 #include <sys/bufq.h>
     36 #include <sys/bufq_impl.h>
     37 #include <sys/kmem.h>
     38 #include <sys/rbtree.h>
     39 #include <sys/module.h>
     40 
     41 #undef	PRIOCSCAN_USE_GLOBAL_POSITION
     42 
     43 /*
     44  * Cyclical scan (CSCAN)
     45  */
     46 
     47 struct cscan_key {
     48 	daddr_t	k_rawblkno;
     49 	int k_cylinder;
     50 };
     51 
     52 struct cscan_queue {
     53 	rb_tree_t cq_buffers;		/* ordered list of buffers */
     54 #if !defined(PRIOCSCAN_USE_GLOBAL_POSITION)
     55 	struct cscan_key cq_lastkey;	/* key of last request */
     56 #endif /* !defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
     57 	int cq_sortby;			/* BUFQ_SORT_MASK */
     58 	rb_tree_ops_t cq_ops;
     59 };
     60 
     61 static signed int
     62 buf_cmp(const struct buf *b1, const struct buf *b2, int sortby)
     63 {
     64 
     65 	if (buf_inorder(b2, b1, sortby)) {
     66 		return 1;	/* b1 > b2 */
     67 	}
     68 	if (buf_inorder(b1, b2, sortby)) {
     69 		return -1;	/* b1 < b2 */
     70 	}
     71 	return 0;
     72 }
     73 
     74 /* return positive if n1 > n2 */
     75 static signed int
     76 cscan_tree_compare_nodes(void *context, const void *n1, const void *n2)
     77 {
     78 	const struct cscan_queue * const q = context;
     79 	const struct buf * const b1 = n1;
     80 	const struct buf * const b2 = n2;
     81 	const int sortby = q->cq_sortby;
     82 	const int diff = buf_cmp(b1, b2, sortby);
     83 
     84 	/*
     85 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
     86 	 */
     87 
     88 	if (diff != 0) {
     89 		return diff;
     90 	}
     91 
     92 	/*
     93 	 * XXX rawblkno/cylinder might not be unique.  eg. unbuffered i/o
     94 	 */
     95 	if (b1 > b2) {
     96 		return 1;
     97 	}
     98 	if (b1 < b2) {
     99 		return -1;
    100 	}
    101 	return 0;
    102 }
    103 
    104 /* return positive if n1 > k2 */
    105 static signed int
    106 cscan_tree_compare_key(void *context, const void *n1, const void *k2)
    107 {
    108 	const struct cscan_queue * const q = context;
    109 	const struct buf * const b1 = n1;
    110 	const struct cscan_key * const key = k2;
    111 	const struct buf tmp = {
    112 		.b_rawblkno = key->k_rawblkno,
    113 		.b_cylinder = key->k_cylinder,
    114 	};
    115 	const struct buf *b2 = &tmp;
    116 	const int sortby = q->cq_sortby;
    117 
    118 	return buf_cmp(b1, b2, sortby);
    119 }
    120 
    121 static void __unused
    122 cscan_dump(struct cscan_queue *cq)
    123 {
    124 	const int sortby = cq->cq_sortby;
    125 	struct buf *bp;
    126 
    127 	RB_TREE_FOREACH(bp, &cq->cq_buffers) {
    128 		if (sortby == BUFQ_SORT_RAWBLOCK) {
    129 			printf(" %jd", (intmax_t)bp->b_rawblkno);
    130 		} else {
    131 			printf(" %jd/%jd",
    132 			    (intmax_t)bp->b_cylinder, (intmax_t)bp->b_rawblkno);
    133 		}
    134 	}
    135 }
    136 
    137 static inline bool
    138 cscan_empty(struct cscan_queue *q)
    139 {
    140 
    141 	/* XXX this might do more work than necessary */
    142 	return rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT) == NULL;
    143 }
    144 
    145 static void
    146 cscan_put(struct cscan_queue *q, struct buf *bp)
    147 {
    148 	struct buf *obp __diagused;
    149 
    150 	obp = rb_tree_insert_node(&q->cq_buffers, bp);
    151 	KASSERT(obp == bp); /* see cscan_tree_compare_nodes */
    152 }
    153 
    154 static struct buf *
    155 cscan_get(struct cscan_queue *q, int remove, struct cscan_key *key)
    156 {
    157 	struct buf *bp;
    158 
    159 	bp = rb_tree_find_node_geq(&q->cq_buffers, key);
    160 	KDASSERT(bp == NULL || cscan_tree_compare_key(q, bp, key) >= 0);
    161 	if (bp == NULL) {
    162 		bp = rb_tree_iterate(&q->cq_buffers, NULL, RB_DIR_LEFT);
    163 		KDASSERT(cscan_tree_compare_key(q, bp, key) < 0);
    164 	}
    165 	if (bp != NULL && remove) {
    166 #if defined(DEBUG)
    167 		struct buf *nbp;
    168 #endif /* defined(DEBUG) */
    169 
    170 		rb_tree_remove_node(&q->cq_buffers, bp);
    171 		/*
    172 		 * remember the head position.
    173 		 */
    174 		key->k_cylinder = bp->b_cylinder;
    175 		key->k_rawblkno = bp->b_rawblkno + (bp->b_bcount >> DEV_BSHIFT);
    176 #if defined(DEBUG)
    177 		nbp = rb_tree_find_node_geq(&q->cq_buffers, key);
    178 		if (nbp != NULL && cscan_tree_compare_nodes(q, nbp, bp) < 0) {
    179 			panic("%s: wrong order %p < %p\n", __func__,
    180 			    nbp, bp);
    181 		}
    182 #endif /* defined(DEBUG) */
    183 	}
    184 	return bp;
    185 }
    186 
    187 static void
    188 cscan_init(struct cscan_queue *q, int sortby)
    189 {
    190 	static const rb_tree_ops_t cscan_tree_ops = {
    191 		.rbto_compare_nodes = cscan_tree_compare_nodes,
    192 		.rbto_compare_key = cscan_tree_compare_key,
    193 		.rbto_node_offset = offsetof(struct buf, b_u.u_rbnode),
    194 		.rbto_context = NULL,
    195 	};
    196 
    197 	q->cq_sortby = sortby;
    198 	/* XXX copy ops to workaround rbtree.h API limitation */
    199 	q->cq_ops = cscan_tree_ops;
    200 	q->cq_ops.rbto_context = q;
    201 	rb_tree_init(&q->cq_buffers, &q->cq_ops);
    202 }
    203 
    204 /*
    205  * Per-prioritiy CSCAN.
    206  *
    207  * XXX probably we should have a way to raise
    208  * priority of the on-queue requests.
    209  */
    210 #define	PRIOCSCAN_NQUEUE	3
    211 
    212 struct priocscan_queue {
    213 	struct cscan_queue q_queue;
    214 	unsigned int q_burst;
    215 };
    216 
    217 struct bufq_priocscan {
    218 	struct priocscan_queue bq_queue[PRIOCSCAN_NQUEUE];
    219 
    220 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
    221 	/*
    222 	 * XXX using "global" head position can reduce positioning time
    223 	 * when switching between queues.
    224 	 * although it might affect against fairness.
    225 	 */
    226 	struct cscan_key bq_lastkey;
    227 #endif
    228 };
    229 
    230 /*
    231  * how many requests to serve when having pending requests on other queues.
    232  *
    233  * XXX tune
    234  * be careful: while making these values larger likely
    235  * increases the total throughput, it can also increase latencies
    236  * for some workloads.
    237  */
    238 const int priocscan_burst[] = {
    239 	64, 16, 4
    240 };
    241 
    242 static void bufq_priocscan_init(struct bufq_state *);
    243 static void bufq_priocscan_put(struct bufq_state *, struct buf *);
    244 static struct buf *bufq_priocscan_get(struct bufq_state *, int);
    245 
    246 BUFQ_DEFINE(priocscan, 40, bufq_priocscan_init);
    247 
    248 static inline struct cscan_queue *bufq_priocscan_selectqueue(
    249     struct bufq_priocscan *, const struct buf *);
    250 
    251 static inline struct cscan_queue *
    252 bufq_priocscan_selectqueue(struct bufq_priocscan *q, const struct buf *bp)
    253 {
    254 	static const int priocscan_priomap[] = {
    255 		[BPRIO_TIMENONCRITICAL] = 2,
    256 		[BPRIO_TIMELIMITED] = 1,
    257 		[BPRIO_TIMECRITICAL] = 0
    258 	};
    259 
    260 	return &q->bq_queue[priocscan_priomap[BIO_GETPRIO(bp)]].q_queue;
    261 }
    262 
    263 static void
    264 bufq_priocscan_put(struct bufq_state *bufq, struct buf *bp)
    265 {
    266 	struct bufq_priocscan *q = bufq_private(bufq);
    267 	struct cscan_queue *cq;
    268 
    269 	cq = bufq_priocscan_selectqueue(q, bp);
    270 	cscan_put(cq, bp);
    271 }
    272 
    273 static struct buf *
    274 bufq_priocscan_get(struct bufq_state *bufq, int remove)
    275 {
    276 	struct bufq_priocscan *q = bufq_private(bufq);
    277 	struct priocscan_queue *pq, *npq;
    278 	struct priocscan_queue *first; /* highest priority non-empty queue */
    279 	const struct priocscan_queue *epq;
    280 	struct buf *bp;
    281 	bool single; /* true if there's only one non-empty queue */
    282 
    283 	/*
    284 	 * find the highest priority non-empty queue.
    285 	 */
    286 	pq = &q->bq_queue[0];
    287 	epq = pq + PRIOCSCAN_NQUEUE;
    288 	for (; pq < epq; pq++) {
    289 		if (!cscan_empty(&pq->q_queue)) {
    290 			break;
    291 		}
    292 	}
    293 	if (pq == epq) {
    294 		/*
    295 		 * all our queues are empty.  there's nothing to serve.
    296 		 */
    297 		return NULL;
    298 	}
    299 	first = pq;
    300 
    301 	/*
    302 	 * scan the rest of queues.
    303 	 *
    304 	 * if we have two or more non-empty queues, we serve the highest
    305 	 * priority one with non-zero burst count.
    306 	 */
    307 	single = true;
    308 	for (npq = pq + 1; npq < epq; npq++) {
    309 		if (!cscan_empty(&npq->q_queue)) {
    310 			/*
    311 			 * we found another non-empty queue.
    312 			 * it means that a queue needs to consume its burst
    313 			 * count to be served.
    314 			 */
    315 			single = false;
    316 
    317 			/*
    318 			 * check if our current candidate queue has already
    319 			 * exhausted its burst count.
    320 			 */
    321 			if (pq->q_burst > 0) {
    322 				break;
    323 			}
    324 			pq = npq;
    325 		}
    326 	}
    327 	if (single) {
    328 		/*
    329 		 * there's only a non-empty queue.
    330 		 * just serve it without consuming its burst count.
    331 		 */
    332 		KASSERT(pq == first);
    333 	} else {
    334 		/*
    335 		 * there are two or more non-empty queues.
    336 		 */
    337 		if (pq->q_burst == 0) {
    338 			/*
    339 			 * no queues can be served because they have already
    340 			 * exhausted their burst count.
    341 			 */
    342 			unsigned int i;
    343 #ifdef DEBUG
    344 			for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    345 				pq = &q->bq_queue[i];
    346 				if (!cscan_empty(&pq->q_queue) && pq->q_burst) {
    347 					panic("%s: inconsist", __func__);
    348 				}
    349 			}
    350 #endif /* DEBUG */
    351 			/*
    352 			 * reset burst counts.
    353 			 */
    354 			if (remove) {
    355 				for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    356 					pq = &q->bq_queue[i];
    357 					pq->q_burst = priocscan_burst[i];
    358 				}
    359 			}
    360 
    361 			/*
    362 			 * serve the highest priority non-empty queue.
    363 			 */
    364 			pq = first;
    365 		}
    366 		/*
    367 		 * consume the burst count.
    368 		 *
    369 		 * XXX account only by number of requests.  is it good enough?
    370 		 */
    371 		if (remove) {
    372 			KASSERT(pq->q_burst > 0);
    373 			pq->q_burst--;
    374 		}
    375 	}
    376 
    377 	/*
    378 	 * finally, get a request from the selected queue.
    379 	 */
    380 	KDASSERT(!cscan_empty(&pq->q_queue));
    381 	bp = cscan_get(&pq->q_queue, remove,
    382 #if defined(PRIOCSCAN_USE_GLOBAL_POSITION)
    383 	    &q->bq_lastkey
    384 #else /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
    385 	    &pq->q_queue.cq_lastkey
    386 #endif /* defined(PRIOCSCAN_USE_GLOBAL_POSITION) */
    387 	    );
    388 	KDASSERT(bp != NULL);
    389 	KDASSERT(&pq->q_queue == bufq_priocscan_selectqueue(q, bp));
    390 
    391 	return bp;
    392 }
    393 
    394 static struct buf *
    395 bufq_priocscan_cancel(struct bufq_state *bufq, struct buf *bp)
    396 {
    397 	struct bufq_priocscan * const q = bufq_private(bufq);
    398 	unsigned int i;
    399 
    400 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    401 		struct cscan_queue * const cq = &q->bq_queue[i].q_queue;
    402 		struct buf *it;
    403 
    404 		/*
    405 		 * XXX probably could be faster but the cancel functionality
    406 		 * is not widely used anyway.
    407 		 */
    408 		RB_TREE_FOREACH(it, &cq->cq_buffers) {
    409 			if (it == bp) {
    410 				rb_tree_remove_node(&cq->cq_buffers, bp);
    411 				return bp;
    412 			}
    413 		}
    414 	}
    415 	return NULL;
    416 }
    417 
    418 static void
    419 bufq_priocscan_fini(struct bufq_state *bufq)
    420 {
    421 
    422 	KASSERT(bufq->bq_private != NULL);
    423 	kmem_free(bufq->bq_private, sizeof(struct bufq_priocscan));
    424 }
    425 
    426 static void
    427 bufq_priocscan_init(struct bufq_state *bufq)
    428 {
    429 	struct bufq_priocscan *q;
    430 	const int sortby = bufq->bq_flags & BUFQ_SORT_MASK;
    431 	unsigned int i;
    432 
    433 	bufq->bq_get = bufq_priocscan_get;
    434 	bufq->bq_put = bufq_priocscan_put;
    435 	bufq->bq_cancel = bufq_priocscan_cancel;
    436 	bufq->bq_fini = bufq_priocscan_fini;
    437 	bufq->bq_private = kmem_zalloc(sizeof(struct bufq_priocscan), KM_SLEEP);
    438 
    439 	q = bufq->bq_private;
    440 	for (i = 0; i < PRIOCSCAN_NQUEUE; i++) {
    441 		struct cscan_queue *cq = &q->bq_queue[i].q_queue;
    442 
    443 		cscan_init(cq, sortby);
    444 	}
    445 }
    446 
    447 MODULE(MODULE_CLASS_BUFQ, bufq_priocscan, NULL);
    448 
    449 static int
    450 bufq_priocscan_modcmd(modcmd_t cmd, void *opaque)
    451 {
    452 
    453 	switch (cmd) {
    454 	case MODULE_CMD_INIT:
    455 		return bufq_register(&bufq_strat_priocscan);
    456 	case MODULE_CMD_FINI:
    457 		return bufq_unregister(&bufq_strat_priocscan);
    458 	default:
    459 		return ENOTTY;
    460 	}
    461 }
    462