rf_diskqueue.c revision 1.24 1 /* $NetBSD: rf_diskqueue.c,v 1.24 2003/12/29 03:33:47 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /****************************************************************************
30 *
31 * rf_diskqueue.c -- higher-level disk queue code
32 *
33 * the routines here are a generic wrapper around the actual queueing
34 * routines. The code here implements thread scheduling, synchronization,
35 * and locking ops (see below) on top of the lower-level queueing code.
36 *
37 * to support atomic RMW, we implement "locking operations". When a
38 * locking op is dispatched to the lower levels of the driver, the
39 * queue is locked, and no further I/Os are dispatched until the queue
40 * receives & completes a corresponding "unlocking operation". This
41 * code relies on the higher layers to guarantee that a locking op
42 * will always be eventually followed by an unlocking op. The model
43 * is that the higher layers are structured so locking and unlocking
44 * ops occur in pairs, i.e. an unlocking op cannot be generated until
45 * after a locking op reports completion. There is no good way to
46 * check to see that an unlocking op "corresponds" to the op that
47 * currently has the queue locked, so we make no such attempt. Since
48 * by definition there can be only one locking op outstanding on a
49 * disk, this should not be a problem.
50 *
51 * In the kernel, we allow multiple I/Os to be concurrently dispatched
52 * to the disk driver. In order to support locking ops in this
53 * environment, when we decide to do a locking op, we stop dispatching
54 * new I/Os and wait until all dispatched I/Os have completed before
55 * dispatching the locking op.
56 *
57 * Unfortunately, the code is different in the 3 different operating
58 * states (user level, kernel, simulator). In the kernel, I/O is
59 * non-blocking, and we have no disk threads to dispatch for us.
60 * Therefore, we have to dispatch new I/Os to the scsi driver at the
61 * time of enqueue, and also at the time of completion. At user
62 * level, I/O is blocking, and so only the disk threads may dispatch
63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue
64 * and wake up the disk thread to do the dispatch.
65 *
66 ****************************************************************************/
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.24 2003/12/29 03:33:47 oster Exp $");
70
71 #include <dev/raidframe/raidframevar.h>
72
73 #include "rf_threadstuff.h"
74 #include "rf_raid.h"
75 #include "rf_diskqueue.h"
76 #include "rf_alloclist.h"
77 #include "rf_acctrace.h"
78 #include "rf_etimer.h"
79 #include "rf_general.h"
80 #include "rf_debugprint.h"
81 #include "rf_shutdown.h"
82 #include "rf_cvscan.h"
83 #include "rf_sstf.h"
84 #include "rf_fifo.h"
85 #include "rf_kintf.h"
86
87 static int init_dqd(RF_DiskQueueData_t *);
88 static void clean_dqd(RF_DiskQueueData_t *);
89 static void rf_ShutdownDiskQueueSystem(void *);
90
91 #ifndef RF_DEBUG_DISKQUEUE
92 #define RF_DEBUG_DISKQUEUE 0
93 #endif
94
95 #if RF_DEBUG_DISKQUEUE
96 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99 #else
100 #define Dprintf1(s,a)
101 #define Dprintf2(s,a,b)
102 #define Dprintf3(s,a,b,c)
103 #endif
104
105 /*****************************************************************************
106 *
107 * the disk queue switch defines all the functions used in the
108 * different queueing disciplines queue ID, init routine, enqueue
109 * routine, dequeue routine
110 *
111 ****************************************************************************/
112
113 static const RF_DiskQueueSW_t diskqueuesw[] = {
114 {"fifo", /* FIFO */
115 rf_FifoCreate,
116 rf_FifoEnqueue,
117 rf_FifoDequeue,
118 rf_FifoPeek,
119 rf_FifoPromote},
120
121 {"cvscan", /* cvscan */
122 rf_CvscanCreate,
123 rf_CvscanEnqueue,
124 rf_CvscanDequeue,
125 rf_CvscanPeek,
126 rf_CvscanPromote},
127
128 {"sstf", /* shortest seek time first */
129 rf_SstfCreate,
130 rf_SstfEnqueue,
131 rf_SstfDequeue,
132 rf_SstfPeek,
133 rf_SstfPromote},
134
135 {"scan", /* SCAN (two-way elevator) */
136 rf_ScanCreate,
137 rf_SstfEnqueue,
138 rf_ScanDequeue,
139 rf_ScanPeek,
140 rf_SstfPromote},
141
142 {"cscan", /* CSCAN (one-way elevator) */
143 rf_CscanCreate,
144 rf_SstfEnqueue,
145 rf_CscanDequeue,
146 rf_CscanPeek,
147 rf_SstfPromote},
148
149 };
150 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
151
152 static struct pool rf_dqd_pool;
153 #define RF_MAX_FREE_DQD 256
154 #define RF_DQD_INC 16
155 #define RF_DQD_INITIAL 64
156
157 #include <sys/buf.h>
158
159 static int
160 init_dqd(dqd)
161 RF_DiskQueueData_t *dqd;
162 {
163
164 dqd->bp = (struct buf *) malloc(sizeof(struct buf),
165 M_RAIDFRAME, M_NOWAIT);
166 if (dqd->bp == NULL) {
167 return (ENOMEM);
168 }
169 memset(dqd->bp, 0, sizeof(struct buf)); /* if you don't do it, nobody
170 * else will.. */
171 return (0);
172 }
173
174 static void
175 clean_dqd(dqd)
176 RF_DiskQueueData_t *dqd;
177 {
178 free(dqd->bp, M_RAIDFRAME);
179 }
180 /* configures a single disk queue */
181
182 int
183 rf_ConfigureDiskQueue(
184 RF_Raid_t * raidPtr,
185 RF_DiskQueue_t * diskqueue,
186 RF_RowCol_t c,
187 const RF_DiskQueueSW_t * p,
188 RF_SectorCount_t sectPerDisk,
189 dev_t dev,
190 int maxOutstanding,
191 RF_ShutdownList_t ** listp,
192 RF_AllocListElem_t * clList)
193 {
194 int rc;
195
196 diskqueue->col = c;
197 diskqueue->qPtr = p;
198 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
199 diskqueue->dev = dev;
200 diskqueue->numOutstanding = 0;
201 diskqueue->queueLength = 0;
202 diskqueue->maxOutstanding = maxOutstanding;
203 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
204 diskqueue->nextLockingOp = NULL;
205 diskqueue->numWaiting = 0;
206 diskqueue->flags = 0;
207 diskqueue->raidPtr = raidPtr;
208 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
209 rc = rf_create_managed_mutex(listp, &diskqueue->mutex);
210 if (rc) {
211 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
212 return (rc);
213 }
214 rc = rf_create_managed_cond(listp, &diskqueue->cond);
215 if (rc) {
216 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc);
217 return (rc);
218 }
219 return (0);
220 }
221
222 static void
223 rf_ShutdownDiskQueueSystem(ignored)
224 void *ignored;
225 {
226 pool_destroy(&rf_dqd_pool);
227 }
228
229 int
230 rf_ConfigureDiskQueueSystem(listp)
231 RF_ShutdownList_t **listp;
232 {
233 int rc;
234
235 pool_init(&rf_dqd_pool, sizeof(RF_DiskQueueData_t), 0, 0, 0,
236 "rf_dqd_pl", NULL);
237 pool_sethiwat(&rf_dqd_pool, RF_MAX_FREE_DQD);
238 pool_prime(&rf_dqd_pool, RF_DQD_INITIAL);
239
240 rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
241 if (rc) {
242 rf_print_unable_to_add_shutdown( __FILE__, __LINE__, rc);
243 rf_ShutdownDiskQueueSystem(NULL);
244 return (rc);
245 }
246
247 return (0);
248 }
249
250 int
251 rf_ConfigureDiskQueues(
252 RF_ShutdownList_t ** listp,
253 RF_Raid_t * raidPtr,
254 RF_Config_t * cfgPtr)
255 {
256 RF_DiskQueue_t *diskQueues, *spareQueues;
257 const RF_DiskQueueSW_t *p;
258 RF_RowCol_t r,c;
259 int rc, i;
260
261 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
262
263 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
264 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
265 p = &diskqueuesw[i];
266 break;
267 }
268 }
269 if (p == NULL) {
270 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
271 p = &diskqueuesw[0];
272 }
273 raidPtr->qType = p;
274
275 RF_MallocAndAdd(diskQueues,
276 (raidPtr->numCol + RF_MAXSPARE) *
277 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *),
278 raidPtr->cleanupList);
279 if (diskQueues == NULL)
280 return (ENOMEM);
281 raidPtr->Queues = diskQueues;
282
283 for (c = 0; c < raidPtr->numCol; c++) {
284 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
285 c, p,
286 raidPtr->sectorsPerDisk,
287 raidPtr->Disks[c].dev,
288 cfgPtr->maxOutstandingDiskReqs,
289 listp, raidPtr->cleanupList);
290 if (rc)
291 return (rc);
292 }
293
294 spareQueues = &raidPtr->Queues[raidPtr->numCol];
295 for (r = 0; r < raidPtr->numSpare; r++) {
296 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
297 raidPtr->numCol + r, p,
298 raidPtr->sectorsPerDisk,
299 raidPtr->Disks[raidPtr->numCol + r].dev,
300 cfgPtr->maxOutstandingDiskReqs, listp,
301 raidPtr->cleanupList);
302 if (rc)
303 return (rc);
304 }
305 return (0);
306 }
307 /* Enqueue a disk I/O
308 *
309 * Unfortunately, we have to do things differently in the different
310 * environments (simulator, user-level, kernel).
311 * At user level, all I/O is blocking, so we have 1 or more threads/disk
312 * and the thread that enqueues is different from the thread that dequeues.
313 * In the kernel, I/O is non-blocking and so we'd like to have multiple
314 * I/Os outstanding on the physical disks when possible.
315 *
316 * when any request arrives at a queue, we have two choices:
317 * dispatch it to the lower levels
318 * queue it up
319 *
320 * kernel rules for when to do what:
321 * locking request: queue empty => dispatch and lock queue,
322 * else queue it
323 * unlocking req : always dispatch it
324 * normal req : queue empty => dispatch it & set priority
325 * queue not full & priority is ok => dispatch it
326 * else queue it
327 *
328 * user-level rules:
329 * always enqueue. In the special case of an unlocking op, enqueue
330 * in a special way that will cause the unlocking op to be the next
331 * thing dequeued.
332 *
333 * simulator rules:
334 * Do the same as at user level, with the sleeps and wakeups suppressed.
335 */
336 void
337 rf_DiskIOEnqueue(queue, req, pri)
338 RF_DiskQueue_t *queue;
339 RF_DiskQueueData_t *req;
340 int pri;
341 {
342 RF_ETIMER_START(req->qtime);
343 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
344 req->priority = pri;
345
346 #if RF_DEBUG_DISKQUEUE
347 if (rf_queueDebug && (req->numSector == 0)) {
348 printf("Warning: Enqueueing zero-sector access\n");
349 }
350 #endif
351 /*
352 * kernel
353 */
354 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
355 /* locking request */
356 if (RF_LOCKING_REQ(req)) {
357 if (RF_QUEUE_EMPTY(queue)) {
358 Dprintf2("Dispatching pri %d locking op to c %d (queue empty)\n", pri, queue->col);
359 RF_LOCK_QUEUE(queue);
360 rf_DispatchKernelIO(queue, req);
361 } else {
362 queue->queueLength++; /* increment count of number
363 * of requests waiting in this
364 * queue */
365 Dprintf2("Enqueueing pri %d locking op to c %d (queue not empty)\n", pri, queue->col);
366 req->queue = (void *) queue;
367 (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
368 }
369 }
370 /* unlocking request */
371 else
372 if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock
373 * when this I/O completes */
374 Dprintf2("Dispatching pri %d unlocking op to c %d\n", pri, queue->col);
375 RF_ASSERT(RF_QUEUE_LOCKED(queue));
376 rf_DispatchKernelIO(queue, req);
377 }
378 /* normal request */
379 else
380 if (RF_OK_TO_DISPATCH(queue, req)) {
381 Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
382 rf_DispatchKernelIO(queue, req);
383 } else {
384 queue->queueLength++; /* increment count of
385 * number of requests
386 * waiting in this queue */
387 Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
388 req->queue = (void *) queue;
389 (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
390 }
391 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
392 }
393
394
395 /* get the next set of I/Os started, kernel version only */
396 void
397 rf_DiskIOComplete(queue, req, status)
398 RF_DiskQueue_t *queue;
399 RF_DiskQueueData_t *req;
400 int status;
401 {
402 int done = 0;
403
404 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
405
406 /* unlock the queue: (1) after an unlocking req completes (2) after a
407 * locking req fails */
408 if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
409 Dprintf1("DiskIOComplete: unlocking queue at c %d\n", queue->col);
410 RF_ASSERT(RF_QUEUE_LOCKED(queue));
411 RF_UNLOCK_QUEUE(queue);
412 }
413 queue->numOutstanding--;
414 RF_ASSERT(queue->numOutstanding >= 0);
415
416 /* dispatch requests to the disk until we find one that we can't. */
417 /* no reason to continue once we've filled up the queue */
418 /* no reason to even start if the queue is locked */
419
420 while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
421 if (queue->nextLockingOp) {
422 req = queue->nextLockingOp;
423 queue->nextLockingOp = NULL;
424 Dprintf2("DiskIOComplete: a pri %d locking req was pending at c %d\n", req->priority, queue->col);
425 } else {
426 req = (queue->qPtr->Dequeue) (queue->qHdr);
427 if (req != NULL) {
428 Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
429 } else {
430 Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
431 }
432 }
433 if (req) {
434 queue->queueLength--; /* decrement count of number
435 * of requests waiting in this
436 * queue */
437 RF_ASSERT(queue->queueLength >= 0);
438 }
439 if (!req)
440 done = 1;
441 else
442 if (RF_LOCKING_REQ(req)) {
443 if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */
444 Dprintf2("DiskIOComplete: dispatching pri %d locking req to c %d (queue empty)\n", req->priority, queue->col);
445 RF_LOCK_QUEUE(queue);
446 rf_DispatchKernelIO(queue, req);
447 done = 1;
448 } else { /* put it aside to wait for
449 * the queue to drain */
450 Dprintf2("DiskIOComplete: postponing pri %d locking req to c %d\n", req->priority, queue->col);
451 RF_ASSERT(queue->nextLockingOp == NULL);
452 queue->nextLockingOp = req;
453 done = 1;
454 }
455 } else
456 if (RF_UNLOCKING_REQ(req)) { /* should not happen:
457 * unlocking ops should
458 * not get queued */
459 RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for
460 * the future */
461 Dprintf2("DiskIOComplete: dispatching pri %d unl req to c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->col);
462 rf_DispatchKernelIO(queue, req);
463 done = 1;
464 } else
465 if (RF_OK_TO_DISPATCH(queue, req)) {
466 Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
467 rf_DispatchKernelIO(queue, req);
468 } else { /* we can't dispatch it,
469 * so just re-enqueue
470 * it. */
471 /* potential trouble here if
472 * disk queues batch reqs */
473 Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
474 queue->queueLength++;
475 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
476 done = 1;
477 }
478 }
479
480 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
481 }
482 /* promotes accesses tagged with the given parityStripeID from low priority
483 * to normal priority. This promotion is optional, meaning that a queue
484 * need not implement it. If there is no promotion routine associated with
485 * a queue, this routine does nothing and returns -1.
486 */
487 int
488 rf_DiskIOPromote(queue, parityStripeID, which_ru)
489 RF_DiskQueue_t *queue;
490 RF_StripeNum_t parityStripeID;
491 RF_ReconUnitNum_t which_ru;
492 {
493 int retval;
494
495 if (!queue->qPtr->Promote)
496 return (-1);
497 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
498 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
499 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
500 return (retval);
501 }
502
503 RF_DiskQueueData_t *
504 rf_CreateDiskQueueData(
505 RF_IoType_t typ,
506 RF_SectorNum_t ssect,
507 RF_SectorCount_t nsect,
508 caddr_t buf,
509 RF_StripeNum_t parityStripeID,
510 RF_ReconUnitNum_t which_ru,
511 int (*wakeF) (void *, int),
512 void *arg,
513 RF_DiskQueueData_t * next,
514 RF_AccTraceEntry_t * tracerec,
515 void *raidPtr,
516 RF_DiskQueueDataFlags_t flags,
517 void *kb_proc)
518 {
519 RF_DiskQueueData_t *p;
520
521 p = pool_get(&rf_dqd_pool, PR_WAITOK);
522 if (init_dqd(p)) {
523 /* no memory for the buffer!?!? */
524 pool_put(&rf_dqd_pool, p);
525 return(NULL);
526 }
527
528 p->sectorOffset = ssect + rf_protectedSectors;
529 p->numSector = nsect;
530 p->type = typ;
531 p->buf = buf;
532 p->parityStripeID = parityStripeID;
533 p->which_ru = which_ru;
534 p->CompleteFunc = wakeF;
535 p->argument = arg;
536 p->next = next;
537 p->tracerec = tracerec;
538 p->priority = RF_IO_NORMAL_PRIORITY;
539 p->raidPtr = raidPtr;
540 p->flags = flags;
541 p->b_proc = kb_proc;
542 return (p);
543 }
544
545 void
546 rf_FreeDiskQueueData(p)
547 RF_DiskQueueData_t *p;
548 {
549 clean_dqd(p);
550 pool_put(&rf_dqd_pool, p);
551 }
552