rf_diskqueue.c revision 1.12 1 /* $NetBSD: rf_diskqueue.c,v 1.12 2000/03/04 03:27:13 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /****************************************************************************************
30 *
31 * rf_diskqueue.c -- higher-level disk queue code
32 *
33 * the routines here are a generic wrapper around the actual queueing
34 * routines. The code here implements thread scheduling, synchronization,
35 * and locking ops (see below) on top of the lower-level queueing code.
36 *
37 * to support atomic RMW, we implement "locking operations". When a locking op
38 * is dispatched to the lower levels of the driver, the queue is locked, and no further
39 * I/Os are dispatched until the queue receives & completes a corresponding "unlocking
40 * operation". This code relies on the higher layers to guarantee that a locking
41 * op will always be eventually followed by an unlocking op. The model is that
42 * the higher layers are structured so locking and unlocking ops occur in pairs, i.e.
43 * an unlocking op cannot be generated until after a locking op reports completion.
44 * There is no good way to check to see that an unlocking op "corresponds" to the
45 * op that currently has the queue locked, so we make no such attempt. Since by
46 * definition there can be only one locking op outstanding on a disk, this should
47 * not be a problem.
48 *
49 * In the kernel, we allow multiple I/Os to be concurrently dispatched to the disk
50 * driver. In order to support locking ops in this environment, when we decide to
51 * do a locking op, we stop dispatching new I/Os and wait until all dispatched I/Os
52 * have completed before dispatching the locking op.
53 *
54 * Unfortunately, the code is different in the 3 different operating states
55 * (user level, kernel, simulator). In the kernel, I/O is non-blocking, and
56 * we have no disk threads to dispatch for us. Therefore, we have to dispatch
57 * new I/Os to the scsi driver at the time of enqueue, and also at the time
58 * of completion. At user level, I/O is blocking, and so only the disk threads
59 * may dispatch I/Os. Thus at user level, all we can do at enqueue time is
60 * enqueue and wake up the disk thread to do the dispatch.
61 *
62 ***************************************************************************************/
63
64 #include "rf_types.h"
65 #include "rf_threadstuff.h"
66 #include "rf_raid.h"
67 #include "rf_diskqueue.h"
68 #include "rf_alloclist.h"
69 #include "rf_acctrace.h"
70 #include "rf_etimer.h"
71 #include "rf_configure.h"
72 #include "rf_general.h"
73 #include "rf_freelist.h"
74 #include "rf_debugprint.h"
75 #include "rf_shutdown.h"
76 #include "rf_cvscan.h"
77 #include "rf_sstf.h"
78 #include "rf_fifo.h"
79 #include "rf_kintf.h"
80
81 static int init_dqd(RF_DiskQueueData_t *);
82 static void clean_dqd(RF_DiskQueueData_t *);
83 static void rf_ShutdownDiskQueueSystem(void *);
84
85 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
86 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
87 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
88
89 /*****************************************************************************************
90 *
91 * the disk queue switch defines all the functions used in the different queueing
92 * disciplines
93 * queue ID, init routine, enqueue routine, dequeue routine
94 *
95 ****************************************************************************************/
96
97 static RF_DiskQueueSW_t diskqueuesw[] = {
98 {"fifo", /* FIFO */
99 rf_FifoCreate,
100 rf_FifoEnqueue,
101 rf_FifoDequeue,
102 rf_FifoPeek,
103 rf_FifoPromote},
104
105 {"cvscan", /* cvscan */
106 rf_CvscanCreate,
107 rf_CvscanEnqueue,
108 rf_CvscanDequeue,
109 rf_CvscanPeek,
110 rf_CvscanPromote},
111
112 {"sstf", /* shortest seek time first */
113 rf_SstfCreate,
114 rf_SstfEnqueue,
115 rf_SstfDequeue,
116 rf_SstfPeek,
117 rf_SstfPromote},
118
119 {"scan", /* SCAN (two-way elevator) */
120 rf_ScanCreate,
121 rf_SstfEnqueue,
122 rf_ScanDequeue,
123 rf_ScanPeek,
124 rf_SstfPromote},
125
126 {"cscan", /* CSCAN (one-way elevator) */
127 rf_CscanCreate,
128 rf_SstfEnqueue,
129 rf_CscanDequeue,
130 rf_CscanPeek,
131 rf_SstfPromote},
132
133 };
134 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
135
136 static RF_FreeList_t *rf_dqd_freelist;
137
138 #define RF_MAX_FREE_DQD 256
139 #define RF_DQD_INC 16
140 #define RF_DQD_INITIAL 64
141
142 #include <sys/buf.h>
143
144 static int
145 init_dqd(dqd)
146 RF_DiskQueueData_t *dqd;
147 {
148 /* XXX not sure if the following malloc is appropriate... probably not
149 * quite... */
150 dqd->bp = (struct buf *) malloc(sizeof(struct buf),
151 M_RAIDFRAME, M_NOWAIT);
152 if (dqd->bp == NULL) {
153 return (ENOMEM);
154 }
155 memset(dqd->bp, 0, sizeof(struct buf)); /* if you don't do it, nobody
156 * else will.. */
157 return (0);
158 }
159
160 static void
161 clean_dqd(dqd)
162 RF_DiskQueueData_t *dqd;
163 {
164 free(dqd->bp, M_RAIDFRAME);
165 }
166 /* configures a single disk queue */
167
168 int
169 rf_ConfigureDiskQueue(
170 RF_Raid_t * raidPtr,
171 RF_DiskQueue_t * diskqueue,
172 RF_RowCol_t r, /* row & col -- debug only. BZZT not any
173 * more... */
174 RF_RowCol_t c,
175 RF_DiskQueueSW_t * p,
176 RF_SectorCount_t sectPerDisk,
177 dev_t dev,
178 int maxOutstanding,
179 RF_ShutdownList_t ** listp,
180 RF_AllocListElem_t * clList)
181 {
182 int rc;
183
184 diskqueue->row = r;
185 diskqueue->col = c;
186 diskqueue->qPtr = p;
187 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
188 diskqueue->dev = dev;
189 diskqueue->numOutstanding = 0;
190 diskqueue->queueLength = 0;
191 diskqueue->maxOutstanding = maxOutstanding;
192 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
193 diskqueue->nextLockingOp = NULL;
194 diskqueue->unlockingOp = NULL;
195 diskqueue->numWaiting = 0;
196 diskqueue->flags = 0;
197 diskqueue->raidPtr = raidPtr;
198 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c];
199 rc = rf_create_managed_mutex(listp, &diskqueue->mutex);
200 if (rc) {
201 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
202 __LINE__, rc);
203 return (rc);
204 }
205 rc = rf_create_managed_cond(listp, &diskqueue->cond);
206 if (rc) {
207 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
208 __LINE__, rc);
209 return (rc);
210 }
211 return (0);
212 }
213
214 static void
215 rf_ShutdownDiskQueueSystem(ignored)
216 void *ignored;
217 {
218 RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist, next, (RF_DiskQueueData_t *), clean_dqd);
219 }
220
221 int
222 rf_ConfigureDiskQueueSystem(listp)
223 RF_ShutdownList_t **listp;
224 {
225 int rc;
226
227 RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD,
228 RF_DQD_INC, sizeof(RF_DiskQueueData_t));
229 if (rf_dqd_freelist == NULL)
230 return (ENOMEM);
231 rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
232 if (rc) {
233 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
234 __FILE__, __LINE__, rc);
235 rf_ShutdownDiskQueueSystem(NULL);
236 return (rc);
237 }
238 RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL, next,
239 (RF_DiskQueueData_t *), init_dqd);
240 return (0);
241 }
242
243 int
244 rf_ConfigureDiskQueues(
245 RF_ShutdownList_t ** listp,
246 RF_Raid_t * raidPtr,
247 RF_Config_t * cfgPtr)
248 {
249 RF_DiskQueue_t **diskQueues, *spareQueues;
250 RF_DiskQueueSW_t *p;
251 RF_RowCol_t r, c;
252 int rc, i;
253
254 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
255
256 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
257 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
258 p = &diskqueuesw[i];
259 break;
260 }
261 }
262 if (p == NULL) {
263 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
264 p = &diskqueuesw[0];
265 }
266 raidPtr->qType = p;
267 RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList);
268 if (diskQueues == NULL) {
269 return (ENOMEM);
270 }
271 raidPtr->Queues = diskQueues;
272 for (r = 0; r < raidPtr->numRow; r++) {
273 RF_CallocAndAdd(diskQueues[r], raidPtr->numCol +
274 ((r == 0) ? RF_MAXSPARE : 0),
275 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *),
276 raidPtr->cleanupList);
277 if (diskQueues[r] == NULL)
278 return (ENOMEM);
279 for (c = 0; c < raidPtr->numCol; c++) {
280 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[r][c],
281 r, c, p,
282 raidPtr->sectorsPerDisk,
283 raidPtr->Disks[r][c].dev,
284 cfgPtr->maxOutstandingDiskReqs,
285 listp, raidPtr->cleanupList);
286 if (rc)
287 return (rc);
288 }
289 }
290
291 spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
292 for (r = 0; r < raidPtr->numSpare; r++) {
293 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
294 0, raidPtr->numCol + r, p,
295 raidPtr->sectorsPerDisk,
296 raidPtr->Disks[0][raidPtr->numCol + r].dev,
297 cfgPtr->maxOutstandingDiskReqs, listp,
298 raidPtr->cleanupList);
299 if (rc)
300 return (rc);
301 }
302 return (0);
303 }
304 /* Enqueue a disk I/O
305 *
306 * Unfortunately, we have to do things differently in the different
307 * environments (simulator, user-level, kernel).
308 * At user level, all I/O is blocking, so we have 1 or more threads/disk
309 * and the thread that enqueues is different from the thread that dequeues.
310 * In the kernel, I/O is non-blocking and so we'd like to have multiple
311 * I/Os outstanding on the physical disks when possible.
312 *
313 * when any request arrives at a queue, we have two choices:
314 * dispatch it to the lower levels
315 * queue it up
316 *
317 * kernel rules for when to do what:
318 * locking request: queue empty => dispatch and lock queue,
319 * else queue it
320 * unlocking req : always dispatch it
321 * normal req : queue empty => dispatch it & set priority
322 * queue not full & priority is ok => dispatch it
323 * else queue it
324 *
325 * user-level rules:
326 * always enqueue. In the special case of an unlocking op, enqueue
327 * in a special way that will cause the unlocking op to be the next
328 * thing dequeued.
329 *
330 * simulator rules:
331 * Do the same as at user level, with the sleeps and wakeups suppressed.
332 */
333 void
334 rf_DiskIOEnqueue(queue, req, pri)
335 RF_DiskQueue_t *queue;
336 RF_DiskQueueData_t *req;
337 int pri;
338 {
339 RF_ETIMER_START(req->qtime);
340 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
341 req->priority = pri;
342
343 if (rf_queueDebug && (req->numSector == 0)) {
344 printf("Warning: Enqueueing zero-sector access\n");
345 }
346 /*
347 * kernel
348 */
349 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
350 /* locking request */
351 if (RF_LOCKING_REQ(req)) {
352 if (RF_QUEUE_EMPTY(queue)) {
353 Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n", pri, queue->row, queue->col);
354 RF_LOCK_QUEUE(queue);
355 rf_DispatchKernelIO(queue, req);
356 } else {
357 queue->queueLength++; /* increment count of number
358 * of requests waiting in this
359 * queue */
360 Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n", pri, queue->row, queue->col);
361 req->queue = (void *) queue;
362 (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
363 }
364 }
365 /* unlocking request */
366 else
367 if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock
368 * when this I/O completes */
369 Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n", pri, queue->row, queue->col);
370 RF_ASSERT(RF_QUEUE_LOCKED(queue));
371 rf_DispatchKernelIO(queue, req);
372 }
373 /* normal request */
374 else
375 if (RF_OK_TO_DISPATCH(queue, req)) {
376 Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n", pri, queue->row, queue->col);
377 rf_DispatchKernelIO(queue, req);
378 } else {
379 queue->queueLength++; /* increment count of
380 * number of requests
381 * waiting in this queue */
382 Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n", pri, queue->row, queue->col);
383 req->queue = (void *) queue;
384 (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
385 }
386 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
387 }
388
389
390 /* get the next set of I/Os started, kernel version only */
391 void
392 rf_DiskIOComplete(queue, req, status)
393 RF_DiskQueue_t *queue;
394 RF_DiskQueueData_t *req;
395 int status;
396 {
397 int done = 0;
398
399 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
400
401 /* unlock the queue: (1) after an unlocking req completes (2) after a
402 * locking req fails */
403 if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
404 Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col);
405 RF_ASSERT(RF_QUEUE_LOCKED(queue) && (queue->unlockingOp == NULL));
406 RF_UNLOCK_QUEUE(queue);
407 }
408 queue->numOutstanding--;
409 RF_ASSERT(queue->numOutstanding >= 0);
410
411 /* dispatch requests to the disk until we find one that we can't. */
412 /* no reason to continue once we've filled up the queue */
413 /* no reason to even start if the queue is locked */
414
415 while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
416 if (queue->nextLockingOp) {
417 req = queue->nextLockingOp;
418 queue->nextLockingOp = NULL;
419 Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n", req->priority, queue->row, queue->col);
420 } else {
421 req = (queue->qPtr->Dequeue) (queue->qHdr);
422 if (req != NULL) {
423 Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n", req->priority, queue->row, queue->col);
424 } else {
425 Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
426 }
427 }
428 if (req) {
429 queue->queueLength--; /* decrement count of number
430 * of requests waiting in this
431 * queue */
432 RF_ASSERT(queue->queueLength >= 0);
433 }
434 if (!req)
435 done = 1;
436 else
437 if (RF_LOCKING_REQ(req)) {
438 if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */
439 Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n", req->priority, queue->row, queue->col);
440 RF_LOCK_QUEUE(queue);
441 rf_DispatchKernelIO(queue, req);
442 done = 1;
443 } else { /* put it aside to wait for
444 * the queue to drain */
445 Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n", req->priority, queue->row, queue->col);
446 RF_ASSERT(queue->nextLockingOp == NULL);
447 queue->nextLockingOp = req;
448 done = 1;
449 }
450 } else
451 if (RF_UNLOCKING_REQ(req)) { /* should not happen:
452 * unlocking ops should
453 * not get queued */
454 RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for
455 * the future */
456 Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->row, queue->col);
457 rf_DispatchKernelIO(queue, req);
458 done = 1;
459 } else
460 if (RF_OK_TO_DISPATCH(queue, req)) {
461 Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n", req->priority, queue->row, queue->col);
462 rf_DispatchKernelIO(queue, req);
463 } else { /* we can't dispatch it,
464 * so just re-enqueue
465 * it. */
466 /* potential trouble here if
467 * disk queues batch reqs */
468 Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n", req->priority, queue->row, queue->col);
469 queue->queueLength++;
470 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
471 done = 1;
472 }
473 }
474
475 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
476 }
477 /* promotes accesses tagged with the given parityStripeID from low priority
478 * to normal priority. This promotion is optional, meaning that a queue
479 * need not implement it. If there is no promotion routine associated with
480 * a queue, this routine does nothing and returns -1.
481 */
482 int
483 rf_DiskIOPromote(queue, parityStripeID, which_ru)
484 RF_DiskQueue_t *queue;
485 RF_StripeNum_t parityStripeID;
486 RF_ReconUnitNum_t which_ru;
487 {
488 int retval;
489
490 if (!queue->qPtr->Promote)
491 return (-1);
492 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
493 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
494 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
495 return (retval);
496 }
497
498 RF_DiskQueueData_t *
499 rf_CreateDiskQueueData(
500 RF_IoType_t typ,
501 RF_SectorNum_t ssect,
502 RF_SectorCount_t nsect,
503 caddr_t buf,
504 RF_StripeNum_t parityStripeID,
505 RF_ReconUnitNum_t which_ru,
506 int (*wakeF) (void *, int),
507 void *arg,
508 RF_DiskQueueData_t * next,
509 RF_AccTraceEntry_t * tracerec,
510 void *raidPtr,
511 RF_DiskQueueDataFlags_t flags,
512 void *kb_proc)
513 {
514 RF_DiskQueueData_t *p;
515
516 RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd);
517
518 p->sectorOffset = ssect + rf_protectedSectors;
519 p->numSector = nsect;
520 p->type = typ;
521 p->buf = buf;
522 p->parityStripeID = parityStripeID;
523 p->which_ru = which_ru;
524 p->CompleteFunc = wakeF;
525 p->argument = arg;
526 p->next = next;
527 p->tracerec = tracerec;
528 p->priority = RF_IO_NORMAL_PRIORITY;
529 p->AuxFunc = NULL;
530 p->buf2 = NULL;
531 p->raidPtr = raidPtr;
532 p->flags = flags;
533 p->b_proc = kb_proc;
534 return (p);
535 }
536
537 RF_DiskQueueData_t *
538 rf_CreateDiskQueueDataFull(
539 RF_IoType_t typ,
540 RF_SectorNum_t ssect,
541 RF_SectorCount_t nsect,
542 caddr_t buf,
543 RF_StripeNum_t parityStripeID,
544 RF_ReconUnitNum_t which_ru,
545 int (*wakeF) (void *, int),
546 void *arg,
547 RF_DiskQueueData_t * next,
548 RF_AccTraceEntry_t * tracerec,
549 int priority,
550 int (*AuxFunc) (void *,...),
551 caddr_t buf2,
552 void *raidPtr,
553 RF_DiskQueueDataFlags_t flags,
554 void *kb_proc)
555 {
556 RF_DiskQueueData_t *p;
557
558 RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd);
559
560 p->sectorOffset = ssect + rf_protectedSectors;
561 p->numSector = nsect;
562 p->type = typ;
563 p->buf = buf;
564 p->parityStripeID = parityStripeID;
565 p->which_ru = which_ru;
566 p->CompleteFunc = wakeF;
567 p->argument = arg;
568 p->next = next;
569 p->tracerec = tracerec;
570 p->priority = priority;
571 p->AuxFunc = AuxFunc;
572 p->buf2 = buf2;
573 p->raidPtr = raidPtr;
574 p->flags = flags;
575 p->b_proc = kb_proc;
576 return (p);
577 }
578
579 void
580 rf_FreeDiskQueueData(p)
581 RF_DiskQueueData_t *p;
582 {
583 RF_FREELIST_FREE_CLEAN(rf_dqd_freelist, p, next, clean_dqd);
584 }
585