rf_diskqueue.c revision 1.27 1 /* $NetBSD: rf_diskqueue.c,v 1.27 2003/12/30 21:59:03 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /****************************************************************************
30 *
31 * rf_diskqueue.c -- higher-level disk queue code
32 *
33 * the routines here are a generic wrapper around the actual queueing
34 * routines. The code here implements thread scheduling, synchronization,
35 * and locking ops (see below) on top of the lower-level queueing code.
36 *
37 * to support atomic RMW, we implement "locking operations". When a
38 * locking op is dispatched to the lower levels of the driver, the
39 * queue is locked, and no further I/Os are dispatched until the queue
40 * receives & completes a corresponding "unlocking operation". This
41 * code relies on the higher layers to guarantee that a locking op
42 * will always be eventually followed by an unlocking op. The model
43 * is that the higher layers are structured so locking and unlocking
44 * ops occur in pairs, i.e. an unlocking op cannot be generated until
45 * after a locking op reports completion. There is no good way to
46 * check to see that an unlocking op "corresponds" to the op that
47 * currently has the queue locked, so we make no such attempt. Since
48 * by definition there can be only one locking op outstanding on a
49 * disk, this should not be a problem.
50 *
51 * In the kernel, we allow multiple I/Os to be concurrently dispatched
52 * to the disk driver. In order to support locking ops in this
53 * environment, when we decide to do a locking op, we stop dispatching
54 * new I/Os and wait until all dispatched I/Os have completed before
55 * dispatching the locking op.
56 *
57 * Unfortunately, the code is different in the 3 different operating
58 * states (user level, kernel, simulator). In the kernel, I/O is
59 * non-blocking, and we have no disk threads to dispatch for us.
60 * Therefore, we have to dispatch new I/Os to the scsi driver at the
61 * time of enqueue, and also at the time of completion. At user
62 * level, I/O is blocking, and so only the disk threads may dispatch
63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue
64 * and wake up the disk thread to do the dispatch.
65 *
66 ****************************************************************************/
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.27 2003/12/30 21:59:03 oster Exp $");
70
71 #include <dev/raidframe/raidframevar.h>
72
73 #include "rf_threadstuff.h"
74 #include "rf_raid.h"
75 #include "rf_diskqueue.h"
76 #include "rf_alloclist.h"
77 #include "rf_acctrace.h"
78 #include "rf_etimer.h"
79 #include "rf_general.h"
80 #include "rf_debugprint.h"
81 #include "rf_shutdown.h"
82 #include "rf_cvscan.h"
83 #include "rf_sstf.h"
84 #include "rf_fifo.h"
85 #include "rf_kintf.h"
86
87 static int init_dqd(RF_DiskQueueData_t *);
88 static void clean_dqd(RF_DiskQueueData_t *);
89 static void rf_ShutdownDiskQueueSystem(void *);
90
91 #ifndef RF_DEBUG_DISKQUEUE
92 #define RF_DEBUG_DISKQUEUE 0
93 #endif
94
95 #if RF_DEBUG_DISKQUEUE
96 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99 #else
100 #define Dprintf1(s,a)
101 #define Dprintf2(s,a,b)
102 #define Dprintf3(s,a,b,c)
103 #endif
104
105 /*****************************************************************************
106 *
107 * the disk queue switch defines all the functions used in the
108 * different queueing disciplines queue ID, init routine, enqueue
109 * routine, dequeue routine
110 *
111 ****************************************************************************/
112
113 static const RF_DiskQueueSW_t diskqueuesw[] = {
114 {"fifo", /* FIFO */
115 rf_FifoCreate,
116 rf_FifoEnqueue,
117 rf_FifoDequeue,
118 rf_FifoPeek,
119 rf_FifoPromote},
120
121 {"cvscan", /* cvscan */
122 rf_CvscanCreate,
123 rf_CvscanEnqueue,
124 rf_CvscanDequeue,
125 rf_CvscanPeek,
126 rf_CvscanPromote},
127
128 {"sstf", /* shortest seek time first */
129 rf_SstfCreate,
130 rf_SstfEnqueue,
131 rf_SstfDequeue,
132 rf_SstfPeek,
133 rf_SstfPromote},
134
135 {"scan", /* SCAN (two-way elevator) */
136 rf_ScanCreate,
137 rf_SstfEnqueue,
138 rf_ScanDequeue,
139 rf_ScanPeek,
140 rf_SstfPromote},
141
142 {"cscan", /* CSCAN (one-way elevator) */
143 rf_CscanCreate,
144 rf_SstfEnqueue,
145 rf_CscanDequeue,
146 rf_CscanPeek,
147 rf_SstfPromote},
148
149 };
150 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
151
152 static struct pool rf_dqd_pool;
153 #define RF_MAX_FREE_DQD 256
154 #define RF_DQD_INC 16
155 #define RF_DQD_INITIAL 64
156
157 #include <sys/buf.h>
158
159 static int
160 init_dqd(RF_DiskQueueData_t *dqd)
161 {
162
163 dqd->bp = (struct buf *) malloc(sizeof(struct buf),
164 M_RAIDFRAME, M_NOWAIT);
165 if (dqd->bp == NULL) {
166 return (ENOMEM);
167 }
168 memset(dqd->bp, 0, sizeof(struct buf)); /* if you don't do it, nobody
169 * else will.. */
170 return (0);
171 }
172
173 static void
174 clean_dqd(RF_DiskQueueData_t *dqd)
175 {
176 free(dqd->bp, M_RAIDFRAME);
177 }
178 /* configures a single disk queue */
179
180 int
181 rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
182 RF_RowCol_t c, const RF_DiskQueueSW_t *p,
183 RF_SectorCount_t sectPerDisk, dev_t dev,
184 int maxOutstanding, RF_ShutdownList_t **listp,
185 RF_AllocListElem_t *clList)
186 {
187 diskqueue->col = c;
188 diskqueue->qPtr = p;
189 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
190 diskqueue->dev = dev;
191 diskqueue->numOutstanding = 0;
192 diskqueue->queueLength = 0;
193 diskqueue->maxOutstanding = maxOutstanding;
194 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
195 diskqueue->nextLockingOp = NULL;
196 diskqueue->numWaiting = 0;
197 diskqueue->flags = 0;
198 diskqueue->raidPtr = raidPtr;
199 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
200 rf_mutex_init(&diskqueue->mutex);
201 diskqueue->cond = 0;
202 return (0);
203 }
204
205 static void
206 rf_ShutdownDiskQueueSystem(void *ignored)
207 {
208 pool_destroy(&rf_dqd_pool);
209 }
210
211 int
212 rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp)
213 {
214 int rc;
215
216 pool_init(&rf_dqd_pool, sizeof(RF_DiskQueueData_t), 0, 0, 0,
217 "rf_dqd_pl", NULL);
218 pool_sethiwat(&rf_dqd_pool, RF_MAX_FREE_DQD);
219 pool_prime(&rf_dqd_pool, RF_DQD_INITIAL);
220
221 rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
222 if (rc) {
223 rf_print_unable_to_add_shutdown( __FILE__, __LINE__, rc);
224 rf_ShutdownDiskQueueSystem(NULL);
225 return (rc);
226 }
227
228 return (0);
229 }
230
231 int
232 rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
233 RF_Config_t *cfgPtr)
234 {
235 RF_DiskQueue_t *diskQueues, *spareQueues;
236 const RF_DiskQueueSW_t *p;
237 RF_RowCol_t r,c;
238 int rc, i;
239
240 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
241
242 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
243 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
244 p = &diskqueuesw[i];
245 break;
246 }
247 }
248 if (p == NULL) {
249 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
250 p = &diskqueuesw[0];
251 }
252 raidPtr->qType = p;
253
254 RF_MallocAndAdd(diskQueues,
255 (raidPtr->numCol + RF_MAXSPARE) *
256 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *),
257 raidPtr->cleanupList);
258 if (diskQueues == NULL)
259 return (ENOMEM);
260 raidPtr->Queues = diskQueues;
261
262 for (c = 0; c < raidPtr->numCol; c++) {
263 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
264 c, p,
265 raidPtr->sectorsPerDisk,
266 raidPtr->Disks[c].dev,
267 cfgPtr->maxOutstandingDiskReqs,
268 listp, raidPtr->cleanupList);
269 if (rc)
270 return (rc);
271 }
272
273 spareQueues = &raidPtr->Queues[raidPtr->numCol];
274 for (r = 0; r < raidPtr->numSpare; r++) {
275 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
276 raidPtr->numCol + r, p,
277 raidPtr->sectorsPerDisk,
278 raidPtr->Disks[raidPtr->numCol + r].dev,
279 cfgPtr->maxOutstandingDiskReqs, listp,
280 raidPtr->cleanupList);
281 if (rc)
282 return (rc);
283 }
284 return (0);
285 }
286 /* Enqueue a disk I/O
287 *
288 * Unfortunately, we have to do things differently in the different
289 * environments (simulator, user-level, kernel).
290 * At user level, all I/O is blocking, so we have 1 or more threads/disk
291 * and the thread that enqueues is different from the thread that dequeues.
292 * In the kernel, I/O is non-blocking and so we'd like to have multiple
293 * I/Os outstanding on the physical disks when possible.
294 *
295 * when any request arrives at a queue, we have two choices:
296 * dispatch it to the lower levels
297 * queue it up
298 *
299 * kernel rules for when to do what:
300 * locking request: queue empty => dispatch and lock queue,
301 * else queue it
302 * unlocking req : always dispatch it
303 * normal req : queue empty => dispatch it & set priority
304 * queue not full & priority is ok => dispatch it
305 * else queue it
306 *
307 * user-level rules:
308 * always enqueue. In the special case of an unlocking op, enqueue
309 * in a special way that will cause the unlocking op to be the next
310 * thing dequeued.
311 *
312 * simulator rules:
313 * Do the same as at user level, with the sleeps and wakeups suppressed.
314 */
315 void
316 rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
317 {
318 RF_ETIMER_START(req->qtime);
319 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
320 req->priority = pri;
321
322 #if RF_DEBUG_DISKQUEUE
323 if (rf_queueDebug && (req->numSector == 0)) {
324 printf("Warning: Enqueueing zero-sector access\n");
325 }
326 #endif
327 /*
328 * kernel
329 */
330 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
331 /* locking request */
332 if (RF_LOCKING_REQ(req)) {
333 if (RF_QUEUE_EMPTY(queue)) {
334 Dprintf2("Dispatching pri %d locking op to c %d (queue empty)\n", pri, queue->col);
335 RF_LOCK_QUEUE(queue);
336 rf_DispatchKernelIO(queue, req);
337 } else {
338 queue->queueLength++; /* increment count of number
339 * of requests waiting in this
340 * queue */
341 Dprintf2("Enqueueing pri %d locking op to c %d (queue not empty)\n", pri, queue->col);
342 req->queue = (void *) queue;
343 (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
344 }
345 }
346 /* unlocking request */
347 else
348 if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock
349 * when this I/O completes */
350 Dprintf2("Dispatching pri %d unlocking op to c %d\n", pri, queue->col);
351 RF_ASSERT(RF_QUEUE_LOCKED(queue));
352 rf_DispatchKernelIO(queue, req);
353 }
354 /* normal request */
355 else
356 if (RF_OK_TO_DISPATCH(queue, req)) {
357 Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
358 rf_DispatchKernelIO(queue, req);
359 } else {
360 queue->queueLength++; /* increment count of
361 * number of requests
362 * waiting in this queue */
363 Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
364 req->queue = (void *) queue;
365 (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
366 }
367 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
368 }
369
370
371 /* get the next set of I/Os started, kernel version only */
372 void
373 rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
374 {
375 int done = 0;
376
377 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
378
379 /* unlock the queue: (1) after an unlocking req completes (2) after a
380 * locking req fails */
381 if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
382 Dprintf1("DiskIOComplete: unlocking queue at c %d\n", queue->col);
383 RF_ASSERT(RF_QUEUE_LOCKED(queue));
384 RF_UNLOCK_QUEUE(queue);
385 }
386 queue->numOutstanding--;
387 RF_ASSERT(queue->numOutstanding >= 0);
388
389 /* dispatch requests to the disk until we find one that we can't. */
390 /* no reason to continue once we've filled up the queue */
391 /* no reason to even start if the queue is locked */
392
393 while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
394 if (queue->nextLockingOp) {
395 req = queue->nextLockingOp;
396 queue->nextLockingOp = NULL;
397 Dprintf2("DiskIOComplete: a pri %d locking req was pending at c %d\n", req->priority, queue->col);
398 } else {
399 req = (queue->qPtr->Dequeue) (queue->qHdr);
400 if (req != NULL) {
401 Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
402 } else {
403 Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
404 }
405 }
406 if (req) {
407 queue->queueLength--; /* decrement count of number
408 * of requests waiting in this
409 * queue */
410 RF_ASSERT(queue->queueLength >= 0);
411 }
412 if (!req)
413 done = 1;
414 else
415 if (RF_LOCKING_REQ(req)) {
416 if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */
417 Dprintf2("DiskIOComplete: dispatching pri %d locking req to c %d (queue empty)\n", req->priority, queue->col);
418 RF_LOCK_QUEUE(queue);
419 rf_DispatchKernelIO(queue, req);
420 done = 1;
421 } else { /* put it aside to wait for
422 * the queue to drain */
423 Dprintf2("DiskIOComplete: postponing pri %d locking req to c %d\n", req->priority, queue->col);
424 RF_ASSERT(queue->nextLockingOp == NULL);
425 queue->nextLockingOp = req;
426 done = 1;
427 }
428 } else
429 if (RF_UNLOCKING_REQ(req)) { /* should not happen:
430 * unlocking ops should
431 * not get queued */
432 RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for
433 * the future */
434 Dprintf2("DiskIOComplete: dispatching pri %d unl req to c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->col);
435 rf_DispatchKernelIO(queue, req);
436 done = 1;
437 } else
438 if (RF_OK_TO_DISPATCH(queue, req)) {
439 Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
440 rf_DispatchKernelIO(queue, req);
441 } else { /* we can't dispatch it,
442 * so just re-enqueue
443 * it. */
444 /* potential trouble here if
445 * disk queues batch reqs */
446 Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
447 queue->queueLength++;
448 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
449 done = 1;
450 }
451 }
452
453 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
454 }
455 /* promotes accesses tagged with the given parityStripeID from low priority
456 * to normal priority. This promotion is optional, meaning that a queue
457 * need not implement it. If there is no promotion routine associated with
458 * a queue, this routine does nothing and returns -1.
459 */
460 int
461 rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
462 RF_ReconUnitNum_t which_ru)
463 {
464 int retval;
465
466 if (!queue->qPtr->Promote)
467 return (-1);
468 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
469 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
470 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
471 return (retval);
472 }
473
474 RF_DiskQueueData_t *
475 rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
476 RF_SectorCount_t nsect, caddr_t buf,
477 RF_StripeNum_t parityStripeID,
478 RF_ReconUnitNum_t which_ru,
479 int (*wakeF) (void *, int), void *arg,
480 RF_DiskQueueData_t *next,
481 RF_AccTraceEntry_t *tracerec, void *raidPtr,
482 RF_DiskQueueDataFlags_t flags, void *kb_proc)
483 {
484 RF_DiskQueueData_t *p;
485
486 p = pool_get(&rf_dqd_pool, PR_WAITOK);
487 if (init_dqd(p)) {
488 /* no memory for the buffer!?!? */
489 pool_put(&rf_dqd_pool, p);
490 return(NULL);
491 }
492
493 p->sectorOffset = ssect + rf_protectedSectors;
494 p->numSector = nsect;
495 p->type = typ;
496 p->buf = buf;
497 p->parityStripeID = parityStripeID;
498 p->which_ru = which_ru;
499 p->CompleteFunc = wakeF;
500 p->argument = arg;
501 p->next = next;
502 p->tracerec = tracerec;
503 p->priority = RF_IO_NORMAL_PRIORITY;
504 p->raidPtr = raidPtr;
505 p->flags = flags;
506 p->b_proc = kb_proc;
507 return (p);
508 }
509
510 void
511 rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
512 {
513 clean_dqd(p);
514 pool_put(&rf_dqd_pool, p);
515 }
516