rf_diskqueue.c revision 1.55.4.1 1 /* $NetBSD: rf_diskqueue.c,v 1.55.4.1 2021/10/19 10:55:15 martin Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /****************************************************************************
30 *
31 * rf_diskqueue.c -- higher-level disk queue code
32 *
33 * the routines here are a generic wrapper around the actual queueing
34 * routines. The code here implements thread scheduling, synchronization,
35 * and locking ops (see below) on top of the lower-level queueing code.
36 *
37 * to support atomic RMW, we implement "locking operations". When a
38 * locking op is dispatched to the lower levels of the driver, the
39 * queue is locked, and no further I/Os are dispatched until the queue
40 * receives & completes a corresponding "unlocking operation". This
41 * code relies on the higher layers to guarantee that a locking op
42 * will always be eventually followed by an unlocking op. The model
43 * is that the higher layers are structured so locking and unlocking
44 * ops occur in pairs, i.e. an unlocking op cannot be generated until
45 * after a locking op reports completion. There is no good way to
46 * check to see that an unlocking op "corresponds" to the op that
47 * currently has the queue locked, so we make no such attempt. Since
48 * by definition there can be only one locking op outstanding on a
49 * disk, this should not be a problem.
50 *
51 * In the kernel, we allow multiple I/Os to be concurrently dispatched
52 * to the disk driver. In order to support locking ops in this
53 * environment, when we decide to do a locking op, we stop dispatching
54 * new I/Os and wait until all dispatched I/Os have completed before
55 * dispatching the locking op.
56 *
57 * Unfortunately, the code is different in the 3 different operating
58 * states (user level, kernel, simulator). In the kernel, I/O is
59 * non-blocking, and we have no disk threads to dispatch for us.
60 * Therefore, we have to dispatch new I/Os to the scsi driver at the
61 * time of enqueue, and also at the time of completion. At user
62 * level, I/O is blocking, and so only the disk threads may dispatch
63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue
64 * and wake up the disk thread to do the dispatch.
65 *
66 ****************************************************************************/
67
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.55.4.1 2021/10/19 10:55:15 martin Exp $");
70
71 #include <dev/raidframe/raidframevar.h>
72
73 #include "rf_threadstuff.h"
74 #include "rf_raid.h"
75 #include "rf_diskqueue.h"
76 #include "rf_alloclist.h"
77 #include "rf_acctrace.h"
78 #include "rf_etimer.h"
79 #include "rf_general.h"
80 #include "rf_debugprint.h"
81 #include "rf_shutdown.h"
82 #include "rf_cvscan.h"
83 #include "rf_sstf.h"
84 #include "rf_fifo.h"
85 #include "rf_kintf.h"
86
87 #include <sys/buf.h>
88
89 static void rf_ShutdownDiskQueueSystem(void *);
90
91 #ifndef RF_DEBUG_DISKQUEUE
92 #define RF_DEBUG_DISKQUEUE 0
93 #endif
94
95 #if RF_DEBUG_DISKQUEUE
96 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99 #else
100 #define Dprintf1(s,a)
101 #define Dprintf2(s,a,b)
102 #define Dprintf3(s,a,b,c)
103 #endif
104
105 /*****************************************************************************
106 *
107 * the disk queue switch defines all the functions used in the
108 * different queueing disciplines queue ID, init routine, enqueue
109 * routine, dequeue routine
110 *
111 ****************************************************************************/
112
113 static const RF_DiskQueueSW_t diskqueuesw[] = {
114 {"fifo", /* FIFO */
115 rf_FifoCreate,
116 rf_FifoEnqueue,
117 rf_FifoDequeue,
118 rf_FifoPeek,
119 rf_FifoPromote},
120
121 {"cvscan", /* cvscan */
122 rf_CvscanCreate,
123 rf_CvscanEnqueue,
124 rf_CvscanDequeue,
125 rf_CvscanPeek,
126 rf_CvscanPromote},
127
128 {"sstf", /* shortest seek time first */
129 rf_SstfCreate,
130 rf_SstfEnqueue,
131 rf_SstfDequeue,
132 rf_SstfPeek,
133 rf_SstfPromote},
134
135 {"scan", /* SCAN (two-way elevator) */
136 rf_ScanCreate,
137 rf_SstfEnqueue,
138 rf_ScanDequeue,
139 rf_ScanPeek,
140 rf_SstfPromote},
141
142 {"cscan", /* CSCAN (one-way elevator) */
143 rf_CscanCreate,
144 rf_SstfEnqueue,
145 rf_CscanDequeue,
146 rf_CscanPeek,
147 rf_SstfPromote},
148
149 };
150 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
151
152
153 #define RF_MAX_FREE_DQD 256
154 #define RF_MIN_FREE_DQD 64
155
156 /* XXX: scale these... */
157 #define RF_MAX_FREE_BUFIO 256
158 #define RF_MIN_FREE_BUFIO 64
159
160
161
162 /* configures a single disk queue */
163
164 static void
165 rf_ShutdownDiskQueue(void *arg)
166 {
167 RF_DiskQueue_t *diskqueue = arg;
168
169 rf_destroy_mutex2(diskqueue->mutex);
170 }
171
172 int
173 rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
174 RF_RowCol_t c, const RF_DiskQueueSW_t *p,
175 RF_SectorCount_t sectPerDisk, dev_t dev,
176 int maxOutstanding, RF_ShutdownList_t **listp,
177 RF_AllocListElem_t *clList)
178 {
179 diskqueue->col = c;
180 diskqueue->qPtr = p;
181 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
182 diskqueue->dev = dev;
183 diskqueue->numOutstanding = 0;
184 diskqueue->queueLength = 0;
185 diskqueue->maxOutstanding = maxOutstanding;
186 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
187 diskqueue->flags = 0;
188 diskqueue->raidPtr = raidPtr;
189 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
190 rf_init_mutex2(diskqueue->mutex, IPL_VM);
191 rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue);
192 return (0);
193 }
194
195 static void
196 rf_ShutdownDiskQueueSystem(void *ignored)
197 {
198 pool_destroy(&rf_pools.dqd);
199 pool_destroy(&rf_pools.bufio);
200 }
201
202 int
203 rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp)
204 {
205
206 rf_pool_init(&rf_pools.dqd, sizeof(RF_DiskQueueData_t),
207 "rf_dqd_pl", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD);
208 rf_pool_init(&rf_pools.bufio, sizeof(buf_t),
209 "rf_bufio_pl", RF_MIN_FREE_BUFIO, RF_MAX_FREE_BUFIO);
210 rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
211
212 return (0);
213 }
214
215 int
216 rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
217 RF_Config_t *cfgPtr)
218 {
219 RF_DiskQueue_t *diskQueues, *spareQueues;
220 const RF_DiskQueueSW_t *p;
221 RF_RowCol_t r,c;
222 int rc, i;
223
224 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
225
226 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
227 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
228 p = &diskqueuesw[i];
229 break;
230 }
231 }
232 if (p == NULL) {
233 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
234 p = &diskqueuesw[0];
235 }
236 raidPtr->qType = p;
237
238 diskQueues = RF_MallocAndAdd(
239 (raidPtr->numCol + RF_MAXSPARE) * sizeof(*diskQueues),
240 raidPtr->cleanupList);
241 if (diskQueues == NULL)
242 return (ENOMEM);
243 raidPtr->Queues = diskQueues;
244
245 for (c = 0; c < raidPtr->numCol; c++) {
246 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
247 c, p,
248 raidPtr->sectorsPerDisk,
249 raidPtr->Disks[c].dev,
250 cfgPtr->maxOutstandingDiskReqs,
251 listp, raidPtr->cleanupList);
252 if (rc)
253 return (rc);
254 }
255
256 spareQueues = &raidPtr->Queues[raidPtr->numCol];
257 for (r = 0; r < raidPtr->numSpare; r++) {
258 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
259 raidPtr->numCol + r, p,
260 raidPtr->sectorsPerDisk,
261 raidPtr->Disks[raidPtr->numCol + r].dev,
262 cfgPtr->maxOutstandingDiskReqs, listp,
263 raidPtr->cleanupList);
264 if (rc)
265 return (rc);
266 }
267 return (0);
268 }
269 /* Enqueue a disk I/O
270 *
271 * In the kernel, I/O is non-blocking and so we'd like to have multiple
272 * I/Os outstanding on the physical disks when possible.
273 *
274 * when any request arrives at a queue, we have two choices:
275 * dispatch it to the lower levels
276 * queue it up
277 *
278 * kernel rules for when to do what:
279 * unlocking req : always dispatch it
280 * normal req : queue empty => dispatch it & set priority
281 * queue not full & priority is ok => dispatch it
282 * else queue it
283 */
284 void
285 rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
286 {
287 RF_ETIMER_START(req->qtime);
288 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
289 req->priority = pri;
290
291 #if RF_DEBUG_DISKQUEUE
292 if (rf_queueDebug && (req->numSector == 0)) {
293 printf("Warning: Enqueueing zero-sector access\n");
294 }
295 #endif
296 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
297 if (RF_OK_TO_DISPATCH(queue, req)) {
298 Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
299 rf_DispatchKernelIO(queue, req);
300 } else {
301 queue->queueLength++; /* increment count of number of requests waiting in this queue */
302 Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
303 req->queue = (void *) queue;
304 (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
305 }
306 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
307 }
308
309
310 /* get the next set of I/Os started */
311 void
312 rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
313 {
314 int done = 0;
315
316 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
317 queue->numOutstanding--;
318 RF_ASSERT(queue->numOutstanding >= 0);
319
320 /* dispatch requests to the disk until we find one that we can't. */
321 /* no reason to continue once we've filled up the queue */
322 /* no reason to even start if the queue is locked */
323
324 while (!done && !RF_QUEUE_FULL(queue)) {
325 req = (queue->qPtr->Dequeue) (queue->qHdr);
326 if (req) {
327 Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
328 queue->queueLength--; /* decrement count of number of requests waiting in this queue */
329 RF_ASSERT(queue->queueLength >= 0);
330 if (RF_OK_TO_DISPATCH(queue, req)) {
331 Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
332 rf_DispatchKernelIO(queue, req);
333 } else {
334 /* we can't dispatch it, so just re-enqueue it.
335 potential trouble here if disk queues batch reqs */
336 Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
337 queue->queueLength++;
338 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
339 done = 1;
340 }
341 } else {
342 Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
343 done = 1;
344 }
345 }
346
347 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
348 }
349 /* promotes accesses tagged with the given parityStripeID from low priority
350 * to normal priority. This promotion is optional, meaning that a queue
351 * need not implement it. If there is no promotion routine associated with
352 * a queue, this routine does nothing and returns -1.
353 */
354 int
355 rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
356 RF_ReconUnitNum_t which_ru)
357 {
358 int retval;
359
360 if (!queue->qPtr->Promote)
361 return (-1);
362 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
363 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
364 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
365 return (retval);
366 }
367
368 RF_DiskQueueData_t *
369 rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
370 RF_SectorCount_t nsect, void *bf,
371 RF_StripeNum_t parityStripeID,
372 RF_ReconUnitNum_t which_ru,
373 int (*wakeF) (void *, int), void *arg,
374 RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr,
375 RF_DiskQueueDataFlags_t flags, void *kb_proc,
376 int waitflag)
377 {
378 RF_DiskQueueData_t *p;
379
380 p = pool_get(&rf_pools.dqd, PR_WAITOK | PR_ZERO);
381 KASSERT(p != NULL);
382
383 /* Obtain a buffer from our own pool. It is possible for the
384 regular getiobuf() to run out of memory and return NULL.
385 We need to guarantee that never happens, as RAIDframe
386 doesn't have a good way to recover if memory allocation
387 fails here.
388 */
389 p->bp = pool_get(&rf_pools.bufio, PR_WAITOK | PR_ZERO);
390 KASSERT(p->bp != NULL);
391
392 buf_init(p->bp);
393
394 SET(p->bp->b_cflags, BC_BUSY); /* mark buffer busy */
395
396 p->sectorOffset = ssect + rf_protectedSectors;
397 p->numSector = nsect;
398 p->type = typ;
399 p->buf = bf;
400 p->parityStripeID = parityStripeID;
401 p->which_ru = which_ru;
402 p->CompleteFunc = wakeF;
403 p->argument = arg;
404 p->next = NULL;
405 p->tracerec = tracerec;
406 p->priority = RF_IO_NORMAL_PRIORITY;
407 p->raidPtr = raidPtr;
408 p->flags = flags;
409 p->b_proc = kb_proc;
410 return (p);
411 }
412
413 void
414 rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
415 {
416 pool_put(&rf_pools.bufio, p->bp);
417 pool_put(&rf_pools.dqd, p);
418 }
419