rf_diskqueue.c revision 1.60 1 1.60 oster /* $NetBSD: rf_diskqueue.c,v 1.60 2021/07/23 00:54:45 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: Mark Holland
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.13 oster /****************************************************************************
30 1.1 oster *
31 1.1 oster * rf_diskqueue.c -- higher-level disk queue code
32 1.1 oster *
33 1.1 oster * the routines here are a generic wrapper around the actual queueing
34 1.6 oster * routines. The code here implements thread scheduling, synchronization,
35 1.1 oster * and locking ops (see below) on top of the lower-level queueing code.
36 1.1 oster *
37 1.13 oster * to support atomic RMW, we implement "locking operations". When a
38 1.13 oster * locking op is dispatched to the lower levels of the driver, the
39 1.13 oster * queue is locked, and no further I/Os are dispatched until the queue
40 1.13 oster * receives & completes a corresponding "unlocking operation". This
41 1.13 oster * code relies on the higher layers to guarantee that a locking op
42 1.13 oster * will always be eventually followed by an unlocking op. The model
43 1.13 oster * is that the higher layers are structured so locking and unlocking
44 1.13 oster * ops occur in pairs, i.e. an unlocking op cannot be generated until
45 1.13 oster * after a locking op reports completion. There is no good way to
46 1.13 oster * check to see that an unlocking op "corresponds" to the op that
47 1.13 oster * currently has the queue locked, so we make no such attempt. Since
48 1.13 oster * by definition there can be only one locking op outstanding on a
49 1.13 oster * disk, this should not be a problem.
50 1.13 oster *
51 1.13 oster * In the kernel, we allow multiple I/Os to be concurrently dispatched
52 1.13 oster * to the disk driver. In order to support locking ops in this
53 1.13 oster * environment, when we decide to do a locking op, we stop dispatching
54 1.13 oster * new I/Os and wait until all dispatched I/Os have completed before
55 1.13 oster * dispatching the locking op.
56 1.13 oster *
57 1.13 oster * Unfortunately, the code is different in the 3 different operating
58 1.13 oster * states (user level, kernel, simulator). In the kernel, I/O is
59 1.13 oster * non-blocking, and we have no disk threads to dispatch for us.
60 1.13 oster * Therefore, we have to dispatch new I/Os to the scsi driver at the
61 1.13 oster * time of enqueue, and also at the time of completion. At user
62 1.13 oster * level, I/O is blocking, and so only the disk threads may dispatch
63 1.13 oster * I/Os. Thus at user level, all we can do at enqueue time is enqueue
64 1.13 oster * and wake up the disk thread to do the dispatch.
65 1.1 oster *
66 1.13 oster ****************************************************************************/
67 1.15 lukem
68 1.15 lukem #include <sys/cdefs.h>
69 1.60 oster __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.60 2021/07/23 00:54:45 oster Exp $");
70 1.1 oster
71 1.14 oster #include <dev/raidframe/raidframevar.h>
72 1.14 oster
73 1.1 oster #include "rf_threadstuff.h"
74 1.1 oster #include "rf_raid.h"
75 1.1 oster #include "rf_diskqueue.h"
76 1.1 oster #include "rf_alloclist.h"
77 1.1 oster #include "rf_acctrace.h"
78 1.1 oster #include "rf_etimer.h"
79 1.1 oster #include "rf_general.h"
80 1.1 oster #include "rf_debugprint.h"
81 1.1 oster #include "rf_shutdown.h"
82 1.1 oster #include "rf_cvscan.h"
83 1.1 oster #include "rf_sstf.h"
84 1.1 oster #include "rf_fifo.h"
85 1.11 oster #include "rf_kintf.h"
86 1.1 oster
87 1.59 oster #include <sys/buf.h>
88 1.59 oster
89 1.1 oster static void rf_ShutdownDiskQueueSystem(void *);
90 1.1 oster
91 1.21 oster #ifndef RF_DEBUG_DISKQUEUE
92 1.21 oster #define RF_DEBUG_DISKQUEUE 0
93 1.21 oster #endif
94 1.21 oster
95 1.21 oster #if RF_DEBUG_DISKQUEUE
96 1.1 oster #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97 1.1 oster #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98 1.1 oster #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99 1.21 oster #else
100 1.21 oster #define Dprintf1(s,a)
101 1.21 oster #define Dprintf2(s,a,b)
102 1.21 oster #define Dprintf3(s,a,b,c)
103 1.21 oster #endif
104 1.1 oster
105 1.13 oster /*****************************************************************************
106 1.1 oster *
107 1.13 oster * the disk queue switch defines all the functions used in the
108 1.13 oster * different queueing disciplines queue ID, init routine, enqueue
109 1.13 oster * routine, dequeue routine
110 1.1 oster *
111 1.13 oster ****************************************************************************/
112 1.1 oster
113 1.22 jdolecek static const RF_DiskQueueSW_t diskqueuesw[] = {
114 1.6 oster {"fifo", /* FIFO */
115 1.6 oster rf_FifoCreate,
116 1.6 oster rf_FifoEnqueue,
117 1.6 oster rf_FifoDequeue,
118 1.6 oster rf_FifoPeek,
119 1.1 oster rf_FifoPromote},
120 1.1 oster
121 1.6 oster {"cvscan", /* cvscan */
122 1.6 oster rf_CvscanCreate,
123 1.6 oster rf_CvscanEnqueue,
124 1.6 oster rf_CvscanDequeue,
125 1.6 oster rf_CvscanPeek,
126 1.6 oster rf_CvscanPromote},
127 1.6 oster
128 1.6 oster {"sstf", /* shortest seek time first */
129 1.6 oster rf_SstfCreate,
130 1.6 oster rf_SstfEnqueue,
131 1.6 oster rf_SstfDequeue,
132 1.6 oster rf_SstfPeek,
133 1.1 oster rf_SstfPromote},
134 1.1 oster
135 1.6 oster {"scan", /* SCAN (two-way elevator) */
136 1.6 oster rf_ScanCreate,
137 1.6 oster rf_SstfEnqueue,
138 1.6 oster rf_ScanDequeue,
139 1.6 oster rf_ScanPeek,
140 1.1 oster rf_SstfPromote},
141 1.1 oster
142 1.6 oster {"cscan", /* CSCAN (one-way elevator) */
143 1.6 oster rf_CscanCreate,
144 1.6 oster rf_SstfEnqueue,
145 1.6 oster rf_CscanDequeue,
146 1.6 oster rf_CscanPeek,
147 1.1 oster rf_SstfPromote},
148 1.1 oster
149 1.1 oster };
150 1.1 oster #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
151 1.1 oster
152 1.59 oster
153 1.1 oster #define RF_MAX_FREE_DQD 256
154 1.31 oster #define RF_MIN_FREE_DQD 64
155 1.1 oster
156 1.59 oster /* XXX: scale these... */
157 1.59 oster #define RF_MAX_FREE_BUFIO 256
158 1.59 oster #define RF_MIN_FREE_BUFIO 64
159 1.59 oster
160 1.59 oster
161 1.1 oster
162 1.6 oster /* configures a single disk queue */
163 1.9 oster
164 1.53 mrg static void
165 1.53 mrg rf_ShutdownDiskQueue(void *arg)
166 1.53 mrg {
167 1.53 mrg RF_DiskQueue_t *diskqueue = arg;
168 1.53 mrg
169 1.53 mrg rf_destroy_mutex2(diskqueue->mutex);
170 1.53 mrg }
171 1.53 mrg
172 1.40 perry int
173 1.27 oster rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
174 1.27 oster RF_RowCol_t c, const RF_DiskQueueSW_t *p,
175 1.27 oster RF_SectorCount_t sectPerDisk, dev_t dev,
176 1.27 oster int maxOutstanding, RF_ShutdownList_t **listp,
177 1.27 oster RF_AllocListElem_t *clList)
178 1.6 oster {
179 1.6 oster diskqueue->col = c;
180 1.6 oster diskqueue->qPtr = p;
181 1.6 oster diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
182 1.6 oster diskqueue->dev = dev;
183 1.6 oster diskqueue->numOutstanding = 0;
184 1.6 oster diskqueue->queueLength = 0;
185 1.6 oster diskqueue->maxOutstanding = maxOutstanding;
186 1.6 oster diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
187 1.6 oster diskqueue->flags = 0;
188 1.6 oster diskqueue->raidPtr = raidPtr;
189 1.23 oster diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
190 1.53 mrg rf_init_mutex2(diskqueue->mutex, IPL_VM);
191 1.53 mrg rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue);
192 1.6 oster return (0);
193 1.1 oster }
194 1.1 oster
195 1.40 perry static void
196 1.60 oster rf_ShutdownDiskQueueSystem(void *arg)
197 1.6 oster {
198 1.60 oster RF_Raid_t *raidPtr;
199 1.60 oster
200 1.60 oster raidPtr = (RF_Raid_t *) arg;
201 1.60 oster
202 1.60 oster pool_destroy(&raidPtr->pools.dqd);
203 1.60 oster pool_destroy(&raidPtr->pools.bufio);
204 1.1 oster }
205 1.1 oster
206 1.30 oster int
207 1.60 oster rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
208 1.60 oster RF_Config_t *cfgPtr)
209 1.60 oster
210 1.6 oster {
211 1.6 oster
212 1.60 oster rf_pool_init(raidPtr, raidPtr->poolNames.dqd, &raidPtr->pools.dqd, sizeof(RF_DiskQueueData_t),
213 1.60 oster "dqd", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD);
214 1.60 oster rf_pool_init(raidPtr, raidPtr->poolNames.bufio, &raidPtr->pools.bufio, sizeof(buf_t),
215 1.60 oster "bufio", RF_MIN_FREE_BUFIO, RF_MAX_FREE_BUFIO);
216 1.60 oster rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, raidPtr);
217 1.24 oster
218 1.6 oster return (0);
219 1.6 oster }
220 1.6 oster
221 1.40 perry int
222 1.27 oster rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
223 1.27 oster RF_Config_t *cfgPtr)
224 1.6 oster {
225 1.23 oster RF_DiskQueue_t *diskQueues, *spareQueues;
226 1.22 jdolecek const RF_DiskQueueSW_t *p;
227 1.23 oster RF_RowCol_t r,c;
228 1.6 oster int rc, i;
229 1.6 oster
230 1.6 oster raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
231 1.6 oster
232 1.6 oster for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
233 1.6 oster if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
234 1.6 oster p = &diskqueuesw[i];
235 1.6 oster break;
236 1.6 oster }
237 1.6 oster }
238 1.6 oster if (p == NULL) {
239 1.6 oster RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
240 1.6 oster p = &diskqueuesw[0];
241 1.6 oster }
242 1.10 oster raidPtr->qType = p;
243 1.23 oster
244 1.54 christos diskQueues = RF_MallocAndAdd(
245 1.54 christos (raidPtr->numCol + RF_MAXSPARE) * sizeof(*diskQueues),
246 1.54 christos raidPtr->cleanupList);
247 1.23 oster if (diskQueues == NULL)
248 1.6 oster return (ENOMEM);
249 1.6 oster raidPtr->Queues = diskQueues;
250 1.23 oster
251 1.23 oster for (c = 0; c < raidPtr->numCol; c++) {
252 1.23 oster rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
253 1.23 oster c, p,
254 1.40 perry raidPtr->sectorsPerDisk,
255 1.23 oster raidPtr->Disks[c].dev,
256 1.40 perry cfgPtr->maxOutstandingDiskReqs,
257 1.23 oster listp, raidPtr->cleanupList);
258 1.23 oster if (rc)
259 1.23 oster return (rc);
260 1.6 oster }
261 1.6 oster
262 1.23 oster spareQueues = &raidPtr->Queues[raidPtr->numCol];
263 1.6 oster for (r = 0; r < raidPtr->numSpare; r++) {
264 1.9 oster rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
265 1.23 oster raidPtr->numCol + r, p,
266 1.23 oster raidPtr->sectorsPerDisk,
267 1.23 oster raidPtr->Disks[raidPtr->numCol + r].dev,
268 1.23 oster cfgPtr->maxOutstandingDiskReqs, listp,
269 1.23 oster raidPtr->cleanupList);
270 1.6 oster if (rc)
271 1.6 oster return (rc);
272 1.6 oster }
273 1.6 oster return (0);
274 1.6 oster }
275 1.1 oster /* Enqueue a disk I/O
276 1.1 oster *
277 1.1 oster * In the kernel, I/O is non-blocking and so we'd like to have multiple
278 1.1 oster * I/Os outstanding on the physical disks when possible.
279 1.1 oster *
280 1.1 oster * when any request arrives at a queue, we have two choices:
281 1.1 oster * dispatch it to the lower levels
282 1.1 oster * queue it up
283 1.1 oster *
284 1.1 oster * kernel rules for when to do what:
285 1.1 oster * unlocking req : always dispatch it
286 1.1 oster * normal req : queue empty => dispatch it & set priority
287 1.1 oster * queue not full & priority is ok => dispatch it
288 1.1 oster * else queue it
289 1.1 oster */
290 1.40 perry void
291 1.27 oster rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
292 1.6 oster {
293 1.6 oster RF_ETIMER_START(req->qtime);
294 1.6 oster RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
295 1.6 oster req->priority = pri;
296 1.6 oster
297 1.21 oster #if RF_DEBUG_DISKQUEUE
298 1.6 oster if (rf_queueDebug && (req->numSector == 0)) {
299 1.6 oster printf("Warning: Enqueueing zero-sector access\n");
300 1.6 oster }
301 1.21 oster #endif
302 1.6 oster RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
303 1.52 oster if (RF_OK_TO_DISPATCH(queue, req)) {
304 1.52 oster Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
305 1.52 oster rf_DispatchKernelIO(queue, req);
306 1.52 oster } else {
307 1.52 oster queue->queueLength++; /* increment count of number of requests waiting in this queue */
308 1.52 oster Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
309 1.52 oster req->queue = (void *) queue;
310 1.52 oster (queue->qPtr->Enqueue) (queue->qHdr, req, pri);
311 1.6 oster }
312 1.6 oster RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
313 1.1 oster }
314 1.6 oster
315 1.1 oster
316 1.52 oster /* get the next set of I/Os started */
317 1.40 perry void
318 1.27 oster rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
319 1.6 oster {
320 1.6 oster int done = 0;
321 1.6 oster
322 1.6 oster RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
323 1.6 oster queue->numOutstanding--;
324 1.6 oster RF_ASSERT(queue->numOutstanding >= 0);
325 1.6 oster
326 1.6 oster /* dispatch requests to the disk until we find one that we can't. */
327 1.6 oster /* no reason to continue once we've filled up the queue */
328 1.6 oster /* no reason to even start if the queue is locked */
329 1.6 oster
330 1.52 oster while (!done && !RF_QUEUE_FULL(queue)) {
331 1.52 oster req = (queue->qPtr->Dequeue) (queue->qHdr);
332 1.6 oster if (req) {
333 1.52 oster Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
334 1.52 oster queue->queueLength--; /* decrement count of number of requests waiting in this queue */
335 1.6 oster RF_ASSERT(queue->queueLength >= 0);
336 1.52 oster if (RF_OK_TO_DISPATCH(queue, req)) {
337 1.52 oster Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
338 1.52 oster rf_DispatchKernelIO(queue, req);
339 1.52 oster } else {
340 1.52 oster /* we can't dispatch it, so just re-enqueue it.
341 1.52 oster potential trouble here if disk queues batch reqs */
342 1.52 oster Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
343 1.52 oster queue->queueLength++;
344 1.52 oster (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
345 1.52 oster done = 1;
346 1.52 oster }
347 1.52 oster } else {
348 1.52 oster Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
349 1.52 oster done = 1;
350 1.6 oster }
351 1.6 oster }
352 1.6 oster
353 1.6 oster RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
354 1.1 oster }
355 1.1 oster /* promotes accesses tagged with the given parityStripeID from low priority
356 1.1 oster * to normal priority. This promotion is optional, meaning that a queue
357 1.1 oster * need not implement it. If there is no promotion routine associated with
358 1.1 oster * a queue, this routine does nothing and returns -1.
359 1.1 oster */
360 1.40 perry int
361 1.27 oster rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
362 1.27 oster RF_ReconUnitNum_t which_ru)
363 1.6 oster {
364 1.6 oster int retval;
365 1.6 oster
366 1.6 oster if (!queue->qPtr->Promote)
367 1.6 oster return (-1);
368 1.6 oster RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
369 1.6 oster retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
370 1.6 oster RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
371 1.6 oster return (retval);
372 1.6 oster }
373 1.6 oster
374 1.6 oster RF_DiskQueueData_t *
375 1.27 oster rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
376 1.49 christos RF_SectorCount_t nsect, void *bf,
377 1.27 oster RF_StripeNum_t parityStripeID,
378 1.27 oster RF_ReconUnitNum_t which_ru,
379 1.56 christos void (*wakeF) (void *, int), void *arg,
380 1.37 oster RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr,
381 1.57 jdolecek RF_DiskQueueDataFlags_t flags, const struct buf *mbp,
382 1.38 oster int waitflag)
383 1.6 oster {
384 1.6 oster RF_DiskQueueData_t *p;
385 1.6 oster
386 1.60 oster p = pool_get(&raidPtr->pools.dqd, PR_WAITOK | PR_ZERO);
387 1.59 oster KASSERT(p != NULL);
388 1.38 oster
389 1.59 oster /* Obtain a buffer from our own pool. It is possible for the
390 1.59 oster regular getiobuf() to run out of memory and return NULL.
391 1.59 oster We need to guarantee that never happens, as RAIDframe
392 1.59 oster doesn't have a good way to recover if memory allocation
393 1.59 oster fails here.
394 1.59 oster */
395 1.60 oster p->bp = pool_get(&raidPtr->pools.bufio, PR_WAITOK | PR_ZERO);
396 1.59 oster KASSERT(p->bp != NULL);
397 1.59 oster
398 1.59 oster buf_init(p->bp);
399 1.59 oster
400 1.51 reinoud SET(p->bp->b_cflags, BC_BUSY); /* mark buffer busy */
401 1.57 jdolecek if (mbp) {
402 1.57 jdolecek SET(p->bp->b_flags, mbp->b_flags & rf_b_pass);
403 1.57 jdolecek p->bp->b_proc = mbp->b_proc;
404 1.57 jdolecek }
405 1.6 oster
406 1.6 oster p->sectorOffset = ssect + rf_protectedSectors;
407 1.6 oster p->numSector = nsect;
408 1.6 oster p->type = typ;
409 1.41 christos p->buf = bf;
410 1.6 oster p->parityStripeID = parityStripeID;
411 1.6 oster p->which_ru = which_ru;
412 1.6 oster p->CompleteFunc = wakeF;
413 1.6 oster p->argument = arg;
414 1.39 oster p->next = NULL;
415 1.6 oster p->tracerec = tracerec;
416 1.6 oster p->priority = RF_IO_NORMAL_PRIORITY;
417 1.6 oster p->raidPtr = raidPtr;
418 1.6 oster p->flags = flags;
419 1.6 oster return (p);
420 1.6 oster }
421 1.6 oster
422 1.40 perry void
423 1.27 oster rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
424 1.1 oster {
425 1.60 oster pool_put(&p->raidPtr->pools.bufio, p->bp);
426 1.60 oster pool_put(&p->raidPtr->pools.dqd, p);
427 1.1 oster }
428