rf_paritylogging.c revision 1.12 1 /* $NetBSD: rf_paritylogging.c,v 1.12 2001/11/13 07:11:15 lukem Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29
30 /*
31 parity logging configuration, dag selection, and mapping is implemented here
32 */
33
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.12 2001/11/13 07:11:15 lukem Exp $");
36
37 #include "rf_archs.h"
38
39 #if RF_INCLUDE_PARITYLOGGING > 0
40
41 #include <dev/raidframe/raidframevar.h>
42
43 #include "rf_raid.h"
44 #include "rf_dag.h"
45 #include "rf_dagutils.h"
46 #include "rf_dagfuncs.h"
47 #include "rf_dagffrd.h"
48 #include "rf_dagffwr.h"
49 #include "rf_dagdegrd.h"
50 #include "rf_dagdegwr.h"
51 #include "rf_paritylog.h"
52 #include "rf_paritylogDiskMgr.h"
53 #include "rf_paritylogging.h"
54 #include "rf_parityloggingdags.h"
55 #include "rf_general.h"
56 #include "rf_map.h"
57 #include "rf_utils.h"
58 #include "rf_shutdown.h"
59
60 typedef struct RF_ParityLoggingConfigInfo_s {
61 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by
62 * IdentifyStripe */
63 } RF_ParityLoggingConfigInfo_t;
64
65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID);
66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
72
73 int
74 rf_ConfigureParityLogging(
75 RF_ShutdownList_t ** listp,
76 RF_Raid_t * raidPtr,
77 RF_Config_t * cfgPtr)
78 {
79 int i, j, startdisk, rc;
80 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
81 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
82 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
83 RF_ParityLoggingConfigInfo_t *info;
84 RF_ParityLog_t *l = NULL, *next;
85 caddr_t lHeapPtr;
86
87 if (rf_numParityRegions <= 0)
88 return(EINVAL);
89
90 /*
91 * We create multiple entries on the shutdown list here, since
92 * this configuration routine is fairly complicated in and of
93 * itself, and this makes backing out of a failed configuration
94 * much simpler.
95 */
96
97 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
98
99 /* create a parity logging configuration structure */
100 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t),
101 (RF_ParityLoggingConfigInfo_t *),
102 raidPtr->cleanupList);
103 if (info == NULL)
104 return (ENOMEM);
105 layoutPtr->layoutSpecificInfo = (void *) info;
106
107 RF_ASSERT(raidPtr->numRow == 1);
108
109 /* the stripe identifier must identify the disks in each stripe, IN
110 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
111 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol),
112 (raidPtr->numCol),
113 raidPtr->cleanupList);
114 if (info->stripeIdentifier == NULL)
115 return (ENOMEM);
116
117 startdisk = 0;
118 for (i = 0; i < (raidPtr->numCol); i++) {
119 for (j = 0; j < (raidPtr->numCol); j++) {
120 info->stripeIdentifier[i][j] = (startdisk + j) %
121 (raidPtr->numCol - 1);
122 }
123 if ((--startdisk) < 0)
124 startdisk = raidPtr->numCol - 1 - 1;
125 }
126
127 /* fill in the remaining layout parameters */
128 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
129 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
130 raidPtr->logBytesPerSector;
131 layoutPtr->numParityCol = 1;
132 layoutPtr->numParityLogCol = 1;
133 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol -
134 layoutPtr->numParityLogCol;
135 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol *
136 layoutPtr->sectorsPerStripeUnit;
137 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
138 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
139 layoutPtr->sectorsPerStripeUnit;
140
141 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk *
142 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
143
144 /* configure parity log parameters
145 *
146 * parameter comment/constraints
147 * -------------------------------------------
148 * numParityRegions* all regions (except possibly last)
149 * of equal size
150 * totalInCoreLogCapacity* amount of memory in bytes available
151 * for in-core logs (default 1 MB)
152 * numSectorsPerLog# capacity of an in-core log in sectors
153 * (1 * disk track)
154 * numParityLogs total number of in-core logs,
155 * should be at least numParityRegions
156 * regionLogCapacity size of a region log (except possibly
157 * last one) in sectors
158 * totalLogCapacity total amount of log space in sectors
159 *
160 * where '*' denotes a user settable parameter.
161 * Note that logs are fixed to be the size of a disk track,
162 * value #defined in rf_paritylog.h
163 *
164 */
165
166 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
167 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
168 if (rf_parityLogDebug)
169 printf("bytes per sector %d\n", raidPtr->bytesPerSector);
170
171 /* reduce fragmentation within a disk region by adjusting the number
172 * of regions in an attempt to allow an integral number of logs to fit
173 * into a disk region */
174 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
175 if (fragmentation > 0)
176 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) {
177 if (((totalLogCapacity / (rf_numParityRegions + i)) %
178 raidPtr->numSectorsPerLog) < fragmentation) {
179 rf_numParityRegions++;
180 raidPtr->regionLogCapacity = totalLogCapacity /
181 rf_numParityRegions;
182 fragmentation = raidPtr->regionLogCapacity %
183 raidPtr->numSectorsPerLog;
184 }
185 if (((totalLogCapacity / (rf_numParityRegions - i)) %
186 raidPtr->numSectorsPerLog) < fragmentation) {
187 rf_numParityRegions--;
188 raidPtr->regionLogCapacity = totalLogCapacity /
189 rf_numParityRegions;
190 fragmentation = raidPtr->regionLogCapacity %
191 raidPtr->numSectorsPerLog;
192 }
193 }
194 /* ensure integral number of regions per log */
195 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity /
196 raidPtr->numSectorsPerLog) *
197 raidPtr->numSectorsPerLog;
198
199 raidPtr->numParityLogs = rf_totalInCoreLogCapacity /
200 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
201 /* to avoid deadlock, must ensure that enough logs exist for each
202 * region to have one simultaneously */
203 if (raidPtr->numParityLogs < rf_numParityRegions)
204 raidPtr->numParityLogs = rf_numParityRegions;
205
206 /* create region information structs */
207 printf("Allocating %d bytes for in-core parity region info\n",
208 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
209 RF_Malloc(raidPtr->regionInfo,
210 (rf_numParityRegions * sizeof(RF_RegionInfo_t)),
211 (RF_RegionInfo_t *));
212 if (raidPtr->regionInfo == NULL)
213 return (ENOMEM);
214
215 /* last region may not be full capacity */
216 lastRegionCapacity = raidPtr->regionLogCapacity;
217 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity +
218 lastRegionCapacity > totalLogCapacity)
219 lastRegionCapacity = lastRegionCapacity -
220 raidPtr->numSectorsPerLog;
221
222 raidPtr->regionParityRange = raidPtr->sectorsPerDisk /
223 rf_numParityRegions;
224 maxRegionParityRange = raidPtr->regionParityRange;
225
226 /* i can't remember why this line is in the code -wvcii 6/30/95 */
227 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
228 regionParityRange++; */
229
230 /* build pool of unused parity logs */
231 printf("Allocating %d bytes for %d parity logs\n",
232 raidPtr->numParityLogs * raidPtr->numSectorsPerLog *
233 raidPtr->bytesPerSector,
234 raidPtr->numParityLogs);
235 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
236 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector,
237 (caddr_t));
238 if (raidPtr->parityLogBufferHeap == NULL)
239 return (ENOMEM);
240 lHeapPtr = raidPtr->parityLogBufferHeap;
241 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex);
242 if (rc) {
243 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
244 __FILE__, __LINE__, rc);
245 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
246 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
247 return (ENOMEM);
248 }
249 for (i = 0; i < raidPtr->numParityLogs; i++) {
250 if (i == 0) {
251 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1,
252 sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
253 if (raidPtr->parityLogPool.parityLogs == NULL) {
254 RF_Free(raidPtr->parityLogBufferHeap,
255 raidPtr->numParityLogs *
256 raidPtr->numSectorsPerLog *
257 raidPtr->bytesPerSector);
258 return (ENOMEM);
259 }
260 l = raidPtr->parityLogPool.parityLogs;
261 } else {
262 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t),
263 (RF_ParityLog_t *));
264 if (l->next == NULL) {
265 RF_Free(raidPtr->parityLogBufferHeap,
266 raidPtr->numParityLogs *
267 raidPtr->numSectorsPerLog *
268 raidPtr->bytesPerSector);
269 for (l = raidPtr->parityLogPool.parityLogs;
270 l;
271 l = next) {
272 next = l->next;
273 if (l->records)
274 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
275 RF_Free(l, sizeof(RF_ParityLog_t));
276 }
277 return (ENOMEM);
278 }
279 l = l->next;
280 }
281 l->bufPtr = lHeapPtr;
282 lHeapPtr += raidPtr->numSectorsPerLog *
283 raidPtr->bytesPerSector;
284 RF_Malloc(l->records, (raidPtr->numSectorsPerLog *
285 sizeof(RF_ParityLogRecord_t)),
286 (RF_ParityLogRecord_t *));
287 if (l->records == NULL) {
288 RF_Free(raidPtr->parityLogBufferHeap,
289 raidPtr->numParityLogs *
290 raidPtr->numSectorsPerLog *
291 raidPtr->bytesPerSector);
292 for (l = raidPtr->parityLogPool.parityLogs;
293 l;
294 l = next) {
295 next = l->next;
296 if (l->records)
297 RF_Free(l->records,
298 (raidPtr->numSectorsPerLog *
299 sizeof(RF_ParityLogRecord_t)));
300 RF_Free(l, sizeof(RF_ParityLog_t));
301 }
302 return (ENOMEM);
303 }
304 }
305 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
306 if (rc) {
307 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
308 __LINE__, rc);
309 rf_ShutdownParityLoggingPool(raidPtr);
310 return (rc);
311 }
312 /* build pool of region buffers */
313 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex);
314 if (rc) {
315 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
316 __FILE__, __LINE__, rc);
317 return (ENOMEM);
318 }
319 rc = rf_cond_init(&raidPtr->regionBufferPool.cond);
320 if (rc) {
321 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
322 __FILE__, __LINE__, rc);
323 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
324 return (ENOMEM);
325 }
326 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity *
327 raidPtr->bytesPerSector;
328 printf("regionBufferPool.bufferSize %d\n",
329 raidPtr->regionBufferPool.bufferSize);
330
331 /* for now, only one region at a time may be reintegrated */
332 raidPtr->regionBufferPool.totalBuffers = 1;
333
334 raidPtr->regionBufferPool.availableBuffers =
335 raidPtr->regionBufferPool.totalBuffers;
336 raidPtr->regionBufferPool.availBuffersIndex = 0;
337 raidPtr->regionBufferPool.emptyBuffersIndex = 0;
338 printf("Allocating %d bytes for regionBufferPool\n",
339 (int) (raidPtr->regionBufferPool.totalBuffers *
340 sizeof(caddr_t)));
341 RF_Malloc(raidPtr->regionBufferPool.buffers,
342 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t),
343 (caddr_t *));
344 if (raidPtr->regionBufferPool.buffers == NULL) {
345 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
346 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
347 return (ENOMEM);
348 }
349 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
350 printf("Allocating %d bytes for regionBufferPool#%d\n",
351 (int) (raidPtr->regionBufferPool.bufferSize *
352 sizeof(char)), i);
353 RF_Malloc(raidPtr->regionBufferPool.buffers[i],
354 raidPtr->regionBufferPool.bufferSize * sizeof(char),
355 (caddr_t));
356 if (raidPtr->regionBufferPool.buffers[i] == NULL) {
357 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
358 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
359 for (j = 0; j < i; j++) {
360 RF_Free(raidPtr->regionBufferPool.buffers[i],
361 raidPtr->regionBufferPool.bufferSize *
362 sizeof(char));
363 }
364 RF_Free(raidPtr->regionBufferPool.buffers,
365 raidPtr->regionBufferPool.totalBuffers *
366 sizeof(caddr_t));
367 return (ENOMEM);
368 }
369 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
370 (long) raidPtr->regionBufferPool.buffers[i]);
371 }
372 rc = rf_ShutdownCreate(listp,
373 rf_ShutdownParityLoggingRegionBufferPool,
374 raidPtr);
375 if (rc) {
376 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
377 __LINE__, rc);
378 rf_ShutdownParityLoggingRegionBufferPool(raidPtr);
379 return (rc);
380 }
381 /* build pool of parity buffers */
382 parityBufferCapacity = maxRegionParityRange;
383 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex);
384 if (rc) {
385 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
386 __FILE__, __LINE__, rc);
387 return (rc);
388 }
389 rc = rf_cond_init(&raidPtr->parityBufferPool.cond);
390 if (rc) {
391 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
392 __FILE__, __LINE__, rc);
393 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
394 return (ENOMEM);
395 }
396 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity *
397 raidPtr->bytesPerSector;
398 printf("parityBufferPool.bufferSize %d\n",
399 raidPtr->parityBufferPool.bufferSize);
400
401 /* for now, only one region at a time may be reintegrated */
402 raidPtr->parityBufferPool.totalBuffers = 1;
403
404 raidPtr->parityBufferPool.availableBuffers =
405 raidPtr->parityBufferPool.totalBuffers;
406 raidPtr->parityBufferPool.availBuffersIndex = 0;
407 raidPtr->parityBufferPool.emptyBuffersIndex = 0;
408 printf("Allocating %d bytes for parityBufferPool of %d units\n",
409 (int) (raidPtr->parityBufferPool.totalBuffers *
410 sizeof(caddr_t)),
411 raidPtr->parityBufferPool.totalBuffers );
412 RF_Malloc(raidPtr->parityBufferPool.buffers,
413 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t),
414 (caddr_t *));
415 if (raidPtr->parityBufferPool.buffers == NULL) {
416 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
417 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
418 return (ENOMEM);
419 }
420 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
421 printf("Allocating %d bytes for parityBufferPool#%d\n",
422 (int) (raidPtr->parityBufferPool.bufferSize *
423 sizeof(char)),i);
424 RF_Malloc(raidPtr->parityBufferPool.buffers[i],
425 raidPtr->parityBufferPool.bufferSize * sizeof(char),
426 (caddr_t));
427 if (raidPtr->parityBufferPool.buffers == NULL) {
428 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
429 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
430 for (j = 0; j < i; j++) {
431 RF_Free(raidPtr->parityBufferPool.buffers[i],
432 raidPtr->regionBufferPool.bufferSize *
433 sizeof(char));
434 }
435 RF_Free(raidPtr->parityBufferPool.buffers,
436 raidPtr->regionBufferPool.totalBuffers *
437 sizeof(caddr_t));
438 return (ENOMEM);
439 }
440 printf("parityBufferPool.buffers[%d] = %lx\n", i,
441 (long) raidPtr->parityBufferPool.buffers[i]);
442 }
443 rc = rf_ShutdownCreate(listp,
444 rf_ShutdownParityLoggingParityBufferPool,
445 raidPtr);
446 if (rc) {
447 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
448 __LINE__, rc);
449 rf_ShutdownParityLoggingParityBufferPool(raidPtr);
450 return (rc);
451 }
452 /* initialize parityLogDiskQueue */
453 rc = rf_create_managed_mutex(listp,
454 &raidPtr->parityLogDiskQueue.mutex);
455 if (rc) {
456 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
457 __FILE__, __LINE__, rc);
458 return (rc);
459 }
460 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond);
461 if (rc) {
462 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
463 __FILE__, __LINE__, rc);
464 return (rc);
465 }
466 raidPtr->parityLogDiskQueue.flushQueue = NULL;
467 raidPtr->parityLogDiskQueue.reintQueue = NULL;
468 raidPtr->parityLogDiskQueue.bufHead = NULL;
469 raidPtr->parityLogDiskQueue.bufTail = NULL;
470 raidPtr->parityLogDiskQueue.reintHead = NULL;
471 raidPtr->parityLogDiskQueue.reintTail = NULL;
472 raidPtr->parityLogDiskQueue.logBlockHead = NULL;
473 raidPtr->parityLogDiskQueue.logBlockTail = NULL;
474 raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
475 raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
476 raidPtr->parityLogDiskQueue.freeDataList = NULL;
477 raidPtr->parityLogDiskQueue.freeCommonList = NULL;
478
479 rc = rf_ShutdownCreate(listp,
480 rf_ShutdownParityLoggingDiskQueue,
481 raidPtr);
482 if (rc) {
483 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
484 __LINE__, rc);
485 return (rc);
486 }
487 for (i = 0; i < rf_numParityRegions; i++) {
488 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex);
489 if (rc) {
490 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
491 __LINE__, rc);
492 for (j = 0; j < i; j++)
493 FreeRegionInfo(raidPtr, j);
494 RF_Free(raidPtr->regionInfo,
495 (rf_numParityRegions *
496 sizeof(RF_RegionInfo_t)));
497 return (ENOMEM);
498 }
499 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex);
500 if (rc) {
501 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
502 __LINE__, rc);
503 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
504 for (j = 0; j < i; j++)
505 FreeRegionInfo(raidPtr, j);
506 RF_Free(raidPtr->regionInfo,
507 (rf_numParityRegions *
508 sizeof(RF_RegionInfo_t)));
509 return (ENOMEM);
510 }
511 raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
512 raidPtr->regionInfo[i].regionStartAddr =
513 raidPtr->regionLogCapacity * i;
514 raidPtr->regionInfo[i].parityStartAddr =
515 raidPtr->regionParityRange * i;
516 if (i < rf_numParityRegions - 1) {
517 raidPtr->regionInfo[i].capacity =
518 raidPtr->regionLogCapacity;
519 raidPtr->regionInfo[i].numSectorsParity =
520 raidPtr->regionParityRange;
521 } else {
522 raidPtr->regionInfo[i].capacity =
523 lastRegionCapacity;
524 raidPtr->regionInfo[i].numSectorsParity =
525 raidPtr->sectorsPerDisk -
526 raidPtr->regionParityRange * i;
527 if (raidPtr->regionInfo[i].numSectorsParity >
528 maxRegionParityRange)
529 maxRegionParityRange =
530 raidPtr->regionInfo[i].numSectorsParity;
531 }
532 raidPtr->regionInfo[i].diskCount = 0;
533 RF_ASSERT(raidPtr->regionInfo[i].capacity +
534 raidPtr->regionInfo[i].regionStartAddr <=
535 totalLogCapacity);
536 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr +
537 raidPtr->regionInfo[i].numSectorsParity <=
538 raidPtr->sectorsPerDisk);
539 printf("Allocating %d bytes for region %d\n",
540 (int) (raidPtr->regionInfo[i].capacity *
541 sizeof(RF_DiskMap_t)), i);
542 RF_Malloc(raidPtr->regionInfo[i].diskMap,
543 (raidPtr->regionInfo[i].capacity *
544 sizeof(RF_DiskMap_t)),
545 (RF_DiskMap_t *));
546 if (raidPtr->regionInfo[i].diskMap == NULL) {
547 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
548 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex);
549 for (j = 0; j < i; j++)
550 FreeRegionInfo(raidPtr, j);
551 RF_Free(raidPtr->regionInfo,
552 (rf_numParityRegions *
553 sizeof(RF_RegionInfo_t)));
554 return (ENOMEM);
555 }
556 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
557 raidPtr->regionInfo[i].coreLog = NULL;
558 }
559 rc = rf_ShutdownCreate(listp,
560 rf_ShutdownParityLoggingRegionInfo,
561 raidPtr);
562 if (rc) {
563 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
564 __LINE__, rc);
565 rf_ShutdownParityLoggingRegionInfo(raidPtr);
566 return (rc);
567 }
568 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
569 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
570 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle,
571 rf_ParityLoggingDiskManager, raidPtr,"rf_log");
572 if (rc) {
573 raidPtr->parityLogDiskQueue.threadState = 0;
574 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
575 __FILE__, __LINE__, rc);
576 return (ENOMEM);
577 }
578 /* wait for thread to start */
579 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
580 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) {
581 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
582 raidPtr->parityLogDiskQueue.mutex);
583 }
584 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
585
586 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
587 if (rc) {
588 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc);
589 rf_ShutdownParityLogging(raidPtr);
590 return (rc);
591 }
592 if (rf_parityLogDebug) {
593 printf(" size of disk log in sectors: %d\n",
594 (int) totalLogCapacity);
595 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions);
596 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity);
597 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation);
598 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs);
599 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
600 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
601 }
602 rf_EnableParityLogging(raidPtr);
603
604 return (0);
605 }
606
607 static void
608 FreeRegionInfo(
609 RF_Raid_t * raidPtr,
610 RF_RegionId_t regionID)
611 {
612 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
613 RF_Free(raidPtr->regionInfo[regionID].diskMap,
614 (raidPtr->regionInfo[regionID].capacity *
615 sizeof(RF_DiskMap_t)));
616 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
617 rf_ReleaseParityLogs(raidPtr,
618 raidPtr->regionInfo[regionID].coreLog);
619 raidPtr->regionInfo[regionID].coreLog = NULL;
620 } else {
621 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
622 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
623 }
624 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
625 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex);
626 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex);
627 }
628
629
630 static void
631 FreeParityLogQueue(
632 RF_Raid_t * raidPtr,
633 RF_ParityLogQueue_t * queue)
634 {
635 RF_ParityLog_t *l1, *l2;
636
637 RF_LOCK_MUTEX(queue->mutex);
638 l1 = queue->parityLogs;
639 while (l1) {
640 l2 = l1;
641 l1 = l2->next;
642 RF_Free(l2->records, (raidPtr->numSectorsPerLog *
643 sizeof(RF_ParityLogRecord_t)));
644 RF_Free(l2, sizeof(RF_ParityLog_t));
645 }
646 RF_UNLOCK_MUTEX(queue->mutex);
647 rf_mutex_destroy(&queue->mutex);
648 }
649
650
651 static void
652 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue)
653 {
654 int i;
655
656 RF_LOCK_MUTEX(queue->mutex);
657 if (queue->availableBuffers != queue->totalBuffers) {
658 printf("Attempt to free region queue which is still in use!\n");
659 RF_ASSERT(0);
660 }
661 for (i = 0; i < queue->totalBuffers; i++)
662 RF_Free(queue->buffers[i], queue->bufferSize);
663 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t));
664 RF_UNLOCK_MUTEX(queue->mutex);
665 rf_mutex_destroy(&queue->mutex);
666 }
667
668 static void
669 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
670 {
671 RF_Raid_t *raidPtr;
672 RF_RegionId_t i;
673
674 raidPtr = (RF_Raid_t *) arg;
675 if (rf_parityLogDebug) {
676 printf("raid%d: ShutdownParityLoggingRegionInfo\n",
677 raidPtr->raidid);
678 }
679 /* free region information structs */
680 for (i = 0; i < rf_numParityRegions; i++)
681 FreeRegionInfo(raidPtr, i);
682 RF_Free(raidPtr->regionInfo, (rf_numParityRegions *
683 sizeof(raidPtr->regionInfo)));
684 raidPtr->regionInfo = NULL;
685 }
686
687 static void
688 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
689 {
690 RF_Raid_t *raidPtr;
691
692 raidPtr = (RF_Raid_t *) arg;
693 if (rf_parityLogDebug) {
694 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid);
695 }
696 /* free contents of parityLogPool */
697 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool);
698 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
699 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
700 }
701
702 static void
703 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
704 {
705 RF_Raid_t *raidPtr;
706
707 raidPtr = (RF_Raid_t *) arg;
708 if (rf_parityLogDebug) {
709 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n",
710 raidPtr->raidid);
711 }
712 FreeRegionBufferQueue(&raidPtr->regionBufferPool);
713 }
714
715 static void
716 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
717 {
718 RF_Raid_t *raidPtr;
719
720 raidPtr = (RF_Raid_t *) arg;
721 if (rf_parityLogDebug) {
722 printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
723 raidPtr->raidid);
724 }
725 FreeRegionBufferQueue(&raidPtr->parityBufferPool);
726 }
727
728 static void
729 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
730 {
731 RF_ParityLogData_t *d;
732 RF_CommonLogData_t *c;
733 RF_Raid_t *raidPtr;
734
735 raidPtr = (RF_Raid_t *) arg;
736 if (rf_parityLogDebug) {
737 printf("raid%d: ShutdownParityLoggingDiskQueue\n",
738 raidPtr->raidid);
739 }
740 /* free disk manager stuff */
741 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
742 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
743 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
744 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
745 while (raidPtr->parityLogDiskQueue.freeDataList) {
746 d = raidPtr->parityLogDiskQueue.freeDataList;
747 raidPtr->parityLogDiskQueue.freeDataList =
748 raidPtr->parityLogDiskQueue.freeDataList->next;
749 RF_Free(d, sizeof(RF_ParityLogData_t));
750 }
751 while (raidPtr->parityLogDiskQueue.freeCommonList) {
752 c = raidPtr->parityLogDiskQueue.freeCommonList;
753 rf_mutex_destroy(&c->mutex);
754 raidPtr->parityLogDiskQueue.freeCommonList =
755 raidPtr->parityLogDiskQueue.freeCommonList->next;
756 RF_Free(c, sizeof(RF_CommonLogData_t));
757 }
758 }
759
760 static void
761 rf_ShutdownParityLogging(RF_ThreadArg_t arg)
762 {
763 RF_Raid_t *raidPtr;
764
765 raidPtr = (RF_Raid_t *) arg;
766 if (rf_parityLogDebug) {
767 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid);
768 }
769 /* shutdown disk thread */
770 /* This has the desirable side-effect of forcing all regions to be
771 * reintegrated. This is necessary since all parity log maps are
772 * currently held in volatile memory. */
773
774 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
775 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
776 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
777 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
778 /*
779 * pLogDiskThread will now terminate when queues are cleared
780 * now wait for it to be done
781 */
782 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
783 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) {
784 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
785 raidPtr->parityLogDiskQueue.mutex);
786 }
787 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
788 if (rf_parityLogDebug) {
789 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid);
790 }
791 }
792
793 int
794 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr)
795 {
796 return (20);
797 }
798
799 RF_HeadSepLimit_t
800 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr)
801 {
802 return (10);
803 }
804 /* return the region ID for a given RAID address */
805 RF_RegionId_t
806 rf_MapRegionIDParityLogging(
807 RF_Raid_t * raidPtr,
808 RF_SectorNum_t address)
809 {
810 RF_RegionId_t regionID;
811
812 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
813 regionID = address / raidPtr->regionParityRange;
814 if (regionID == rf_numParityRegions) {
815 /* last region may be larger than other regions */
816 regionID--;
817 }
818 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
819 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr +
820 raidPtr->regionInfo[regionID].numSectorsParity);
821 RF_ASSERT(regionID < rf_numParityRegions);
822 return (regionID);
823 }
824
825
826 /* given a logical RAID sector, determine physical disk address of data */
827 void
828 rf_MapSectorParityLogging(
829 RF_Raid_t * raidPtr,
830 RF_RaidAddr_t raidSector,
831 RF_RowCol_t * row,
832 RF_RowCol_t * col,
833 RF_SectorNum_t * diskSector,
834 int remap)
835 {
836 RF_StripeNum_t SUID = raidSector /
837 raidPtr->Layout.sectorsPerStripeUnit;
838 *row = 0;
839 /* *col = (SUID % (raidPtr->numCol -
840 * raidPtr->Layout.numParityLogCol)); */
841 *col = SUID % raidPtr->Layout.numDataCol;
842 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
843 raidPtr->Layout.sectorsPerStripeUnit +
844 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
845 }
846
847
848 /* given a logical RAID sector, determine physical disk address of parity */
849 void
850 rf_MapParityParityLogging(
851 RF_Raid_t * raidPtr,
852 RF_RaidAddr_t raidSector,
853 RF_RowCol_t * row,
854 RF_RowCol_t * col,
855 RF_SectorNum_t * diskSector,
856 int remap)
857 {
858 RF_StripeNum_t SUID = raidSector /
859 raidPtr->Layout.sectorsPerStripeUnit;
860
861 *row = 0;
862 /* *col =
863 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
864 * r->numCol - raidPtr->Layout.numParityLogCol); */
865 *col = raidPtr->Layout.numDataCol;
866 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
867 raidPtr->Layout.sectorsPerStripeUnit +
868 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
869 }
870
871
872 /* given a regionID and sector offset, determine the physical disk address of the parity log */
873 void
874 rf_MapLogParityLogging(
875 RF_Raid_t * raidPtr,
876 RF_RegionId_t regionID,
877 RF_SectorNum_t regionOffset,
878 RF_RowCol_t * row,
879 RF_RowCol_t * col,
880 RF_SectorNum_t * startSector)
881 {
882 *row = 0;
883 *col = raidPtr->numCol - 1;
884 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
885 }
886
887
888 /* given a regionID, determine the physical disk address of the logged
889 parity for that region */
890 void
891 rf_MapRegionParity(
892 RF_Raid_t * raidPtr,
893 RF_RegionId_t regionID,
894 RF_RowCol_t * row,
895 RF_RowCol_t * col,
896 RF_SectorNum_t * startSector,
897 RF_SectorCount_t * numSector)
898 {
899 *row = 0;
900 *col = raidPtr->numCol - 2;
901 *startSector = raidPtr->regionInfo[regionID].parityStartAddr;
902 *numSector = raidPtr->regionInfo[regionID].numSectorsParity;
903 }
904
905
906 /* given a logical RAID address, determine the participating disks in
907 the stripe */
908 void
909 rf_IdentifyStripeParityLogging(
910 RF_Raid_t * raidPtr,
911 RF_RaidAddr_t addr,
912 RF_RowCol_t ** diskids,
913 RF_RowCol_t * outRow)
914 {
915 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout,
916 addr);
917 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *)
918 raidPtr->Layout.layoutSpecificInfo;
919 *outRow = 0;
920 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
921 }
922
923
924 void
925 rf_MapSIDToPSIDParityLogging(
926 RF_RaidLayout_t * layoutPtr,
927 RF_StripeNum_t stripeID,
928 RF_StripeNum_t * psID,
929 RF_ReconUnitNum_t * which_ru)
930 {
931 *which_ru = 0;
932 *psID = stripeID;
933 }
934
935
936 /* select an algorithm for performing an access. Returns two pointers,
937 * one to a function that will return information about the DAG, and
938 * another to a function that will create the dag.
939 */
940 void
941 rf_ParityLoggingDagSelect(
942 RF_Raid_t * raidPtr,
943 RF_IoType_t type,
944 RF_AccessStripeMap_t * asmp,
945 RF_VoidFuncPtr * createFunc)
946 {
947 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
948 RF_PhysDiskAddr_t *failedPDA = NULL;
949 RF_RowCol_t frow, fcol;
950 RF_RowStatus_t rstat;
951 int prior_recon;
952
953 RF_ASSERT(RF_IO_IS_R_OR_W(type));
954
955 if (asmp->numDataFailed + asmp->numParityFailed > 1) {
956 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
957 /* *infoFunc = */ *createFunc = NULL;
958 return;
959 } else
960 if (asmp->numDataFailed + asmp->numParityFailed == 1) {
961
962 /* if under recon & already reconstructed, redirect
963 * the access to the spare drive and eliminate the
964 * failure indication */
965 failedPDA = asmp->failedPDAs[0];
966 frow = failedPDA->row;
967 fcol = failedPDA->col;
968 rstat = raidPtr->status[failedPDA->row];
969 prior_recon = (rstat == rf_rs_reconfigured) || (
970 (rstat == rf_rs_reconstructing) ?
971 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
972 );
973 if (prior_recon) {
974 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col;
975 RF_SectorNum_t oo = failedPDA->startSector;
976 if (layoutPtr->map->flags &
977 RF_DISTRIBUTE_SPARE) {
978 /* redirect to dist spare space */
979
980 if (failedPDA == asmp->parityInfo) {
981
982 /* parity has failed */
983 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
984 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
985
986 if (asmp->parityInfo->next) { /* redir 2nd component,
987 * if any */
988 RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
989 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
990 p->row = failedPDA->row;
991 p->col = failedPDA->col;
992 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
993 SUoffs; /* cheating:
994 * startSector is not
995 * really a RAID address */
996 }
997 } else
998 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
999 RF_ASSERT(0); /* should not ever
1000 * happen */
1001 } else {
1002
1003 /* data has failed */
1004 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
1005 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
1006
1007 }
1008
1009 } else {
1010 /* redirect to dedicated spare space */
1011
1012 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
1013 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
1014
1015 /* the parity may have two distinct
1016 * components, both of which may need
1017 * to be redirected */
1018 if (asmp->parityInfo->next) {
1019 if (failedPDA == asmp->parityInfo) {
1020 failedPDA->next->row = failedPDA->row;
1021 failedPDA->next->col = failedPDA->col;
1022 } else
1023 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */
1024 asmp->parityInfo->row = failedPDA->row;
1025 asmp->parityInfo->col = failedPDA->col;
1026 }
1027 }
1028 }
1029
1030 RF_ASSERT(failedPDA->col != -1);
1031
1032 if (rf_dagDebug || rf_mapDebug) {
1033 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
1034 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector);
1035 }
1036 asmp->numDataFailed = asmp->numParityFailed = 0;
1037 }
1038 }
1039 if (type == RF_IO_TYPE_READ) {
1040
1041 if (asmp->numDataFailed == 0)
1042 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
1043 else
1044 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
1045
1046 } else {
1047
1048
1049 /* if mirroring, always use large writes. If the access
1050 * requires two distinct parity updates, always do a small
1051 * write. If the stripe contains a failure but the access
1052 * does not, do a small write. The first conditional
1053 * (numStripeUnitsAccessed <= numDataCol/2) uses a
1054 * less-than-or-equal rather than just a less-than because
1055 * when G is 3 or 4, numDataCol/2 is 1, and I want
1056 * single-stripe-unit updates to use just one disk. */
1057 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) {
1058 if (((asmp->numStripeUnitsAccessed <=
1059 (layoutPtr->numDataCol / 2)) &&
1060 (layoutPtr->numDataCol != 1)) ||
1061 (asmp->parityInfo->next != NULL) ||
1062 rf_CheckStripeForFailures(raidPtr, asmp)) {
1063 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG;
1064 } else
1065 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG;
1066 } else
1067 if (asmp->numParityFailed == 1)
1068 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
1069 else
1070 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
1071 *createFunc = NULL;
1072 else
1073 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
1074 }
1075 }
1076 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
1077