rf_paritylogging.c revision 1.10 1 /* $NetBSD: rf_paritylogging.c,v 1.10 2000/02/12 16:06:27 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29
30 /*
31 parity logging configuration, dag selection, and mapping is implemented here
32 */
33
34 #include "rf_archs.h"
35
36 #if RF_INCLUDE_PARITYLOGGING > 0
37
38 #include "rf_types.h"
39 #include "rf_raid.h"
40 #include "rf_dag.h"
41 #include "rf_dagutils.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_dagffrd.h"
44 #include "rf_dagffwr.h"
45 #include "rf_dagdegrd.h"
46 #include "rf_dagdegwr.h"
47 #include "rf_paritylog.h"
48 #include "rf_paritylogDiskMgr.h"
49 #include "rf_paritylogging.h"
50 #include "rf_parityloggingdags.h"
51 #include "rf_general.h"
52 #include "rf_map.h"
53 #include "rf_utils.h"
54 #include "rf_shutdown.h"
55
56 typedef struct RF_ParityLoggingConfigInfo_s {
57 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by
58 * IdentifyStripe */
59 } RF_ParityLoggingConfigInfo_t;
60
61 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID);
62 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
63 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
64 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
65 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
66 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
67 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
68
69 int
70 rf_ConfigureParityLogging(
71 RF_ShutdownList_t ** listp,
72 RF_Raid_t * raidPtr,
73 RF_Config_t * cfgPtr)
74 {
75 int i, j, startdisk, rc;
76 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
77 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
78 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
79 RF_ParityLoggingConfigInfo_t *info;
80 RF_ParityLog_t *l = NULL, *next;
81 caddr_t lHeapPtr;
82
83 if (rf_numParityRegions <= 0)
84 return(EINVAL);
85
86 /*
87 * We create multiple entries on the shutdown list here, since
88 * this configuration routine is fairly complicated in and of
89 * itself, and this makes backing out of a failed configuration
90 * much simpler.
91 */
92
93 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
94
95 /* create a parity logging configuration structure */
96 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t),
97 (RF_ParityLoggingConfigInfo_t *),
98 raidPtr->cleanupList);
99 if (info == NULL)
100 return (ENOMEM);
101 layoutPtr->layoutSpecificInfo = (void *) info;
102
103 RF_ASSERT(raidPtr->numRow == 1);
104
105 /* the stripe identifier must identify the disks in each stripe, IN
106 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
107 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol),
108 (raidPtr->numCol),
109 raidPtr->cleanupList);
110 if (info->stripeIdentifier == NULL)
111 return (ENOMEM);
112
113 startdisk = 0;
114 for (i = 0; i < (raidPtr->numCol); i++) {
115 for (j = 0; j < (raidPtr->numCol); j++) {
116 info->stripeIdentifier[i][j] = (startdisk + j) %
117 (raidPtr->numCol - 1);
118 }
119 if ((--startdisk) < 0)
120 startdisk = raidPtr->numCol - 1 - 1;
121 }
122
123 /* fill in the remaining layout parameters */
124 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
125 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
126 raidPtr->logBytesPerSector;
127 layoutPtr->numParityCol = 1;
128 layoutPtr->numParityLogCol = 1;
129 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol -
130 layoutPtr->numParityLogCol;
131 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol *
132 layoutPtr->sectorsPerStripeUnit;
133 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
134 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
135 layoutPtr->sectorsPerStripeUnit;
136
137 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk *
138 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
139
140 /* configure parity log parameters
141 *
142 * parameter comment/constraints
143 * -------------------------------------------
144 * numParityRegions* all regions (except possibly last)
145 * of equal size
146 * totalInCoreLogCapacity* amount of memory in bytes available
147 * for in-core logs (default 1 MB)
148 * numSectorsPerLog# capacity of an in-core log in sectors
149 * (1 * disk track)
150 * numParityLogs total number of in-core logs,
151 * should be at least numParityRegions
152 * regionLogCapacity size of a region log (except possibly
153 * last one) in sectors
154 * totalLogCapacity total amount of log space in sectors
155 *
156 * where '*' denotes a user settable parameter.
157 * Note that logs are fixed to be the size of a disk track,
158 * value #defined in rf_paritylog.h
159 *
160 */
161
162 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
163 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
164 if (rf_parityLogDebug)
165 printf("bytes per sector %d\n", raidPtr->bytesPerSector);
166
167 /* reduce fragmentation within a disk region by adjusting the number
168 * of regions in an attempt to allow an integral number of logs to fit
169 * into a disk region */
170 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
171 if (fragmentation > 0)
172 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) {
173 if (((totalLogCapacity / (rf_numParityRegions + i)) %
174 raidPtr->numSectorsPerLog) < fragmentation) {
175 rf_numParityRegions++;
176 raidPtr->regionLogCapacity = totalLogCapacity /
177 rf_numParityRegions;
178 fragmentation = raidPtr->regionLogCapacity %
179 raidPtr->numSectorsPerLog;
180 }
181 if (((totalLogCapacity / (rf_numParityRegions - i)) %
182 raidPtr->numSectorsPerLog) < fragmentation) {
183 rf_numParityRegions--;
184 raidPtr->regionLogCapacity = totalLogCapacity /
185 rf_numParityRegions;
186 fragmentation = raidPtr->regionLogCapacity %
187 raidPtr->numSectorsPerLog;
188 }
189 }
190 /* ensure integral number of regions per log */
191 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity /
192 raidPtr->numSectorsPerLog) *
193 raidPtr->numSectorsPerLog;
194
195 raidPtr->numParityLogs = rf_totalInCoreLogCapacity /
196 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
197 /* to avoid deadlock, must ensure that enough logs exist for each
198 * region to have one simultaneously */
199 if (raidPtr->numParityLogs < rf_numParityRegions)
200 raidPtr->numParityLogs = rf_numParityRegions;
201
202 /* create region information structs */
203 printf("Allocating %d bytes for in-core parity region info\n",
204 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
205 RF_Malloc(raidPtr->regionInfo,
206 (rf_numParityRegions * sizeof(RF_RegionInfo_t)),
207 (RF_RegionInfo_t *));
208 if (raidPtr->regionInfo == NULL)
209 return (ENOMEM);
210
211 /* last region may not be full capacity */
212 lastRegionCapacity = raidPtr->regionLogCapacity;
213 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity +
214 lastRegionCapacity > totalLogCapacity)
215 lastRegionCapacity = lastRegionCapacity -
216 raidPtr->numSectorsPerLog;
217
218 raidPtr->regionParityRange = raidPtr->sectorsPerDisk /
219 rf_numParityRegions;
220 maxRegionParityRange = raidPtr->regionParityRange;
221
222 /* i can't remember why this line is in the code -wvcii 6/30/95 */
223 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
224 regionParityRange++; */
225
226 /* build pool of unused parity logs */
227 printf("Allocating %d bytes for %d parity logs\n",
228 raidPtr->numParityLogs * raidPtr->numSectorsPerLog *
229 raidPtr->bytesPerSector,
230 raidPtr->numParityLogs);
231 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
232 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector,
233 (caddr_t));
234 if (raidPtr->parityLogBufferHeap == NULL)
235 return (ENOMEM);
236 lHeapPtr = raidPtr->parityLogBufferHeap;
237 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex);
238 if (rc) {
239 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
240 __FILE__, __LINE__, rc);
241 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
242 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
243 return (ENOMEM);
244 }
245 for (i = 0; i < raidPtr->numParityLogs; i++) {
246 if (i == 0) {
247 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1,
248 sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
249 if (raidPtr->parityLogPool.parityLogs == NULL) {
250 RF_Free(raidPtr->parityLogBufferHeap,
251 raidPtr->numParityLogs *
252 raidPtr->numSectorsPerLog *
253 raidPtr->bytesPerSector);
254 return (ENOMEM);
255 }
256 l = raidPtr->parityLogPool.parityLogs;
257 } else {
258 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t),
259 (RF_ParityLog_t *));
260 if (l->next == NULL) {
261 RF_Free(raidPtr->parityLogBufferHeap,
262 raidPtr->numParityLogs *
263 raidPtr->numSectorsPerLog *
264 raidPtr->bytesPerSector);
265 for (l = raidPtr->parityLogPool.parityLogs;
266 l;
267 l = next) {
268 next = l->next;
269 if (l->records)
270 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
271 RF_Free(l, sizeof(RF_ParityLog_t));
272 }
273 return (ENOMEM);
274 }
275 l = l->next;
276 }
277 l->bufPtr = lHeapPtr;
278 lHeapPtr += raidPtr->numSectorsPerLog *
279 raidPtr->bytesPerSector;
280 RF_Malloc(l->records, (raidPtr->numSectorsPerLog *
281 sizeof(RF_ParityLogRecord_t)),
282 (RF_ParityLogRecord_t *));
283 if (l->records == NULL) {
284 RF_Free(raidPtr->parityLogBufferHeap,
285 raidPtr->numParityLogs *
286 raidPtr->numSectorsPerLog *
287 raidPtr->bytesPerSector);
288 for (l = raidPtr->parityLogPool.parityLogs;
289 l;
290 l = next) {
291 next = l->next;
292 if (l->records)
293 RF_Free(l->records,
294 (raidPtr->numSectorsPerLog *
295 sizeof(RF_ParityLogRecord_t)));
296 RF_Free(l, sizeof(RF_ParityLog_t));
297 }
298 return (ENOMEM);
299 }
300 }
301 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
302 if (rc) {
303 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
304 __LINE__, rc);
305 rf_ShutdownParityLoggingPool(raidPtr);
306 return (rc);
307 }
308 /* build pool of region buffers */
309 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex);
310 if (rc) {
311 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
312 __FILE__, __LINE__, rc);
313 return (ENOMEM);
314 }
315 rc = rf_cond_init(&raidPtr->regionBufferPool.cond);
316 if (rc) {
317 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
318 __FILE__, __LINE__, rc);
319 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
320 return (ENOMEM);
321 }
322 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity *
323 raidPtr->bytesPerSector;
324 printf("regionBufferPool.bufferSize %d\n",
325 raidPtr->regionBufferPool.bufferSize);
326
327 /* for now, only one region at a time may be reintegrated */
328 raidPtr->regionBufferPool.totalBuffers = 1;
329
330 raidPtr->regionBufferPool.availableBuffers =
331 raidPtr->regionBufferPool.totalBuffers;
332 raidPtr->regionBufferPool.availBuffersIndex = 0;
333 raidPtr->regionBufferPool.emptyBuffersIndex = 0;
334 printf("Allocating %d bytes for regionBufferPool\n",
335 (int) (raidPtr->regionBufferPool.totalBuffers *
336 sizeof(caddr_t)));
337 RF_Malloc(raidPtr->regionBufferPool.buffers,
338 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t),
339 (caddr_t *));
340 if (raidPtr->regionBufferPool.buffers == NULL) {
341 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
342 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
343 return (ENOMEM);
344 }
345 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
346 printf("Allocating %d bytes for regionBufferPool#%d\n",
347 (int) (raidPtr->regionBufferPool.bufferSize *
348 sizeof(char)), i);
349 RF_Malloc(raidPtr->regionBufferPool.buffers[i],
350 raidPtr->regionBufferPool.bufferSize * sizeof(char),
351 (caddr_t));
352 if (raidPtr->regionBufferPool.buffers[i] == NULL) {
353 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
354 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
355 for (j = 0; j < i; j++) {
356 RF_Free(raidPtr->regionBufferPool.buffers[i],
357 raidPtr->regionBufferPool.bufferSize *
358 sizeof(char));
359 }
360 RF_Free(raidPtr->regionBufferPool.buffers,
361 raidPtr->regionBufferPool.totalBuffers *
362 sizeof(caddr_t));
363 return (ENOMEM);
364 }
365 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
366 (long) raidPtr->regionBufferPool.buffers[i]);
367 }
368 rc = rf_ShutdownCreate(listp,
369 rf_ShutdownParityLoggingRegionBufferPool,
370 raidPtr);
371 if (rc) {
372 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
373 __LINE__, rc);
374 rf_ShutdownParityLoggingRegionBufferPool(raidPtr);
375 return (rc);
376 }
377 /* build pool of parity buffers */
378 parityBufferCapacity = maxRegionParityRange;
379 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex);
380 if (rc) {
381 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
382 __FILE__, __LINE__, rc);
383 return (rc);
384 }
385 rc = rf_cond_init(&raidPtr->parityBufferPool.cond);
386 if (rc) {
387 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
388 __FILE__, __LINE__, rc);
389 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
390 return (ENOMEM);
391 }
392 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity *
393 raidPtr->bytesPerSector;
394 printf("parityBufferPool.bufferSize %d\n",
395 raidPtr->parityBufferPool.bufferSize);
396
397 /* for now, only one region at a time may be reintegrated */
398 raidPtr->parityBufferPool.totalBuffers = 1;
399
400 raidPtr->parityBufferPool.availableBuffers =
401 raidPtr->parityBufferPool.totalBuffers;
402 raidPtr->parityBufferPool.availBuffersIndex = 0;
403 raidPtr->parityBufferPool.emptyBuffersIndex = 0;
404 printf("Allocating %d bytes for parityBufferPool of %d units\n",
405 (int) (raidPtr->parityBufferPool.totalBuffers *
406 sizeof(caddr_t)),
407 raidPtr->parityBufferPool.totalBuffers );
408 RF_Malloc(raidPtr->parityBufferPool.buffers,
409 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t),
410 (caddr_t *));
411 if (raidPtr->parityBufferPool.buffers == NULL) {
412 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
413 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
414 return (ENOMEM);
415 }
416 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
417 printf("Allocating %d bytes for parityBufferPool#%d\n",
418 (int) (raidPtr->parityBufferPool.bufferSize *
419 sizeof(char)),i);
420 RF_Malloc(raidPtr->parityBufferPool.buffers[i],
421 raidPtr->parityBufferPool.bufferSize * sizeof(char),
422 (caddr_t));
423 if (raidPtr->parityBufferPool.buffers == NULL) {
424 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
425 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
426 for (j = 0; j < i; j++) {
427 RF_Free(raidPtr->parityBufferPool.buffers[i],
428 raidPtr->regionBufferPool.bufferSize *
429 sizeof(char));
430 }
431 RF_Free(raidPtr->parityBufferPool.buffers,
432 raidPtr->regionBufferPool.totalBuffers *
433 sizeof(caddr_t));
434 return (ENOMEM);
435 }
436 printf("parityBufferPool.buffers[%d] = %lx\n", i,
437 (long) raidPtr->parityBufferPool.buffers[i]);
438 }
439 rc = rf_ShutdownCreate(listp,
440 rf_ShutdownParityLoggingParityBufferPool,
441 raidPtr);
442 if (rc) {
443 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
444 __LINE__, rc);
445 rf_ShutdownParityLoggingParityBufferPool(raidPtr);
446 return (rc);
447 }
448 /* initialize parityLogDiskQueue */
449 rc = rf_create_managed_mutex(listp,
450 &raidPtr->parityLogDiskQueue.mutex);
451 if (rc) {
452 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
453 __FILE__, __LINE__, rc);
454 return (rc);
455 }
456 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond);
457 if (rc) {
458 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
459 __FILE__, __LINE__, rc);
460 return (rc);
461 }
462 raidPtr->parityLogDiskQueue.flushQueue = NULL;
463 raidPtr->parityLogDiskQueue.reintQueue = NULL;
464 raidPtr->parityLogDiskQueue.bufHead = NULL;
465 raidPtr->parityLogDiskQueue.bufTail = NULL;
466 raidPtr->parityLogDiskQueue.reintHead = NULL;
467 raidPtr->parityLogDiskQueue.reintTail = NULL;
468 raidPtr->parityLogDiskQueue.logBlockHead = NULL;
469 raidPtr->parityLogDiskQueue.logBlockTail = NULL;
470 raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
471 raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
472 raidPtr->parityLogDiskQueue.freeDataList = NULL;
473 raidPtr->parityLogDiskQueue.freeCommonList = NULL;
474
475 rc = rf_ShutdownCreate(listp,
476 rf_ShutdownParityLoggingDiskQueue,
477 raidPtr);
478 if (rc) {
479 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
480 __LINE__, rc);
481 return (rc);
482 }
483 for (i = 0; i < rf_numParityRegions; i++) {
484 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex);
485 if (rc) {
486 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
487 __LINE__, rc);
488 for (j = 0; j < i; j++)
489 FreeRegionInfo(raidPtr, j);
490 RF_Free(raidPtr->regionInfo,
491 (rf_numParityRegions *
492 sizeof(RF_RegionInfo_t)));
493 return (ENOMEM);
494 }
495 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex);
496 if (rc) {
497 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
498 __LINE__, rc);
499 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
500 for (j = 0; j < i; j++)
501 FreeRegionInfo(raidPtr, j);
502 RF_Free(raidPtr->regionInfo,
503 (rf_numParityRegions *
504 sizeof(RF_RegionInfo_t)));
505 return (ENOMEM);
506 }
507 raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
508 raidPtr->regionInfo[i].regionStartAddr =
509 raidPtr->regionLogCapacity * i;
510 raidPtr->regionInfo[i].parityStartAddr =
511 raidPtr->regionParityRange * i;
512 if (i < rf_numParityRegions - 1) {
513 raidPtr->regionInfo[i].capacity =
514 raidPtr->regionLogCapacity;
515 raidPtr->regionInfo[i].numSectorsParity =
516 raidPtr->regionParityRange;
517 } else {
518 raidPtr->regionInfo[i].capacity =
519 lastRegionCapacity;
520 raidPtr->regionInfo[i].numSectorsParity =
521 raidPtr->sectorsPerDisk -
522 raidPtr->regionParityRange * i;
523 if (raidPtr->regionInfo[i].numSectorsParity >
524 maxRegionParityRange)
525 maxRegionParityRange =
526 raidPtr->regionInfo[i].numSectorsParity;
527 }
528 raidPtr->regionInfo[i].diskCount = 0;
529 RF_ASSERT(raidPtr->regionInfo[i].capacity +
530 raidPtr->regionInfo[i].regionStartAddr <=
531 totalLogCapacity);
532 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr +
533 raidPtr->regionInfo[i].numSectorsParity <=
534 raidPtr->sectorsPerDisk);
535 printf("Allocating %d bytes for region %d\n",
536 (int) (raidPtr->regionInfo[i].capacity *
537 sizeof(RF_DiskMap_t)), i);
538 RF_Malloc(raidPtr->regionInfo[i].diskMap,
539 (raidPtr->regionInfo[i].capacity *
540 sizeof(RF_DiskMap_t)),
541 (RF_DiskMap_t *));
542 if (raidPtr->regionInfo[i].diskMap == NULL) {
543 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
544 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex);
545 for (j = 0; j < i; j++)
546 FreeRegionInfo(raidPtr, j);
547 RF_Free(raidPtr->regionInfo,
548 (rf_numParityRegions *
549 sizeof(RF_RegionInfo_t)));
550 return (ENOMEM);
551 }
552 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
553 raidPtr->regionInfo[i].coreLog = NULL;
554 }
555 rc = rf_ShutdownCreate(listp,
556 rf_ShutdownParityLoggingRegionInfo,
557 raidPtr);
558 if (rc) {
559 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
560 __LINE__, rc);
561 rf_ShutdownParityLoggingRegionInfo(raidPtr);
562 return (rc);
563 }
564 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
565 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
566 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle,
567 rf_ParityLoggingDiskManager, raidPtr,"rf_log");
568 if (rc) {
569 raidPtr->parityLogDiskQueue.threadState = 0;
570 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
571 __FILE__, __LINE__, rc);
572 return (ENOMEM);
573 }
574 /* wait for thread to start */
575 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
576 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) {
577 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
578 raidPtr->parityLogDiskQueue.mutex);
579 }
580 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
581
582 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
583 if (rc) {
584 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc);
585 rf_ShutdownParityLogging(raidPtr);
586 return (rc);
587 }
588 if (rf_parityLogDebug) {
589 printf(" size of disk log in sectors: %d\n",
590 (int) totalLogCapacity);
591 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions);
592 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity);
593 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation);
594 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs);
595 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
596 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
597 }
598 rf_EnableParityLogging(raidPtr);
599
600 return (0);
601 }
602
603 static void
604 FreeRegionInfo(
605 RF_Raid_t * raidPtr,
606 RF_RegionId_t regionID)
607 {
608 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
609 RF_Free(raidPtr->regionInfo[regionID].diskMap,
610 (raidPtr->regionInfo[regionID].capacity *
611 sizeof(RF_DiskMap_t)));
612 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
613 rf_ReleaseParityLogs(raidPtr,
614 raidPtr->regionInfo[regionID].coreLog);
615 raidPtr->regionInfo[regionID].coreLog = NULL;
616 } else {
617 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
618 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
619 }
620 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
621 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex);
622 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex);
623 }
624
625
626 static void
627 FreeParityLogQueue(
628 RF_Raid_t * raidPtr,
629 RF_ParityLogQueue_t * queue)
630 {
631 RF_ParityLog_t *l1, *l2;
632
633 RF_LOCK_MUTEX(queue->mutex);
634 l1 = queue->parityLogs;
635 while (l1) {
636 l2 = l1;
637 l1 = l2->next;
638 RF_Free(l2->records, (raidPtr->numSectorsPerLog *
639 sizeof(RF_ParityLogRecord_t)));
640 RF_Free(l2, sizeof(RF_ParityLog_t));
641 }
642 RF_UNLOCK_MUTEX(queue->mutex);
643 rf_mutex_destroy(&queue->mutex);
644 }
645
646
647 static void
648 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue)
649 {
650 int i;
651
652 RF_LOCK_MUTEX(queue->mutex);
653 if (queue->availableBuffers != queue->totalBuffers) {
654 printf("Attempt to free region queue which is still in use!\n");
655 RF_ASSERT(0);
656 }
657 for (i = 0; i < queue->totalBuffers; i++)
658 RF_Free(queue->buffers[i], queue->bufferSize);
659 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t));
660 RF_UNLOCK_MUTEX(queue->mutex);
661 rf_mutex_destroy(&queue->mutex);
662 }
663
664 static void
665 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
666 {
667 RF_Raid_t *raidPtr;
668 RF_RegionId_t i;
669
670 raidPtr = (RF_Raid_t *) arg;
671 if (rf_parityLogDebug) {
672 printf("raid%d: ShutdownParityLoggingRegionInfo\n",
673 raidPtr->raidid);
674 }
675 /* free region information structs */
676 for (i = 0; i < rf_numParityRegions; i++)
677 FreeRegionInfo(raidPtr, i);
678 RF_Free(raidPtr->regionInfo, (rf_numParityRegions *
679 sizeof(raidPtr->regionInfo)));
680 raidPtr->regionInfo = NULL;
681 }
682
683 static void
684 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
685 {
686 RF_Raid_t *raidPtr;
687
688 raidPtr = (RF_Raid_t *) arg;
689 if (rf_parityLogDebug) {
690 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid);
691 }
692 /* free contents of parityLogPool */
693 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool);
694 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
695 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
696 }
697
698 static void
699 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
700 {
701 RF_Raid_t *raidPtr;
702
703 raidPtr = (RF_Raid_t *) arg;
704 if (rf_parityLogDebug) {
705 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n",
706 raidPtr->raidid);
707 }
708 FreeRegionBufferQueue(&raidPtr->regionBufferPool);
709 }
710
711 static void
712 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
713 {
714 RF_Raid_t *raidPtr;
715
716 raidPtr = (RF_Raid_t *) arg;
717 if (rf_parityLogDebug) {
718 printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
719 raidPtr->raidid);
720 }
721 FreeRegionBufferQueue(&raidPtr->parityBufferPool);
722 }
723
724 static void
725 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
726 {
727 RF_ParityLogData_t *d;
728 RF_CommonLogData_t *c;
729 RF_Raid_t *raidPtr;
730
731 raidPtr = (RF_Raid_t *) arg;
732 if (rf_parityLogDebug) {
733 printf("raid%d: ShutdownParityLoggingDiskQueue\n",
734 raidPtr->raidid);
735 }
736 /* free disk manager stuff */
737 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
738 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
739 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
740 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
741 while (raidPtr->parityLogDiskQueue.freeDataList) {
742 d = raidPtr->parityLogDiskQueue.freeDataList;
743 raidPtr->parityLogDiskQueue.freeDataList =
744 raidPtr->parityLogDiskQueue.freeDataList->next;
745 RF_Free(d, sizeof(RF_ParityLogData_t));
746 }
747 while (raidPtr->parityLogDiskQueue.freeCommonList) {
748 c = raidPtr->parityLogDiskQueue.freeCommonList;
749 rf_mutex_destroy(&c->mutex);
750 raidPtr->parityLogDiskQueue.freeCommonList =
751 raidPtr->parityLogDiskQueue.freeCommonList->next;
752 RF_Free(c, sizeof(RF_CommonLogData_t));
753 }
754 }
755
756 static void
757 rf_ShutdownParityLogging(RF_ThreadArg_t arg)
758 {
759 RF_Raid_t *raidPtr;
760
761 raidPtr = (RF_Raid_t *) arg;
762 if (rf_parityLogDebug) {
763 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid);
764 }
765 /* shutdown disk thread */
766 /* This has the desirable side-effect of forcing all regions to be
767 * reintegrated. This is necessary since all parity log maps are
768 * currently held in volatile memory. */
769
770 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
771 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
772 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
773 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
774 /*
775 * pLogDiskThread will now terminate when queues are cleared
776 * now wait for it to be done
777 */
778 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
779 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) {
780 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
781 raidPtr->parityLogDiskQueue.mutex);
782 }
783 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
784 if (rf_parityLogDebug) {
785 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid);
786 }
787 }
788
789 int
790 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr)
791 {
792 return (20);
793 }
794
795 RF_HeadSepLimit_t
796 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr)
797 {
798 return (10);
799 }
800 /* return the region ID for a given RAID address */
801 RF_RegionId_t
802 rf_MapRegionIDParityLogging(
803 RF_Raid_t * raidPtr,
804 RF_SectorNum_t address)
805 {
806 RF_RegionId_t regionID;
807
808 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
809 regionID = address / raidPtr->regionParityRange;
810 if (regionID == rf_numParityRegions) {
811 /* last region may be larger than other regions */
812 regionID--;
813 }
814 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
815 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr +
816 raidPtr->regionInfo[regionID].numSectorsParity);
817 RF_ASSERT(regionID < rf_numParityRegions);
818 return (regionID);
819 }
820
821
822 /* given a logical RAID sector, determine physical disk address of data */
823 void
824 rf_MapSectorParityLogging(
825 RF_Raid_t * raidPtr,
826 RF_RaidAddr_t raidSector,
827 RF_RowCol_t * row,
828 RF_RowCol_t * col,
829 RF_SectorNum_t * diskSector,
830 int remap)
831 {
832 RF_StripeNum_t SUID = raidSector /
833 raidPtr->Layout.sectorsPerStripeUnit;
834 *row = 0;
835 /* *col = (SUID % (raidPtr->numCol -
836 * raidPtr->Layout.numParityLogCol)); */
837 *col = SUID % raidPtr->Layout.numDataCol;
838 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
839 raidPtr->Layout.sectorsPerStripeUnit +
840 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
841 }
842
843
844 /* given a logical RAID sector, determine physical disk address of parity */
845 void
846 rf_MapParityParityLogging(
847 RF_Raid_t * raidPtr,
848 RF_RaidAddr_t raidSector,
849 RF_RowCol_t * row,
850 RF_RowCol_t * col,
851 RF_SectorNum_t * diskSector,
852 int remap)
853 {
854 RF_StripeNum_t SUID = raidSector /
855 raidPtr->Layout.sectorsPerStripeUnit;
856
857 *row = 0;
858 /* *col =
859 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
860 * r->numCol - raidPtr->Layout.numParityLogCol); */
861 *col = raidPtr->Layout.numDataCol;
862 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
863 raidPtr->Layout.sectorsPerStripeUnit +
864 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
865 }
866
867
868 /* given a regionID and sector offset, determine the physical disk address of the parity log */
869 void
870 rf_MapLogParityLogging(
871 RF_Raid_t * raidPtr,
872 RF_RegionId_t regionID,
873 RF_SectorNum_t regionOffset,
874 RF_RowCol_t * row,
875 RF_RowCol_t * col,
876 RF_SectorNum_t * startSector)
877 {
878 *row = 0;
879 *col = raidPtr->numCol - 1;
880 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
881 }
882
883
884 /* given a regionID, determine the physical disk address of the logged
885 parity for that region */
886 void
887 rf_MapRegionParity(
888 RF_Raid_t * raidPtr,
889 RF_RegionId_t regionID,
890 RF_RowCol_t * row,
891 RF_RowCol_t * col,
892 RF_SectorNum_t * startSector,
893 RF_SectorCount_t * numSector)
894 {
895 *row = 0;
896 *col = raidPtr->numCol - 2;
897 *startSector = raidPtr->regionInfo[regionID].parityStartAddr;
898 *numSector = raidPtr->regionInfo[regionID].numSectorsParity;
899 }
900
901
902 /* given a logical RAID address, determine the participating disks in
903 the stripe */
904 void
905 rf_IdentifyStripeParityLogging(
906 RF_Raid_t * raidPtr,
907 RF_RaidAddr_t addr,
908 RF_RowCol_t ** diskids,
909 RF_RowCol_t * outRow)
910 {
911 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout,
912 addr);
913 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *)
914 raidPtr->Layout.layoutSpecificInfo;
915 *outRow = 0;
916 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
917 }
918
919
920 void
921 rf_MapSIDToPSIDParityLogging(
922 RF_RaidLayout_t * layoutPtr,
923 RF_StripeNum_t stripeID,
924 RF_StripeNum_t * psID,
925 RF_ReconUnitNum_t * which_ru)
926 {
927 *which_ru = 0;
928 *psID = stripeID;
929 }
930
931
932 /* select an algorithm for performing an access. Returns two pointers,
933 * one to a function that will return information about the DAG, and
934 * another to a function that will create the dag.
935 */
936 void
937 rf_ParityLoggingDagSelect(
938 RF_Raid_t * raidPtr,
939 RF_IoType_t type,
940 RF_AccessStripeMap_t * asmp,
941 RF_VoidFuncPtr * createFunc)
942 {
943 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
944 RF_PhysDiskAddr_t *failedPDA = NULL;
945 RF_RowCol_t frow, fcol;
946 RF_RowStatus_t rstat;
947 int prior_recon;
948
949 RF_ASSERT(RF_IO_IS_R_OR_W(type));
950
951 if (asmp->numDataFailed + asmp->numParityFailed > 1) {
952 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
953 /* *infoFunc = */ *createFunc = NULL;
954 return;
955 } else
956 if (asmp->numDataFailed + asmp->numParityFailed == 1) {
957
958 /* if under recon & already reconstructed, redirect
959 * the access to the spare drive and eliminate the
960 * failure indication */
961 failedPDA = asmp->failedPDAs[0];
962 frow = failedPDA->row;
963 fcol = failedPDA->col;
964 rstat = raidPtr->status[failedPDA->row];
965 prior_recon = (rstat == rf_rs_reconfigured) || (
966 (rstat == rf_rs_reconstructing) ?
967 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
968 );
969 if (prior_recon) {
970 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col;
971 RF_SectorNum_t oo = failedPDA->startSector;
972 if (layoutPtr->map->flags &
973 RF_DISTRIBUTE_SPARE) {
974 /* redirect to dist spare space */
975
976 if (failedPDA == asmp->parityInfo) {
977
978 /* parity has failed */
979 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
980 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
981
982 if (asmp->parityInfo->next) { /* redir 2nd component,
983 * if any */
984 RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
985 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
986 p->row = failedPDA->row;
987 p->col = failedPDA->col;
988 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
989 SUoffs; /* cheating:
990 * startSector is not
991 * really a RAID address */
992 }
993 } else
994 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
995 RF_ASSERT(0); /* should not ever
996 * happen */
997 } else {
998
999 /* data has failed */
1000 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
1001 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
1002
1003 }
1004
1005 } else {
1006 /* redirect to dedicated spare space */
1007
1008 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
1009 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
1010
1011 /* the parity may have two distinct
1012 * components, both of which may need
1013 * to be redirected */
1014 if (asmp->parityInfo->next) {
1015 if (failedPDA == asmp->parityInfo) {
1016 failedPDA->next->row = failedPDA->row;
1017 failedPDA->next->col = failedPDA->col;
1018 } else
1019 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */
1020 asmp->parityInfo->row = failedPDA->row;
1021 asmp->parityInfo->col = failedPDA->col;
1022 }
1023 }
1024 }
1025
1026 RF_ASSERT(failedPDA->col != -1);
1027
1028 if (rf_dagDebug || rf_mapDebug) {
1029 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
1030 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector);
1031 }
1032 asmp->numDataFailed = asmp->numParityFailed = 0;
1033 }
1034 }
1035 if (type == RF_IO_TYPE_READ) {
1036
1037 if (asmp->numDataFailed == 0)
1038 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
1039 else
1040 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
1041
1042 } else {
1043
1044
1045 /* if mirroring, always use large writes. If the access
1046 * requires two distinct parity updates, always do a small
1047 * write. If the stripe contains a failure but the access
1048 * does not, do a small write. The first conditional
1049 * (numStripeUnitsAccessed <= numDataCol/2) uses a
1050 * less-than-or-equal rather than just a less-than because
1051 * when G is 3 or 4, numDataCol/2 is 1, and I want
1052 * single-stripe-unit updates to use just one disk. */
1053 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) {
1054 if (((asmp->numStripeUnitsAccessed <=
1055 (layoutPtr->numDataCol / 2)) &&
1056 (layoutPtr->numDataCol != 1)) ||
1057 (asmp->parityInfo->next != NULL) ||
1058 rf_CheckStripeForFailures(raidPtr, asmp)) {
1059 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG;
1060 } else
1061 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG;
1062 } else
1063 if (asmp->numParityFailed == 1)
1064 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
1065 else
1066 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
1067 *createFunc = NULL;
1068 else
1069 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
1070 }
1071 }
1072 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
1073