rf_paritylogging.c revision 1.8 1 /* $NetBSD: rf_paritylogging.c,v 1.8 2000/01/09 04:35:13 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29
30 /*
31 parity logging configuration, dag selection, and mapping is implemented here
32 */
33
34 #include "rf_archs.h"
35
36 #if RF_INCLUDE_PARITYLOGGING > 0
37
38 #include "rf_types.h"
39 #include "rf_raid.h"
40 #include "rf_dag.h"
41 #include "rf_dagutils.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_dagffrd.h"
44 #include "rf_dagffwr.h"
45 #include "rf_dagdegrd.h"
46 #include "rf_dagdegwr.h"
47 #include "rf_paritylog.h"
48 #include "rf_paritylogDiskMgr.h"
49 #include "rf_paritylogging.h"
50 #include "rf_parityloggingdags.h"
51 #include "rf_general.h"
52 #include "rf_map.h"
53 #include "rf_utils.h"
54 #include "rf_shutdown.h"
55
56 typedef struct RF_ParityLoggingConfigInfo_s {
57 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by
58 * IdentifyStripe */
59 } RF_ParityLoggingConfigInfo_t;
60
61 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID);
62 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
63 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
64 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
65 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
66 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
67 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
68
69 int
70 rf_ConfigureParityLogging(
71 RF_ShutdownList_t ** listp,
72 RF_Raid_t * raidPtr,
73 RF_Config_t * cfgPtr)
74 {
75 int i, j, startdisk, rc;
76 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
77 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
78 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
79 RF_ParityLoggingConfigInfo_t *info;
80 RF_ParityLog_t *l = NULL, *next;
81 caddr_t lHeapPtr;
82
83 if (rf_numParityRegions <= 0)
84 return(EINVAL);
85
86 /*
87 * We create multiple entries on the shutdown list here, since
88 * this configuration routine is fairly complicated in and of
89 * itself, and this makes backing out of a failed configuration
90 * much simpler.
91 */
92
93 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
94
95 /* create a parity logging configuration structure */
96 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t),
97 (RF_ParityLoggingConfigInfo_t *),
98 raidPtr->cleanupList);
99 if (info == NULL)
100 return (ENOMEM);
101 layoutPtr->layoutSpecificInfo = (void *) info;
102
103 RF_ASSERT(raidPtr->numRow == 1);
104
105 /* the stripe identifier must identify the disks in each stripe, IN
106 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
107 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol),
108 (raidPtr->numCol),
109 raidPtr->cleanupList);
110 if (info->stripeIdentifier == NULL)
111 return (ENOMEM);
112
113 startdisk = 0;
114 for (i = 0; i < (raidPtr->numCol); i++) {
115 for (j = 0; j < (raidPtr->numCol); j++) {
116 info->stripeIdentifier[i][j] = (startdisk + j) %
117 (raidPtr->numCol - 1);
118 }
119 if ((--startdisk) < 0)
120 startdisk = raidPtr->numCol - 1 - 1;
121 }
122
123 /* fill in the remaining layout parameters */
124 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
125 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
126 raidPtr->logBytesPerSector;
127 layoutPtr->numParityCol = 1;
128 layoutPtr->numParityLogCol = 1;
129 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol -
130 layoutPtr->numParityLogCol;
131 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol *
132 layoutPtr->sectorsPerStripeUnit;
133 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
134 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
135 layoutPtr->sectorsPerStripeUnit;
136
137 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk *
138 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
139
140 /* configure parity log parameters
141 *
142 * parameter comment/constraints
143 * -------------------------------------------
144 * numParityRegions* all regions (except possibly last)
145 * of equal size
146 * totalInCoreLogCapacity* amount of memory in bytes available
147 * for in-core logs (default 1 MB)
148 * numSectorsPerLog# capacity of an in-core log in sectors
149 * (1 * disk track)
150 * numParityLogs total number of in-core logs,
151 * should be at least numParityRegions
152 * regionLogCapacity size of a region log (except possibly
153 * last one) in sectors
154 * totalLogCapacity total amount of log space in sectors
155 *
156 * where '*' denotes a user settable parameter.
157 * Note that logs are fixed to be the size of a disk track,
158 * value #defined in rf_paritylog.h
159 *
160 */
161
162 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
163 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
164 if (rf_parityLogDebug)
165 printf("bytes per sector %d\n", raidPtr->bytesPerSector);
166
167 /* reduce fragmentation within a disk region by adjusting the number
168 * of regions in an attempt to allow an integral number of logs to fit
169 * into a disk region */
170 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
171 if (fragmentation > 0)
172 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) {
173 if (((totalLogCapacity / (rf_numParityRegions + i)) %
174 raidPtr->numSectorsPerLog) < fragmentation) {
175 rf_numParityRegions++;
176 raidPtr->regionLogCapacity = totalLogCapacity /
177 rf_numParityRegions;
178 fragmentation = raidPtr->regionLogCapacity %
179 raidPtr->numSectorsPerLog;
180 }
181 if (((totalLogCapacity / (rf_numParityRegions - i)) %
182 raidPtr->numSectorsPerLog) < fragmentation) {
183 rf_numParityRegions--;
184 raidPtr->regionLogCapacity = totalLogCapacity /
185 rf_numParityRegions;
186 fragmentation = raidPtr->regionLogCapacity %
187 raidPtr->numSectorsPerLog;
188 }
189 }
190 /* ensure integral number of regions per log */
191 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity /
192 raidPtr->numSectorsPerLog) *
193 raidPtr->numSectorsPerLog;
194
195 raidPtr->numParityLogs = rf_totalInCoreLogCapacity /
196 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
197 /* to avoid deadlock, must ensure that enough logs exist for each
198 * region to have one simultaneously */
199 if (raidPtr->numParityLogs < rf_numParityRegions)
200 raidPtr->numParityLogs = rf_numParityRegions;
201
202 /* create region information structs */
203 RF_Malloc(raidPtr->regionInfo,
204 (rf_numParityRegions * sizeof(RF_RegionInfo_t)),
205 (RF_RegionInfo_t *));
206 if (raidPtr->regionInfo == NULL)
207 return (ENOMEM);
208
209 /* last region may not be full capacity */
210 lastRegionCapacity = raidPtr->regionLogCapacity;
211 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity +
212 lastRegionCapacity > totalLogCapacity)
213 lastRegionCapacity = lastRegionCapacity -
214 raidPtr->numSectorsPerLog;
215
216 raidPtr->regionParityRange = raidPtr->sectorsPerDisk /
217 rf_numParityRegions;
218 maxRegionParityRange = raidPtr->regionParityRange;
219
220 /* i can't remember why this line is in the code -wvcii 6/30/95 */
221 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
222 regionParityRange++; */
223
224 /* build pool of unused parity logs */
225 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
226 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector,
227 (caddr_t));
228 if (raidPtr->parityLogBufferHeap == NULL)
229 return (ENOMEM);
230 lHeapPtr = raidPtr->parityLogBufferHeap;
231 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex);
232 if (rc) {
233 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
234 __FILE__, __LINE__, rc);
235 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
236 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
237 return (ENOMEM);
238 }
239 for (i = 0; i < raidPtr->numParityLogs; i++) {
240 if (i == 0) {
241 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1,
242 sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
243 if (raidPtr->parityLogPool.parityLogs == NULL) {
244 RF_Free(raidPtr->parityLogBufferHeap,
245 raidPtr->numParityLogs *
246 raidPtr->numSectorsPerLog *
247 raidPtr->bytesPerSector);
248 return (ENOMEM);
249 }
250 l = raidPtr->parityLogPool.parityLogs;
251 } else {
252 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t),
253 (RF_ParityLog_t *));
254 if (l->next == NULL) {
255 RF_Free(raidPtr->parityLogBufferHeap,
256 raidPtr->numParityLogs *
257 raidPtr->numSectorsPerLog *
258 raidPtr->bytesPerSector);
259 for (l = raidPtr->parityLogPool.parityLogs;
260 l;
261 l = next) {
262 next = l->next;
263 if (l->records)
264 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
265 RF_Free(l, sizeof(RF_ParityLog_t));
266 }
267 return (ENOMEM);
268 }
269 l = l->next;
270 }
271 l->bufPtr = lHeapPtr;
272 lHeapPtr += raidPtr->numSectorsPerLog *
273 raidPtr->bytesPerSector;
274 RF_Malloc(l->records, (raidPtr->numSectorsPerLog *
275 sizeof(RF_ParityLogRecord_t)),
276 (RF_ParityLogRecord_t *));
277 if (l->records == NULL) {
278 RF_Free(raidPtr->parityLogBufferHeap,
279 raidPtr->numParityLogs *
280 raidPtr->numSectorsPerLog *
281 raidPtr->bytesPerSector);
282 for (l = raidPtr->parityLogPool.parityLogs;
283 l;
284 l = next) {
285 next = l->next;
286 if (l->records)
287 RF_Free(l->records,
288 (raidPtr->numSectorsPerLog *
289 sizeof(RF_ParityLogRecord_t)));
290 RF_Free(l, sizeof(RF_ParityLog_t));
291 }
292 return (ENOMEM);
293 }
294 }
295 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
296 if (rc) {
297 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
298 __LINE__, rc);
299 rf_ShutdownParityLoggingPool(raidPtr);
300 return (rc);
301 }
302 /* build pool of region buffers */
303 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex);
304 if (rc) {
305 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
306 __FILE__, __LINE__, rc);
307 return (ENOMEM);
308 }
309 rc = rf_cond_init(&raidPtr->regionBufferPool.cond);
310 if (rc) {
311 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
312 __FILE__, __LINE__, rc);
313 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
314 return (ENOMEM);
315 }
316 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity *
317 raidPtr->bytesPerSector;
318 printf("regionBufferPool.bufferSize %d\n",
319 raidPtr->regionBufferPool.bufferSize);
320
321 /* for now, only one region at a time may be reintegrated */
322 raidPtr->regionBufferPool.totalBuffers = 1;
323
324 raidPtr->regionBufferPool.availableBuffers =
325 raidPtr->regionBufferPool.totalBuffers;
326 raidPtr->regionBufferPool.availBuffersIndex = 0;
327 raidPtr->regionBufferPool.emptyBuffersIndex = 0;
328 RF_Malloc(raidPtr->regionBufferPool.buffers,
329 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t),
330 (caddr_t *));
331 if (raidPtr->regionBufferPool.buffers == NULL) {
332 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
333 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
334 return (ENOMEM);
335 }
336 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
337 RF_Malloc(raidPtr->regionBufferPool.buffers[i],
338 raidPtr->regionBufferPool.bufferSize * sizeof(char),
339 (caddr_t));
340 if (raidPtr->regionBufferPool.buffers[i] == NULL) {
341 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
342 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
343 for (j = 0; j < i; j++) {
344 RF_Free(raidPtr->regionBufferPool.buffers[i],
345 raidPtr->regionBufferPool.bufferSize *
346 sizeof(char));
347 }
348 RF_Free(raidPtr->regionBufferPool.buffers,
349 raidPtr->regionBufferPool.totalBuffers *
350 sizeof(caddr_t));
351 return (ENOMEM);
352 }
353 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
354 (long) raidPtr->regionBufferPool.buffers[i]);
355 }
356 rc = rf_ShutdownCreate(listp,
357 rf_ShutdownParityLoggingRegionBufferPool,
358 raidPtr);
359 if (rc) {
360 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
361 __LINE__, rc);
362 rf_ShutdownParityLoggingRegionBufferPool(raidPtr);
363 return (rc);
364 }
365 /* build pool of parity buffers */
366 parityBufferCapacity = maxRegionParityRange;
367 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex);
368 if (rc) {
369 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
370 __FILE__, __LINE__, rc);
371 return (rc);
372 }
373 rc = rf_cond_init(&raidPtr->parityBufferPool.cond);
374 if (rc) {
375 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
376 __FILE__, __LINE__, rc);
377 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
378 return (ENOMEM);
379 }
380 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity *
381 raidPtr->bytesPerSector;
382 printf("parityBufferPool.bufferSize %d\n",
383 raidPtr->parityBufferPool.bufferSize);
384
385 /* for now, only one region at a time may be reintegrated */
386 raidPtr->parityBufferPool.totalBuffers = 1;
387
388 raidPtr->parityBufferPool.availableBuffers =
389 raidPtr->parityBufferPool.totalBuffers;
390 raidPtr->parityBufferPool.availBuffersIndex = 0;
391 raidPtr->parityBufferPool.emptyBuffersIndex = 0;
392 RF_Malloc(raidPtr->parityBufferPool.buffers,
393 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t),
394 (caddr_t *));
395 if (raidPtr->parityBufferPool.buffers == NULL) {
396 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
397 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
398 return (ENOMEM);
399 }
400 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
401 RF_Malloc(raidPtr->parityBufferPool.buffers[i],
402 raidPtr->parityBufferPool.bufferSize * sizeof(char),
403 (caddr_t));
404 if (raidPtr->parityBufferPool.buffers == NULL) {
405 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
406 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
407 for (j = 0; j < i; j++) {
408 RF_Free(raidPtr->parityBufferPool.buffers[i],
409 raidPtr->regionBufferPool.bufferSize *
410 sizeof(char));
411 }
412 RF_Free(raidPtr->parityBufferPool.buffers,
413 raidPtr->regionBufferPool.totalBuffers *
414 sizeof(caddr_t));
415 return (ENOMEM);
416 }
417 printf("parityBufferPool.buffers[%d] = %lx\n", i,
418 (long) raidPtr->parityBufferPool.buffers[i]);
419 }
420 rc = rf_ShutdownCreate(listp,
421 rf_ShutdownParityLoggingParityBufferPool,
422 raidPtr);
423 if (rc) {
424 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
425 __LINE__, rc);
426 rf_ShutdownParityLoggingParityBufferPool(raidPtr);
427 return (rc);
428 }
429 /* initialize parityLogDiskQueue */
430 rc = rf_create_managed_mutex(listp,
431 &raidPtr->parityLogDiskQueue.mutex);
432 if (rc) {
433 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n",
434 __FILE__, __LINE__, rc);
435 return (rc);
436 }
437 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond);
438 if (rc) {
439 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n",
440 __FILE__, __LINE__, rc);
441 return (rc);
442 }
443 raidPtr->parityLogDiskQueue.flushQueue = NULL;
444 raidPtr->parityLogDiskQueue.reintQueue = NULL;
445 raidPtr->parityLogDiskQueue.bufHead = NULL;
446 raidPtr->parityLogDiskQueue.bufTail = NULL;
447 raidPtr->parityLogDiskQueue.reintHead = NULL;
448 raidPtr->parityLogDiskQueue.reintTail = NULL;
449 raidPtr->parityLogDiskQueue.logBlockHead = NULL;
450 raidPtr->parityLogDiskQueue.logBlockTail = NULL;
451 raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
452 raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
453 raidPtr->parityLogDiskQueue.freeDataList = NULL;
454 raidPtr->parityLogDiskQueue.freeCommonList = NULL;
455
456 rc = rf_ShutdownCreate(listp,
457 rf_ShutdownParityLoggingDiskQueue,
458 raidPtr);
459 if (rc) {
460 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
461 __LINE__, rc);
462 return (rc);
463 }
464 for (i = 0; i < rf_numParityRegions; i++) {
465 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex);
466 if (rc) {
467 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
468 __LINE__, rc);
469 for (j = 0; j < i; j++)
470 FreeRegionInfo(raidPtr, j);
471 RF_Free(raidPtr->regionInfo,
472 (rf_numParityRegions *
473 sizeof(RF_RegionInfo_t)));
474 return (ENOMEM);
475 }
476 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex);
477 if (rc) {
478 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
479 __LINE__, rc);
480 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
481 for (j = 0; j < i; j++)
482 FreeRegionInfo(raidPtr, j);
483 RF_Free(raidPtr->regionInfo,
484 (rf_numParityRegions *
485 sizeof(RF_RegionInfo_t)));
486 return (ENOMEM);
487 }
488 raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
489 raidPtr->regionInfo[i].regionStartAddr =
490 raidPtr->regionLogCapacity * i;
491 raidPtr->regionInfo[i].parityStartAddr =
492 raidPtr->regionParityRange * i;
493 if (i < rf_numParityRegions - 1) {
494 raidPtr->regionInfo[i].capacity =
495 raidPtr->regionLogCapacity;
496 raidPtr->regionInfo[i].numSectorsParity =
497 raidPtr->regionParityRange;
498 } else {
499 raidPtr->regionInfo[i].capacity =
500 lastRegionCapacity;
501 raidPtr->regionInfo[i].numSectorsParity =
502 raidPtr->sectorsPerDisk -
503 raidPtr->regionParityRange * i;
504 if (raidPtr->regionInfo[i].numSectorsParity >
505 maxRegionParityRange)
506 maxRegionParityRange =
507 raidPtr->regionInfo[i].numSectorsParity;
508 }
509 raidPtr->regionInfo[i].diskCount = 0;
510 RF_ASSERT(raidPtr->regionInfo[i].capacity +
511 raidPtr->regionInfo[i].regionStartAddr <=
512 totalLogCapacity);
513 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr +
514 raidPtr->regionInfo[i].numSectorsParity <=
515 raidPtr->sectorsPerDisk);
516 RF_Malloc(raidPtr->regionInfo[i].diskMap,
517 (raidPtr->regionInfo[i].capacity *
518 sizeof(RF_DiskMap_t)),
519 (RF_DiskMap_t *));
520 if (raidPtr->regionInfo[i].diskMap == NULL) {
521 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
522 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex);
523 for (j = 0; j < i; j++)
524 FreeRegionInfo(raidPtr, j);
525 RF_Free(raidPtr->regionInfo,
526 (rf_numParityRegions *
527 sizeof(RF_RegionInfo_t)));
528 return (ENOMEM);
529 }
530 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
531 raidPtr->regionInfo[i].coreLog = NULL;
532 }
533 rc = rf_ShutdownCreate(listp,
534 rf_ShutdownParityLoggingRegionInfo,
535 raidPtr);
536 if (rc) {
537 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
538 __LINE__, rc);
539 rf_ShutdownParityLoggingRegionInfo(raidPtr);
540 return (rc);
541 }
542 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
543 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
544 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle,
545 rf_ParityLoggingDiskManager, raidPtr,"rf_log");
546 if (rc) {
547 raidPtr->parityLogDiskQueue.threadState = 0;
548 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
549 __FILE__, __LINE__, rc);
550 return (ENOMEM);
551 }
552 /* wait for thread to start */
553 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
554 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) {
555 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
556 raidPtr->parityLogDiskQueue.mutex);
557 }
558 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
559
560 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
561 if (rc) {
562 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc);
563 rf_ShutdownParityLogging(raidPtr);
564 return (rc);
565 }
566 if (rf_parityLogDebug) {
567 printf(" size of disk log in sectors: %d\n",
568 (int) totalLogCapacity);
569 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions);
570 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity);
571 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation);
572 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs);
573 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
574 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
575 }
576 rf_EnableParityLogging(raidPtr);
577
578 return (0);
579 }
580
581 static void
582 FreeRegionInfo(
583 RF_Raid_t * raidPtr,
584 RF_RegionId_t regionID)
585 {
586 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
587 RF_Free(raidPtr->regionInfo[regionID].diskMap,
588 (raidPtr->regionInfo[regionID].capacity *
589 sizeof(RF_DiskMap_t)));
590 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
591 rf_ReleaseParityLogs(raidPtr,
592 raidPtr->regionInfo[regionID].coreLog);
593 raidPtr->regionInfo[regionID].coreLog = NULL;
594 } else {
595 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
596 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
597 }
598 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
599 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex);
600 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex);
601 }
602
603
604 static void
605 FreeParityLogQueue(
606 RF_Raid_t * raidPtr,
607 RF_ParityLogQueue_t * queue)
608 {
609 RF_ParityLog_t *l1, *l2;
610
611 RF_LOCK_MUTEX(queue->mutex);
612 l1 = queue->parityLogs;
613 while (l1) {
614 l2 = l1;
615 l1 = l2->next;
616 RF_Free(l2->records, (raidPtr->numSectorsPerLog *
617 sizeof(RF_ParityLogRecord_t)));
618 RF_Free(l2, sizeof(RF_ParityLog_t));
619 }
620 RF_UNLOCK_MUTEX(queue->mutex);
621 rf_mutex_destroy(&queue->mutex);
622 }
623
624
625 static void
626 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue)
627 {
628 int i;
629
630 RF_LOCK_MUTEX(queue->mutex);
631 if (queue->availableBuffers != queue->totalBuffers) {
632 printf("Attempt to free region queue which is still in use!\n");
633 RF_ASSERT(0);
634 }
635 for (i = 0; i < queue->totalBuffers; i++)
636 RF_Free(queue->buffers[i], queue->bufferSize);
637 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t));
638 RF_UNLOCK_MUTEX(queue->mutex);
639 rf_mutex_destroy(&queue->mutex);
640 }
641
642 static void
643 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
644 {
645 RF_Raid_t *raidPtr;
646 RF_RegionId_t i;
647
648 raidPtr = (RF_Raid_t *) arg;
649 if (rf_parityLogDebug) {
650 printf("raid%d: ShutdownParityLoggingRegionInfo\n",
651 raidPtr->raidid);
652 }
653 /* free region information structs */
654 for (i = 0; i < rf_numParityRegions; i++)
655 FreeRegionInfo(raidPtr, i);
656 RF_Free(raidPtr->regionInfo, (rf_numParityRegions *
657 sizeof(raidPtr->regionInfo)));
658 raidPtr->regionInfo = NULL;
659 }
660
661 static void
662 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
663 {
664 RF_Raid_t *raidPtr;
665
666 raidPtr = (RF_Raid_t *) arg;
667 if (rf_parityLogDebug) {
668 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid);
669 }
670 /* free contents of parityLogPool */
671 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool);
672 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
673 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
674 }
675
676 static void
677 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
678 {
679 RF_Raid_t *raidPtr;
680
681 raidPtr = (RF_Raid_t *) arg;
682 if (rf_parityLogDebug) {
683 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n",
684 raidPtr->raidid);
685 }
686 FreeRegionBufferQueue(&raidPtr->regionBufferPool);
687 }
688
689 static void
690 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
691 {
692 RF_Raid_t *raidPtr;
693
694 raidPtr = (RF_Raid_t *) arg;
695 if (rf_parityLogDebug) {
696 printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
697 raidPtr->raidid);
698 }
699 FreeRegionBufferQueue(&raidPtr->parityBufferPool);
700 }
701
702 static void
703 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
704 {
705 RF_ParityLogData_t *d;
706 RF_CommonLogData_t *c;
707 RF_Raid_t *raidPtr;
708
709 raidPtr = (RF_Raid_t *) arg;
710 if (rf_parityLogDebug) {
711 printf("raid%d: ShutdownParityLoggingDiskQueue\n",
712 raidPtr->raidid);
713 }
714 /* free disk manager stuff */
715 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
716 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
717 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
718 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
719 while (raidPtr->parityLogDiskQueue.freeDataList) {
720 d = raidPtr->parityLogDiskQueue.freeDataList;
721 raidPtr->parityLogDiskQueue.freeDataList =
722 raidPtr->parityLogDiskQueue.freeDataList->next;
723 RF_Free(d, sizeof(RF_ParityLogData_t));
724 }
725 while (raidPtr->parityLogDiskQueue.freeCommonList) {
726 c = raidPtr->parityLogDiskQueue.freeCommonList;
727 rf_mutex_destroy(&c->mutex);
728 raidPtr->parityLogDiskQueue.freeCommonList =
729 raidPtr->parityLogDiskQueue.freeCommonList->next;
730 RF_Free(c, sizeof(RF_CommonLogData_t));
731 }
732 }
733
734 static void
735 rf_ShutdownParityLogging(RF_ThreadArg_t arg)
736 {
737 RF_Raid_t *raidPtr;
738
739 raidPtr = (RF_Raid_t *) arg;
740 if (rf_parityLogDebug) {
741 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid);
742 }
743 /* shutdown disk thread */
744 /* This has the desirable side-effect of forcing all regions to be
745 * reintegrated. This is necessary since all parity log maps are
746 * currently held in volatile memory. */
747
748 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
749 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
750 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
751 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
752 /*
753 * pLogDiskThread will now terminate when queues are cleared
754 * now wait for it to be done
755 */
756 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
757 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) {
758 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
759 raidPtr->parityLogDiskQueue.mutex);
760 }
761 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
762 if (rf_parityLogDebug) {
763 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid);
764 }
765 }
766
767 int
768 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr)
769 {
770 return (20);
771 }
772
773 RF_HeadSepLimit_t
774 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr)
775 {
776 return (10);
777 }
778 /* return the region ID for a given RAID address */
779 RF_RegionId_t
780 rf_MapRegionIDParityLogging(
781 RF_Raid_t * raidPtr,
782 RF_SectorNum_t address)
783 {
784 RF_RegionId_t regionID;
785
786 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
787 regionID = address / raidPtr->regionParityRange;
788 if (regionID == rf_numParityRegions) {
789 /* last region may be larger than other regions */
790 regionID--;
791 }
792 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
793 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr +
794 raidPtr->regionInfo[regionID].numSectorsParity);
795 RF_ASSERT(regionID < rf_numParityRegions);
796 return (regionID);
797 }
798
799
800 /* given a logical RAID sector, determine physical disk address of data */
801 void
802 rf_MapSectorParityLogging(
803 RF_Raid_t * raidPtr,
804 RF_RaidAddr_t raidSector,
805 RF_RowCol_t * row,
806 RF_RowCol_t * col,
807 RF_SectorNum_t * diskSector,
808 int remap)
809 {
810 RF_StripeNum_t SUID = raidSector /
811 raidPtr->Layout.sectorsPerStripeUnit;
812 *row = 0;
813 /* *col = (SUID % (raidPtr->numCol -
814 * raidPtr->Layout.numParityLogCol)); */
815 *col = SUID % raidPtr->Layout.numDataCol;
816 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
817 raidPtr->Layout.sectorsPerStripeUnit +
818 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
819 }
820
821
822 /* given a logical RAID sector, determine physical disk address of parity */
823 void
824 rf_MapParityParityLogging(
825 RF_Raid_t * raidPtr,
826 RF_RaidAddr_t raidSector,
827 RF_RowCol_t * row,
828 RF_RowCol_t * col,
829 RF_SectorNum_t * diskSector,
830 int remap)
831 {
832 RF_StripeNum_t SUID = raidSector /
833 raidPtr->Layout.sectorsPerStripeUnit;
834
835 *row = 0;
836 /* *col =
837 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
838 * r->numCol - raidPtr->Layout.numParityLogCol); */
839 *col = raidPtr->Layout.numDataCol;
840 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
841 raidPtr->Layout.sectorsPerStripeUnit +
842 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
843 }
844
845
846 /* given a regionID and sector offset, determine the physical disk address of the parity log */
847 void
848 rf_MapLogParityLogging(
849 RF_Raid_t * raidPtr,
850 RF_RegionId_t regionID,
851 RF_SectorNum_t regionOffset,
852 RF_RowCol_t * row,
853 RF_RowCol_t * col,
854 RF_SectorNum_t * startSector)
855 {
856 *row = 0;
857 *col = raidPtr->numCol - 1;
858 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
859 }
860
861
862 /* given a regionID, determine the physical disk address of the logged
863 parity for that region */
864 void
865 rf_MapRegionParity(
866 RF_Raid_t * raidPtr,
867 RF_RegionId_t regionID,
868 RF_RowCol_t * row,
869 RF_RowCol_t * col,
870 RF_SectorNum_t * startSector,
871 RF_SectorCount_t * numSector)
872 {
873 *row = 0;
874 *col = raidPtr->numCol - 2;
875 *startSector = raidPtr->regionInfo[regionID].parityStartAddr;
876 *numSector = raidPtr->regionInfo[regionID].numSectorsParity;
877 }
878
879
880 /* given a logical RAID address, determine the participating disks in
881 the stripe */
882 void
883 rf_IdentifyStripeParityLogging(
884 RF_Raid_t * raidPtr,
885 RF_RaidAddr_t addr,
886 RF_RowCol_t ** diskids,
887 RF_RowCol_t * outRow)
888 {
889 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout,
890 addr);
891 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *)
892 raidPtr->Layout.layoutSpecificInfo;
893 *outRow = 0;
894 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
895 }
896
897
898 void
899 rf_MapSIDToPSIDParityLogging(
900 RF_RaidLayout_t * layoutPtr,
901 RF_StripeNum_t stripeID,
902 RF_StripeNum_t * psID,
903 RF_ReconUnitNum_t * which_ru)
904 {
905 *which_ru = 0;
906 *psID = stripeID;
907 }
908
909
910 /* select an algorithm for performing an access. Returns two pointers,
911 * one to a function that will return information about the DAG, and
912 * another to a function that will create the dag.
913 */
914 void
915 rf_ParityLoggingDagSelect(
916 RF_Raid_t * raidPtr,
917 RF_IoType_t type,
918 RF_AccessStripeMap_t * asmp,
919 RF_VoidFuncPtr * createFunc)
920 {
921 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
922 RF_PhysDiskAddr_t *failedPDA = NULL;
923 RF_RowCol_t frow, fcol;
924 RF_RowStatus_t rstat;
925 int prior_recon;
926
927 RF_ASSERT(RF_IO_IS_R_OR_W(type));
928
929 if (asmp->numDataFailed + asmp->numParityFailed > 1) {
930 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
931 /* *infoFunc = */ *createFunc = NULL;
932 return;
933 } else
934 if (asmp->numDataFailed + asmp->numParityFailed == 1) {
935
936 /* if under recon & already reconstructed, redirect
937 * the access to the spare drive and eliminate the
938 * failure indication */
939 failedPDA = asmp->failedPDAs[0];
940 frow = failedPDA->row;
941 fcol = failedPDA->col;
942 rstat = raidPtr->status[failedPDA->row];
943 prior_recon = (rstat == rf_rs_reconfigured) || (
944 (rstat == rf_rs_reconstructing) ?
945 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
946 );
947 if (prior_recon) {
948 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col;
949 RF_SectorNum_t oo = failedPDA->startSector;
950 if (layoutPtr->map->flags &
951 RF_DISTRIBUTE_SPARE) {
952 /* redirect to dist spare space */
953
954 if (failedPDA == asmp->parityInfo) {
955
956 /* parity has failed */
957 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
958 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
959
960 if (asmp->parityInfo->next) { /* redir 2nd component,
961 * if any */
962 RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
963 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
964 p->row = failedPDA->row;
965 p->col = failedPDA->col;
966 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
967 SUoffs; /* cheating:
968 * startSector is not
969 * really a RAID address */
970 }
971 } else
972 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
973 RF_ASSERT(0); /* should not ever
974 * happen */
975 } else {
976
977 /* data has failed */
978 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
979 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
980
981 }
982
983 } else {
984 /* redirect to dedicated spare space */
985
986 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
987 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
988
989 /* the parity may have two distinct
990 * components, both of which may need
991 * to be redirected */
992 if (asmp->parityInfo->next) {
993 if (failedPDA == asmp->parityInfo) {
994 failedPDA->next->row = failedPDA->row;
995 failedPDA->next->col = failedPDA->col;
996 } else
997 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */
998 asmp->parityInfo->row = failedPDA->row;
999 asmp->parityInfo->col = failedPDA->col;
1000 }
1001 }
1002 }
1003
1004 RF_ASSERT(failedPDA->col != -1);
1005
1006 if (rf_dagDebug || rf_mapDebug) {
1007 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
1008 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector);
1009 }
1010 asmp->numDataFailed = asmp->numParityFailed = 0;
1011 }
1012 }
1013 if (type == RF_IO_TYPE_READ) {
1014
1015 if (asmp->numDataFailed == 0)
1016 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
1017 else
1018 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
1019
1020 } else {
1021
1022
1023 /* if mirroring, always use large writes. If the access
1024 * requires two distinct parity updates, always do a small
1025 * write. If the stripe contains a failure but the access
1026 * does not, do a small write. The first conditional
1027 * (numStripeUnitsAccessed <= numDataCol/2) uses a
1028 * less-than-or-equal rather than just a less-than because
1029 * when G is 3 or 4, numDataCol/2 is 1, and I want
1030 * single-stripe-unit updates to use just one disk. */
1031 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) {
1032 if (((asmp->numStripeUnitsAccessed <=
1033 (layoutPtr->numDataCol / 2)) &&
1034 (layoutPtr->numDataCol != 1)) ||
1035 (asmp->parityInfo->next != NULL) ||
1036 rf_CheckStripeForFailures(raidPtr, asmp)) {
1037 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG;
1038 } else
1039 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG;
1040 } else
1041 if (asmp->numParityFailed == 1)
1042 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
1043 else
1044 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
1045 *createFunc = NULL;
1046 else
1047 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
1048 }
1049 }
1050 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
1051