rf_paritylogging.c revision 1.19 1 /* $NetBSD: rf_paritylogging.c,v 1.19 2003/12/29 05:22:16 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29
30 /*
31 parity logging configuration, dag selection, and mapping is implemented here
32 */
33
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.19 2003/12/29 05:22:16 oster Exp $");
36
37 #include "rf_archs.h"
38
39 #if RF_INCLUDE_PARITYLOGGING > 0
40
41 #include <dev/raidframe/raidframevar.h>
42
43 #include "rf_raid.h"
44 #include "rf_dag.h"
45 #include "rf_dagutils.h"
46 #include "rf_dagfuncs.h"
47 #include "rf_dagffrd.h"
48 #include "rf_dagffwr.h"
49 #include "rf_dagdegrd.h"
50 #include "rf_dagdegwr.h"
51 #include "rf_paritylog.h"
52 #include "rf_paritylogDiskMgr.h"
53 #include "rf_paritylogging.h"
54 #include "rf_parityloggingdags.h"
55 #include "rf_general.h"
56 #include "rf_map.h"
57 #include "rf_utils.h"
58 #include "rf_shutdown.h"
59
60 typedef struct RF_ParityLoggingConfigInfo_s {
61 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by
62 * IdentifyStripe */
63 } RF_ParityLoggingConfigInfo_t;
64
65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID);
66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
72
73 int
74 rf_ConfigureParityLogging(
75 RF_ShutdownList_t ** listp,
76 RF_Raid_t * raidPtr,
77 RF_Config_t * cfgPtr)
78 {
79 int i, j, startdisk, rc;
80 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
81 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
82 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
83 RF_ParityLoggingConfigInfo_t *info;
84 RF_ParityLog_t *l = NULL, *next;
85 caddr_t lHeapPtr;
86
87 if (rf_numParityRegions <= 0)
88 return(EINVAL);
89
90 /*
91 * We create multiple entries on the shutdown list here, since
92 * this configuration routine is fairly complicated in and of
93 * itself, and this makes backing out of a failed configuration
94 * much simpler.
95 */
96
97 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
98
99 /* create a parity logging configuration structure */
100 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t),
101 (RF_ParityLoggingConfigInfo_t *),
102 raidPtr->cleanupList);
103 if (info == NULL)
104 return (ENOMEM);
105 layoutPtr->layoutSpecificInfo = (void *) info;
106
107 /* the stripe identifier must identify the disks in each stripe, IN
108 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
109 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol),
110 (raidPtr->numCol),
111 raidPtr->cleanupList);
112 if (info->stripeIdentifier == NULL)
113 return (ENOMEM);
114
115 startdisk = 0;
116 for (i = 0; i < (raidPtr->numCol); i++) {
117 for (j = 0; j < (raidPtr->numCol); j++) {
118 info->stripeIdentifier[i][j] = (startdisk + j) %
119 (raidPtr->numCol - 1);
120 }
121 if ((--startdisk) < 0)
122 startdisk = raidPtr->numCol - 1 - 1;
123 }
124
125 /* fill in the remaining layout parameters */
126 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
127 layoutPtr->numParityCol = 1;
128 layoutPtr->numParityLogCol = 1;
129 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol -
130 layoutPtr->numParityLogCol;
131 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol *
132 layoutPtr->sectorsPerStripeUnit;
133 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
134 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
135 layoutPtr->sectorsPerStripeUnit;
136
137 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk *
138 layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
139
140 /* configure parity log parameters
141 *
142 * parameter comment/constraints
143 * -------------------------------------------
144 * numParityRegions* all regions (except possibly last)
145 * of equal size
146 * totalInCoreLogCapacity* amount of memory in bytes available
147 * for in-core logs (default 1 MB)
148 * numSectorsPerLog# capacity of an in-core log in sectors
149 * (1 * disk track)
150 * numParityLogs total number of in-core logs,
151 * should be at least numParityRegions
152 * regionLogCapacity size of a region log (except possibly
153 * last one) in sectors
154 * totalLogCapacity total amount of log space in sectors
155 *
156 * where '*' denotes a user settable parameter.
157 * Note that logs are fixed to be the size of a disk track,
158 * value #defined in rf_paritylog.h
159 *
160 */
161
162 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
163 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
164 if (rf_parityLogDebug)
165 printf("bytes per sector %d\n", raidPtr->bytesPerSector);
166
167 /* reduce fragmentation within a disk region by adjusting the number
168 * of regions in an attempt to allow an integral number of logs to fit
169 * into a disk region */
170 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
171 if (fragmentation > 0)
172 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) {
173 if (((totalLogCapacity / (rf_numParityRegions + i)) %
174 raidPtr->numSectorsPerLog) < fragmentation) {
175 rf_numParityRegions++;
176 raidPtr->regionLogCapacity = totalLogCapacity /
177 rf_numParityRegions;
178 fragmentation = raidPtr->regionLogCapacity %
179 raidPtr->numSectorsPerLog;
180 }
181 if (((totalLogCapacity / (rf_numParityRegions - i)) %
182 raidPtr->numSectorsPerLog) < fragmentation) {
183 rf_numParityRegions--;
184 raidPtr->regionLogCapacity = totalLogCapacity /
185 rf_numParityRegions;
186 fragmentation = raidPtr->regionLogCapacity %
187 raidPtr->numSectorsPerLog;
188 }
189 }
190 /* ensure integral number of regions per log */
191 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity /
192 raidPtr->numSectorsPerLog) *
193 raidPtr->numSectorsPerLog;
194
195 raidPtr->numParityLogs = rf_totalInCoreLogCapacity /
196 (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
197 /* to avoid deadlock, must ensure that enough logs exist for each
198 * region to have one simultaneously */
199 if (raidPtr->numParityLogs < rf_numParityRegions)
200 raidPtr->numParityLogs = rf_numParityRegions;
201
202 /* create region information structs */
203 printf("Allocating %d bytes for in-core parity region info\n",
204 (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
205 RF_Malloc(raidPtr->regionInfo,
206 (rf_numParityRegions * sizeof(RF_RegionInfo_t)),
207 (RF_RegionInfo_t *));
208 if (raidPtr->regionInfo == NULL)
209 return (ENOMEM);
210
211 /* last region may not be full capacity */
212 lastRegionCapacity = raidPtr->regionLogCapacity;
213 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity +
214 lastRegionCapacity > totalLogCapacity)
215 lastRegionCapacity = lastRegionCapacity -
216 raidPtr->numSectorsPerLog;
217
218 raidPtr->regionParityRange = raidPtr->sectorsPerDisk /
219 rf_numParityRegions;
220 maxRegionParityRange = raidPtr->regionParityRange;
221
222 /* i can't remember why this line is in the code -wvcii 6/30/95 */
223 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
224 regionParityRange++; */
225
226 /* build pool of unused parity logs */
227 printf("Allocating %d bytes for %d parity logs\n",
228 raidPtr->numParityLogs * raidPtr->numSectorsPerLog *
229 raidPtr->bytesPerSector,
230 raidPtr->numParityLogs);
231 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
232 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector,
233 (caddr_t));
234 if (raidPtr->parityLogBufferHeap == NULL)
235 return (ENOMEM);
236 lHeapPtr = raidPtr->parityLogBufferHeap;
237 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex);
238 if (rc) {
239 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
240 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
241 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
242 return (ENOMEM);
243 }
244 for (i = 0; i < raidPtr->numParityLogs; i++) {
245 if (i == 0) {
246 RF_Malloc(raidPtr->parityLogPool.parityLogs,
247 sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
248 if (raidPtr->parityLogPool.parityLogs == NULL) {
249 RF_Free(raidPtr->parityLogBufferHeap,
250 raidPtr->numParityLogs *
251 raidPtr->numSectorsPerLog *
252 raidPtr->bytesPerSector);
253 return (ENOMEM);
254 }
255 l = raidPtr->parityLogPool.parityLogs;
256 } else {
257 RF_Malloc(l->next, sizeof(RF_ParityLog_t),
258 (RF_ParityLog_t *));
259 if (l->next == NULL) {
260 RF_Free(raidPtr->parityLogBufferHeap,
261 raidPtr->numParityLogs *
262 raidPtr->numSectorsPerLog *
263 raidPtr->bytesPerSector);
264 for (l = raidPtr->parityLogPool.parityLogs;
265 l;
266 l = next) {
267 next = l->next;
268 if (l->records)
269 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
270 RF_Free(l, sizeof(RF_ParityLog_t));
271 }
272 return (ENOMEM);
273 }
274 l = l->next;
275 }
276 l->bufPtr = lHeapPtr;
277 lHeapPtr += raidPtr->numSectorsPerLog *
278 raidPtr->bytesPerSector;
279 RF_Malloc(l->records, (raidPtr->numSectorsPerLog *
280 sizeof(RF_ParityLogRecord_t)),
281 (RF_ParityLogRecord_t *));
282 if (l->records == NULL) {
283 RF_Free(raidPtr->parityLogBufferHeap,
284 raidPtr->numParityLogs *
285 raidPtr->numSectorsPerLog *
286 raidPtr->bytesPerSector);
287 for (l = raidPtr->parityLogPool.parityLogs;
288 l;
289 l = next) {
290 next = l->next;
291 if (l->records)
292 RF_Free(l->records,
293 (raidPtr->numSectorsPerLog *
294 sizeof(RF_ParityLogRecord_t)));
295 RF_Free(l, sizeof(RF_ParityLog_t));
296 }
297 return (ENOMEM);
298 }
299 }
300 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
301 if (rc) {
302 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
303 __LINE__, rc);
304 rf_ShutdownParityLoggingPool(raidPtr);
305 return (rc);
306 }
307 /* build pool of region buffers */
308 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex);
309 if (rc) {
310 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
311 return (ENOMEM);
312 }
313 rc = rf_cond_init(&raidPtr->regionBufferPool.cond);
314 if (rc) {
315 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc);
316 return (ENOMEM);
317 }
318 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity *
319 raidPtr->bytesPerSector;
320 printf("regionBufferPool.bufferSize %d\n",
321 raidPtr->regionBufferPool.bufferSize);
322
323 /* for now, only one region at a time may be reintegrated */
324 raidPtr->regionBufferPool.totalBuffers = 1;
325
326 raidPtr->regionBufferPool.availableBuffers =
327 raidPtr->regionBufferPool.totalBuffers;
328 raidPtr->regionBufferPool.availBuffersIndex = 0;
329 raidPtr->regionBufferPool.emptyBuffersIndex = 0;
330 printf("Allocating %d bytes for regionBufferPool\n",
331 (int) (raidPtr->regionBufferPool.totalBuffers *
332 sizeof(caddr_t)));
333 RF_Malloc(raidPtr->regionBufferPool.buffers,
334 raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t),
335 (caddr_t *));
336 if (raidPtr->regionBufferPool.buffers == NULL) {
337 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
338 return (ENOMEM);
339 }
340 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
341 printf("Allocating %d bytes for regionBufferPool#%d\n",
342 (int) (raidPtr->regionBufferPool.bufferSize *
343 sizeof(char)), i);
344 RF_Malloc(raidPtr->regionBufferPool.buffers[i],
345 raidPtr->regionBufferPool.bufferSize * sizeof(char),
346 (caddr_t));
347 if (raidPtr->regionBufferPool.buffers[i] == NULL) {
348 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
349 for (j = 0; j < i; j++) {
350 RF_Free(raidPtr->regionBufferPool.buffers[i],
351 raidPtr->regionBufferPool.bufferSize *
352 sizeof(char));
353 }
354 RF_Free(raidPtr->regionBufferPool.buffers,
355 raidPtr->regionBufferPool.totalBuffers *
356 sizeof(caddr_t));
357 return (ENOMEM);
358 }
359 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
360 (long) raidPtr->regionBufferPool.buffers[i]);
361 }
362 rc = rf_ShutdownCreate(listp,
363 rf_ShutdownParityLoggingRegionBufferPool,
364 raidPtr);
365 if (rc) {
366 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
367 __LINE__, rc);
368 rf_ShutdownParityLoggingRegionBufferPool(raidPtr);
369 return (rc);
370 }
371 /* build pool of parity buffers */
372 parityBufferCapacity = maxRegionParityRange;
373 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex);
374 if (rc) {
375 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
376 return (rc);
377 }
378 rc = rf_cond_init(&raidPtr->parityBufferPool.cond);
379 if (rc) {
380 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc);
381 return (ENOMEM);
382 }
383 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity *
384 raidPtr->bytesPerSector;
385 printf("parityBufferPool.bufferSize %d\n",
386 raidPtr->parityBufferPool.bufferSize);
387
388 /* for now, only one region at a time may be reintegrated */
389 raidPtr->parityBufferPool.totalBuffers = 1;
390
391 raidPtr->parityBufferPool.availableBuffers =
392 raidPtr->parityBufferPool.totalBuffers;
393 raidPtr->parityBufferPool.availBuffersIndex = 0;
394 raidPtr->parityBufferPool.emptyBuffersIndex = 0;
395 printf("Allocating %d bytes for parityBufferPool of %d units\n",
396 (int) (raidPtr->parityBufferPool.totalBuffers *
397 sizeof(caddr_t)),
398 raidPtr->parityBufferPool.totalBuffers );
399 RF_Malloc(raidPtr->parityBufferPool.buffers,
400 raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t),
401 (caddr_t *));
402 if (raidPtr->parityBufferPool.buffers == NULL) {
403 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
404 return (ENOMEM);
405 }
406 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
407 printf("Allocating %d bytes for parityBufferPool#%d\n",
408 (int) (raidPtr->parityBufferPool.bufferSize *
409 sizeof(char)),i);
410 RF_Malloc(raidPtr->parityBufferPool.buffers[i],
411 raidPtr->parityBufferPool.bufferSize * sizeof(char),
412 (caddr_t));
413 if (raidPtr->parityBufferPool.buffers == NULL) {
414 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
415 for (j = 0; j < i; j++) {
416 RF_Free(raidPtr->parityBufferPool.buffers[i],
417 raidPtr->regionBufferPool.bufferSize *
418 sizeof(char));
419 }
420 RF_Free(raidPtr->parityBufferPool.buffers,
421 raidPtr->regionBufferPool.totalBuffers *
422 sizeof(caddr_t));
423 return (ENOMEM);
424 }
425 printf("parityBufferPool.buffers[%d] = %lx\n", i,
426 (long) raidPtr->parityBufferPool.buffers[i]);
427 }
428 rc = rf_ShutdownCreate(listp,
429 rf_ShutdownParityLoggingParityBufferPool,
430 raidPtr);
431 if (rc) {
432 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
433 __LINE__, rc);
434 rf_ShutdownParityLoggingParityBufferPool(raidPtr);
435 return (rc);
436 }
437 /* initialize parityLogDiskQueue */
438 rf_mutex_init(&raidPtr->parityLogDiskQueue.mutex);
439 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond);
440 if (rc) {
441 rf_print_unable_to_init_cond(__FILE__, __LINE__, rc);
442 return (rc);
443 }
444 raidPtr->parityLogDiskQueue.flushQueue = NULL;
445 raidPtr->parityLogDiskQueue.reintQueue = NULL;
446 raidPtr->parityLogDiskQueue.bufHead = NULL;
447 raidPtr->parityLogDiskQueue.bufTail = NULL;
448 raidPtr->parityLogDiskQueue.reintHead = NULL;
449 raidPtr->parityLogDiskQueue.reintTail = NULL;
450 raidPtr->parityLogDiskQueue.logBlockHead = NULL;
451 raidPtr->parityLogDiskQueue.logBlockTail = NULL;
452 raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
453 raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
454 raidPtr->parityLogDiskQueue.freeDataList = NULL;
455 raidPtr->parityLogDiskQueue.freeCommonList = NULL;
456
457 rc = rf_ShutdownCreate(listp,
458 rf_ShutdownParityLoggingDiskQueue,
459 raidPtr);
460 if (rc) {
461 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
462 __LINE__, rc);
463 return (rc);
464 }
465 for (i = 0; i < rf_numParityRegions; i++) {
466 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex);
467 if (rc) {
468 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
469 for (j = 0; j < i; j++)
470 FreeRegionInfo(raidPtr, j);
471 RF_Free(raidPtr->regionInfo,
472 (rf_numParityRegions *
473 sizeof(RF_RegionInfo_t)));
474 return (ENOMEM);
475 }
476 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex);
477 if (rc) {
478 rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
479 for (j = 0; j < i; j++)
480 FreeRegionInfo(raidPtr, j);
481 RF_Free(raidPtr->regionInfo,
482 (rf_numParityRegions *
483 sizeof(RF_RegionInfo_t)));
484 return (ENOMEM);
485 }
486 raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
487 raidPtr->regionInfo[i].regionStartAddr =
488 raidPtr->regionLogCapacity * i;
489 raidPtr->regionInfo[i].parityStartAddr =
490 raidPtr->regionParityRange * i;
491 if (i < rf_numParityRegions - 1) {
492 raidPtr->regionInfo[i].capacity =
493 raidPtr->regionLogCapacity;
494 raidPtr->regionInfo[i].numSectorsParity =
495 raidPtr->regionParityRange;
496 } else {
497 raidPtr->regionInfo[i].capacity =
498 lastRegionCapacity;
499 raidPtr->regionInfo[i].numSectorsParity =
500 raidPtr->sectorsPerDisk -
501 raidPtr->regionParityRange * i;
502 if (raidPtr->regionInfo[i].numSectorsParity >
503 maxRegionParityRange)
504 maxRegionParityRange =
505 raidPtr->regionInfo[i].numSectorsParity;
506 }
507 raidPtr->regionInfo[i].diskCount = 0;
508 RF_ASSERT(raidPtr->regionInfo[i].capacity +
509 raidPtr->regionInfo[i].regionStartAddr <=
510 totalLogCapacity);
511 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr +
512 raidPtr->regionInfo[i].numSectorsParity <=
513 raidPtr->sectorsPerDisk);
514 printf("Allocating %d bytes for region %d\n",
515 (int) (raidPtr->regionInfo[i].capacity *
516 sizeof(RF_DiskMap_t)), i);
517 RF_Malloc(raidPtr->regionInfo[i].diskMap,
518 (raidPtr->regionInfo[i].capacity *
519 sizeof(RF_DiskMap_t)),
520 (RF_DiskMap_t *));
521 if (raidPtr->regionInfo[i].diskMap == NULL) {
522 for (j = 0; j < i; j++)
523 FreeRegionInfo(raidPtr, j);
524 RF_Free(raidPtr->regionInfo,
525 (rf_numParityRegions *
526 sizeof(RF_RegionInfo_t)));
527 return (ENOMEM);
528 }
529 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
530 raidPtr->regionInfo[i].coreLog = NULL;
531 }
532 rc = rf_ShutdownCreate(listp,
533 rf_ShutdownParityLoggingRegionInfo,
534 raidPtr);
535 if (rc) {
536 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
537 __LINE__, rc);
538 rf_ShutdownParityLoggingRegionInfo(raidPtr);
539 return (rc);
540 }
541 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
542 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
543 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle,
544 rf_ParityLoggingDiskManager, raidPtr,"rf_log");
545 if (rc) {
546 raidPtr->parityLogDiskQueue.threadState = 0;
547 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
548 __FILE__, __LINE__, rc);
549 return (ENOMEM);
550 }
551 /* wait for thread to start */
552 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
553 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) {
554 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
555 raidPtr->parityLogDiskQueue.mutex);
556 }
557 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
558
559 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
560 if (rc) {
561 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc);
562 rf_ShutdownParityLogging(raidPtr);
563 return (rc);
564 }
565 if (rf_parityLogDebug) {
566 printf(" size of disk log in sectors: %d\n",
567 (int) totalLogCapacity);
568 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions);
569 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity);
570 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation);
571 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs);
572 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
573 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
574 }
575 rf_EnableParityLogging(raidPtr);
576
577 return (0);
578 }
579
580 static void
581 FreeRegionInfo(
582 RF_Raid_t * raidPtr,
583 RF_RegionId_t regionID)
584 {
585 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
586 RF_Free(raidPtr->regionInfo[regionID].diskMap,
587 (raidPtr->regionInfo[regionID].capacity *
588 sizeof(RF_DiskMap_t)));
589 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
590 rf_ReleaseParityLogs(raidPtr,
591 raidPtr->regionInfo[regionID].coreLog);
592 raidPtr->regionInfo[regionID].coreLog = NULL;
593 } else {
594 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
595 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
596 }
597 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
598 }
599
600
601 static void
602 FreeParityLogQueue(
603 RF_Raid_t * raidPtr,
604 RF_ParityLogQueue_t * queue)
605 {
606 RF_ParityLog_t *l1, *l2;
607
608 RF_LOCK_MUTEX(queue->mutex);
609 l1 = queue->parityLogs;
610 while (l1) {
611 l2 = l1;
612 l1 = l2->next;
613 RF_Free(l2->records, (raidPtr->numSectorsPerLog *
614 sizeof(RF_ParityLogRecord_t)));
615 RF_Free(l2, sizeof(RF_ParityLog_t));
616 }
617 RF_UNLOCK_MUTEX(queue->mutex);
618 }
619
620
621 static void
622 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue)
623 {
624 int i;
625
626 RF_LOCK_MUTEX(queue->mutex);
627 if (queue->availableBuffers != queue->totalBuffers) {
628 printf("Attempt to free region queue which is still in use!\n");
629 RF_ASSERT(0);
630 }
631 for (i = 0; i < queue->totalBuffers; i++)
632 RF_Free(queue->buffers[i], queue->bufferSize);
633 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t));
634 RF_UNLOCK_MUTEX(queue->mutex);
635 }
636
637 static void
638 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
639 {
640 RF_Raid_t *raidPtr;
641 RF_RegionId_t i;
642
643 raidPtr = (RF_Raid_t *) arg;
644 if (rf_parityLogDebug) {
645 printf("raid%d: ShutdownParityLoggingRegionInfo\n",
646 raidPtr->raidid);
647 }
648 /* free region information structs */
649 for (i = 0; i < rf_numParityRegions; i++)
650 FreeRegionInfo(raidPtr, i);
651 RF_Free(raidPtr->regionInfo, (rf_numParityRegions *
652 sizeof(raidPtr->regionInfo)));
653 raidPtr->regionInfo = NULL;
654 }
655
656 static void
657 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
658 {
659 RF_Raid_t *raidPtr;
660
661 raidPtr = (RF_Raid_t *) arg;
662 if (rf_parityLogDebug) {
663 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid);
664 }
665 /* free contents of parityLogPool */
666 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool);
667 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
668 raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
669 }
670
671 static void
672 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
673 {
674 RF_Raid_t *raidPtr;
675
676 raidPtr = (RF_Raid_t *) arg;
677 if (rf_parityLogDebug) {
678 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n",
679 raidPtr->raidid);
680 }
681 FreeRegionBufferQueue(&raidPtr->regionBufferPool);
682 }
683
684 static void
685 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
686 {
687 RF_Raid_t *raidPtr;
688
689 raidPtr = (RF_Raid_t *) arg;
690 if (rf_parityLogDebug) {
691 printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
692 raidPtr->raidid);
693 }
694 FreeRegionBufferQueue(&raidPtr->parityBufferPool);
695 }
696
697 static void
698 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
699 {
700 RF_ParityLogData_t *d;
701 RF_CommonLogData_t *c;
702 RF_Raid_t *raidPtr;
703
704 raidPtr = (RF_Raid_t *) arg;
705 if (rf_parityLogDebug) {
706 printf("raid%d: ShutdownParityLoggingDiskQueue\n",
707 raidPtr->raidid);
708 }
709 /* free disk manager stuff */
710 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
711 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
712 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
713 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
714 while (raidPtr->parityLogDiskQueue.freeDataList) {
715 d = raidPtr->parityLogDiskQueue.freeDataList;
716 raidPtr->parityLogDiskQueue.freeDataList =
717 raidPtr->parityLogDiskQueue.freeDataList->next;
718 RF_Free(d, sizeof(RF_ParityLogData_t));
719 }
720 while (raidPtr->parityLogDiskQueue.freeCommonList) {
721 c = raidPtr->parityLogDiskQueue.freeCommonList;
722 raidPtr->parityLogDiskQueue.freeCommonList =
723 raidPtr->parityLogDiskQueue.freeCommonList->next;
724 RF_Free(c, sizeof(RF_CommonLogData_t));
725 }
726 }
727
728 static void
729 rf_ShutdownParityLogging(RF_ThreadArg_t arg)
730 {
731 RF_Raid_t *raidPtr;
732
733 raidPtr = (RF_Raid_t *) arg;
734 if (rf_parityLogDebug) {
735 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid);
736 }
737 /* shutdown disk thread */
738 /* This has the desirable side-effect of forcing all regions to be
739 * reintegrated. This is necessary since all parity log maps are
740 * currently held in volatile memory. */
741
742 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
743 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
744 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
745 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
746 /*
747 * pLogDiskThread will now terminate when queues are cleared
748 * now wait for it to be done
749 */
750 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
751 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) {
752 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond,
753 raidPtr->parityLogDiskQueue.mutex);
754 }
755 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
756 if (rf_parityLogDebug) {
757 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid);
758 }
759 }
760
761 int
762 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr)
763 {
764 return (20);
765 }
766
767 RF_HeadSepLimit_t
768 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr)
769 {
770 return (10);
771 }
772 /* return the region ID for a given RAID address */
773 RF_RegionId_t
774 rf_MapRegionIDParityLogging(
775 RF_Raid_t * raidPtr,
776 RF_SectorNum_t address)
777 {
778 RF_RegionId_t regionID;
779
780 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
781 regionID = address / raidPtr->regionParityRange;
782 if (regionID == rf_numParityRegions) {
783 /* last region may be larger than other regions */
784 regionID--;
785 }
786 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
787 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr +
788 raidPtr->regionInfo[regionID].numSectorsParity);
789 RF_ASSERT(regionID < rf_numParityRegions);
790 return (regionID);
791 }
792
793
794 /* given a logical RAID sector, determine physical disk address of data */
795 void
796 rf_MapSectorParityLogging(
797 RF_Raid_t * raidPtr,
798 RF_RaidAddr_t raidSector,
799 RF_RowCol_t * col,
800 RF_SectorNum_t * diskSector,
801 int remap)
802 {
803 RF_StripeNum_t SUID = raidSector /
804 raidPtr->Layout.sectorsPerStripeUnit;
805 /* *col = (SUID % (raidPtr->numCol -
806 * raidPtr->Layout.numParityLogCol)); */
807 *col = SUID % raidPtr->Layout.numDataCol;
808 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
809 raidPtr->Layout.sectorsPerStripeUnit +
810 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
811 }
812
813
814 /* given a logical RAID sector, determine physical disk address of parity */
815 void
816 rf_MapParityParityLogging(
817 RF_Raid_t * raidPtr,
818 RF_RaidAddr_t raidSector,
819 RF_RowCol_t * col,
820 RF_SectorNum_t * diskSector,
821 int remap)
822 {
823 RF_StripeNum_t SUID = raidSector /
824 raidPtr->Layout.sectorsPerStripeUnit;
825
826 /* *col =
827 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
828 * r->numCol - raidPtr->Layout.numParityLogCol); */
829 *col = raidPtr->Layout.numDataCol;
830 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
831 raidPtr->Layout.sectorsPerStripeUnit +
832 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
833 }
834
835
836 /* given a regionID and sector offset, determine the physical disk address of the parity log */
837 void
838 rf_MapLogParityLogging(
839 RF_Raid_t * raidPtr,
840 RF_RegionId_t regionID,
841 RF_SectorNum_t regionOffset,
842 RF_RowCol_t * col,
843 RF_SectorNum_t * startSector)
844 {
845 *col = raidPtr->numCol - 1;
846 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
847 }
848
849
850 /* given a regionID, determine the physical disk address of the logged
851 parity for that region */
852 void
853 rf_MapRegionParity(
854 RF_Raid_t * raidPtr,
855 RF_RegionId_t regionID,
856 RF_RowCol_t * col,
857 RF_SectorNum_t * startSector,
858 RF_SectorCount_t * numSector)
859 {
860 *col = raidPtr->numCol - 2;
861 *startSector = raidPtr->regionInfo[regionID].parityStartAddr;
862 *numSector = raidPtr->regionInfo[regionID].numSectorsParity;
863 }
864
865
866 /* given a logical RAID address, determine the participating disks in
867 the stripe */
868 void
869 rf_IdentifyStripeParityLogging(
870 RF_Raid_t * raidPtr,
871 RF_RaidAddr_t addr,
872 RF_RowCol_t ** diskids)
873 {
874 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout,
875 addr);
876 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *)
877 raidPtr->Layout.layoutSpecificInfo;
878 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
879 }
880
881
882 void
883 rf_MapSIDToPSIDParityLogging(
884 RF_RaidLayout_t * layoutPtr,
885 RF_StripeNum_t stripeID,
886 RF_StripeNum_t * psID,
887 RF_ReconUnitNum_t * which_ru)
888 {
889 *which_ru = 0;
890 *psID = stripeID;
891 }
892
893
894 /* select an algorithm for performing an access. Returns two pointers,
895 * one to a function that will return information about the DAG, and
896 * another to a function that will create the dag.
897 */
898 void
899 rf_ParityLoggingDagSelect(
900 RF_Raid_t * raidPtr,
901 RF_IoType_t type,
902 RF_AccessStripeMap_t * asmp,
903 RF_VoidFuncPtr * createFunc)
904 {
905 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
906 RF_PhysDiskAddr_t *failedPDA = NULL;
907 RF_RowCol_t fcol;
908 RF_RowStatus_t rstat;
909 int prior_recon;
910
911 RF_ASSERT(RF_IO_IS_R_OR_W(type));
912
913 if (asmp->numDataFailed + asmp->numParityFailed > 1) {
914 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
915 *createFunc = NULL;
916 return;
917 } else
918 if (asmp->numDataFailed + asmp->numParityFailed == 1) {
919
920 /* if under recon & already reconstructed, redirect
921 * the access to the spare drive and eliminate the
922 * failure indication */
923 failedPDA = asmp->failedPDAs[0];
924 fcol = failedPDA->col;
925 rstat = raidPtr->status;
926 prior_recon = (rstat == rf_rs_reconfigured) || (
927 (rstat == rf_rs_reconstructing) ?
928 rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0
929 );
930 if (prior_recon) {
931 RF_RowCol_t oc = failedPDA->col;
932 RF_SectorNum_t oo = failedPDA->startSector;
933 if (layoutPtr->map->flags &
934 RF_DISTRIBUTE_SPARE) {
935 /* redirect to dist spare space */
936
937 if (failedPDA == asmp->parityInfo) {
938
939 /* parity has failed */
940 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress,
941 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
942
943 if (asmp->parityInfo->next) { /* redir 2nd component,
944 * if any */
945 RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
946 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
947 p->col = failedPDA->col;
948 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
949 SUoffs; /* cheating:
950 * startSector is not
951 * really a RAID address */
952 }
953 } else
954 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
955 RF_ASSERT(0); /* should not ever
956 * happen */
957 } else {
958
959 /* data has failed */
960 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress,
961 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
962
963 }
964
965 } else {
966 /* redirect to dedicated spare space */
967
968 failedPDA->col = raidPtr->Disks[fcol].spareCol;
969
970 /* the parity may have two distinct
971 * components, both of which may need
972 * to be redirected */
973 if (asmp->parityInfo->next) {
974 if (failedPDA == asmp->parityInfo) {
975 failedPDA->next->col = failedPDA->col;
976 } else
977 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */
978 asmp->parityInfo->col = failedPDA->col;
979 }
980 }
981 }
982
983 RF_ASSERT(failedPDA->col != -1);
984
985 if (rf_dagDebug || rf_mapDebug) {
986 printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n",
987 raidPtr->raidid, type, oc, (long) oo, failedPDA->col, (long) failedPDA->startSector);
988 }
989 asmp->numDataFailed = asmp->numParityFailed = 0;
990 }
991 }
992 if (type == RF_IO_TYPE_READ) {
993
994 if (asmp->numDataFailed == 0)
995 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
996 else
997 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
998
999 } else {
1000
1001
1002 /* if mirroring, always use large writes. If the access
1003 * requires two distinct parity updates, always do a small
1004 * write. If the stripe contains a failure but the access
1005 * does not, do a small write. The first conditional
1006 * (numStripeUnitsAccessed <= numDataCol/2) uses a
1007 * less-than-or-equal rather than just a less-than because
1008 * when G is 3 or 4, numDataCol/2 is 1, and I want
1009 * single-stripe-unit updates to use just one disk. */
1010 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) {
1011 if (((asmp->numStripeUnitsAccessed <=
1012 (layoutPtr->numDataCol / 2)) &&
1013 (layoutPtr->numDataCol != 1)) ||
1014 (asmp->parityInfo->next != NULL) ||
1015 rf_CheckStripeForFailures(raidPtr, asmp)) {
1016 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG;
1017 } else
1018 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG;
1019 } else
1020 if (asmp->numParityFailed == 1)
1021 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
1022 else
1023 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
1024 *createFunc = NULL;
1025 else
1026 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
1027 }
1028 }
1029 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
1030