rf_paritylogging.c revision 1.7 1 /* $NetBSD: rf_paritylogging.c,v 1.7 2000/01/09 03:28:11 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29
30 /*
31 parity logging configuration, dag selection, and mapping is implemented here
32 */
33
34 #include "rf_archs.h"
35
36 #if RF_INCLUDE_PARITYLOGGING > 0
37
38 #include "rf_types.h"
39 #include "rf_raid.h"
40 #include "rf_dag.h"
41 #include "rf_dagutils.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_dagffrd.h"
44 #include "rf_dagffwr.h"
45 #include "rf_dagdegrd.h"
46 #include "rf_dagdegwr.h"
47 #include "rf_paritylog.h"
48 #include "rf_paritylogDiskMgr.h"
49 #include "rf_paritylogging.h"
50 #include "rf_parityloggingdags.h"
51 #include "rf_general.h"
52 #include "rf_map.h"
53 #include "rf_utils.h"
54 #include "rf_shutdown.h"
55
56 typedef struct RF_ParityLoggingConfigInfo_s {
57 RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by
58 * IdentifyStripe */
59 } RF_ParityLoggingConfigInfo_t;
60
61 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID);
62 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
63 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
64 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
65 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
66 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
67 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
68
69 int
70 rf_ConfigureParityLogging(
71 RF_ShutdownList_t ** listp,
72 RF_Raid_t * raidPtr,
73 RF_Config_t * cfgPtr)
74 {
75 int i, j, startdisk, rc;
76 RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
77 RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
78 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
79 RF_ParityLoggingConfigInfo_t *info;
80 RF_ParityLog_t *l = NULL, *next;
81 caddr_t lHeapPtr;
82
83 if (rf_numParityRegions <= 0)
84 return(EINVAL);
85
86 /*
87 * We create multiple entries on the shutdown list here, since
88 * this configuration routine is fairly complicated in and of
89 * itself, and this makes backing out of a failed configuration
90 * much simpler.
91 */
92
93 raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
94
95 /* create a parity logging configuration structure */
96 RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), (RF_ParityLoggingConfigInfo_t *), raidPtr->cleanupList);
97 if (info == NULL)
98 return (ENOMEM);
99 layoutPtr->layoutSpecificInfo = (void *) info;
100
101 RF_ASSERT(raidPtr->numRow == 1);
102
103 /* the stripe identifier must identify the disks in each stripe, IN
104 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
105 info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), (raidPtr->numCol), raidPtr->cleanupList);
106 if (info->stripeIdentifier == NULL)
107 return (ENOMEM);
108
109 startdisk = 0;
110 for (i = 0; i < (raidPtr->numCol); i++) {
111 for (j = 0; j < (raidPtr->numCol); j++) {
112 info->stripeIdentifier[i][j] = (startdisk + j) % (raidPtr->numCol - 1);
113 }
114 if ((--startdisk) < 0)
115 startdisk = raidPtr->numCol - 1 - 1;
116 }
117
118 /* fill in the remaining layout parameters */
119 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
120 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
121 layoutPtr->numParityCol = 1;
122 layoutPtr->numParityLogCol = 1;
123 layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - layoutPtr->numParityLogCol;
124 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
125 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
126 raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
127
128 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
129
130 /* configure parity log parameters
131 *
132 * parameter comment/constraints
133 * -------------------------------------------
134 * numParityRegions* all regions (except possibly last)
135 * of equal size
136 * totalInCoreLogCapacity* amount of memory in bytes available
137 * for in-core logs (default 1 MB)
138 * numSectorsPerLog# capacity of an in-core log in sectors
139 * (1 * disk track)
140 * numParityLogs total number of in-core logs,
141 * should be at least numParityRegions
142 * regionLogCapacity size of a region log (except possibly
143 * last one) in sectors
144 * totalLogCapacity total amount of log space in sectors
145 *
146 * where '*' denotes a user settable parameter.
147 * Note that logs are fixed to be the size of a disk track,
148 * value #defined in rf_paritylog.h
149 *
150 */
151
152 totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
153 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
154 if (rf_parityLogDebug)
155 printf("bytes per sector %d\n", raidPtr->bytesPerSector);
156
157 /* reduce fragmentation within a disk region by adjusting the number
158 * of regions in an attempt to allow an integral number of logs to fit
159 * into a disk region */
160 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
161 if (fragmentation > 0)
162 for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) {
163 if (((totalLogCapacity / (rf_numParityRegions + i)) % raidPtr->numSectorsPerLog) < fragmentation) {
164 rf_numParityRegions++;
165 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
166 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
167 }
168 if (((totalLogCapacity / (rf_numParityRegions - i)) % raidPtr->numSectorsPerLog) < fragmentation) {
169 rf_numParityRegions--;
170 raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
171 fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
172 }
173 }
174 /* ensure integral number of regions per log */
175 raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / raidPtr->numSectorsPerLog) * raidPtr->numSectorsPerLog;
176
177 raidPtr->numParityLogs = rf_totalInCoreLogCapacity / (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
178 /* to avoid deadlock, must ensure that enough logs exist for each
179 * region to have one simultaneously */
180 if (raidPtr->numParityLogs < rf_numParityRegions)
181 raidPtr->numParityLogs = rf_numParityRegions;
182
183 /* create region information structs */
184 RF_Malloc(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)), (RF_RegionInfo_t *));
185 if (raidPtr->regionInfo == NULL)
186 return (ENOMEM);
187
188 /* last region may not be full capacity */
189 lastRegionCapacity = raidPtr->regionLogCapacity;
190 while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + lastRegionCapacity > totalLogCapacity)
191 lastRegionCapacity = lastRegionCapacity - raidPtr->numSectorsPerLog;
192
193 raidPtr->regionParityRange = raidPtr->sectorsPerDisk / rf_numParityRegions;
194 maxRegionParityRange = raidPtr->regionParityRange;
195
196 /* i can't remember why this line is in the code -wvcii 6/30/95 */
197 /* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
198 regionParityRange++; */
199
200 /* build pool of unused parity logs */
201 RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, (caddr_t));
202 if (raidPtr->parityLogBufferHeap == NULL)
203 return (ENOMEM);
204 lHeapPtr = raidPtr->parityLogBufferHeap;
205 rc = rf_mutex_init(&raidPtr->parityLogPool.mutex);
206 if (rc) {
207 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
208 __LINE__, rc);
209 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
210 return (ENOMEM);
211 }
212 for (i = 0; i < raidPtr->numParityLogs; i++) {
213 if (i == 0) {
214 RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
215 if (raidPtr->parityLogPool.parityLogs == NULL) {
216 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
217 return (ENOMEM);
218 }
219 l = raidPtr->parityLogPool.parityLogs;
220 } else {
221 RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
222 if (l->next == NULL) {
223 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
224 for (l = raidPtr->parityLogPool.parityLogs; l; l = next) {
225 next = l->next;
226 if (l->records)
227 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
228 RF_Free(l, sizeof(RF_ParityLog_t));
229 }
230 return (ENOMEM);
231 }
232 l = l->next;
233 }
234 l->bufPtr = lHeapPtr;
235 lHeapPtr += raidPtr->numSectorsPerLog * raidPtr->bytesPerSector;
236 RF_Malloc(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)), (RF_ParityLogRecord_t *));
237 if (l->records == NULL) {
238 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
239 for (l = raidPtr->parityLogPool.parityLogs; l; l = next) {
240 next = l->next;
241 if (l->records)
242 RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
243 RF_Free(l, sizeof(RF_ParityLog_t));
244 }
245 return (ENOMEM);
246 }
247 }
248 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
249 if (rc) {
250 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
251 __LINE__, rc);
252 rf_ShutdownParityLoggingPool(raidPtr);
253 return (rc);
254 }
255 /* build pool of region buffers */
256 rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex);
257 if (rc) {
258 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
259 __LINE__, rc);
260 return (ENOMEM);
261 }
262 rc = rf_cond_init(&raidPtr->regionBufferPool.cond);
263 if (rc) {
264 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
265 __LINE__, rc);
266 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
267 return (ENOMEM);
268 }
269 raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * raidPtr->bytesPerSector;
270 printf("regionBufferPool.bufferSize %d\n", raidPtr->regionBufferPool.bufferSize);
271 raidPtr->regionBufferPool.totalBuffers = 1; /* for now, only one
272 * region at a time may
273 * be reintegrated */
274 raidPtr->regionBufferPool.availableBuffers = raidPtr->regionBufferPool.totalBuffers;
275 raidPtr->regionBufferPool.availBuffersIndex = 0;
276 raidPtr->regionBufferPool.emptyBuffersIndex = 0;
277 RF_Malloc(raidPtr->regionBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), (caddr_t *));
278 if (raidPtr->regionBufferPool.buffers == NULL) {
279 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
280 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
281 return (ENOMEM);
282 }
283 for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
284 RF_Malloc(raidPtr->regionBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char), (caddr_t));
285 if (raidPtr->regionBufferPool.buffers[i] == NULL) {
286 rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
287 rf_cond_destroy(&raidPtr->regionBufferPool.cond);
288 for (j = 0; j < i; j++) {
289 RF_Free(raidPtr->regionBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char));
290 }
291 RF_Free(raidPtr->regionBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t));
292 return (ENOMEM);
293 }
294 printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
295 (long) raidPtr->regionBufferPool.buffers[i]);
296 }
297 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingRegionBufferPool, raidPtr);
298 if (rc) {
299 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
300 __LINE__, rc);
301 rf_ShutdownParityLoggingRegionBufferPool(raidPtr);
302 return (rc);
303 }
304 /* build pool of parity buffers */
305 parityBufferCapacity = maxRegionParityRange;
306 rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex);
307 if (rc) {
308 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
309 __LINE__, rc);
310 return (rc);
311 }
312 rc = rf_cond_init(&raidPtr->parityBufferPool.cond);
313 if (rc) {
314 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
315 __LINE__, rc);
316 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
317 return (ENOMEM);
318 }
319 raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * raidPtr->bytesPerSector;
320 printf("parityBufferPool.bufferSize %d\n", raidPtr->parityBufferPool.bufferSize);
321 raidPtr->parityBufferPool.totalBuffers = 1; /* for now, only one
322 * region at a time may
323 * be reintegrated */
324 raidPtr->parityBufferPool.availableBuffers = raidPtr->parityBufferPool.totalBuffers;
325 raidPtr->parityBufferPool.availBuffersIndex = 0;
326 raidPtr->parityBufferPool.emptyBuffersIndex = 0;
327 RF_Malloc(raidPtr->parityBufferPool.buffers, raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), (caddr_t *));
328 if (raidPtr->parityBufferPool.buffers == NULL) {
329 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
330 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
331 return (ENOMEM);
332 }
333 for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
334 RF_Malloc(raidPtr->parityBufferPool.buffers[i], raidPtr->parityBufferPool.bufferSize * sizeof(char), (caddr_t));
335 if (raidPtr->parityBufferPool.buffers == NULL) {
336 rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
337 rf_cond_destroy(&raidPtr->parityBufferPool.cond);
338 for (j = 0; j < i; j++) {
339 RF_Free(raidPtr->parityBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char));
340 }
341 RF_Free(raidPtr->parityBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t));
342 return (ENOMEM);
343 }
344 printf("parityBufferPool.buffers[%d] = %lx\n", i,
345 (long) raidPtr->parityBufferPool.buffers[i]);
346 }
347 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingParityBufferPool, raidPtr);
348 if (rc) {
349 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
350 __LINE__, rc);
351 rf_ShutdownParityLoggingParityBufferPool(raidPtr);
352 return (rc);
353 }
354 /* initialize parityLogDiskQueue */
355 rc = rf_create_managed_mutex(listp, &raidPtr->parityLogDiskQueue.mutex);
356 if (rc) {
357 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
358 __LINE__, rc);
359 return (rc);
360 }
361 rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond);
362 if (rc) {
363 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
364 __LINE__, rc);
365 return (rc);
366 }
367 raidPtr->parityLogDiskQueue.flushQueue = NULL;
368 raidPtr->parityLogDiskQueue.reintQueue = NULL;
369 raidPtr->parityLogDiskQueue.bufHead = NULL;
370 raidPtr->parityLogDiskQueue.bufTail = NULL;
371 raidPtr->parityLogDiskQueue.reintHead = NULL;
372 raidPtr->parityLogDiskQueue.reintTail = NULL;
373 raidPtr->parityLogDiskQueue.logBlockHead = NULL;
374 raidPtr->parityLogDiskQueue.logBlockTail = NULL;
375 raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
376 raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
377 raidPtr->parityLogDiskQueue.freeDataList = NULL;
378 raidPtr->parityLogDiskQueue.freeCommonList = NULL;
379
380 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingDiskQueue, raidPtr);
381 if (rc) {
382 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
383 __LINE__, rc);
384 return (rc);
385 }
386 for (i = 0; i < rf_numParityRegions; i++) {
387 rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex);
388 if (rc) {
389 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
390 __LINE__, rc);
391 for (j = 0; j < i; j++)
392 FreeRegionInfo(raidPtr, j);
393 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
394 return (ENOMEM);
395 }
396 rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex);
397 if (rc) {
398 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
399 __LINE__, rc);
400 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
401 for (j = 0; j < i; j++)
402 FreeRegionInfo(raidPtr, j);
403 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
404 return (ENOMEM);
405 }
406 raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
407 raidPtr->regionInfo[i].regionStartAddr = raidPtr->regionLogCapacity * i;
408 raidPtr->regionInfo[i].parityStartAddr = raidPtr->regionParityRange * i;
409 if (i < rf_numParityRegions - 1) {
410 raidPtr->regionInfo[i].capacity = raidPtr->regionLogCapacity;
411 raidPtr->regionInfo[i].numSectorsParity = raidPtr->regionParityRange;
412 } else {
413 raidPtr->regionInfo[i].capacity = lastRegionCapacity;
414 raidPtr->regionInfo[i].numSectorsParity = raidPtr->sectorsPerDisk - raidPtr->regionParityRange * i;
415 if (raidPtr->regionInfo[i].numSectorsParity > maxRegionParityRange)
416 maxRegionParityRange = raidPtr->regionInfo[i].numSectorsParity;
417 }
418 raidPtr->regionInfo[i].diskCount = 0;
419 RF_ASSERT(raidPtr->regionInfo[i].capacity + raidPtr->regionInfo[i].regionStartAddr <= totalLogCapacity);
420 RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + raidPtr->regionInfo[i].numSectorsParity <= raidPtr->sectorsPerDisk);
421 RF_Malloc(raidPtr->regionInfo[i].diskMap, (raidPtr->regionInfo[i].capacity * sizeof(RF_DiskMap_t)), (RF_DiskMap_t *));
422 if (raidPtr->regionInfo[i].diskMap == NULL) {
423 rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
424 rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex);
425 for (j = 0; j < i; j++)
426 FreeRegionInfo(raidPtr, j);
427 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
428 return (ENOMEM);
429 }
430 raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
431 raidPtr->regionInfo[i].coreLog = NULL;
432 }
433 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingRegionInfo, raidPtr);
434 if (rc) {
435 RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
436 __LINE__, rc);
437 rf_ShutdownParityLoggingRegionInfo(raidPtr);
438 return (rc);
439 }
440 RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
441 raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
442 rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, rf_ParityLoggingDiskManager, raidPtr,"rf_log");
443 if (rc) {
444 raidPtr->parityLogDiskQueue.threadState = 0;
445 RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
446 __FILE__, __LINE__, rc);
447 return (ENOMEM);
448 }
449 /* wait for thread to start */
450 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
451 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) {
452 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex);
453 }
454 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
455
456 rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
457 if (rc) {
458 RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc);
459 rf_ShutdownParityLogging(raidPtr);
460 return (rc);
461 }
462 if (rf_parityLogDebug) {
463 printf(" size of disk log in sectors: %d\n",
464 (int) totalLogCapacity);
465 printf(" total number of parity regions is %d\n", (int) rf_numParityRegions);
466 printf(" nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity);
467 printf(" nominal region fragmentation is %d sectors\n", (int) fragmentation);
468 printf(" total number of parity logs is %d\n", raidPtr->numParityLogs);
469 printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
470 printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
471 }
472 rf_EnableParityLogging(raidPtr);
473
474 return (0);
475 }
476
477 static void
478 FreeRegionInfo(
479 RF_Raid_t * raidPtr,
480 RF_RegionId_t regionID)
481 {
482 RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
483 RF_Free(raidPtr->regionInfo[regionID].diskMap, (raidPtr->regionInfo[regionID].capacity * sizeof(RF_DiskMap_t)));
484 if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
485 rf_ReleaseParityLogs(raidPtr, raidPtr->regionInfo[regionID].coreLog);
486 raidPtr->regionInfo[regionID].coreLog = NULL;
487 } else {
488 RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
489 RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
490 }
491 RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
492 rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex);
493 rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex);
494 }
495
496
497 static void
498 FreeParityLogQueue(
499 RF_Raid_t * raidPtr,
500 RF_ParityLogQueue_t * queue)
501 {
502 RF_ParityLog_t *l1, *l2;
503
504 RF_LOCK_MUTEX(queue->mutex);
505 l1 = queue->parityLogs;
506 while (l1) {
507 l2 = l1;
508 l1 = l2->next;
509 RF_Free(l2->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
510 RF_Free(l2, sizeof(RF_ParityLog_t));
511 }
512 RF_UNLOCK_MUTEX(queue->mutex);
513 rf_mutex_destroy(&queue->mutex);
514 }
515
516
517 static void
518 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue)
519 {
520 int i;
521
522 RF_LOCK_MUTEX(queue->mutex);
523 if (queue->availableBuffers != queue->totalBuffers) {
524 printf("Attempt to free region queue which is still in use!\n");
525 RF_ASSERT(0);
526 }
527 for (i = 0; i < queue->totalBuffers; i++)
528 RF_Free(queue->buffers[i], queue->bufferSize);
529 RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t));
530 RF_UNLOCK_MUTEX(queue->mutex);
531 rf_mutex_destroy(&queue->mutex);
532 }
533
534 static void
535 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
536 {
537 RF_Raid_t *raidPtr;
538 RF_RegionId_t i;
539
540 raidPtr = (RF_Raid_t *) arg;
541 if (rf_parityLogDebug) {
542 printf("raid%d: ShutdownParityLoggingRegionInfo\n",
543 raidPtr->raidid);
544 }
545 /* free region information structs */
546 for (i = 0; i < rf_numParityRegions; i++)
547 FreeRegionInfo(raidPtr, i);
548 RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(raidPtr->regionInfo)));
549 raidPtr->regionInfo = NULL;
550 }
551
552 static void
553 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
554 {
555 RF_Raid_t *raidPtr;
556
557 raidPtr = (RF_Raid_t *) arg;
558 if (rf_parityLogDebug) {
559 printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid);
560 }
561 /* free contents of parityLogPool */
562 FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool);
563 RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
564 }
565
566 static void
567 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
568 {
569 RF_Raid_t *raidPtr;
570
571 raidPtr = (RF_Raid_t *) arg;
572 if (rf_parityLogDebug) {
573 printf("raid%d: ShutdownParityLoggingRegionBufferPool\n",
574 raidPtr->raidid);
575 }
576 FreeRegionBufferQueue(&raidPtr->regionBufferPool);
577 }
578
579 static void
580 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
581 {
582 RF_Raid_t *raidPtr;
583
584 raidPtr = (RF_Raid_t *) arg;
585 if (rf_parityLogDebug) {
586 printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
587 raidPtr->raidid);
588 }
589 FreeRegionBufferQueue(&raidPtr->parityBufferPool);
590 }
591
592 static void
593 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
594 {
595 RF_ParityLogData_t *d;
596 RF_CommonLogData_t *c;
597 RF_Raid_t *raidPtr;
598
599 raidPtr = (RF_Raid_t *) arg;
600 if (rf_parityLogDebug) {
601 printf("raid%d: ShutdownParityLoggingDiskQueue\n",
602 raidPtr->raidid);
603 }
604 /* free disk manager stuff */
605 RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
606 RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
607 RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
608 RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
609 while (raidPtr->parityLogDiskQueue.freeDataList) {
610 d = raidPtr->parityLogDiskQueue.freeDataList;
611 raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
612 RF_Free(d, sizeof(RF_ParityLogData_t));
613 }
614 while (raidPtr->parityLogDiskQueue.freeCommonList) {
615 c = raidPtr->parityLogDiskQueue.freeCommonList;
616 rf_mutex_destroy(&c->mutex);
617 raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
618 RF_Free(c, sizeof(RF_CommonLogData_t));
619 }
620 }
621
622 static void
623 rf_ShutdownParityLogging(RF_ThreadArg_t arg)
624 {
625 RF_Raid_t *raidPtr;
626
627 raidPtr = (RF_Raid_t *) arg;
628 if (rf_parityLogDebug) {
629 printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid);
630 }
631 /* shutdown disk thread */
632 /* This has the desirable side-effect of forcing all regions to be
633 * reintegrated. This is necessary since all parity log maps are
634 * currently held in volatile memory. */
635
636 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
637 raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
638 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
639 RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
640 /*
641 * pLogDiskThread will now terminate when queues are cleared
642 * now wait for it to be done
643 */
644 RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
645 while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) {
646 RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex);
647 }
648 RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
649 if (rf_parityLogDebug) {
650 printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid);
651 }
652 }
653
654 int
655 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr)
656 {
657 return (20);
658 }
659
660 RF_HeadSepLimit_t
661 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr)
662 {
663 return (10);
664 }
665 /* return the region ID for a given RAID address */
666 RF_RegionId_t
667 rf_MapRegionIDParityLogging(
668 RF_Raid_t * raidPtr,
669 RF_SectorNum_t address)
670 {
671 RF_RegionId_t regionID;
672
673 /* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
674 regionID = address / raidPtr->regionParityRange;
675 if (regionID == rf_numParityRegions) {
676 /* last region may be larger than other regions */
677 regionID--;
678 }
679 RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
680 RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
681 RF_ASSERT(regionID < rf_numParityRegions);
682 return (regionID);
683 }
684
685
686 /* given a logical RAID sector, determine physical disk address of data */
687 void
688 rf_MapSectorParityLogging(
689 RF_Raid_t * raidPtr,
690 RF_RaidAddr_t raidSector,
691 RF_RowCol_t * row,
692 RF_RowCol_t * col,
693 RF_SectorNum_t * diskSector,
694 int remap)
695 {
696 RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
697 *row = 0;
698 /* *col = (SUID % (raidPtr->numCol -
699 * raidPtr->Layout.numParityLogCol)); */
700 *col = SUID % raidPtr->Layout.numDataCol;
701 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
702 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
703 }
704
705
706 /* given a logical RAID sector, determine physical disk address of parity */
707 void
708 rf_MapParityParityLogging(
709 RF_Raid_t * raidPtr,
710 RF_RaidAddr_t raidSector,
711 RF_RowCol_t * row,
712 RF_RowCol_t * col,
713 RF_SectorNum_t * diskSector,
714 int remap)
715 {
716 RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
717
718 *row = 0;
719 /* *col =
720 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
721 * r->numCol - raidPtr->Layout.numParityLogCol); */
722 *col = raidPtr->Layout.numDataCol;
723 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
724 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
725 }
726
727
728 /* given a regionID and sector offset, determine the physical disk address of the parity log */
729 void
730 rf_MapLogParityLogging(
731 RF_Raid_t * raidPtr,
732 RF_RegionId_t regionID,
733 RF_SectorNum_t regionOffset,
734 RF_RowCol_t * row,
735 RF_RowCol_t * col,
736 RF_SectorNum_t * startSector)
737 {
738 *row = 0;
739 *col = raidPtr->numCol - 1;
740 *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
741 }
742
743
744 /* given a regionID, determine the physical disk address of the logged parity for that region */
745 void
746 rf_MapRegionParity(
747 RF_Raid_t * raidPtr,
748 RF_RegionId_t regionID,
749 RF_RowCol_t * row,
750 RF_RowCol_t * col,
751 RF_SectorNum_t * startSector,
752 RF_SectorCount_t * numSector)
753 {
754 *row = 0;
755 *col = raidPtr->numCol - 2;
756 *startSector = raidPtr->regionInfo[regionID].parityStartAddr;
757 *numSector = raidPtr->regionInfo[regionID].numSectorsParity;
758 }
759
760
761 /* given a logical RAID address, determine the participating disks in the stripe */
762 void
763 rf_IdentifyStripeParityLogging(
764 RF_Raid_t * raidPtr,
765 RF_RaidAddr_t addr,
766 RF_RowCol_t ** diskids,
767 RF_RowCol_t * outRow)
768 {
769 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
770 RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
771 *outRow = 0;
772 *diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
773 }
774
775
776 void
777 rf_MapSIDToPSIDParityLogging(
778 RF_RaidLayout_t * layoutPtr,
779 RF_StripeNum_t stripeID,
780 RF_StripeNum_t * psID,
781 RF_ReconUnitNum_t * which_ru)
782 {
783 *which_ru = 0;
784 *psID = stripeID;
785 }
786
787
788 /* select an algorithm for performing an access. Returns two pointers,
789 * one to a function that will return information about the DAG, and
790 * another to a function that will create the dag.
791 */
792 void
793 rf_ParityLoggingDagSelect(
794 RF_Raid_t * raidPtr,
795 RF_IoType_t type,
796 RF_AccessStripeMap_t * asmp,
797 RF_VoidFuncPtr * createFunc)
798 {
799 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
800 RF_PhysDiskAddr_t *failedPDA = NULL;
801 RF_RowCol_t frow, fcol;
802 RF_RowStatus_t rstat;
803 int prior_recon;
804
805 RF_ASSERT(RF_IO_IS_R_OR_W(type));
806
807 if (asmp->numDataFailed + asmp->numParityFailed > 1) {
808 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
809 /* *infoFunc = */ *createFunc = NULL;
810 return;
811 } else
812 if (asmp->numDataFailed + asmp->numParityFailed == 1) {
813
814 /* if under recon & already reconstructed, redirect
815 * the access to the spare drive and eliminate the
816 * failure indication */
817 failedPDA = asmp->failedPDAs[0];
818 frow = failedPDA->row;
819 fcol = failedPDA->col;
820 rstat = raidPtr->status[failedPDA->row];
821 prior_recon = (rstat == rf_rs_reconfigured) || (
822 (rstat == rf_rs_reconstructing) ?
823 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
824 );
825 if (prior_recon) {
826 RF_RowCol_t or = failedPDA->row, oc = failedPDA->col;
827 RF_SectorNum_t oo = failedPDA->startSector;
828 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist
829 * spare space */
830
831 if (failedPDA == asmp->parityInfo) {
832
833 /* parity has failed */
834 (layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
835 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
836
837 if (asmp->parityInfo->next) { /* redir 2nd component,
838 * if any */
839 RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
840 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
841 p->row = failedPDA->row;
842 p->col = failedPDA->col;
843 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
844 SUoffs; /* cheating:
845 * startSector is not
846 * really a RAID address */
847 }
848 } else
849 if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
850 RF_ASSERT(0); /* should not ever
851 * happen */
852 } else {
853
854 /* data has failed */
855 (layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress, &failedPDA->row,
856 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
857
858 }
859
860 } else { /* redirect to dedicated spare
861 * space */
862
863 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
864 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
865
866 /* the parity may have two distinct
867 * components, both of which may need
868 * to be redirected */
869 if (asmp->parityInfo->next) {
870 if (failedPDA == asmp->parityInfo) {
871 failedPDA->next->row = failedPDA->row;
872 failedPDA->next->col = failedPDA->col;
873 } else
874 if (failedPDA == asmp->parityInfo->next) { /* paranoid: should
875 * never occur */
876 asmp->parityInfo->row = failedPDA->row;
877 asmp->parityInfo->col = failedPDA->col;
878 }
879 }
880 }
881
882 RF_ASSERT(failedPDA->col != -1);
883
884 if (rf_dagDebug || rf_mapDebug) {
885 printf("raid%d: Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
886 raidPtr->raidid, type, or, oc, (long) oo, failedPDA->row, failedPDA->col, (long) failedPDA->startSector);
887 }
888 asmp->numDataFailed = asmp->numParityFailed = 0;
889 }
890 }
891 if (type == RF_IO_TYPE_READ) {
892
893 if (asmp->numDataFailed == 0)
894 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
895 else
896 *createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
897
898 } else {
899
900
901 /* if mirroring, always use large writes. If the access
902 * requires two distinct parity updates, always do a small
903 * write. If the stripe contains a failure but the access
904 * does not, do a small write. The first conditional
905 * (numStripeUnitsAccessed <= numDataCol/2) uses a
906 * less-than-or-equal rather than just a less-than because
907 * when G is 3 or 4, numDataCol/2 is 1, and I want
908 * single-stripe-unit updates to use just one disk. */
909 if ((asmp->numDataFailed + asmp->numParityFailed) == 0) {
910 if (((asmp->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
911 (asmp->parityInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmp)) {
912 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG;
913 } else
914 *createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG;
915 } else
916 if (asmp->numParityFailed == 1)
917 *createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
918 else
919 if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
920 *createFunc = NULL;
921 else
922 *createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
923 }
924 }
925 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
926