rf_map.c revision 1.1 1 /* $NetBSD: rf_map.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /**************************************************************************
30 *
31 * map.c -- main code for mapping RAID addresses to physical disk addresses
32 *
33 **************************************************************************/
34
35 /*
36 * :
37 * Log: rf_map.c,v
38 * Revision 1.53 1996/11/05 21:10:40 jimz
39 * failed pda generalization
40 *
41 * Revision 1.52 1996/08/20 19:58:39 jimz
42 * initialize numParityFailed and numQFailed to 0 in MarkFailuresInASMList
43 *
44 * Revision 1.51 1996/08/19 22:26:31 jimz
45 * add Chang's bugfixes for double-disk failures in MarkFailuresInASMList
46 *
47 * Revision 1.50 1996/08/19 21:38:06 jimz
48 * stripeOffset was uninitialized in CheckStripeForFailures
49 *
50 * Revision 1.49 1996/07/31 15:34:56 jimz
51 * evenodd changes; bugfixes for double-degraded archs, generalize
52 * some formerly PQ-only functions
53 *
54 * Revision 1.48 1996/07/27 23:36:08 jimz
55 * Solaris port of simulator
56 *
57 * Revision 1.47 1996/07/22 19:52:16 jimz
58 * switched node params to RF_DagParam_t, a union of
59 * a 64-bit int and a void *, for better portability
60 * attempted hpux port, but failed partway through for
61 * lack of a single C compiler capable of compiling all
62 * source files
63 *
64 * Revision 1.46 1996/06/10 12:50:57 jimz
65 * Add counters to freelists to track number of allocations, frees,
66 * grows, max size, etc. Adjust a couple sets of PRIME params based
67 * on the results.
68 *
69 * Revision 1.45 1996/06/10 11:55:47 jimz
70 * Straightened out some per-array/not-per-array distinctions, fixed
71 * a couple bugs related to confusion. Added shutdown lists. Removed
72 * layout shutdown function (now subsumed by shutdown lists).
73 *
74 * Revision 1.44 1996/06/09 02:36:46 jimz
75 * lots of little crufty cleanup- fixup whitespace
76 * issues, comment #ifdefs, improve typing in some
77 * places (esp size-related)
78 *
79 * Revision 1.43 1996/06/07 21:33:04 jimz
80 * begin using consistent types for sector numbers,
81 * stripe numbers, row+col numbers, recon unit numbers
82 *
83 * Revision 1.42 1996/06/05 18:06:02 jimz
84 * Major code cleanup. The Great Renaming is now done.
85 * Better modularity. Better typing. Fixed a bunch of
86 * synchronization bugs. Made a lot of global stuff
87 * per-desc or per-array. Removed dead code.
88 *
89 * Revision 1.41 1996/06/03 23:28:26 jimz
90 * more bugfixes
91 * check in tree to sync for IPDS runs with current bugfixes
92 * there still may be a problem with threads in the script test
93 * getting I/Os stuck- not trivially reproducible (runs ~50 times
94 * in a row without getting stuck)
95 *
96 * Revision 1.40 1996/05/31 22:26:54 jimz
97 * fix a lot of mapping problems, memory allocation problems
98 * found some weird lock issues, fixed 'em
99 * more code cleanup
100 *
101 * Revision 1.39 1996/05/30 23:22:16 jimz
102 * bugfixes of serialization, timing problems
103 * more cleanup
104 *
105 * Revision 1.38 1996/05/30 11:29:41 jimz
106 * Numerous bug fixes. Stripe lock release code disagreed with the taking code
107 * about when stripes should be locked (I made it consistent: no parity, no lock)
108 * There was a lot of extra serialization of I/Os which I've removed- a lot of
109 * it was to calculate values for the cache code, which is no longer with us.
110 * More types, function, macro cleanup. Added code to properly quiesce the array
111 * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
112 * before. Fixed memory allocation, freeing bugs.
113 *
114 * Revision 1.37 1996/05/27 18:56:37 jimz
115 * more code cleanup
116 * better typing
117 * compiles in all 3 environments
118 *
119 * Revision 1.36 1996/05/23 21:46:35 jimz
120 * checkpoint in code cleanup (release prep)
121 * lots of types, function names have been fixed
122 *
123 * Revision 1.35 1996/05/23 00:33:23 jimz
124 * code cleanup: move all debug decls to rf_options.c, all extern
125 * debug decls to rf_options.h, all debug vars preceded by rf_
126 *
127 * Revision 1.34 1996/05/20 16:14:45 jimz
128 * switch to rf_{mutex,cond}_{init,destroy}
129 *
130 * Revision 1.33 1996/05/18 19:51:34 jimz
131 * major code cleanup- fix syntax, make some types consistent,
132 * add prototypes, clean out dead code, et cetera
133 *
134 * Revision 1.32 1996/05/17 00:51:47 jimz
135 * reformat for readability
136 *
137 * Revision 1.31 1996/05/16 23:06:26 jimz
138 * convert asmhdr to use RF_FREELIST stuff
139 *
140 * Revision 1.30 1996/05/16 19:09:42 jimz
141 * grow init asm freelist to 32
142 *
143 * Revision 1.29 1996/05/16 15:27:55 jimz
144 * prime freelist pumps for asm and pda lists
145 *
146 * Revision 1.28 1996/05/02 14:58:35 jimz
147 * legibility cleanup
148 *
149 * Revision 1.27 1995/12/12 18:10:06 jimz
150 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
151 * fix 80-column brain damage in comments
152 *
153 * Revision 1.26 1995/12/01 19:25:06 root
154 * added copyright info
155 *
156 * Revision 1.25 1995/11/17 19:01:57 wvcii
157 * added call to MapQ in two fault tolerant case
158 *
159 * Revision 1.24 1995/11/17 15:10:53 wvcii
160 * fixed bug in ASMCheckStatus - ASSERT was using disk sector addresses
161 * rather than raidAddress
162 *
163 * Revision 1.23 1995/07/26 03:26:51 robby
164 * map the allocation and freeing routines for some stuff non-static
165 *
166 * Revision 1.22 1995/06/28 09:33:45 holland
167 * bug fixes related to dist sparing and multiple-row arrays
168 *
169 * Revision 1.21 1995/06/28 04:51:08 holland
170 * added some asserts against zero-length accesses
171 *
172 * Revision 1.20 1995/06/23 13:40:06 robby
173 * updeated to prototypes in rf_layout.h
174 *
175 */
176
177 #include "rf_types.h"
178 #include "rf_threadstuff.h"
179 #include "rf_raid.h"
180 #include "rf_general.h"
181 #include "rf_map.h"
182 #include "rf_freelist.h"
183 #include "rf_shutdown.h"
184 #include "rf_sys.h"
185
186 static void rf_FreePDAList(RF_PhysDiskAddr_t *start, RF_PhysDiskAddr_t *end, int count);
187 static void rf_FreeASMList(RF_AccessStripeMap_t *start, RF_AccessStripeMap_t *end,
188 int count);
189
190 /*****************************************************************************************
191 *
192 * MapAccess -- main 1st order mapping routine.
193 *
194 * Maps an access in the RAID address space to the corresponding set of physical disk
195 * addresses. The result is returned as a list of AccessStripeMap structures, one per
196 * stripe accessed. Each ASM structure contains a pointer to a list of PhysDiskAddr
197 * structures, which describe the physical locations touched by the user access. Note
198 * that this routine returns only static mapping information, i.e. the list of physical
199 * addresses returned does not necessarily identify the set of physical locations that
200 * will actually be read or written.
201 *
202 * The routine also maps the parity. The physical disk location returned always
203 * indicates the entire parity unit, even when only a subset of it is being accessed.
204 * This is because an access that is not stripe unit aligned but that spans a stripe
205 * unit boundary may require access two distinct portions of the parity unit, and we
206 * can't yet tell which portion(s) we'll actually need. We leave it up to the algorithm
207 * selection code to decide what subset of the parity unit to access.
208 *
209 * Note that addresses in the RAID address space must always be maintained as
210 * longs, instead of ints.
211 *
212 * This routine returns NULL if numBlocks is 0
213 *
214 ****************************************************************************************/
215
216 RF_AccessStripeMapHeader_t *rf_MapAccess(raidPtr, raidAddress, numBlocks, buffer, remap)
217 RF_Raid_t *raidPtr;
218 RF_RaidAddr_t raidAddress; /* starting address in RAID address space */
219 RF_SectorCount_t numBlocks; /* number of blocks in RAID address space to access */
220 caddr_t buffer; /* buffer to supply/receive data */
221 int remap; /* 1 => remap addresses to spare space */
222 {
223 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
224 RF_AccessStripeMapHeader_t *asm_hdr = NULL;
225 RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL;
226 int faultsTolerated = layoutPtr->map->faultsTolerated;
227 RF_RaidAddr_t startAddress = raidAddress; /* we'll change raidAddress along the way */
228 RF_RaidAddr_t endAddress = raidAddress + numBlocks;
229 RF_RaidDisk_t **disks = raidPtr->Disks;
230
231 RF_PhysDiskAddr_t *pda_p, *pda_q;
232 RF_StripeCount_t numStripes = 0;
233 RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress, nextStripeUnitAddress;
234 RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr;
235 RF_StripeCount_t totStripes;
236 RF_StripeNum_t stripeID, lastSID, SUID, lastSUID;
237 RF_AccessStripeMap_t *asmList, *t_asm;
238 RF_PhysDiskAddr_t *pdaList, *t_pda;
239
240 /* allocate all the ASMs and PDAs up front */
241 lastRaidAddr = raidAddress + numBlocks - 1 ;
242 stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress);
243 lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr);
244 totStripes = lastSID - stripeID + 1;
245 SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress);
246 lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr);
247
248 asmList = rf_AllocASMList(totStripes);
249 pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + faultsTolerated * totStripes); /* may also need pda(s) per stripe for parity */
250
251 if (raidAddress+numBlocks > raidPtr->totalSectors) {
252 RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n",
253 (int)raidAddress);
254 return(NULL);
255 }
256
257 if (rf_mapDebug)
258 rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks);
259 for (; raidAddress < endAddress; ) {
260 /* make the next stripe structure */
261 RF_ASSERT(asmList);
262 t_asm = asmList;
263 asmList = asmList->next;
264 bzero((char *)t_asm, sizeof(RF_AccessStripeMap_t));
265 if (!asm_p)
266 asm_list = asm_p = t_asm;
267 else {
268 asm_p->next = t_asm;
269 asm_p = asm_p->next;
270 }
271 numStripes++;
272
273 /* map SUs from current location to the end of the stripe */
274 asm_p->stripeID = /*rf_RaidAddressToStripeID(layoutPtr, raidAddress)*/ stripeID++;
275 stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress);
276 stripeEndAddress = RF_MIN(endAddress,stripeRealEndAddress );
277 asm_p->raidAddress = raidAddress;
278 asm_p->endRaidAddress = stripeEndAddress;
279
280 /* map each stripe unit in the stripe */
281 pda_p = NULL;
282 startAddrWithinStripe = raidAddress; /* Raid addr of start of portion of access that is within this stripe */
283 for (; raidAddress < stripeEndAddress; ) {
284 RF_ASSERT(pdaList);
285 t_pda = pdaList;
286 pdaList = pdaList->next;
287 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
288 if (!pda_p)
289 asm_p->physInfo = pda_p = t_pda;
290 else {
291 pda_p->next = t_pda;
292 pda_p = pda_p->next;
293 }
294
295 pda_p->type = RF_PDA_TYPE_DATA;
296 (layoutPtr->map->MapSector)(raidPtr, raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
297
298 /* mark any failures we find. failedPDA is don't-care if there is more than one failure */
299 pda_p->raidAddress = raidAddress; /* the RAID address corresponding to this physical disk address */
300 nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress);
301 pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress;
302 RF_ASSERT(pda_p->numSector != 0);
303 rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,0);
304 pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress));
305 asm_p->totalSectorsAccessed += pda_p->numSector;
306 asm_p->numStripeUnitsAccessed++;
307 asm_p->origRow = pda_p->row; /* redundant but harmless to do this in every loop iteration */
308
309 raidAddress = RF_MIN(endAddress, nextStripeUnitAddress);
310 }
311
312 /* Map the parity. At this stage, the startSector and numSector fields
313 * for the parity unit are always set to indicate the entire parity unit.
314 * We may modify this after mapping the data portion.
315 */
316 switch (faultsTolerated)
317 {
318 case 0:
319 break;
320 case 1: /* single fault tolerant */
321 RF_ASSERT(pdaList);
322 t_pda = pdaList;
323 pdaList = pdaList->next;
324 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
325 pda_p = asm_p->parityInfo = t_pda;
326 pda_p->type = RF_PDA_TYPE_PARITY;
327 (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
328 &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
329 pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
330 /* raidAddr may be needed to find unit to redirect to */
331 pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
332 rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1);
333 rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
334
335 break;
336 case 2: /* two fault tolerant */
337 RF_ASSERT(pdaList && pdaList->next);
338 t_pda = pdaList;
339 pdaList = pdaList->next;
340 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
341 pda_p = asm_p->parityInfo = t_pda;
342 pda_p->type = RF_PDA_TYPE_PARITY;
343 t_pda = pdaList;
344 pdaList = pdaList->next;
345 bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
346 pda_q = asm_p->qInfo = t_pda;
347 pda_q->type = RF_PDA_TYPE_Q;
348 (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
349 &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
350 (layoutPtr->map->MapQ)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
351 &(pda_q->row), &(pda_q->col), &(pda_q->startSector), remap);
352 pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
353 /* raidAddr may be needed to find unit to redirect to */
354 pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
355 pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
356 /* failure mode stuff */
357 rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1);
358 rf_ASMCheckStatus(raidPtr,pda_q,asm_p,disks,1);
359 rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
360 rf_ASMParityAdjust(asm_p->qInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
361 break;
362 }
363 }
364 RF_ASSERT(asmList == NULL && pdaList == NULL);
365 /* make the header structure */
366 asm_hdr = rf_AllocAccessStripeMapHeader();
367 RF_ASSERT(numStripes == totStripes);
368 asm_hdr->numStripes = numStripes;
369 asm_hdr->stripeMap = asm_list;
370
371 if (rf_mapDebug)
372 rf_PrintAccessStripeMap(asm_hdr);
373 return(asm_hdr);
374 }
375
376 /*****************************************************************************************
377 * This routine walks through an ASM list and marks the PDAs that have failed.
378 * It's called only when a disk failure causes an in-flight DAG to fail.
379 * The parity may consist of two components, but we want to use only one failedPDA
380 * pointer. Thus we set failedPDA to point to the first parity component, and rely
381 * on the rest of the code to do the right thing with this.
382 ****************************************************************************************/
383
384 void rf_MarkFailuresInASMList(raidPtr, asm_h)
385 RF_Raid_t *raidPtr;
386 RF_AccessStripeMapHeader_t *asm_h;
387 {
388 RF_RaidDisk_t **disks = raidPtr->Disks;
389 RF_AccessStripeMap_t *asmap;
390 RF_PhysDiskAddr_t *pda;
391
392 for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) {
393 asmap->numDataFailed = asmap->numParityFailed = asmap->numQFailed = 0;
394 asmap->numFailedPDAs = 0;
395 bzero((char *)asmap->failedPDAs,
396 RF_MAX_FAILED_PDA*sizeof(RF_PhysDiskAddr_t *));
397 for (pda = asmap->physInfo; pda; pda=pda->next) {
398 if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
399 printf("DEAD DISK BOGUSLY DETECTED!!\n");
400 asmap->numDataFailed++;
401 asmap->failedPDAs[asmap->numFailedPDAs] = pda;
402 asmap->numFailedPDAs++;
403 }
404 }
405 pda = asmap->parityInfo;
406 if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
407 asmap->numParityFailed++;
408 asmap->failedPDAs[asmap->numFailedPDAs] = pda;
409 asmap->numFailedPDAs++;
410 }
411 pda = asmap->qInfo;
412 if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
413 asmap->numQFailed++;
414 asmap->failedPDAs[asmap->numFailedPDAs] = pda;
415 asmap->numFailedPDAs++;
416 }
417 }
418 }
419
420 /*****************************************************************************************
421 *
422 * DuplicateASM -- duplicates an ASM and returns the new one
423 *
424 ****************************************************************************************/
425 RF_AccessStripeMap_t *rf_DuplicateASM(asmap)
426 RF_AccessStripeMap_t *asmap;
427 {
428 RF_AccessStripeMap_t *new_asm;
429 RF_PhysDiskAddr_t *pda, *new_pda, *t_pda;
430
431 new_pda = NULL;
432 new_asm = rf_AllocAccessStripeMapComponent();
433 bcopy((char *)asmap, (char *)new_asm, sizeof(RF_AccessStripeMap_t));
434 new_asm->numFailedPDAs = 0; /* ??? */
435 new_asm->failedPDAs[0] = NULL;
436 new_asm->physInfo = NULL;
437 new_asm->parityInfo = NULL;
438 new_asm->next = NULL;
439
440 for (pda = asmap->physInfo; pda; pda=pda->next) { /* copy the physInfo list */
441 t_pda = rf_AllocPhysDiskAddr();
442 bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t));
443 t_pda->next = NULL;
444 if (!new_asm->physInfo) {new_asm->physInfo = t_pda; new_pda = t_pda;}
445 else {new_pda->next = t_pda; new_pda = new_pda->next;}
446 if (pda == asmap->failedPDAs[0])
447 new_asm->failedPDAs[0] = t_pda;
448 }
449 for (pda = asmap->parityInfo; pda; pda=pda->next) { /* copy the parityInfo list */
450 t_pda = rf_AllocPhysDiskAddr();
451 bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t));
452 t_pda->next = NULL;
453 if (!new_asm->parityInfo) {new_asm->parityInfo = t_pda; new_pda = t_pda;}
454 else {new_pda->next = t_pda; new_pda = new_pda->next;}
455 if (pda == asmap->failedPDAs[0])
456 new_asm->failedPDAs[0] = t_pda;
457 }
458 return(new_asm);
459 }
460
461 /*****************************************************************************************
462 *
463 * DuplicatePDA -- duplicates a PDA and returns the new one
464 *
465 ****************************************************************************************/
466 RF_PhysDiskAddr_t *rf_DuplicatePDA(pda)
467 RF_PhysDiskAddr_t *pda;
468 {
469 RF_PhysDiskAddr_t *new;
470
471 new = rf_AllocPhysDiskAddr();
472 bcopy((char *)pda, (char *)new, sizeof(RF_PhysDiskAddr_t));
473 return(new);
474 }
475
476 /*****************************************************************************************
477 *
478 * routines to allocate and free list elements. All allocation routines zero the
479 * structure before returning it.
480 *
481 * FreePhysDiskAddr is static. It should never be called directly, because
482 * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list.
483 *
484 ****************************************************************************************/
485
486 static RF_FreeList_t *rf_asmhdr_freelist;
487 #define RF_MAX_FREE_ASMHDR 128
488 #define RF_ASMHDR_INC 16
489 #define RF_ASMHDR_INITIAL 32
490
491 static RF_FreeList_t *rf_asm_freelist;
492 #define RF_MAX_FREE_ASM 192
493 #define RF_ASM_INC 24
494 #define RF_ASM_INITIAL 64
495
496 static RF_FreeList_t *rf_pda_freelist;
497 #define RF_MAX_FREE_PDA 192
498 #define RF_PDA_INC 24
499 #define RF_PDA_INITIAL 64
500
501 /* called at shutdown time. So far, all that is necessary is to release all the free lists */
502 static void rf_ShutdownMapModule(void *);
503 static void rf_ShutdownMapModule(ignored)
504 void *ignored;
505 {
506 RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
507 RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *));
508 RF_FREELIST_DESTROY(rf_asm_freelist,next,(RF_AccessStripeMap_t *));
509 }
510
511 int rf_ConfigureMapModule(listp)
512 RF_ShutdownList_t **listp;
513 {
514 int rc;
515
516 RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR,
517 RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t));
518 if (rf_asmhdr_freelist == NULL) {
519 return(ENOMEM);
520 }
521 RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM,
522 RF_ASM_INC, sizeof(RF_AccessStripeMap_t));
523 if (rf_asm_freelist == NULL) {
524 RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
525 return(ENOMEM);
526 }
527 RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA,
528 RF_PDA_INC, sizeof(RF_PhysDiskAddr_t));
529 if (rf_pda_freelist == NULL) {
530 RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
531 RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *));
532 return(ENOMEM);
533 }
534
535 rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL);
536 if (rc) {
537 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
538 __LINE__, rc);
539 rf_ShutdownMapModule(NULL);
540 return(rc);
541 }
542
543 RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL,next,
544 (RF_AccessStripeMapHeader_t *));
545 RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL,next,
546 (RF_AccessStripeMap_t *));
547 RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL,next,
548 (RF_PhysDiskAddr_t *));
549
550 return(0);
551 }
552
553 RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader()
554 {
555 RF_AccessStripeMapHeader_t *p;
556
557 RF_FREELIST_GET(rf_asmhdr_freelist,p,next,(RF_AccessStripeMapHeader_t *));
558 bzero((char *)p, sizeof(RF_AccessStripeMapHeader_t));
559
560 return(p);
561 }
562
563
564 void rf_FreeAccessStripeMapHeader(p)
565 RF_AccessStripeMapHeader_t *p;
566 {
567 RF_FREELIST_FREE(rf_asmhdr_freelist,p,next);
568 }
569
570 RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr()
571 {
572 RF_PhysDiskAddr_t *p;
573
574 RF_FREELIST_GET(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *));
575 bzero((char *)p, sizeof(RF_PhysDiskAddr_t));
576
577 return(p);
578 }
579
580 /* allocates a list of PDAs, locking the free list only once
581 * when we have to call calloc, we do it one component at a time to simplify
582 * the process of freeing the list at program shutdown. This should not be
583 * much of a performance hit, because it should be very infrequently executed.
584 */
585 RF_PhysDiskAddr_t *rf_AllocPDAList(count)
586 int count;
587 {
588 RF_PhysDiskAddr_t *p = NULL;
589
590 RF_FREELIST_GET_N(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *),count);
591 return(p);
592 }
593
594 void rf_FreePhysDiskAddr(p)
595 RF_PhysDiskAddr_t *p;
596 {
597 RF_FREELIST_FREE(rf_pda_freelist,p,next);
598 }
599
600 static void rf_FreePDAList(l_start, l_end, count)
601 RF_PhysDiskAddr_t *l_start, *l_end; /* pointers to start and end of list */
602 int count; /* number of elements in list */
603 {
604 RF_FREELIST_FREE_N(rf_pda_freelist,l_start,next,(RF_PhysDiskAddr_t *),count);
605 }
606
607 RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent()
608 {
609 RF_AccessStripeMap_t *p;
610
611 RF_FREELIST_GET(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *));
612 bzero((char *)p, sizeof(RF_AccessStripeMap_t));
613
614 return(p);
615 }
616
617 /* this is essentially identical to AllocPDAList. I should combine the two.
618 * when we have to call calloc, we do it one component at a time to simplify
619 * the process of freeing the list at program shutdown. This should not be
620 * much of a performance hit, because it should be very infrequently executed.
621 */
622 RF_AccessStripeMap_t *rf_AllocASMList(count)
623 int count;
624 {
625 RF_AccessStripeMap_t *p = NULL;
626
627 RF_FREELIST_GET_N(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *),count);
628 return(p);
629 }
630
631 void rf_FreeAccessStripeMapComponent(p)
632 RF_AccessStripeMap_t *p;
633 {
634 RF_FREELIST_FREE(rf_asm_freelist,p,next);
635 }
636
637 static void rf_FreeASMList(l_start, l_end, count)
638 RF_AccessStripeMap_t *l_start, *l_end;
639 int count;
640 {
641 RF_FREELIST_FREE_N(rf_asm_freelist,l_start,next,(RF_AccessStripeMap_t *),count);
642 }
643
644 void rf_FreeAccessStripeMap(hdr)
645 RF_AccessStripeMapHeader_t *hdr;
646 {
647 RF_AccessStripeMap_t *p, *pt = NULL;
648 RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL;
649 int count = 0, t, asm_count = 0;
650
651 for (p = hdr->stripeMap; p; p=p->next) {
652
653 /* link the 3 pda lists into the accumulating pda list */
654
655 if (!pdaList) pdaList = p->qInfo; else pdaEnd->next = p->qInfo;
656 for (trailer=NULL,pdp=p->qInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
657 if (trailer) pdaEnd = trailer;
658
659 if (!pdaList) pdaList = p->parityInfo; else pdaEnd->next = p->parityInfo;
660 for (trailer=NULL,pdp=p->parityInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
661 if (trailer) pdaEnd = trailer;
662
663 if (!pdaList) pdaList = p->physInfo; else pdaEnd->next = p->physInfo;
664 for (trailer=NULL,pdp=p->physInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
665 if (trailer) pdaEnd = trailer;
666
667 pt = p;
668 asm_count++;
669 }
670
671 /* debug only */
672 for (t=0,pdp=pdaList; pdp; pdp=pdp->next)
673 t++;
674 RF_ASSERT(t == count);
675
676 if (pdaList)
677 rf_FreePDAList(pdaList, pdaEnd, count);
678 rf_FreeASMList(hdr->stripeMap, pt, asm_count);
679 rf_FreeAccessStripeMapHeader(hdr);
680 }
681
682 /* We can't use the large write optimization if there are any failures in the stripe.
683 * In the declustered layout, there is no way to immediately determine what disks
684 * constitute a stripe, so we actually have to hunt through the stripe looking for failures.
685 * The reason we map the parity instead of just using asm->parityInfo->col is because
686 * the latter may have been already redirected to a spare drive, which would
687 * mess up the computation of the stripe offset.
688 *
689 * ASSUMES AT MOST ONE FAILURE IN THE STRIPE.
690 */
691 int rf_CheckStripeForFailures(raidPtr, asmap)
692 RF_Raid_t *raidPtr;
693 RF_AccessStripeMap_t *asmap;
694 {
695 RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i;
696 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
697 RF_StripeCount_t stripeOffset;
698 int numFailures;
699 RF_RaidAddr_t sosAddr;
700 RF_SectorNum_t diskOffset, poffset;
701 RF_RowCol_t testrow;
702
703 /* quick out in the fault-free case. */
704 RF_LOCK_MUTEX(raidPtr->mutex);
705 numFailures = raidPtr->numFailures;
706 RF_UNLOCK_MUTEX(raidPtr->mutex);
707 if (numFailures == 0) return(0);
708
709 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
710 row = asmap->physInfo->row;
711 (layoutPtr->map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &testrow);
712 (layoutPtr->map->MapParity)(raidPtr, asmap->raidAddress, &prow, &pcol, &poffset, 0); /* get pcol */
713
714 /* this need not be true if we've redirected the access to a spare in another row
715 RF_ASSERT(row == testrow);
716 */
717 stripeOffset = 0;
718 for (i=0; i<layoutPtr->numDataCol+layoutPtr->numParityCol; i++) {
719 if (diskids[i] != pcol) {
720 if (RF_DEAD_DISK(raidPtr->Disks[testrow][diskids[i]].status)) {
721 if (raidPtr->status[testrow] != rf_rs_reconstructing)
722 return(1);
723 RF_ASSERT(raidPtr->reconControl[testrow]->fcol == diskids[i]);
724 layoutPtr->map->MapSector(raidPtr,
725 sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit,
726 &trow, &tcol, &diskOffset, 0);
727 RF_ASSERT( (trow == testrow) && (tcol == diskids[i]) );
728 if (!rf_CheckRUReconstructed(raidPtr->reconControl[testrow]->reconMap, diskOffset))
729 return(1);
730 asmap->flags |= RF_ASM_REDIR_LARGE_WRITE;
731 return(0);
732 }
733 stripeOffset++;
734 }
735 }
736 return(0);
737 }
738
739 /*
740 return the number of failed data units in the stripe.
741 */
742
743 int rf_NumFailedDataUnitsInStripe(raidPtr, asmap)
744 RF_Raid_t *raidPtr;
745 RF_AccessStripeMap_t *asmap;
746 {
747 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
748 RF_RowCol_t trow, tcol, row, i;
749 RF_SectorNum_t diskOffset;
750 RF_RaidAddr_t sosAddr;
751 int numFailures;
752
753 /* quick out in the fault-free case. */
754 RF_LOCK_MUTEX(raidPtr->mutex);
755 numFailures = raidPtr->numFailures;
756 RF_UNLOCK_MUTEX(raidPtr->mutex);
757 if (numFailures == 0) return(0);
758 numFailures = 0;
759
760 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
761 row = asmap->physInfo->row;
762 for (i=0; i<layoutPtr->numDataCol; i++)
763 {
764 (layoutPtr->map->MapSector)(raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit,
765 &trow, &tcol, &diskOffset, 0);
766 if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status))
767 numFailures++;
768 }
769
770 return numFailures;
771 }
772
773
774 /*****************************************************************************************
775 *
776 * debug routines
777 *
778 ****************************************************************************************/
779
780 void rf_PrintAccessStripeMap(asm_h)
781 RF_AccessStripeMapHeader_t *asm_h;
782 {
783 rf_PrintFullAccessStripeMap(asm_h, 0);
784 }
785
786 void rf_PrintFullAccessStripeMap(asm_h, prbuf)
787 RF_AccessStripeMapHeader_t *asm_h;
788 int prbuf; /* flag to print buffer pointers */
789 {
790 int i;
791 RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
792 RF_PhysDiskAddr_t *p;
793 printf("%d stripes total\n", (int)asm_h->numStripes);
794 for (; asmap; asmap = asmap->next) {
795 /* printf("Num failures: %d\n",asmap->numDataFailed); */
796 /* printf("Num sectors: %d\n",(int)asmap->totalSectorsAccessed); */
797 printf("Stripe %d (%d sectors), failures: %d data, %d parity: ",
798 (int) asmap->stripeID,
799 (int) asmap->totalSectorsAccessed,
800 (int) asmap->numDataFailed,
801 (int) asmap->numParityFailed);
802 if (asmap->parityInfo) {
803 printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row, asmap->parityInfo->col,
804 (int)asmap->parityInfo->startSector,
805 (int)(asmap->parityInfo->startSector +
806 asmap->parityInfo->numSector - 1));
807 if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->bufPtr);
808 if (asmap->parityInfo->next) {
809 printf(", r%d c%d s%d-%d", asmap->parityInfo->next->row,
810 asmap->parityInfo->next->col,
811 (int) asmap->parityInfo->next->startSector,
812 (int)(asmap->parityInfo->next->startSector +
813 asmap->parityInfo->next->numSector - 1));
814 if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->next->bufPtr);
815 RF_ASSERT(asmap->parityInfo->next->next == NULL);
816 }
817 printf("]\n\t");
818 }
819 for (i=0,p=asmap->physInfo; p; p=p->next,i++) {
820 printf("SU r%d c%d s%d-%d ", p->row, p->col, (int)p->startSector,
821 (int)(p->startSector + p->numSector - 1));
822 if (prbuf) printf("b0x%lx ", (unsigned long) p->bufPtr);
823 if (i && !(i&1)) printf("\n\t");
824 }
825 printf("\n");
826 p = asm_h->stripeMap->failedPDAs[0];
827 if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1) printf("[multiple failures]\n");
828 else if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0)
829 printf("\t[Failed PDA: r%d c%d s%d-%d]\n",p->row, p->col,
830 (int)p->startSector, (int)(p->startSector + p->numSector-1));
831 }
832 }
833
834 void rf_PrintRaidAddressInfo(raidPtr, raidAddr, numBlocks)
835 RF_Raid_t *raidPtr;
836 RF_RaidAddr_t raidAddr;
837 RF_SectorCount_t numBlocks;
838 {
839 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
840 RF_RaidAddr_t ra, sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
841
842 printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t");
843 for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) {
844 printf("%d (0x%x), ",(int)ra, (int)ra);
845 }
846 printf("\n");
847 printf("Offset into stripe unit: %d (0x%x)\n",
848 (int)(raidAddr % layoutPtr->sectorsPerStripeUnit),
849 (int)(raidAddr % layoutPtr->sectorsPerStripeUnit));
850 }
851
852 /*
853 given a parity descriptor and the starting address within a stripe,
854 range restrict the parity descriptor to touch only the correct stuff.
855 */
856 void rf_ASMParityAdjust(
857 RF_PhysDiskAddr_t *toAdjust,
858 RF_StripeNum_t startAddrWithinStripe,
859 RF_SectorNum_t endAddress,
860 RF_RaidLayout_t *layoutPtr,
861 RF_AccessStripeMap_t *asm_p)
862 {
863 RF_PhysDiskAddr_t *new_pda;
864
865 /* when we're accessing only a portion of one stripe unit, we want the parity descriptor
866 * to identify only the chunk of parity associated with the data. When the access spans
867 * exactly one stripe unit boundary and is less than a stripe unit in size, it uses two disjoint
868 * regions of the parity unit. When an access spans more than one stripe unit boundary, it
869 * uses all of the parity unit.
870 *
871 * To better handle the case where stripe units are small, we may eventually want to change
872 * the 2nd case so that if the SU size is below some threshold, we just read/write the whole
873 * thing instead of breaking it up into two accesses.
874 */
875 if (asm_p->numStripeUnitsAccessed == 1)
876 {
877 int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
878 toAdjust->startSector += x;
879 toAdjust->raidAddress += x;
880 toAdjust->numSector = asm_p->physInfo->numSector;
881 RF_ASSERT(toAdjust->numSector != 0);
882 }
883 else
884 if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit)
885 {
886 int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
887
888 /* create a second pda and copy the parity map info into it */
889 RF_ASSERT(toAdjust->next == NULL);
890 new_pda = toAdjust->next = rf_AllocPhysDiskAddr();
891 *new_pda = *toAdjust; /* structure assignment */
892 new_pda->next = NULL;
893
894 /* adjust the start sector & number of blocks for the first parity pda */
895 toAdjust->startSector += x;
896 toAdjust->raidAddress += x;
897 toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe;
898 RF_ASSERT(toAdjust->numSector != 0);
899
900 /* adjust the second pda */
901 new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress);
902 /*new_pda->raidAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, toAdjust->raidAddress);*/
903 RF_ASSERT(new_pda->numSector != 0);
904 }
905 }
906
907 /*
908 Check if a disk has been spared or failed. If spared,
909 redirect the I/O.
910 If it has been failed, record it in the asm pointer.
911 Fourth arg is whether data or parity.
912 */
913 void rf_ASMCheckStatus(
914 RF_Raid_t *raidPtr,
915 RF_PhysDiskAddr_t *pda_p,
916 RF_AccessStripeMap_t *asm_p,
917 RF_RaidDisk_t **disks,
918 int parity)
919 {
920 RF_DiskStatus_t dstatus;
921 RF_RowCol_t frow, fcol;
922
923 dstatus = disks[pda_p->row][pda_p->col].status;
924
925 if (dstatus == rf_ds_spared) {
926 /* if the disk has been spared, redirect access to the spare */
927 frow = pda_p->row; fcol = pda_p->col;
928 pda_p->row = disks[frow][fcol].spareRow;
929 pda_p->col = disks[frow][fcol].spareCol;
930 }
931 else if (dstatus == rf_ds_dist_spared) {
932 /* ditto if disk has been spared to dist spare space */
933 RF_RowCol_t or = pda_p->row, oc=pda_p->col;
934 RF_SectorNum_t oo = pda_p->startSector;
935
936 if (pda_p -> type == RF_PDA_TYPE_DATA)
937 raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
938 else
939 raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
940
941 if (rf_mapDebug) {
942 printf("Redirected r %d c %d o %d -> r%d c %d o %d\n",or,oc,(int)oo,
943 pda_p->row,pda_p->col,(int)pda_p->startSector);
944 }
945 } else if (RF_DEAD_DISK(dstatus)) {
946 /* if the disk is inaccessible, mark the failure */
947 if (parity)
948 asm_p->numParityFailed++;
949 else {
950 asm_p->numDataFailed++;
951 #if 0
952 /* XXX Do we really want this spewing out on the console? GO */
953 printf("DATA_FAILED!\n");
954 #endif
955 }
956 asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p;
957 asm_p->numFailedPDAs++;
958 #if 0
959 switch (asm_p->numParityFailed + asm_p->numDataFailed)
960 {
961 case 1:
962 asm_p->failedPDAs[0] = pda_p;
963 break;
964 case 2:
965 asm_p->failedPDAs[1] = pda_p;
966 default:
967 break;
968 }
969 #endif
970 }
971 /* the redirected access should never span a stripe unit boundary */
972 RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress) ==
973 rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress + pda_p->numSector -1));
974 RF_ASSERT(pda_p->col != -1);
975 }
976