rf_raid5.c revision 1.1 1 /* $NetBSD: rf_raid5.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /******************************************************************************
30 *
31 * rf_raid5.c -- implements RAID Level 5
32 *
33 *****************************************************************************/
34
35 /*
36 * :
37 * Log: rf_raid5.c,v
38 * Revision 1.26 1996/11/05 21:10:40 jimz
39 * failed pda generalization
40 *
41 * Revision 1.25 1996/07/31 16:56:18 jimz
42 * dataBytesPerStripe, sectorsPerDisk init arch-indep.
43 *
44 * Revision 1.24 1996/07/18 22:57:14 jimz
45 * port simulator to AIX
46 *
47 * Revision 1.23 1996/07/13 00:00:59 jimz
48 * sanitized generalized reconstruction architecture
49 * cleaned up head sep, rbuf problems
50 *
51 * Revision 1.22 1996/06/11 08:54:27 jimz
52 * improved error-checking at configuration time
53 *
54 * Revision 1.21 1996/06/10 11:55:47 jimz
55 * Straightened out some per-array/not-per-array distinctions, fixed
56 * a couple bugs related to confusion. Added shutdown lists. Removed
57 * layout shutdown function (now subsumed by shutdown lists).
58 *
59 * Revision 1.20 1996/06/07 22:26:27 jimz
60 * type-ify which_ru (RF_ReconUnitNum_t)
61 *
62 * Revision 1.19 1996/06/07 21:33:04 jimz
63 * begin using consistent types for sector numbers,
64 * stripe numbers, row+col numbers, recon unit numbers
65 *
66 * Revision 1.18 1996/06/05 18:06:02 jimz
67 * Major code cleanup. The Great Renaming is now done.
68 * Better modularity. Better typing. Fixed a bunch of
69 * synchronization bugs. Made a lot of global stuff
70 * per-desc or per-array. Removed dead code.
71 *
72 * Revision 1.17 1996/06/03 23:28:26 jimz
73 * more bugfixes
74 * check in tree to sync for IPDS runs with current bugfixes
75 * there still may be a problem with threads in the script test
76 * getting I/Os stuck- not trivially reproducible (runs ~50 times
77 * in a row without getting stuck)
78 *
79 * Revision 1.16 1996/06/02 17:31:48 jimz
80 * Moved a lot of global stuff into array structure, where it belongs.
81 * Fixed up paritylogging, pss modules in this manner. Some general
82 * code cleanup. Removed lots of dead code, some dead files.
83 *
84 * Revision 1.15 1996/05/31 22:26:54 jimz
85 * fix a lot of mapping problems, memory allocation problems
86 * found some weird lock issues, fixed 'em
87 * more code cleanup
88 *
89 * Revision 1.14 1996/05/30 23:22:16 jimz
90 * bugfixes of serialization, timing problems
91 * more cleanup
92 *
93 * Revision 1.13 1996/05/27 18:56:37 jimz
94 * more code cleanup
95 * better typing
96 * compiles in all 3 environments
97 *
98 * Revision 1.12 1996/05/24 22:17:04 jimz
99 * continue code + namespace cleanup
100 * typed a bunch of flags
101 *
102 * Revision 1.11 1996/05/24 01:59:45 jimz
103 * another checkpoint in code cleanup for release
104 * time to sync kernel tree
105 *
106 * Revision 1.10 1996/05/23 00:33:23 jimz
107 * code cleanup: move all debug decls to rf_options.c, all extern
108 * debug decls to rf_options.h, all debug vars preceded by rf_
109 *
110 * Revision 1.9 1996/05/18 19:51:34 jimz
111 * major code cleanup- fix syntax, make some types consistent,
112 * add prototypes, clean out dead code, et cetera
113 *
114 * Revision 1.8 1996/05/03 19:38:58 wvcii
115 * moved dag creation routines to dag library
116 *
117 * Revision 1.7 1995/12/12 18:10:06 jimz
118 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
119 * fix 80-column brain damage in comments
120 *
121 * Revision 1.6 1995/12/06 15:04:28 root
122 * added copyright info
123 *
124 * Revision 1.5 1995/11/17 18:59:41 wvcii
125 * added prototyping to MapParity
126 *
127 * Revision 1.4 1995/06/23 13:38:21 robby
128 * updeated to prototypes in rf_layout.h
129 *
130 */
131
132 #include "rf_types.h"
133 #include "rf_raid.h"
134 #include "rf_raid5.h"
135 #include "rf_dag.h"
136 #include "rf_dagffrd.h"
137 #include "rf_dagffwr.h"
138 #include "rf_dagdegrd.h"
139 #include "rf_dagdegwr.h"
140 #include "rf_dagutils.h"
141 #include "rf_threadid.h"
142 #include "rf_general.h"
143 #include "rf_map.h"
144 #include "rf_utils.h"
145
146 typedef struct RF_Raid5ConfigInfo_s {
147 RF_RowCol_t **stripeIdentifier; /* filled in at config time and used by IdentifyStripe */
148 } RF_Raid5ConfigInfo_t;
149
150 int rf_ConfigureRAID5(
151 RF_ShutdownList_t **listp,
152 RF_Raid_t *raidPtr,
153 RF_Config_t *cfgPtr)
154 {
155 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
156 RF_Raid5ConfigInfo_t *info;
157 RF_RowCol_t i, j, startdisk;
158
159 /* create a RAID level 5 configuration structure */
160 RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t), (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList);
161 if (info == NULL)
162 return(ENOMEM);
163 layoutPtr->layoutSpecificInfo = (void *) info;
164
165 RF_ASSERT(raidPtr->numRow == 1);
166
167 /* the stripe identifier must identify the disks in each stripe,
168 * IN THE ORDER THAT THEY APPEAR IN THE STRIPE.
169 */
170 info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
171 if (info->stripeIdentifier == NULL)
172 return(ENOMEM);
173 startdisk = 0;
174 for (i=0; i<raidPtr->numCol; i++) {
175 for (j=0; j<raidPtr->numCol; j++) {
176 info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
177 }
178 if ((--startdisk) < 0) startdisk = raidPtr->numCol-1;
179 }
180
181 /* fill in the remaining layout parameters */
182 layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
183 layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
184 layoutPtr->numDataCol = raidPtr->numCol-1;
185 layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
186 layoutPtr->numParityCol = 1;
187 layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
188
189 raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
190
191 return(0);
192 }
193
194 int rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr)
195 {
196 return(20);
197 }
198
199 RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr)
200 {
201 return(10);
202 }
203
204 #if !defined(__NetBSD__) && !defined(_KERNEL)
205 /* not currently used */
206 int rf_ShutdownRAID5(RF_Raid_t *raidPtr)
207 {
208 return(0);
209 }
210 #endif
211
212 void rf_MapSectorRAID5(
213 RF_Raid_t *raidPtr,
214 RF_RaidAddr_t raidSector,
215 RF_RowCol_t *row,
216 RF_RowCol_t *col,
217 RF_SectorNum_t *diskSector,
218 int remap)
219 {
220 RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
221 *row = 0;
222 *col = (SUID % raidPtr->numCol);
223 *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
224 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
225 }
226
227 void rf_MapParityRAID5(
228 RF_Raid_t *raidPtr,
229 RF_RaidAddr_t raidSector,
230 RF_RowCol_t *row,
231 RF_RowCol_t *col,
232 RF_SectorNum_t *diskSector,
233 int remap)
234 {
235 RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
236
237 *row = 0;
238 *col = raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol;
239 *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
240 (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
241 }
242
243 void rf_IdentifyStripeRAID5(
244 RF_Raid_t *raidPtr,
245 RF_RaidAddr_t addr,
246 RF_RowCol_t **diskids,
247 RF_RowCol_t *outRow)
248 {
249 RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
250 RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
251
252 *outRow = 0;
253 *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ];
254 }
255
256 void rf_MapSIDToPSIDRAID5(
257 RF_RaidLayout_t *layoutPtr,
258 RF_StripeNum_t stripeID,
259 RF_StripeNum_t *psID,
260 RF_ReconUnitNum_t *which_ru)
261 {
262 *which_ru = 0;
263 *psID = stripeID;
264 }
265
266 /* select an algorithm for performing an access. Returns two pointers,
267 * one to a function that will return information about the DAG, and
268 * another to a function that will create the dag.
269 */
270 void rf_RaidFiveDagSelect(
271 RF_Raid_t *raidPtr,
272 RF_IoType_t type,
273 RF_AccessStripeMap_t *asmap,
274 RF_VoidFuncPtr *createFunc)
275 {
276 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
277 RF_PhysDiskAddr_t *failedPDA=NULL;
278 RF_RowCol_t frow, fcol;
279 RF_RowStatus_t rstat;
280 int prior_recon;
281 int tid;
282
283 RF_ASSERT(RF_IO_IS_R_OR_W(type));
284
285 if (asmap->numDataFailed + asmap->numParityFailed > 1) {
286 RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
287 /* *infoFunc = */ *createFunc = NULL;
288 return;
289 } else if (asmap->numDataFailed + asmap->numParityFailed == 1) {
290
291 /* if under recon & already reconstructed, redirect the access to the spare drive
292 * and eliminate the failure indication
293 */
294 failedPDA = asmap->failedPDAs[0];
295 frow = failedPDA->row; fcol = failedPDA->col;
296 rstat = raidPtr->status[failedPDA->row];
297 prior_recon = (rstat == rf_rs_reconfigured) || (
298 (rstat == rf_rs_reconstructing) ?
299 rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
300 );
301 if (prior_recon) {
302 RF_RowCol_t or = failedPDA->row,oc=failedPDA->col;
303 RF_SectorNum_t oo=failedPDA->startSector;
304
305 if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist spare space */
306
307 if (failedPDA == asmap->parityInfo) {
308
309 /* parity has failed */
310 (layoutPtr->map->MapParity)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
311 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
312
313 if (asmap->parityInfo->next) { /* redir 2nd component, if any */
314 RF_PhysDiskAddr_t *p = asmap->parityInfo->next;
315 RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
316 p->row = failedPDA->row;
317 p->col = failedPDA->col;
318 p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
319 SUoffs; /* cheating: startSector is not really a RAID address */
320 }
321
322 } else if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) {
323 RF_ASSERT(0); /* should not ever happen */
324 } else {
325
326 /* data has failed */
327 (layoutPtr->map->MapSector)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
328 &failedPDA->col, &failedPDA->startSector, RF_REMAP);
329
330 }
331
332 } else { /* redirect to dedicated spare space */
333
334 failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
335 failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
336
337 /* the parity may have two distinct components, both of which may need to be redirected */
338 if (asmap->parityInfo->next) {
339 if (failedPDA == asmap->parityInfo) {
340 failedPDA->next->row = failedPDA->row;
341 failedPDA->next->col = failedPDA->col;
342 } else if (failedPDA == asmap->parityInfo->next) { /* paranoid: should never occur */
343 asmap->parityInfo->row = failedPDA->row;
344 asmap->parityInfo->col = failedPDA->col;
345 }
346 }
347 }
348
349 RF_ASSERT(failedPDA->col != -1);
350
351 if (rf_dagDebug || rf_mapDebug) {
352 rf_get_threadid(tid);
353 printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
354 tid,type,or,oc,(long)oo,failedPDA->row,failedPDA->col,
355 (long)failedPDA->startSector);
356 }
357
358 asmap->numDataFailed = asmap->numParityFailed = 0;
359 }
360
361 }
362
363 /* all dags begin/end with block/unblock node
364 * therefore, hdrSucc & termAnt counts should always be 1
365 * also, these counts should not be visible outside dag creation routines -
366 * manipulating the counts here should be removed */
367 if (type == RF_IO_TYPE_READ) {
368 if (asmap->numDataFailed == 0)
369 *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG;
370 else
371 *createFunc = (RF_VoidFuncPtr)rf_CreateRaidFiveDegradedReadDAG;
372 } else {
373
374
375 /* if mirroring, always use large writes. If the access requires two
376 * distinct parity updates, always do a small write. If the stripe
377 * contains a failure but the access does not, do a small write.
378 * The first conditional (numStripeUnitsAccessed <= numDataCol/2) uses a
379 * less-than-or-equal rather than just a less-than because when G is 3
380 * or 4, numDataCol/2 is 1, and I want single-stripe-unit updates to use
381 * just one disk.
382 */
383 if ( (asmap->numDataFailed + asmap->numParityFailed) == 0) {
384 if (rf_suppressLocksAndLargeWrites ||
385 (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol!=1)) ||
386 (asmap->parityInfo->next!=NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
387 *createFunc = (RF_VoidFuncPtr)rf_CreateSmallWriteDAG;
388 }
389 else
390 *createFunc = (RF_VoidFuncPtr)rf_CreateLargeWriteDAG;
391 }
392 else {
393 if (asmap->numParityFailed == 1)
394 *createFunc = (RF_VoidFuncPtr)rf_CreateNonRedundantWriteDAG;
395 else
396 if (asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
397 *createFunc = NULL;
398 else
399 *createFunc = (RF_VoidFuncPtr)rf_CreateDegradedWriteDAG;
400 }
401 }
402 }
403