rf_raid5.c revision 1.2 1 1.2 oster /* $NetBSD: rf_raid5.c,v 1.2 1999/01/26 02:34:01 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: Mark Holland
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster /******************************************************************************
30 1.1 oster *
31 1.1 oster * rf_raid5.c -- implements RAID Level 5
32 1.1 oster *
33 1.1 oster *****************************************************************************/
34 1.1 oster
35 1.1 oster #include "rf_types.h"
36 1.1 oster #include "rf_raid.h"
37 1.1 oster #include "rf_raid5.h"
38 1.1 oster #include "rf_dag.h"
39 1.1 oster #include "rf_dagffrd.h"
40 1.1 oster #include "rf_dagffwr.h"
41 1.1 oster #include "rf_dagdegrd.h"
42 1.1 oster #include "rf_dagdegwr.h"
43 1.1 oster #include "rf_dagutils.h"
44 1.1 oster #include "rf_threadid.h"
45 1.1 oster #include "rf_general.h"
46 1.1 oster #include "rf_map.h"
47 1.1 oster #include "rf_utils.h"
48 1.1 oster
49 1.1 oster typedef struct RF_Raid5ConfigInfo_s {
50 1.1 oster RF_RowCol_t **stripeIdentifier; /* filled in at config time and used by IdentifyStripe */
51 1.1 oster } RF_Raid5ConfigInfo_t;
52 1.1 oster
53 1.1 oster int rf_ConfigureRAID5(
54 1.1 oster RF_ShutdownList_t **listp,
55 1.1 oster RF_Raid_t *raidPtr,
56 1.1 oster RF_Config_t *cfgPtr)
57 1.1 oster {
58 1.1 oster RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
59 1.1 oster RF_Raid5ConfigInfo_t *info;
60 1.1 oster RF_RowCol_t i, j, startdisk;
61 1.1 oster
62 1.1 oster /* create a RAID level 5 configuration structure */
63 1.1 oster RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t), (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList);
64 1.1 oster if (info == NULL)
65 1.1 oster return(ENOMEM);
66 1.1 oster layoutPtr->layoutSpecificInfo = (void *) info;
67 1.1 oster
68 1.1 oster RF_ASSERT(raidPtr->numRow == 1);
69 1.1 oster
70 1.1 oster /* the stripe identifier must identify the disks in each stripe,
71 1.1 oster * IN THE ORDER THAT THEY APPEAR IN THE STRIPE.
72 1.1 oster */
73 1.1 oster info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
74 1.1 oster if (info->stripeIdentifier == NULL)
75 1.1 oster return(ENOMEM);
76 1.1 oster startdisk = 0;
77 1.1 oster for (i=0; i<raidPtr->numCol; i++) {
78 1.1 oster for (j=0; j<raidPtr->numCol; j++) {
79 1.1 oster info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
80 1.1 oster }
81 1.1 oster if ((--startdisk) < 0) startdisk = raidPtr->numCol-1;
82 1.1 oster }
83 1.1 oster
84 1.1 oster /* fill in the remaining layout parameters */
85 1.1 oster layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
86 1.1 oster layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
87 1.1 oster layoutPtr->numDataCol = raidPtr->numCol-1;
88 1.1 oster layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
89 1.1 oster layoutPtr->numParityCol = 1;
90 1.1 oster layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
91 1.1 oster
92 1.1 oster raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
93 1.1 oster
94 1.1 oster return(0);
95 1.1 oster }
96 1.1 oster
97 1.1 oster int rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr)
98 1.1 oster {
99 1.1 oster return(20);
100 1.1 oster }
101 1.1 oster
102 1.1 oster RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr)
103 1.1 oster {
104 1.1 oster return(10);
105 1.1 oster }
106 1.1 oster
107 1.1 oster #if !defined(__NetBSD__) && !defined(_KERNEL)
108 1.1 oster /* not currently used */
109 1.1 oster int rf_ShutdownRAID5(RF_Raid_t *raidPtr)
110 1.1 oster {
111 1.1 oster return(0);
112 1.1 oster }
113 1.1 oster #endif
114 1.1 oster
115 1.1 oster void rf_MapSectorRAID5(
116 1.1 oster RF_Raid_t *raidPtr,
117 1.1 oster RF_RaidAddr_t raidSector,
118 1.1 oster RF_RowCol_t *row,
119 1.1 oster RF_RowCol_t *col,
120 1.1 oster RF_SectorNum_t *diskSector,
121 1.1 oster int remap)
122 1.1 oster {
123 1.1 oster RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
124 1.1 oster *row = 0;
125 1.1 oster *col = (SUID % raidPtr->numCol);
126 1.1 oster *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
127 1.1 oster (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
128 1.1 oster }
129 1.1 oster
130 1.1 oster void rf_MapParityRAID5(
131 1.1 oster RF_Raid_t *raidPtr,
132 1.1 oster RF_RaidAddr_t raidSector,
133 1.1 oster RF_RowCol_t *row,
134 1.1 oster RF_RowCol_t *col,
135 1.1 oster RF_SectorNum_t *diskSector,
136 1.1 oster int remap)
137 1.1 oster {
138 1.1 oster RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
139 1.1 oster
140 1.1 oster *row = 0;
141 1.1 oster *col = raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol;
142 1.1 oster *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
143 1.1 oster (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
144 1.1 oster }
145 1.1 oster
146 1.1 oster void rf_IdentifyStripeRAID5(
147 1.1 oster RF_Raid_t *raidPtr,
148 1.1 oster RF_RaidAddr_t addr,
149 1.1 oster RF_RowCol_t **diskids,
150 1.1 oster RF_RowCol_t *outRow)
151 1.1 oster {
152 1.1 oster RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
153 1.1 oster RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
154 1.1 oster
155 1.1 oster *outRow = 0;
156 1.1 oster *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ];
157 1.1 oster }
158 1.1 oster
159 1.1 oster void rf_MapSIDToPSIDRAID5(
160 1.1 oster RF_RaidLayout_t *layoutPtr,
161 1.1 oster RF_StripeNum_t stripeID,
162 1.1 oster RF_StripeNum_t *psID,
163 1.1 oster RF_ReconUnitNum_t *which_ru)
164 1.1 oster {
165 1.1 oster *which_ru = 0;
166 1.1 oster *psID = stripeID;
167 1.1 oster }
168 1.1 oster
169 1.1 oster /* select an algorithm for performing an access. Returns two pointers,
170 1.1 oster * one to a function that will return information about the DAG, and
171 1.1 oster * another to a function that will create the dag.
172 1.1 oster */
173 1.1 oster void rf_RaidFiveDagSelect(
174 1.1 oster RF_Raid_t *raidPtr,
175 1.1 oster RF_IoType_t type,
176 1.1 oster RF_AccessStripeMap_t *asmap,
177 1.1 oster RF_VoidFuncPtr *createFunc)
178 1.1 oster {
179 1.1 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
180 1.1 oster RF_PhysDiskAddr_t *failedPDA=NULL;
181 1.1 oster RF_RowCol_t frow, fcol;
182 1.1 oster RF_RowStatus_t rstat;
183 1.1 oster int prior_recon;
184 1.1 oster int tid;
185 1.1 oster
186 1.1 oster RF_ASSERT(RF_IO_IS_R_OR_W(type));
187 1.1 oster
188 1.1 oster if (asmap->numDataFailed + asmap->numParityFailed > 1) {
189 1.1 oster RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
190 1.1 oster /* *infoFunc = */ *createFunc = NULL;
191 1.1 oster return;
192 1.1 oster } else if (asmap->numDataFailed + asmap->numParityFailed == 1) {
193 1.1 oster
194 1.1 oster /* if under recon & already reconstructed, redirect the access to the spare drive
195 1.1 oster * and eliminate the failure indication
196 1.1 oster */
197 1.1 oster failedPDA = asmap->failedPDAs[0];
198 1.1 oster frow = failedPDA->row; fcol = failedPDA->col;
199 1.1 oster rstat = raidPtr->status[failedPDA->row];
200 1.1 oster prior_recon = (rstat == rf_rs_reconfigured) || (
201 1.1 oster (rstat == rf_rs_reconstructing) ?
202 1.1 oster rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
203 1.1 oster );
204 1.1 oster if (prior_recon) {
205 1.1 oster RF_RowCol_t or = failedPDA->row,oc=failedPDA->col;
206 1.1 oster RF_SectorNum_t oo=failedPDA->startSector;
207 1.1 oster
208 1.1 oster if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist spare space */
209 1.1 oster
210 1.1 oster if (failedPDA == asmap->parityInfo) {
211 1.1 oster
212 1.1 oster /* parity has failed */
213 1.1 oster (layoutPtr->map->MapParity)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
214 1.1 oster &failedPDA->col, &failedPDA->startSector, RF_REMAP);
215 1.1 oster
216 1.1 oster if (asmap->parityInfo->next) { /* redir 2nd component, if any */
217 1.1 oster RF_PhysDiskAddr_t *p = asmap->parityInfo->next;
218 1.1 oster RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
219 1.1 oster p->row = failedPDA->row;
220 1.1 oster p->col = failedPDA->col;
221 1.1 oster p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
222 1.1 oster SUoffs; /* cheating: startSector is not really a RAID address */
223 1.1 oster }
224 1.1 oster
225 1.1 oster } else if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) {
226 1.1 oster RF_ASSERT(0); /* should not ever happen */
227 1.1 oster } else {
228 1.1 oster
229 1.1 oster /* data has failed */
230 1.1 oster (layoutPtr->map->MapSector)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
231 1.1 oster &failedPDA->col, &failedPDA->startSector, RF_REMAP);
232 1.1 oster
233 1.1 oster }
234 1.1 oster
235 1.1 oster } else { /* redirect to dedicated spare space */
236 1.1 oster
237 1.1 oster failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
238 1.1 oster failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
239 1.1 oster
240 1.1 oster /* the parity may have two distinct components, both of which may need to be redirected */
241 1.1 oster if (asmap->parityInfo->next) {
242 1.1 oster if (failedPDA == asmap->parityInfo) {
243 1.1 oster failedPDA->next->row = failedPDA->row;
244 1.1 oster failedPDA->next->col = failedPDA->col;
245 1.1 oster } else if (failedPDA == asmap->parityInfo->next) { /* paranoid: should never occur */
246 1.1 oster asmap->parityInfo->row = failedPDA->row;
247 1.1 oster asmap->parityInfo->col = failedPDA->col;
248 1.1 oster }
249 1.1 oster }
250 1.1 oster }
251 1.1 oster
252 1.1 oster RF_ASSERT(failedPDA->col != -1);
253 1.1 oster
254 1.1 oster if (rf_dagDebug || rf_mapDebug) {
255 1.1 oster rf_get_threadid(tid);
256 1.1 oster printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
257 1.1 oster tid,type,or,oc,(long)oo,failedPDA->row,failedPDA->col,
258 1.1 oster (long)failedPDA->startSector);
259 1.1 oster }
260 1.1 oster
261 1.1 oster asmap->numDataFailed = asmap->numParityFailed = 0;
262 1.1 oster }
263 1.1 oster
264 1.1 oster }
265 1.1 oster
266 1.1 oster /* all dags begin/end with block/unblock node
267 1.1 oster * therefore, hdrSucc & termAnt counts should always be 1
268 1.1 oster * also, these counts should not be visible outside dag creation routines -
269 1.1 oster * manipulating the counts here should be removed */
270 1.1 oster if (type == RF_IO_TYPE_READ) {
271 1.1 oster if (asmap->numDataFailed == 0)
272 1.1 oster *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG;
273 1.1 oster else
274 1.1 oster *createFunc = (RF_VoidFuncPtr)rf_CreateRaidFiveDegradedReadDAG;
275 1.1 oster } else {
276 1.1 oster
277 1.1 oster
278 1.1 oster /* if mirroring, always use large writes. If the access requires two
279 1.1 oster * distinct parity updates, always do a small write. If the stripe
280 1.1 oster * contains a failure but the access does not, do a small write.
281 1.1 oster * The first conditional (numStripeUnitsAccessed <= numDataCol/2) uses a
282 1.1 oster * less-than-or-equal rather than just a less-than because when G is 3
283 1.1 oster * or 4, numDataCol/2 is 1, and I want single-stripe-unit updates to use
284 1.1 oster * just one disk.
285 1.1 oster */
286 1.1 oster if ( (asmap->numDataFailed + asmap->numParityFailed) == 0) {
287 1.1 oster if (rf_suppressLocksAndLargeWrites ||
288 1.1 oster (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol!=1)) ||
289 1.1 oster (asmap->parityInfo->next!=NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
290 1.1 oster *createFunc = (RF_VoidFuncPtr)rf_CreateSmallWriteDAG;
291 1.1 oster }
292 1.1 oster else
293 1.1 oster *createFunc = (RF_VoidFuncPtr)rf_CreateLargeWriteDAG;
294 1.1 oster }
295 1.1 oster else {
296 1.1 oster if (asmap->numParityFailed == 1)
297 1.1 oster *createFunc = (RF_VoidFuncPtr)rf_CreateNonRedundantWriteDAG;
298 1.1 oster else
299 1.1 oster if (asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
300 1.1 oster *createFunc = NULL;
301 1.1 oster else
302 1.1 oster *createFunc = (RF_VoidFuncPtr)rf_CreateDegradedWriteDAG;
303 1.1 oster }
304 1.1 oster }
305 1.1 oster }
306