rf_pqdegdags.c revision 1.2 1 /* $NetBSD: rf_pqdegdags.c,v 1.2 1999/01/26 02:34:00 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32 */
33
34
35 #include "rf_archs.h"
36
37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
38
39 #include "rf_types.h"
40 #include "rf_raid.h"
41 #include "rf_dag.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_dagutils.h"
44 #include "rf_etimer.h"
45 #include "rf_acctrace.h"
46 #include "rf_general.h"
47 #include "rf_pqdegdags.h"
48 #include "rf_pq.h"
49 #include "rf_sys.h"
50
51 static void applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda,
52 RF_PhysDiskAddr_t *qpda, void *bp);
53
54 /*
55 Two data drives have failed, and we are doing a read that covers one of them.
56 We may also be reading some of the surviving drives.
57
58
59 *****************************************************************************************
60 *
61 * creates a DAG to perform a degraded-mode read of data within one stripe.
62 * This DAG is as follows:
63 *
64 * Hdr
65 * |
66 * Block
67 * / / \ \ \ \
68 * Rud ... Rud Rrd ... Rrd Rp Rq
69 * | \ | \ | \ | \ | \ | \
70 *
71 * | |
72 * Unblock X
73 * \ /
74 * ------ T ------
75 *
76 * Each R node is a successor of the L node
77 * One successor arc from each R node goes to U, and the other to X
78 * There is one Rud for each chunk of surviving user data requested by the user,
79 * and one Rrd for each chunk of surviving user data _not_ being read by the user
80 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
81 * X = pq recovery node, T = terminate
82 *
83 * The block & unblock nodes are leftovers from a previous version. They
84 * do nothing, but I haven't deleted them because it would be a tremendous
85 * effort to put them back in.
86 *
87 * Note: The target buffer for the XOR node is set to the actual user buffer where the
88 * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
89 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
90 * zero the target buffer prior to the re-use.
91 *
92 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
93 * needs and what's not.
94 ****************************************************************************************/
95 /* init a disk node with 2 successors and one predecessor */
96 #define INIT_DISK_NODE(node,name) \
97 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
98 (node)->succedents[0] = unblockNode; \
99 (node)->succedents[1] = recoveryNode; \
100 (node)->antecedents[0] = blockNode; \
101 (node)->antType[0] = rf_control
102
103 #define DISK_NODE_PARAMS(_node_,_p_) \
104 (_node_).params[0].p = _p_ ; \
105 (_node_).params[1].p = (_p_)->bufPtr; \
106 (_node_).params[2].v = parityStripeID; \
107 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
108
109 #define DISK_NODE_PDA(node) ((node)->params[0].p)
110
111 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
112 {
113 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
114 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
115 }
116
117 static void applyPDA(raidPtr,pda,ppda,qpda, bp)
118 RF_Raid_t *raidPtr;
119 RF_PhysDiskAddr_t *pda;
120 RF_PhysDiskAddr_t *ppda;
121 RF_PhysDiskAddr_t *qpda;
122 void *bp;
123 {
124 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
125 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
126 RF_SectorCount_t s0len = ppda->numSector, len;
127 RF_SectorNum_t suoffset;
128 unsigned coeff;
129 char *pbuf = ppda->bufPtr;
130 char *qbuf = qpda->bufPtr;
131 char *buf;
132 int delta;
133
134 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
135 len = pda->numSector;
136 /* see if pda intersects a recovery pda */
137 if ((suoffset < s0off+s0len) && ( suoffset+len > s0off))
138 {
139 buf = pda->bufPtr;
140 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
141 coeff = (coeff % raidPtr->Layout.numDataCol);
142
143 if (suoffset < s0off)
144 {
145 delta = s0off - suoffset;
146 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
147 suoffset = s0off;
148 len -= delta;
149 }
150 if (suoffset > s0off)
151 {
152 delta = suoffset - s0off;
153 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
154 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
155 }
156 if ((suoffset + len) > (s0len + s0off))
157 len = s0len + s0off - suoffset;
158
159 /* src, dest, len */
160 rf_bxor(buf,pbuf,rf_RaidAddressToByte(raidPtr,len), bp);
161
162 /* dest, src, len, coeff */
163 rf_IncQ((unsigned long *)qbuf,(unsigned long *)buf,rf_RaidAddressToByte(raidPtr,len),coeff);
164 }
165 }
166 /*
167 Recover data in the case of a double failure. There can be two
168 result buffers, one for each chunk of data trying to be recovered.
169 The params are pda's that have not been range restricted or otherwise
170 politely massaged - this should be done here. The last params are the
171 pdas of P and Q, followed by the raidPtr. The list can look like
172
173 pda, pda, ... , p pda, q pda, raidptr, asm
174
175 or
176
177 pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
178
179 depending on wether two chunks of recovery data were required.
180
181 The second condition only arises if there are two failed buffers
182 whose lengths do not add up a stripe unit.
183 */
184
185
186 int rf_PQDoubleRecoveryFunc(node)
187 RF_DagNode_t *node;
188 {
189 int np = node->numParams;
190 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
191 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
192 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
193 int d, i;
194 unsigned coeff;
195 RF_RaidAddr_t sosAddr, suoffset;
196 RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
197 int two = 0;
198 RF_PhysDiskAddr_t *ppda,*ppda2,*qpda,*qpda2,*pda,npda;
199 char *buf;
200 int numDataCol = layoutPtr->numDataCol;
201 RF_Etimer_t timer;
202 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
203
204 RF_ETIMER_START(timer);
205
206 if (asmap->failedPDAs[1] &&
207 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU))
208 {
209 RF_ASSERT(0);
210 ppda = node->params[np-6].p;
211 ppda2 = node->params[np-5].p;
212 qpda = node->params[np-4].p;
213 qpda2 = node->params[np-3].p;
214 d = (np-6);
215 two = 1;
216 }
217 else
218 {
219 ppda = node->params[np-4].p;
220 qpda = node->params[np-3].p;
221 d = (np-4);
222 }
223
224 for (i=0; i < d; i++)
225 {
226 pda = node->params[i].p;
227 buf = pda->bufPtr;
228 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
229 len = pda->numSector;
230 coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
231 /* compute the data unit offset within the column */
232 coeff = (coeff % raidPtr->Layout.numDataCol);
233 /* see if pda intersects a recovery pda */
234 applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
235 if (two)
236 applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
237 }
238
239 /* ok, we got the parity back to the point where we can recover.
240 We now need to determine the coeff of the columns that need to be
241 recovered. We can also only need to recover a single stripe unit.
242 */
243
244 if (asmap->failedPDAs[1] == NULL)
245 { /* only a single stripe unit to recover. */
246 pda = asmap->failedPDAs[0];
247 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
248 /* need to determine the column of the other failed disk */
249 coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
250 /* compute the data unit offset within the column */
251 coeff = (coeff % raidPtr->Layout.numDataCol);
252 for (i=0; i < numDataCol; i++)
253 {
254 npda.raidAddress = sosAddr + (i * secPerSU);
255 (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
256 /* skip over dead disks */
257 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
258 if (i != coeff) break;
259 }
260 RF_ASSERT (i < numDataCol);
261 RF_ASSERT (two==0);
262 /* recover the data. Since we need only want to recover one column, we overwrite the
263 parity with the other one. */
264 if (coeff < i) /* recovering 'a' */
265 rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)pda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
266 else /* recovering 'b' */
267 rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)pda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
268 }
269 else
270 RF_PANIC();
271
272 RF_ETIMER_STOP(timer);
273 RF_ETIMER_EVAL(timer);
274 if (tracerec)
275 tracerec->q_us += RF_ETIMER_VAL_US(timer);
276 rf_GenericWakeupFunc(node,0);
277 return(0);
278 }
279
280 int rf_PQWriteDoubleRecoveryFunc(node)
281 RF_DagNode_t *node;
282 {
283 /* The situation:
284
285 We are doing a write that hits only one
286 failed data unit.
287 The other failed data unit is not being overwritten, so
288 we need to generate it.
289
290 For the moment, we assume all the nonfailed data being
291 written is in the shadow of the failed data unit.
292 (i.e,, either a single data unit write or the entire
293 failed stripe unit is being overwritten. )
294
295 Recovery strategy:
296 apply the recovery data to the parity and q.
297 Use P & Q to recover the second failed data unit in P.
298 Zero fill Q, then apply the recovered data to p.
299 Then apply the data being written to the failed drive.
300 Then walk through the surviving drives, applying new data
301 when it exists, othewise the recovery data. Quite a mess.
302
303
304 The params
305
306 read pda0, read pda1, ... read pda (numDataCol-3),
307 write pda0, ... , write pda (numStripeUnitAccess - numDataFailed),
308 failed pda, raidPtr, asmap
309 */
310
311 int np = node->numParams;
312 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
313 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
314 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
315 int i;
316 RF_RaidAddr_t sosAddr;
317 unsigned coeff;
318 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
319 RF_PhysDiskAddr_t *ppda,*qpda,*pda,npda;
320 int numDataCol = layoutPtr->numDataCol;
321 RF_Etimer_t timer;
322 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
323
324 RF_ASSERT(node->numResults == 2);
325 RF_ASSERT(asmap->failedPDAs[1] == NULL);
326 RF_ETIMER_START(timer);
327 ppda = node->results[0];
328 qpda = node->results[1];
329 /* apply the recovery data */
330 for (i=0; i < numDataCol-2; i++)
331 applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
332
333 /* determine the other failed data unit */
334 pda = asmap->failedPDAs[0];
335 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
336 /* need to determine the column of the other failed disk */
337 coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
338 /* compute the data unit offset within the column */
339 coeff = (coeff % raidPtr->Layout.numDataCol);
340 for (i=0; i < numDataCol; i++)
341 {
342 npda.raidAddress = sosAddr + (i * secPerSU);
343 (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
344 /* skip over dead disks */
345 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
346 if (i != coeff) break;
347 }
348 RF_ASSERT (i < numDataCol);
349 /* recover the data. The column we want to recover we write over the parity.
350 The column we don't care about we dump in q. */
351 if (coeff < i) /* recovering 'a' */
352 rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
353 else /* recovering 'b' */
354 rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
355
356 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
357 bzero(qpda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector));
358 rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),i);
359
360 /* now apply all the write data to the buffer */
361 /* single stripe unit write case: the failed data is only thing we are writing. */
362 RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
363 /* dest, src, len, coeff */
364 rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)asmap->failedPDAs[0]->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),coeff);
365 rf_bxor(asmap->failedPDAs[0]->bufPtr,ppda->bufPtr,rf_RaidAddressToByte(raidPtr,ppda->numSector),node->dagHdr->bp);
366
367 /* now apply all the recovery data */
368 for (i=0; i < numDataCol-2; i++)
369 applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
370
371 RF_ETIMER_STOP(timer);
372 RF_ETIMER_EVAL(timer);
373 if (tracerec)
374 tracerec->q_us += RF_ETIMER_VAL_US(timer);
375
376 rf_GenericWakeupFunc(node,0);
377 return(0);
378 }
379
380 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
381 {
382 RF_PANIC();
383 }
384
385 /*
386 Two lost data unit write case.
387
388 There are really two cases here:
389
390 (1) The write completely covers the two lost data units.
391 In that case, a reconstruct write that doesn't write the
392 failed data units will do the correct thing. So in this case,
393 the dag looks like
394
395 full stripe read of surviving data units (not being overwriten)
396 write new data (ignoring failed units) compute P&Q
397 write P&Q
398
399
400 (2) The write does not completely cover both failed data units
401 (but touches at least one of them). Then we need to do the
402 equivalent of a reconstruct read to recover the missing data
403 unit from the other stripe.
404
405 For any data we are writing that is not in the "shadow"
406 of the failed units, we need to do a four cycle update.
407 PANIC on this case. for now
408
409 */
410
411 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
412 {
413 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
414 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
415 int sum;
416 int nf = asmap->numDataFailed;
417
418 sum = asmap->failedPDAs[0]->numSector;
419 if (nf == 2)
420 sum += asmap->failedPDAs[1]->numSector;
421
422 if ((nf == 2) && ( sum == (2*sectorsPerSU)))
423 {
424 /* large write case */
425 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
426 return;
427 }
428
429
430 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU))
431 {
432 /* small write case, no user data not in shadow */
433 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
434 return;
435 }
436 RF_PANIC();
437 }
438
439 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
440 {
441 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
442 }
443
444 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
445