rf_pqdegdags.c revision 1.3 1 /* $NetBSD: rf_pqdegdags.c,v 1.3 1999/02/05 00:06:15 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32 */
33
34
35 #include "rf_archs.h"
36
37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
38
39 #include "rf_types.h"
40 #include "rf_raid.h"
41 #include "rf_dag.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_dagutils.h"
44 #include "rf_etimer.h"
45 #include "rf_acctrace.h"
46 #include "rf_general.h"
47 #include "rf_pqdegdags.h"
48 #include "rf_pq.h"
49 #include "rf_sys.h"
50
51 static void
52 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
53 RF_PhysDiskAddr_t * qpda, void *bp);
54
55 /*
56 Two data drives have failed, and we are doing a read that covers one of them.
57 We may also be reading some of the surviving drives.
58
59
60 *****************************************************************************************
61 *
62 * creates a DAG to perform a degraded-mode read of data within one stripe.
63 * This DAG is as follows:
64 *
65 * Hdr
66 * |
67 * Block
68 * / / \ \ \ \
69 * Rud ... Rud Rrd ... Rrd Rp Rq
70 * | \ | \ | \ | \ | \ | \
71 *
72 * | |
73 * Unblock X
74 * \ /
75 * ------ T ------
76 *
77 * Each R node is a successor of the L node
78 * One successor arc from each R node goes to U, and the other to X
79 * There is one Rud for each chunk of surviving user data requested by the user,
80 * and one Rrd for each chunk of surviving user data _not_ being read by the user
81 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
82 * X = pq recovery node, T = terminate
83 *
84 * The block & unblock nodes are leftovers from a previous version. They
85 * do nothing, but I haven't deleted them because it would be a tremendous
86 * effort to put them back in.
87 *
88 * Note: The target buffer for the XOR node is set to the actual user buffer where the
89 * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
90 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
91 * zero the target buffer prior to the re-use.
92 *
93 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
94 * needs and what's not.
95 ****************************************************************************************/
96 /* init a disk node with 2 successors and one predecessor */
97 #define INIT_DISK_NODE(node,name) \
98 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
99 (node)->succedents[0] = unblockNode; \
100 (node)->succedents[1] = recoveryNode; \
101 (node)->antecedents[0] = blockNode; \
102 (node)->antType[0] = rf_control
103
104 #define DISK_NODE_PARAMS(_node_,_p_) \
105 (_node_).params[0].p = _p_ ; \
106 (_node_).params[1].p = (_p_)->bufPtr; \
107 (_node_).params[2].v = parityStripeID; \
108 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
109
110 #define DISK_NODE_PDA(node) ((node)->params[0].p)
111
112 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
113 {
114 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
115 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
116 }
117
118 static void
119 applyPDA(raidPtr, pda, ppda, qpda, bp)
120 RF_Raid_t *raidPtr;
121 RF_PhysDiskAddr_t *pda;
122 RF_PhysDiskAddr_t *ppda;
123 RF_PhysDiskAddr_t *qpda;
124 void *bp;
125 {
126 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
127 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
128 RF_SectorCount_t s0len = ppda->numSector, len;
129 RF_SectorNum_t suoffset;
130 unsigned coeff;
131 char *pbuf = ppda->bufPtr;
132 char *qbuf = qpda->bufPtr;
133 char *buf;
134 int delta;
135
136 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
137 len = pda->numSector;
138 /* see if pda intersects a recovery pda */
139 if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
140 buf = pda->bufPtr;
141 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
142 coeff = (coeff % raidPtr->Layout.numDataCol);
143
144 if (suoffset < s0off) {
145 delta = s0off - suoffset;
146 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
147 suoffset = s0off;
148 len -= delta;
149 }
150 if (suoffset > s0off) {
151 delta = suoffset - s0off;
152 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
153 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
154 }
155 if ((suoffset + len) > (s0len + s0off))
156 len = s0len + s0off - suoffset;
157
158 /* src, dest, len */
159 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
160
161 /* dest, src, len, coeff */
162 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
163 }
164 }
165 /*
166 Recover data in the case of a double failure. There can be two
167 result buffers, one for each chunk of data trying to be recovered.
168 The params are pda's that have not been range restricted or otherwise
169 politely massaged - this should be done here. The last params are the
170 pdas of P and Q, followed by the raidPtr. The list can look like
171
172 pda, pda, ... , p pda, q pda, raidptr, asm
173
174 or
175
176 pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
177
178 depending on wether two chunks of recovery data were required.
179
180 The second condition only arises if there are two failed buffers
181 whose lengths do not add up a stripe unit.
182 */
183
184
185 int
186 rf_PQDoubleRecoveryFunc(node)
187 RF_DagNode_t *node;
188 {
189 int np = node->numParams;
190 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
191 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
192 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
193 int d, i;
194 unsigned coeff;
195 RF_RaidAddr_t sosAddr, suoffset;
196 RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
197 int two = 0;
198 RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
199 char *buf;
200 int numDataCol = layoutPtr->numDataCol;
201 RF_Etimer_t timer;
202 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
203
204 RF_ETIMER_START(timer);
205
206 if (asmap->failedPDAs[1] &&
207 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
208 RF_ASSERT(0);
209 ppda = node->params[np - 6].p;
210 ppda2 = node->params[np - 5].p;
211 qpda = node->params[np - 4].p;
212 qpda2 = node->params[np - 3].p;
213 d = (np - 6);
214 two = 1;
215 } else {
216 ppda = node->params[np - 4].p;
217 qpda = node->params[np - 3].p;
218 d = (np - 4);
219 }
220
221 for (i = 0; i < d; i++) {
222 pda = node->params[i].p;
223 buf = pda->bufPtr;
224 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
225 len = pda->numSector;
226 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
227 /* compute the data unit offset within the column */
228 coeff = (coeff % raidPtr->Layout.numDataCol);
229 /* see if pda intersects a recovery pda */
230 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
231 if (two)
232 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
233 }
234
235 /* ok, we got the parity back to the point where we can recover. We
236 * now need to determine the coeff of the columns that need to be
237 * recovered. We can also only need to recover a single stripe unit. */
238
239 if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit
240 * to recover. */
241 pda = asmap->failedPDAs[0];
242 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
243 /* need to determine the column of the other failed disk */
244 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
245 /* compute the data unit offset within the column */
246 coeff = (coeff % raidPtr->Layout.numDataCol);
247 for (i = 0; i < numDataCol; i++) {
248 npda.raidAddress = sosAddr + (i * secPerSU);
249 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
250 /* skip over dead disks */
251 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
252 if (i != coeff)
253 break;
254 }
255 RF_ASSERT(i < numDataCol);
256 RF_ASSERT(two == 0);
257 /* recover the data. Since we need only want to recover one
258 * column, we overwrite the parity with the other one. */
259 if (coeff < i) /* recovering 'a' */
260 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
261 else /* recovering 'b' */
262 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
263 } else
264 RF_PANIC();
265
266 RF_ETIMER_STOP(timer);
267 RF_ETIMER_EVAL(timer);
268 if (tracerec)
269 tracerec->q_us += RF_ETIMER_VAL_US(timer);
270 rf_GenericWakeupFunc(node, 0);
271 return (0);
272 }
273
274 int
275 rf_PQWriteDoubleRecoveryFunc(node)
276 RF_DagNode_t *node;
277 {
278 /* The situation:
279 *
280 * We are doing a write that hits only one failed data unit. The other
281 * failed data unit is not being overwritten, so we need to generate
282 * it.
283 *
284 * For the moment, we assume all the nonfailed data being written is in
285 * the shadow of the failed data unit. (i.e,, either a single data
286 * unit write or the entire failed stripe unit is being overwritten. )
287 *
288 * Recovery strategy: apply the recovery data to the parity and q. Use P
289 * & Q to recover the second failed data unit in P. Zero fill Q, then
290 * apply the recovered data to p. Then apply the data being written to
291 * the failed drive. Then walk through the surviving drives, applying
292 * new data when it exists, othewise the recovery data. Quite a mess.
293 *
294 *
295 * The params
296 *
297 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
298 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
299 * raidPtr, asmap */
300
301 int np = node->numParams;
302 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
303 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
304 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
305 int i;
306 RF_RaidAddr_t sosAddr;
307 unsigned coeff;
308 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
309 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
310 int numDataCol = layoutPtr->numDataCol;
311 RF_Etimer_t timer;
312 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
313
314 RF_ASSERT(node->numResults == 2);
315 RF_ASSERT(asmap->failedPDAs[1] == NULL);
316 RF_ETIMER_START(timer);
317 ppda = node->results[0];
318 qpda = node->results[1];
319 /* apply the recovery data */
320 for (i = 0; i < numDataCol - 2; i++)
321 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
322
323 /* determine the other failed data unit */
324 pda = asmap->failedPDAs[0];
325 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
326 /* need to determine the column of the other failed disk */
327 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
328 /* compute the data unit offset within the column */
329 coeff = (coeff % raidPtr->Layout.numDataCol);
330 for (i = 0; i < numDataCol; i++) {
331 npda.raidAddress = sosAddr + (i * secPerSU);
332 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
333 /* skip over dead disks */
334 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
335 if (i != coeff)
336 break;
337 }
338 RF_ASSERT(i < numDataCol);
339 /* recover the data. The column we want to recover we write over the
340 * parity. The column we don't care about we dump in q. */
341 if (coeff < i) /* recovering 'a' */
342 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
343 else /* recovering 'b' */
344 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
345
346 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
347 bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
348 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
349
350 /* now apply all the write data to the buffer */
351 /* single stripe unit write case: the failed data is only thing we are
352 * writing. */
353 RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
354 /* dest, src, len, coeff */
355 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
356 rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
357
358 /* now apply all the recovery data */
359 for (i = 0; i < numDataCol - 2; i++)
360 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
361
362 RF_ETIMER_STOP(timer);
363 RF_ETIMER_EVAL(timer);
364 if (tracerec)
365 tracerec->q_us += RF_ETIMER_VAL_US(timer);
366
367 rf_GenericWakeupFunc(node, 0);
368 return (0);
369 }
370 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
371 {
372 RF_PANIC();
373 }
374 /*
375 Two lost data unit write case.
376
377 There are really two cases here:
378
379 (1) The write completely covers the two lost data units.
380 In that case, a reconstruct write that doesn't write the
381 failed data units will do the correct thing. So in this case,
382 the dag looks like
383
384 full stripe read of surviving data units (not being overwriten)
385 write new data (ignoring failed units) compute P&Q
386 write P&Q
387
388
389 (2) The write does not completely cover both failed data units
390 (but touches at least one of them). Then we need to do the
391 equivalent of a reconstruct read to recover the missing data
392 unit from the other stripe.
393
394 For any data we are writing that is not in the "shadow"
395 of the failed units, we need to do a four cycle update.
396 PANIC on this case. for now
397
398 */
399
400 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
401 {
402 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
403 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
404 int sum;
405 int nf = asmap->numDataFailed;
406
407 sum = asmap->failedPDAs[0]->numSector;
408 if (nf == 2)
409 sum += asmap->failedPDAs[1]->numSector;
410
411 if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
412 /* large write case */
413 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
414 return;
415 }
416 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
417 /* small write case, no user data not in shadow */
418 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
419 return;
420 }
421 RF_PANIC();
422 }
423 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
424 {
425 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
426 }
427 #endif /* (RF_INCLUDE_DECL_PQ > 0) ||
428 * (RF_INCLUDE_RAID6 > 0) */
429