rf_pqdegdags.c revision 1.4 1 /* $NetBSD: rf_pqdegdags.c,v 1.4 1999/08/13 03:41:57 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32 */
33
34
35 #include "rf_archs.h"
36
37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
38
39 #include "rf_types.h"
40 #include "rf_raid.h"
41 #include "rf_dag.h"
42 #include "rf_dagfuncs.h"
43 #include "rf_dagutils.h"
44 #include "rf_etimer.h"
45 #include "rf_acctrace.h"
46 #include "rf_general.h"
47 #include "rf_pqdegdags.h"
48 #include "rf_pq.h"
49
50 static void
51 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
52 RF_PhysDiskAddr_t * qpda, void *bp);
53
54 /*
55 Two data drives have failed, and we are doing a read that covers one of them.
56 We may also be reading some of the surviving drives.
57
58
59 *****************************************************************************************
60 *
61 * creates a DAG to perform a degraded-mode read of data within one stripe.
62 * This DAG is as follows:
63 *
64 * Hdr
65 * |
66 * Block
67 * / / \ \ \ \
68 * Rud ... Rud Rrd ... Rrd Rp Rq
69 * | \ | \ | \ | \ | \ | \
70 *
71 * | |
72 * Unblock X
73 * \ /
74 * ------ T ------
75 *
76 * Each R node is a successor of the L node
77 * One successor arc from each R node goes to U, and the other to X
78 * There is one Rud for each chunk of surviving user data requested by the user,
79 * and one Rrd for each chunk of surviving user data _not_ being read by the user
80 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
81 * X = pq recovery node, T = terminate
82 *
83 * The block & unblock nodes are leftovers from a previous version. They
84 * do nothing, but I haven't deleted them because it would be a tremendous
85 * effort to put them back in.
86 *
87 * Note: The target buffer for the XOR node is set to the actual user buffer where the
88 * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
89 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
90 * zero the target buffer prior to the re-use.
91 *
92 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
93 * needs and what's not.
94 ****************************************************************************************/
95 /* init a disk node with 2 successors and one predecessor */
96 #define INIT_DISK_NODE(node,name) \
97 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
98 (node)->succedents[0] = unblockNode; \
99 (node)->succedents[1] = recoveryNode; \
100 (node)->antecedents[0] = blockNode; \
101 (node)->antType[0] = rf_control
102
103 #define DISK_NODE_PARAMS(_node_,_p_) \
104 (_node_).params[0].p = _p_ ; \
105 (_node_).params[1].p = (_p_)->bufPtr; \
106 (_node_).params[2].v = parityStripeID; \
107 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
108
109 #define DISK_NODE_PDA(node) ((node)->params[0].p)
110
111 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
112 {
113 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
114 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
115 }
116
117 static void
118 applyPDA(raidPtr, pda, ppda, qpda, bp)
119 RF_Raid_t *raidPtr;
120 RF_PhysDiskAddr_t *pda;
121 RF_PhysDiskAddr_t *ppda;
122 RF_PhysDiskAddr_t *qpda;
123 void *bp;
124 {
125 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
126 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
127 RF_SectorCount_t s0len = ppda->numSector, len;
128 RF_SectorNum_t suoffset;
129 unsigned coeff;
130 char *pbuf = ppda->bufPtr;
131 char *qbuf = qpda->bufPtr;
132 char *buf;
133 int delta;
134
135 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
136 len = pda->numSector;
137 /* see if pda intersects a recovery pda */
138 if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
139 buf = pda->bufPtr;
140 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
141 coeff = (coeff % raidPtr->Layout.numDataCol);
142
143 if (suoffset < s0off) {
144 delta = s0off - suoffset;
145 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
146 suoffset = s0off;
147 len -= delta;
148 }
149 if (suoffset > s0off) {
150 delta = suoffset - s0off;
151 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
152 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
153 }
154 if ((suoffset + len) > (s0len + s0off))
155 len = s0len + s0off - suoffset;
156
157 /* src, dest, len */
158 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
159
160 /* dest, src, len, coeff */
161 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
162 }
163 }
164 /*
165 Recover data in the case of a double failure. There can be two
166 result buffers, one for each chunk of data trying to be recovered.
167 The params are pda's that have not been range restricted or otherwise
168 politely massaged - this should be done here. The last params are the
169 pdas of P and Q, followed by the raidPtr. The list can look like
170
171 pda, pda, ... , p pda, q pda, raidptr, asm
172
173 or
174
175 pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
176
177 depending on wether two chunks of recovery data were required.
178
179 The second condition only arises if there are two failed buffers
180 whose lengths do not add up a stripe unit.
181 */
182
183
184 int
185 rf_PQDoubleRecoveryFunc(node)
186 RF_DagNode_t *node;
187 {
188 int np = node->numParams;
189 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
190 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
191 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
192 int d, i;
193 unsigned coeff;
194 RF_RaidAddr_t sosAddr, suoffset;
195 RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
196 int two = 0;
197 RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
198 char *buf;
199 int numDataCol = layoutPtr->numDataCol;
200 RF_Etimer_t timer;
201 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
202
203 RF_ETIMER_START(timer);
204
205 if (asmap->failedPDAs[1] &&
206 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
207 RF_ASSERT(0);
208 ppda = node->params[np - 6].p;
209 ppda2 = node->params[np - 5].p;
210 qpda = node->params[np - 4].p;
211 qpda2 = node->params[np - 3].p;
212 d = (np - 6);
213 two = 1;
214 } else {
215 ppda = node->params[np - 4].p;
216 qpda = node->params[np - 3].p;
217 d = (np - 4);
218 }
219
220 for (i = 0; i < d; i++) {
221 pda = node->params[i].p;
222 buf = pda->bufPtr;
223 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
224 len = pda->numSector;
225 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
226 /* compute the data unit offset within the column */
227 coeff = (coeff % raidPtr->Layout.numDataCol);
228 /* see if pda intersects a recovery pda */
229 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
230 if (two)
231 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
232 }
233
234 /* ok, we got the parity back to the point where we can recover. We
235 * now need to determine the coeff of the columns that need to be
236 * recovered. We can also only need to recover a single stripe unit. */
237
238 if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit
239 * to recover. */
240 pda = asmap->failedPDAs[0];
241 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
242 /* need to determine the column of the other failed disk */
243 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
244 /* compute the data unit offset within the column */
245 coeff = (coeff % raidPtr->Layout.numDataCol);
246 for (i = 0; i < numDataCol; i++) {
247 npda.raidAddress = sosAddr + (i * secPerSU);
248 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
249 /* skip over dead disks */
250 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
251 if (i != coeff)
252 break;
253 }
254 RF_ASSERT(i < numDataCol);
255 RF_ASSERT(two == 0);
256 /* recover the data. Since we need only want to recover one
257 * column, we overwrite the parity with the other one. */
258 if (coeff < i) /* recovering 'a' */
259 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
260 else /* recovering 'b' */
261 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
262 } else
263 RF_PANIC();
264
265 RF_ETIMER_STOP(timer);
266 RF_ETIMER_EVAL(timer);
267 if (tracerec)
268 tracerec->q_us += RF_ETIMER_VAL_US(timer);
269 rf_GenericWakeupFunc(node, 0);
270 return (0);
271 }
272
273 int
274 rf_PQWriteDoubleRecoveryFunc(node)
275 RF_DagNode_t *node;
276 {
277 /* The situation:
278 *
279 * We are doing a write that hits only one failed data unit. The other
280 * failed data unit is not being overwritten, so we need to generate
281 * it.
282 *
283 * For the moment, we assume all the nonfailed data being written is in
284 * the shadow of the failed data unit. (i.e,, either a single data
285 * unit write or the entire failed stripe unit is being overwritten. )
286 *
287 * Recovery strategy: apply the recovery data to the parity and q. Use P
288 * & Q to recover the second failed data unit in P. Zero fill Q, then
289 * apply the recovered data to p. Then apply the data being written to
290 * the failed drive. Then walk through the surviving drives, applying
291 * new data when it exists, othewise the recovery data. Quite a mess.
292 *
293 *
294 * The params
295 *
296 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
297 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
298 * raidPtr, asmap */
299
300 int np = node->numParams;
301 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
302 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
303 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
304 int i;
305 RF_RaidAddr_t sosAddr;
306 unsigned coeff;
307 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
308 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
309 int numDataCol = layoutPtr->numDataCol;
310 RF_Etimer_t timer;
311 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
312
313 RF_ASSERT(node->numResults == 2);
314 RF_ASSERT(asmap->failedPDAs[1] == NULL);
315 RF_ETIMER_START(timer);
316 ppda = node->results[0];
317 qpda = node->results[1];
318 /* apply the recovery data */
319 for (i = 0; i < numDataCol - 2; i++)
320 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
321
322 /* determine the other failed data unit */
323 pda = asmap->failedPDAs[0];
324 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
325 /* need to determine the column of the other failed disk */
326 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
327 /* compute the data unit offset within the column */
328 coeff = (coeff % raidPtr->Layout.numDataCol);
329 for (i = 0; i < numDataCol; i++) {
330 npda.raidAddress = sosAddr + (i * secPerSU);
331 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
332 /* skip over dead disks */
333 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
334 if (i != coeff)
335 break;
336 }
337 RF_ASSERT(i < numDataCol);
338 /* recover the data. The column we want to recover we write over the
339 * parity. The column we don't care about we dump in q. */
340 if (coeff < i) /* recovering 'a' */
341 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
342 else /* recovering 'b' */
343 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
344
345 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
346 bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
347 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
348
349 /* now apply all the write data to the buffer */
350 /* single stripe unit write case: the failed data is only thing we are
351 * writing. */
352 RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
353 /* dest, src, len, coeff */
354 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
355 rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
356
357 /* now apply all the recovery data */
358 for (i = 0; i < numDataCol - 2; i++)
359 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
360
361 RF_ETIMER_STOP(timer);
362 RF_ETIMER_EVAL(timer);
363 if (tracerec)
364 tracerec->q_us += RF_ETIMER_VAL_US(timer);
365
366 rf_GenericWakeupFunc(node, 0);
367 return (0);
368 }
369 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
370 {
371 RF_PANIC();
372 }
373 /*
374 Two lost data unit write case.
375
376 There are really two cases here:
377
378 (1) The write completely covers the two lost data units.
379 In that case, a reconstruct write that doesn't write the
380 failed data units will do the correct thing. So in this case,
381 the dag looks like
382
383 full stripe read of surviving data units (not being overwriten)
384 write new data (ignoring failed units) compute P&Q
385 write P&Q
386
387
388 (2) The write does not completely cover both failed data units
389 (but touches at least one of them). Then we need to do the
390 equivalent of a reconstruct read to recover the missing data
391 unit from the other stripe.
392
393 For any data we are writing that is not in the "shadow"
394 of the failed units, we need to do a four cycle update.
395 PANIC on this case. for now
396
397 */
398
399 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
400 {
401 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
402 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
403 int sum;
404 int nf = asmap->numDataFailed;
405
406 sum = asmap->failedPDAs[0]->numSector;
407 if (nf == 2)
408 sum += asmap->failedPDAs[1]->numSector;
409
410 if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
411 /* large write case */
412 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
413 return;
414 }
415 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
416 /* small write case, no user data not in shadow */
417 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
418 return;
419 }
420 RF_PANIC();
421 }
422 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
423 {
424 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
425 }
426 #endif /* (RF_INCLUDE_DECL_PQ > 0) ||
427 * (RF_INCLUDE_RAID6 > 0) */
428