rf_pqdegdags.c revision 1.6 1 /* $NetBSD: rf_pqdegdags.c,v 1.6 2001/07/18 06:45:34 thorpej Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32 */
33
34
35 #include "rf_archs.h"
36
37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
38
39 #include "rf_types.h"
40 #include "rf_raid.h"
41 #include "rf_dag.h"
42 #include "rf_dagdegrd.h"
43 #include "rf_dagdegwr.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_dagutils.h"
46 #include "rf_etimer.h"
47 #include "rf_acctrace.h"
48 #include "rf_general.h"
49 #include "rf_pqdegdags.h"
50 #include "rf_pq.h"
51
52 static void
53 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
54 RF_PhysDiskAddr_t * qpda, void *bp);
55
56 /*
57 Two data drives have failed, and we are doing a read that covers one of them.
58 We may also be reading some of the surviving drives.
59
60
61 *****************************************************************************************
62 *
63 * creates a DAG to perform a degraded-mode read of data within one stripe.
64 * This DAG is as follows:
65 *
66 * Hdr
67 * |
68 * Block
69 * / / \ \ \ \
70 * Rud ... Rud Rrd ... Rrd Rp Rq
71 * | \ | \ | \ | \ | \ | \
72 *
73 * | |
74 * Unblock X
75 * \ /
76 * ------ T ------
77 *
78 * Each R node is a successor of the L node
79 * One successor arc from each R node goes to U, and the other to X
80 * There is one Rud for each chunk of surviving user data requested by the user,
81 * and one Rrd for each chunk of surviving user data _not_ being read by the user
82 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
83 * X = pq recovery node, T = terminate
84 *
85 * The block & unblock nodes are leftovers from a previous version. They
86 * do nothing, but I haven't deleted them because it would be a tremendous
87 * effort to put them back in.
88 *
89 * Note: The target buffer for the XOR node is set to the actual user buffer where the
90 * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
91 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
92 * zero the target buffer prior to the re-use.
93 *
94 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
95 * needs and what's not.
96 ****************************************************************************************/
97 /* init a disk node with 2 successors and one predecessor */
98 #define INIT_DISK_NODE(node,name) \
99 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
100 (node)->succedents[0] = unblockNode; \
101 (node)->succedents[1] = recoveryNode; \
102 (node)->antecedents[0] = blockNode; \
103 (node)->antType[0] = rf_control
104
105 #define DISK_NODE_PARAMS(_node_,_p_) \
106 (_node_).params[0].p = _p_ ; \
107 (_node_).params[1].p = (_p_)->bufPtr; \
108 (_node_).params[2].v = parityStripeID; \
109 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
110
111 #define DISK_NODE_PDA(node) ((node)->params[0].p)
112
113 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
114 {
115 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
116 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
117 }
118
119 static void
120 applyPDA(raidPtr, pda, ppda, qpda, bp)
121 RF_Raid_t *raidPtr;
122 RF_PhysDiskAddr_t *pda;
123 RF_PhysDiskAddr_t *ppda;
124 RF_PhysDiskAddr_t *qpda;
125 void *bp;
126 {
127 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
128 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
129 RF_SectorCount_t s0len = ppda->numSector, len;
130 RF_SectorNum_t suoffset;
131 unsigned coeff;
132 char *pbuf = ppda->bufPtr;
133 char *qbuf = qpda->bufPtr;
134 char *buf;
135 int delta;
136
137 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
138 len = pda->numSector;
139 /* see if pda intersects a recovery pda */
140 if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
141 buf = pda->bufPtr;
142 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
143 coeff = (coeff % raidPtr->Layout.numDataCol);
144
145 if (suoffset < s0off) {
146 delta = s0off - suoffset;
147 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
148 suoffset = s0off;
149 len -= delta;
150 }
151 if (suoffset > s0off) {
152 delta = suoffset - s0off;
153 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
154 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
155 }
156 if ((suoffset + len) > (s0len + s0off))
157 len = s0len + s0off - suoffset;
158
159 /* src, dest, len */
160 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
161
162 /* dest, src, len, coeff */
163 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
164 }
165 }
166 /*
167 Recover data in the case of a double failure. There can be two
168 result buffers, one for each chunk of data trying to be recovered.
169 The params are pda's that have not been range restricted or otherwise
170 politely massaged - this should be done here. The last params are the
171 pdas of P and Q, followed by the raidPtr. The list can look like
172
173 pda, pda, ... , p pda, q pda, raidptr, asm
174
175 or
176
177 pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
178
179 depending on wether two chunks of recovery data were required.
180
181 The second condition only arises if there are two failed buffers
182 whose lengths do not add up a stripe unit.
183 */
184
185
186 int
187 rf_PQDoubleRecoveryFunc(node)
188 RF_DagNode_t *node;
189 {
190 int np = node->numParams;
191 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
192 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
193 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
194 int d, i;
195 unsigned coeff;
196 RF_RaidAddr_t sosAddr, suoffset;
197 RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
198 int two = 0;
199 RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
200 char *buf;
201 int numDataCol = layoutPtr->numDataCol;
202 RF_Etimer_t timer;
203 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
204
205 RF_ETIMER_START(timer);
206
207 if (asmap->failedPDAs[1] &&
208 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
209 RF_ASSERT(0);
210 ppda = node->params[np - 6].p;
211 ppda2 = node->params[np - 5].p;
212 qpda = node->params[np - 4].p;
213 qpda2 = node->params[np - 3].p;
214 d = (np - 6);
215 two = 1;
216 } else {
217 ppda = node->params[np - 4].p;
218 qpda = node->params[np - 3].p;
219 d = (np - 4);
220 }
221
222 for (i = 0; i < d; i++) {
223 pda = node->params[i].p;
224 buf = pda->bufPtr;
225 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
226 len = pda->numSector;
227 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
228 /* compute the data unit offset within the column */
229 coeff = (coeff % raidPtr->Layout.numDataCol);
230 /* see if pda intersects a recovery pda */
231 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
232 if (two)
233 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
234 }
235
236 /* ok, we got the parity back to the point where we can recover. We
237 * now need to determine the coeff of the columns that need to be
238 * recovered. We can also only need to recover a single stripe unit. */
239
240 if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit
241 * to recover. */
242 pda = asmap->failedPDAs[0];
243 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
244 /* need to determine the column of the other failed disk */
245 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
246 /* compute the data unit offset within the column */
247 coeff = (coeff % raidPtr->Layout.numDataCol);
248 for (i = 0; i < numDataCol; i++) {
249 npda.raidAddress = sosAddr + (i * secPerSU);
250 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
251 /* skip over dead disks */
252 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
253 if (i != coeff)
254 break;
255 }
256 RF_ASSERT(i < numDataCol);
257 RF_ASSERT(two == 0);
258 /* recover the data. Since we need only want to recover one
259 * column, we overwrite the parity with the other one. */
260 if (coeff < i) /* recovering 'a' */
261 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
262 else /* recovering 'b' */
263 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
264 } else
265 RF_PANIC();
266
267 RF_ETIMER_STOP(timer);
268 RF_ETIMER_EVAL(timer);
269 if (tracerec)
270 tracerec->q_us += RF_ETIMER_VAL_US(timer);
271 rf_GenericWakeupFunc(node, 0);
272 return (0);
273 }
274
275 int
276 rf_PQWriteDoubleRecoveryFunc(node)
277 RF_DagNode_t *node;
278 {
279 /* The situation:
280 *
281 * We are doing a write that hits only one failed data unit. The other
282 * failed data unit is not being overwritten, so we need to generate
283 * it.
284 *
285 * For the moment, we assume all the nonfailed data being written is in
286 * the shadow of the failed data unit. (i.e,, either a single data
287 * unit write or the entire failed stripe unit is being overwritten. )
288 *
289 * Recovery strategy: apply the recovery data to the parity and q. Use P
290 * & Q to recover the second failed data unit in P. Zero fill Q, then
291 * apply the recovered data to p. Then apply the data being written to
292 * the failed drive. Then walk through the surviving drives, applying
293 * new data when it exists, othewise the recovery data. Quite a mess.
294 *
295 *
296 * The params
297 *
298 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
299 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
300 * raidPtr, asmap */
301
302 int np = node->numParams;
303 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
304 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
305 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
306 int i;
307 RF_RaidAddr_t sosAddr;
308 unsigned coeff;
309 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
310 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
311 int numDataCol = layoutPtr->numDataCol;
312 RF_Etimer_t timer;
313 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
314
315 RF_ASSERT(node->numResults == 2);
316 RF_ASSERT(asmap->failedPDAs[1] == NULL);
317 RF_ETIMER_START(timer);
318 ppda = node->results[0];
319 qpda = node->results[1];
320 /* apply the recovery data */
321 for (i = 0; i < numDataCol - 2; i++)
322 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
323
324 /* determine the other failed data unit */
325 pda = asmap->failedPDAs[0];
326 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
327 /* need to determine the column of the other failed disk */
328 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
329 /* compute the data unit offset within the column */
330 coeff = (coeff % raidPtr->Layout.numDataCol);
331 for (i = 0; i < numDataCol; i++) {
332 npda.raidAddress = sosAddr + (i * secPerSU);
333 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
334 /* skip over dead disks */
335 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
336 if (i != coeff)
337 break;
338 }
339 RF_ASSERT(i < numDataCol);
340 /* recover the data. The column we want to recover we write over the
341 * parity. The column we don't care about we dump in q. */
342 if (coeff < i) /* recovering 'a' */
343 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
344 else /* recovering 'b' */
345 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
346
347 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
348 memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
349 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
350
351 /* now apply all the write data to the buffer */
352 /* single stripe unit write case: the failed data is only thing we are
353 * writing. */
354 RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
355 /* dest, src, len, coeff */
356 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
357 rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
358
359 /* now apply all the recovery data */
360 for (i = 0; i < numDataCol - 2; i++)
361 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
362
363 RF_ETIMER_STOP(timer);
364 RF_ETIMER_EVAL(timer);
365 if (tracerec)
366 tracerec->q_us += RF_ETIMER_VAL_US(timer);
367
368 rf_GenericWakeupFunc(node, 0);
369 return (0);
370 }
371 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
372 {
373 RF_PANIC();
374 }
375 /*
376 Two lost data unit write case.
377
378 There are really two cases here:
379
380 (1) The write completely covers the two lost data units.
381 In that case, a reconstruct write that doesn't write the
382 failed data units will do the correct thing. So in this case,
383 the dag looks like
384
385 full stripe read of surviving data units (not being overwriten)
386 write new data (ignoring failed units) compute P&Q
387 write P&Q
388
389
390 (2) The write does not completely cover both failed data units
391 (but touches at least one of them). Then we need to do the
392 equivalent of a reconstruct read to recover the missing data
393 unit from the other stripe.
394
395 For any data we are writing that is not in the "shadow"
396 of the failed units, we need to do a four cycle update.
397 PANIC on this case. for now
398
399 */
400
401 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
402 {
403 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
404 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
405 int sum;
406 int nf = asmap->numDataFailed;
407
408 sum = asmap->failedPDAs[0]->numSector;
409 if (nf == 2)
410 sum += asmap->failedPDAs[1]->numSector;
411
412 if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
413 /* large write case */
414 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
415 return;
416 }
417 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
418 /* small write case, no user data not in shadow */
419 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
420 return;
421 }
422 RF_PANIC();
423 }
424 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
425 {
426 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
427 }
428 #endif /* (RF_INCLUDE_DECL_PQ > 0) ||
429 * (RF_INCLUDE_RAID6 > 0) */
430