rf_pqdegdags.c revision 1.7 1 /* $NetBSD: rf_pqdegdags.c,v 1.7 2001/10/04 15:58:55 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_pqdegdags.c
31 * Degraded mode dags for double fault cases.
32 */
33
34
35 #include "rf_archs.h"
36
37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
38
39 #include <dev/raidframe/raidframevar.h>
40
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagdegrd.h"
44 #include "rf_dagdegwr.h"
45 #include "rf_dagfuncs.h"
46 #include "rf_dagutils.h"
47 #include "rf_etimer.h"
48 #include "rf_acctrace.h"
49 #include "rf_general.h"
50 #include "rf_pqdegdags.h"
51 #include "rf_pq.h"
52
53 static void
54 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
55 RF_PhysDiskAddr_t * qpda, void *bp);
56
57 /*
58 Two data drives have failed, and we are doing a read that covers one of them.
59 We may also be reading some of the surviving drives.
60
61
62 *****************************************************************************************
63 *
64 * creates a DAG to perform a degraded-mode read of data within one stripe.
65 * This DAG is as follows:
66 *
67 * Hdr
68 * |
69 * Block
70 * / / \ \ \ \
71 * Rud ... Rud Rrd ... Rrd Rp Rq
72 * | \ | \ | \ | \ | \ | \
73 *
74 * | |
75 * Unblock X
76 * \ /
77 * ------ T ------
78 *
79 * Each R node is a successor of the L node
80 * One successor arc from each R node goes to U, and the other to X
81 * There is one Rud for each chunk of surviving user data requested by the user,
82 * and one Rrd for each chunk of surviving user data _not_ being read by the user
83 * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
84 * X = pq recovery node, T = terminate
85 *
86 * The block & unblock nodes are leftovers from a previous version. They
87 * do nothing, but I haven't deleted them because it would be a tremendous
88 * effort to put them back in.
89 *
90 * Note: The target buffer for the XOR node is set to the actual user buffer where the
91 * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
92 * if you create a degraded read dag, use it, and then re-use, you have to be sure to
93 * zero the target buffer prior to the re-use.
94 *
95 * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
96 * needs and what's not.
97 ****************************************************************************************/
98 /* init a disk node with 2 successors and one predecessor */
99 #define INIT_DISK_NODE(node,name) \
100 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
101 (node)->succedents[0] = unblockNode; \
102 (node)->succedents[1] = recoveryNode; \
103 (node)->antecedents[0] = blockNode; \
104 (node)->antType[0] = rf_control
105
106 #define DISK_NODE_PARAMS(_node_,_p_) \
107 (_node_).params[0].p = _p_ ; \
108 (_node_).params[1].p = (_p_)->bufPtr; \
109 (_node_).params[2].v = parityStripeID; \
110 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
111
112 #define DISK_NODE_PDA(node) ((node)->params[0].p)
113
114 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
115 {
116 rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
117 "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
118 }
119
120 static void
121 applyPDA(raidPtr, pda, ppda, qpda, bp)
122 RF_Raid_t *raidPtr;
123 RF_PhysDiskAddr_t *pda;
124 RF_PhysDiskAddr_t *ppda;
125 RF_PhysDiskAddr_t *qpda;
126 void *bp;
127 {
128 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
129 RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
130 RF_SectorCount_t s0len = ppda->numSector, len;
131 RF_SectorNum_t suoffset;
132 unsigned coeff;
133 char *pbuf = ppda->bufPtr;
134 char *qbuf = qpda->bufPtr;
135 char *buf;
136 int delta;
137
138 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
139 len = pda->numSector;
140 /* see if pda intersects a recovery pda */
141 if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
142 buf = pda->bufPtr;
143 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
144 coeff = (coeff % raidPtr->Layout.numDataCol);
145
146 if (suoffset < s0off) {
147 delta = s0off - suoffset;
148 buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
149 suoffset = s0off;
150 len -= delta;
151 }
152 if (suoffset > s0off) {
153 delta = suoffset - s0off;
154 pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
155 qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
156 }
157 if ((suoffset + len) > (s0len + s0off))
158 len = s0len + s0off - suoffset;
159
160 /* src, dest, len */
161 rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
162
163 /* dest, src, len, coeff */
164 rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
165 }
166 }
167 /*
168 Recover data in the case of a double failure. There can be two
169 result buffers, one for each chunk of data trying to be recovered.
170 The params are pda's that have not been range restricted or otherwise
171 politely massaged - this should be done here. The last params are the
172 pdas of P and Q, followed by the raidPtr. The list can look like
173
174 pda, pda, ... , p pda, q pda, raidptr, asm
175
176 or
177
178 pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
179
180 depending on wether two chunks of recovery data were required.
181
182 The second condition only arises if there are two failed buffers
183 whose lengths do not add up a stripe unit.
184 */
185
186
187 int
188 rf_PQDoubleRecoveryFunc(node)
189 RF_DagNode_t *node;
190 {
191 int np = node->numParams;
192 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
193 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
194 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
195 int d, i;
196 unsigned coeff;
197 RF_RaidAddr_t sosAddr, suoffset;
198 RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
199 int two = 0;
200 RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
201 char *buf;
202 int numDataCol = layoutPtr->numDataCol;
203 RF_Etimer_t timer;
204 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
205
206 RF_ETIMER_START(timer);
207
208 if (asmap->failedPDAs[1] &&
209 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
210 RF_ASSERT(0);
211 ppda = node->params[np - 6].p;
212 ppda2 = node->params[np - 5].p;
213 qpda = node->params[np - 4].p;
214 qpda2 = node->params[np - 3].p;
215 d = (np - 6);
216 two = 1;
217 } else {
218 ppda = node->params[np - 4].p;
219 qpda = node->params[np - 3].p;
220 d = (np - 4);
221 }
222
223 for (i = 0; i < d; i++) {
224 pda = node->params[i].p;
225 buf = pda->bufPtr;
226 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
227 len = pda->numSector;
228 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
229 /* compute the data unit offset within the column */
230 coeff = (coeff % raidPtr->Layout.numDataCol);
231 /* see if pda intersects a recovery pda */
232 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
233 if (two)
234 applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
235 }
236
237 /* ok, we got the parity back to the point where we can recover. We
238 * now need to determine the coeff of the columns that need to be
239 * recovered. We can also only need to recover a single stripe unit. */
240
241 if (asmap->failedPDAs[1] == NULL) { /* only a single stripe unit
242 * to recover. */
243 pda = asmap->failedPDAs[0];
244 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
245 /* need to determine the column of the other failed disk */
246 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
247 /* compute the data unit offset within the column */
248 coeff = (coeff % raidPtr->Layout.numDataCol);
249 for (i = 0; i < numDataCol; i++) {
250 npda.raidAddress = sosAddr + (i * secPerSU);
251 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
252 /* skip over dead disks */
253 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
254 if (i != coeff)
255 break;
256 }
257 RF_ASSERT(i < numDataCol);
258 RF_ASSERT(two == 0);
259 /* recover the data. Since we need only want to recover one
260 * column, we overwrite the parity with the other one. */
261 if (coeff < i) /* recovering 'a' */
262 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
263 else /* recovering 'b' */
264 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
265 } else
266 RF_PANIC();
267
268 RF_ETIMER_STOP(timer);
269 RF_ETIMER_EVAL(timer);
270 if (tracerec)
271 tracerec->q_us += RF_ETIMER_VAL_US(timer);
272 rf_GenericWakeupFunc(node, 0);
273 return (0);
274 }
275
276 int
277 rf_PQWriteDoubleRecoveryFunc(node)
278 RF_DagNode_t *node;
279 {
280 /* The situation:
281 *
282 * We are doing a write that hits only one failed data unit. The other
283 * failed data unit is not being overwritten, so we need to generate
284 * it.
285 *
286 * For the moment, we assume all the nonfailed data being written is in
287 * the shadow of the failed data unit. (i.e,, either a single data
288 * unit write or the entire failed stripe unit is being overwritten. )
289 *
290 * Recovery strategy: apply the recovery data to the parity and q. Use P
291 * & Q to recover the second failed data unit in P. Zero fill Q, then
292 * apply the recovered data to p. Then apply the data being written to
293 * the failed drive. Then walk through the surviving drives, applying
294 * new data when it exists, othewise the recovery data. Quite a mess.
295 *
296 *
297 * The params
298 *
299 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
300 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
301 * raidPtr, asmap */
302
303 int np = node->numParams;
304 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
305 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
306 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
307 int i;
308 RF_RaidAddr_t sosAddr;
309 unsigned coeff;
310 RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
311 RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
312 int numDataCol = layoutPtr->numDataCol;
313 RF_Etimer_t timer;
314 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
315
316 RF_ASSERT(node->numResults == 2);
317 RF_ASSERT(asmap->failedPDAs[1] == NULL);
318 RF_ETIMER_START(timer);
319 ppda = node->results[0];
320 qpda = node->results[1];
321 /* apply the recovery data */
322 for (i = 0; i < numDataCol - 2; i++)
323 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
324
325 /* determine the other failed data unit */
326 pda = asmap->failedPDAs[0];
327 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
328 /* need to determine the column of the other failed disk */
329 coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
330 /* compute the data unit offset within the column */
331 coeff = (coeff % raidPtr->Layout.numDataCol);
332 for (i = 0; i < numDataCol; i++) {
333 npda.raidAddress = sosAddr + (i * secPerSU);
334 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
335 /* skip over dead disks */
336 if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
337 if (i != coeff)
338 break;
339 }
340 RF_ASSERT(i < numDataCol);
341 /* recover the data. The column we want to recover we write over the
342 * parity. The column we don't care about we dump in q. */
343 if (coeff < i) /* recovering 'a' */
344 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
345 else /* recovering 'b' */
346 rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
347
348 /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
349 memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
350 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
351
352 /* now apply all the write data to the buffer */
353 /* single stripe unit write case: the failed data is only thing we are
354 * writing. */
355 RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
356 /* dest, src, len, coeff */
357 rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
358 rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
359
360 /* now apply all the recovery data */
361 for (i = 0; i < numDataCol - 2; i++)
362 applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
363
364 RF_ETIMER_STOP(timer);
365 RF_ETIMER_EVAL(timer);
366 if (tracerec)
367 tracerec->q_us += RF_ETIMER_VAL_US(timer);
368
369 rf_GenericWakeupFunc(node, 0);
370 return (0);
371 }
372 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
373 {
374 RF_PANIC();
375 }
376 /*
377 Two lost data unit write case.
378
379 There are really two cases here:
380
381 (1) The write completely covers the two lost data units.
382 In that case, a reconstruct write that doesn't write the
383 failed data units will do the correct thing. So in this case,
384 the dag looks like
385
386 full stripe read of surviving data units (not being overwriten)
387 write new data (ignoring failed units) compute P&Q
388 write P&Q
389
390
391 (2) The write does not completely cover both failed data units
392 (but touches at least one of them). Then we need to do the
393 equivalent of a reconstruct read to recover the missing data
394 unit from the other stripe.
395
396 For any data we are writing that is not in the "shadow"
397 of the failed units, we need to do a four cycle update.
398 PANIC on this case. for now
399
400 */
401
402 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
403 {
404 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
405 RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
406 int sum;
407 int nf = asmap->numDataFailed;
408
409 sum = asmap->failedPDAs[0]->numSector;
410 if (nf == 2)
411 sum += asmap->failedPDAs[1]->numSector;
412
413 if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
414 /* large write case */
415 rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
416 return;
417 }
418 if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
419 /* small write case, no user data not in shadow */
420 rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
421 return;
422 }
423 RF_PANIC();
424 }
425 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
426 {
427 rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
428 }
429 #endif /* (RF_INCLUDE_DECL_PQ > 0) ||
430 * (RF_INCLUDE_RAID6 > 0) */
431