rf_dagdegwr.c revision 1.14 1 /* $NetBSD: rf_dagdegwr.c,v 1.14 2003/12/30 21:59:03 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_dagdegwr.c
31 *
32 * code for creating degraded write DAGs
33 *
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.14 2003/12/30 21:59:03 oster Exp $");
38
39 #include <dev/raidframe/raidframevar.h>
40
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_general.h"
47 #include "rf_dagdegwr.h"
48
49
50 /******************************************************************************
51 *
52 * General comments on DAG creation:
53 *
54 * All DAGs in this file use roll-away error recovery. Each DAG has a single
55 * commit node, usually called "Cmt." If an error occurs before the Cmt node
56 * is reached, the execution engine will halt forward execution and work
57 * backward through the graph, executing the undo functions. Assuming that
58 * each node in the graph prior to the Cmt node are undoable and atomic - or -
59 * does not make changes to permanent state, the graph will fail atomically.
60 * If an error occurs after the Cmt node executes, the engine will roll-forward
61 * through the graph, blindly executing nodes until it reaches the end.
62 * If a graph reaches the end, it is assumed to have completed successfully.
63 *
64 * A graph has only 1 Cmt node.
65 *
66 */
67
68
69 /******************************************************************************
70 *
71 * The following wrappers map the standard DAG creation interface to the
72 * DAG creation routines. Additionally, these wrappers enable experimentation
73 * with new DAG structures by providing an extra level of indirection, allowing
74 * the DAG creation routines to be replaced at this single point.
75 */
76
77 static
78 RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
79 {
80 rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
81 flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
82 }
83
84 void
85 rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
86 RF_DagHeader_t *dag_h, void *bp,
87 RF_RaidAccessFlags_t flags,
88 RF_AllocListElem_t *allocList)
89 {
90
91 RF_ASSERT(asmap->numDataFailed == 1);
92 dag_h->creator = "DegradedWriteDAG";
93
94 /*
95 * if the access writes only a portion of the failed unit, and also
96 * writes some portion of at least one surviving unit, we create two
97 * DAGs, one for the failed component and one for the non-failed
98 * component, and do them sequentially. Note that the fact that we're
99 * accessing only a portion of the failed unit indicates that the
100 * access either starts or ends in the failed unit, and hence we need
101 * create only two dags. This is inefficient in that the same data or
102 * parity can get read and written twice using this structure. I need
103 * to fix this to do the access all at once.
104 */
105 RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 &&
106 asmap->failedPDAs[0]->numSector !=
107 raidPtr->Layout.sectorsPerStripeUnit));
108 rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
109 allocList);
110 }
111
112
113
114 /******************************************************************************
115 *
116 * DAG creation code begins here
117 */
118
119
120
121 /******************************************************************************
122 *
123 * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
124 * write, which is as follows
125 *
126 * / {Wnq} --\
127 * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
128 * \ {Rod} / \ Wnd ---/
129 * \ {Wnd} -/
130 *
131 * commit nodes: Xor, Wnd
132 *
133 * IMPORTANT:
134 * This DAG generator does not work for double-degraded archs since it does not
135 * generate Q
136 *
137 * This dag is essentially identical to the large-write dag, except that the
138 * write to the failed data unit is suppressed.
139 *
140 * IMPORTANT: this dag does not work in the case where the access writes only
141 * a portion of the failed unit, and also writes some portion of at least one
142 * surviving SU. this case is handled in CreateDegradedWriteDAG above.
143 *
144 * The block & unblock nodes are leftovers from a previous version. They
145 * do nothing, but I haven't deleted them because it would be a tremendous
146 * effort to put them back in.
147 *
148 * This dag is used whenever a one of the data units in a write has failed.
149 * If it is the parity unit that failed, the nonredundant write dag (below)
150 * is used.
151 *****************************************************************************/
152
153 void
154 rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr,
155 RF_AccessStripeMap_t *asmap,
156 RF_DagHeader_t *dag_h, void *bp,
157 RF_RaidAccessFlags_t flags,
158 RF_AllocListElem_t *allocList,
159 int nfaults,
160 int (*redFunc) (RF_DagNode_t *),
161 int allowBufferRecycle)
162 {
163 int nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
164 rdnodesFaked;
165 RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
166 RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode;
167 RF_SectorCount_t sectorsPerSU;
168 RF_ReconUnitNum_t which_ru;
169 char *xorTargetBuf = NULL; /* the target buffer for the XOR
170 * operation */
171 char *overlappingPDAs;/* a temporary array of flags */
172 RF_AccessStripeMapHeader_t *new_asm_h[2];
173 RF_PhysDiskAddr_t *pda, *parityPDA;
174 RF_StripeNum_t parityStripeID;
175 RF_PhysDiskAddr_t *failedPDA;
176 RF_RaidLayout_t *layoutPtr;
177
178 layoutPtr = &(raidPtr->Layout);
179 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
180 &which_ru);
181 sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
182 /* failedPDA points to the pda within the asm that targets the failed
183 * disk */
184 failedPDA = asmap->failedPDAs[0];
185
186 if (rf_dagDebug)
187 printf("[Creating degraded-write DAG]\n");
188
189 RF_ASSERT(asmap->numDataFailed == 1);
190 dag_h->creator = "SimpleDegradedWriteDAG";
191
192 /*
193 * Generate two ASMs identifying the surviving data
194 * we need in order to recover the lost data.
195 */
196 /* overlappingPDAs array must be zero'd */
197 RF_Malloc(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char), (char *));
198 rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
199 &nXorBufs, NULL, overlappingPDAs, allocList);
200
201 /* create all the nodes at once */
202 nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is
203 * generated for the
204 * failed pda */
205
206 nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
207 ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
208 /*
209 * XXX
210 *
211 * There's a bug with a complete stripe overwrite- that means 0 reads
212 * of old data, and the rest of the DAG generation code doesn't like
213 * that. A release is coming, and I don't wanna risk breaking a critical
214 * DAG generator, so here's what I'm gonna do- if there's no read nodes,
215 * I'm gonna fake there being a read node, and I'm gonna swap in a
216 * no-op node in its place (to make all the link-up code happy).
217 * This should be fixed at some point. --jimz
218 */
219 if (nRrdNodes == 0) {
220 nRrdNodes = 1;
221 rdnodesFaked = 1;
222 } else {
223 rdnodesFaked = 0;
224 }
225 /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
226 nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
227 RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t),
228 (RF_DagNode_t *), allocList);
229 i = 0;
230 blockNode = &nodes[i];
231 i += 1;
232 commitNode = &nodes[i];
233 i += 1;
234 unblockNode = &nodes[i];
235 i += 1;
236 termNode = &nodes[i];
237 i += 1;
238 xorNode = &nodes[i];
239 i += 1;
240 wnpNode = &nodes[i];
241 i += 1;
242 wndNodes = &nodes[i];
243 i += nWndNodes;
244 rrdNodes = &nodes[i];
245 i += nRrdNodes;
246 if (nfaults == 2) {
247 wnqNode = &nodes[i];
248 i += 1;
249 } else {
250 wnqNode = NULL;
251 }
252 RF_ASSERT(i == nNodes);
253
254 /* this dag can not commit until all rrd and xor Nodes have completed */
255 dag_h->numCommitNodes = 1;
256 dag_h->numCommits = 0;
257 dag_h->numSuccedents = 1;
258
259 RF_ASSERT(nRrdNodes > 0);
260 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
261 NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
262 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
263 NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
264 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
265 NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
266 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
267 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
268 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
269 nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
270
271 /*
272 * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
273 * the failed buffer, save a pointer to it so we can use it as the target
274 * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
275 * a buffer is the same size as the failed buffer, it must also be at the
276 * same alignment within the SU.
277 */
278 i = 0;
279 if (new_asm_h[0]) {
280 for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
281 i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
282 i++, pda = pda->next) {
283 rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
284 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
285 RF_ASSERT(pda);
286 rrdNodes[i].params[0].p = pda;
287 rrdNodes[i].params[1].p = pda->bufPtr;
288 rrdNodes[i].params[2].v = parityStripeID;
289 rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
290 }
291 }
292 /* i now equals the number of stripe units accessed in new_asm_h[0] */
293 if (new_asm_h[1]) {
294 for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
295 j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
296 j++, pda = pda->next) {
297 rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
298 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
299 RF_ASSERT(pda);
300 rrdNodes[i + j].params[0].p = pda;
301 rrdNodes[i + j].params[1].p = pda->bufPtr;
302 rrdNodes[i + j].params[2].v = parityStripeID;
303 rrdNodes[i + j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
304 if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
305 xorTargetBuf = pda->bufPtr;
306 }
307 }
308 if (rdnodesFaked) {
309 /*
310 * This is where we'll init that fake noop read node
311 * (XXX should the wakeup func be different?)
312 */
313 rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
314 NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
315 }
316 /*
317 * Make a PDA for the parity unit. The parity PDA should start at
318 * the same offset into the SU as the failed PDA.
319 */
320 /* Danner comment: I don't think this copy is really necessary. We are
321 * in one of two cases here. (1) The entire failed unit is written.
322 * Then asmap->parityInfo will describe the entire parity. (2) We are
323 * only writing a subset of the failed unit and nothing else. Then the
324 * asmap->parityInfo describes the failed unit and the copy can also
325 * be avoided. */
326
327 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
328 parityPDA->col = asmap->parityInfo->col;
329 parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
330 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
331 parityPDA->numSector = failedPDA->numSector;
332
333 if (!xorTargetBuf) {
334 RF_MallocAndAdd(xorTargetBuf,
335 rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
336 }
337 /* init the Wnp node */
338 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
339 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
340 wnpNode->params[0].p = parityPDA;
341 wnpNode->params[1].p = xorTargetBuf;
342 wnpNode->params[2].v = parityStripeID;
343 wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
344
345 /* fill in the Wnq Node */
346 if (nfaults == 2) {
347 {
348 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
349 (RF_PhysDiskAddr_t *), allocList);
350 parityPDA->col = asmap->qInfo->col;
351 parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
352 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
353 parityPDA->numSector = failedPDA->numSector;
354
355 rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
356 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
357 wnqNode->params[0].p = parityPDA;
358 RF_MallocAndAdd(xorNode->results[1],
359 rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
360 wnqNode->params[1].p = xorNode->results[1];
361 wnqNode->params[2].v = parityStripeID;
362 wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
363 }
364 }
365 /* fill in the Wnd nodes */
366 for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
367 if (pda == failedPDA) {
368 i--;
369 continue;
370 }
371 rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
372 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
373 RF_ASSERT(pda);
374 wndNodes[i].params[0].p = pda;
375 wndNodes[i].params[1].p = pda->bufPtr;
376 wndNodes[i].params[2].v = parityStripeID;
377 wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
378 }
379
380 /* fill in the results of the xor node */
381 xorNode->results[0] = xorTargetBuf;
382
383 /* fill in the params of the xor node */
384
385 paramNum = 0;
386 if (rdnodesFaked == 0) {
387 for (i = 0; i < nRrdNodes; i++) {
388 /* all the Rrd nodes need to be xored together */
389 xorNode->params[paramNum++] = rrdNodes[i].params[0];
390 xorNode->params[paramNum++] = rrdNodes[i].params[1];
391 }
392 }
393 for (i = 0; i < nWndNodes; i++) {
394 /* any Wnd nodes that overlap the failed access need to be
395 * xored in */
396 if (overlappingPDAs[i]) {
397 RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
398 memcpy((char *) pda, (char *) wndNodes[i].params[0].p, sizeof(RF_PhysDiskAddr_t));
399 rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
400 xorNode->params[paramNum++].p = pda;
401 xorNode->params[paramNum++].p = pda->bufPtr;
402 }
403 }
404 RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
405
406 /*
407 * Install the failed PDA into the xor param list so that the
408 * new data gets xor'd in.
409 */
410 xorNode->params[paramNum++].p = failedPDA;
411 xorNode->params[paramNum++].p = failedPDA->bufPtr;
412
413 /*
414 * The last 2 params to the recovery xor node are always the failed
415 * PDA and the raidPtr. install the failedPDA even though we have just
416 * done so above. This allows us to use the same XOR function for both
417 * degraded reads and degraded writes.
418 */
419 xorNode->params[paramNum++].p = failedPDA;
420 xorNode->params[paramNum++].p = raidPtr;
421 RF_ASSERT(paramNum == 2 * nXorBufs + 2);
422
423 /*
424 * Code to link nodes begins here
425 */
426
427 /* link header to block node */
428 RF_ASSERT(blockNode->numAntecedents == 0);
429 dag_h->succedents[0] = blockNode;
430
431 /* link block node to rd nodes */
432 RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
433 for (i = 0; i < nRrdNodes; i++) {
434 RF_ASSERT(rrdNodes[i].numAntecedents == 1);
435 blockNode->succedents[i] = &rrdNodes[i];
436 rrdNodes[i].antecedents[0] = blockNode;
437 rrdNodes[i].antType[0] = rf_control;
438 }
439
440 /* link read nodes to xor node */
441 RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
442 for (i = 0; i < nRrdNodes; i++) {
443 RF_ASSERT(rrdNodes[i].numSuccedents == 1);
444 rrdNodes[i].succedents[0] = xorNode;
445 xorNode->antecedents[i] = &rrdNodes[i];
446 xorNode->antType[i] = rf_trueData;
447 }
448
449 /* link xor node to commit node */
450 RF_ASSERT(xorNode->numSuccedents == 1);
451 RF_ASSERT(commitNode->numAntecedents == 1);
452 xorNode->succedents[0] = commitNode;
453 commitNode->antecedents[0] = xorNode;
454 commitNode->antType[0] = rf_control;
455
456 /* link commit node to wnd nodes */
457 RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
458 for (i = 0; i < nWndNodes; i++) {
459 RF_ASSERT(wndNodes[i].numAntecedents == 1);
460 commitNode->succedents[i] = &wndNodes[i];
461 wndNodes[i].antecedents[0] = commitNode;
462 wndNodes[i].antType[0] = rf_control;
463 }
464
465 /* link the commit node to wnp, wnq nodes */
466 RF_ASSERT(wnpNode->numAntecedents == 1);
467 commitNode->succedents[nWndNodes] = wnpNode;
468 wnpNode->antecedents[0] = commitNode;
469 wnpNode->antType[0] = rf_control;
470 if (nfaults == 2) {
471 RF_ASSERT(wnqNode->numAntecedents == 1);
472 commitNode->succedents[nWndNodes + 1] = wnqNode;
473 wnqNode->antecedents[0] = commitNode;
474 wnqNode->antType[0] = rf_control;
475 }
476 /* link write new data nodes to unblock node */
477 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
478 for (i = 0; i < nWndNodes; i++) {
479 RF_ASSERT(wndNodes[i].numSuccedents == 1);
480 wndNodes[i].succedents[0] = unblockNode;
481 unblockNode->antecedents[i] = &wndNodes[i];
482 unblockNode->antType[i] = rf_control;
483 }
484
485 /* link write new parity node to unblock node */
486 RF_ASSERT(wnpNode->numSuccedents == 1);
487 wnpNode->succedents[0] = unblockNode;
488 unblockNode->antecedents[nWndNodes] = wnpNode;
489 unblockNode->antType[nWndNodes] = rf_control;
490
491 /* link write new q node to unblock node */
492 if (nfaults == 2) {
493 RF_ASSERT(wnqNode->numSuccedents == 1);
494 wnqNode->succedents[0] = unblockNode;
495 unblockNode->antecedents[nWndNodes + 1] = wnqNode;
496 unblockNode->antType[nWndNodes + 1] = rf_control;
497 }
498 /* link unblock node to term node */
499 RF_ASSERT(unblockNode->numSuccedents == 1);
500 RF_ASSERT(termNode->numAntecedents == 1);
501 RF_ASSERT(termNode->numSuccedents == 0);
502 unblockNode->succedents[0] = termNode;
503 termNode->antecedents[0] = unblockNode;
504 termNode->antType[0] = rf_control;
505 }
506 #define CONS_PDA(if,start,num) \
507 pda_p->col = asmap->if->col; \
508 pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
509 pda_p->numSector = num; \
510 pda_p->next = NULL; \
511 RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
512 #if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
513 void
514 rf_WriteGenerateFailedAccessASMs(
515 RF_Raid_t * raidPtr,
516 RF_AccessStripeMap_t * asmap,
517 RF_PhysDiskAddr_t ** pdap,
518 int *nNodep,
519 RF_PhysDiskAddr_t ** pqpdap,
520 int *nPQNodep,
521 RF_AllocListElem_t * allocList)
522 {
523 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
524 int PDAPerDisk, i;
525 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
526 int numDataCol = layoutPtr->numDataCol;
527 int state;
528 unsigned napdas;
529 RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
530 RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
531 RF_PhysDiskAddr_t *pda_p;
532 RF_RaidAddr_t sosAddr;
533
534 /* determine how many pda's we will have to generate per unaccess
535 * stripe. If there is only one failed data unit, it is one; if two,
536 * possibly two, depending wether they overlap. */
537
538 fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
539 fone_end = fone_start + fone->numSector;
540
541 if (asmap->numDataFailed == 1) {
542 PDAPerDisk = 1;
543 state = 1;
544 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
545 pda_p = *pqpdap;
546 /* build p */
547 CONS_PDA(parityInfo, fone_start, fone->numSector);
548 pda_p->type = RF_PDA_TYPE_PARITY;
549 pda_p++;
550 /* build q */
551 CONS_PDA(qInfo, fone_start, fone->numSector);
552 pda_p->type = RF_PDA_TYPE_Q;
553 } else {
554 ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
555 ftwo_end = ftwo_start + ftwo->numSector;
556 if (fone->numSector + ftwo->numSector > secPerSU) {
557 PDAPerDisk = 1;
558 state = 2;
559 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
560 pda_p = *pqpdap;
561 CONS_PDA(parityInfo, 0, secPerSU);
562 pda_p->type = RF_PDA_TYPE_PARITY;
563 pda_p++;
564 CONS_PDA(qInfo, 0, secPerSU);
565 pda_p->type = RF_PDA_TYPE_Q;
566 } else {
567 PDAPerDisk = 2;
568 state = 3;
569 /* four of them, fone, then ftwo */
570 RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
571 pda_p = *pqpdap;
572 CONS_PDA(parityInfo, fone_start, fone->numSector);
573 pda_p->type = RF_PDA_TYPE_PARITY;
574 pda_p++;
575 CONS_PDA(qInfo, fone_start, fone->numSector);
576 pda_p->type = RF_PDA_TYPE_Q;
577 pda_p++;
578 CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
579 pda_p->type = RF_PDA_TYPE_PARITY;
580 pda_p++;
581 CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
582 pda_p->type = RF_PDA_TYPE_Q;
583 }
584 }
585 /* figure out number of nonaccessed pda */
586 napdas = PDAPerDisk * (numDataCol - 2);
587 *nPQNodep = PDAPerDisk;
588
589 *nNodep = napdas;
590 if (napdas == 0)
591 return; /* short circuit */
592
593 /* allocate up our list of pda's */
594
595 RF_MallocAndAdd(pda_p, napdas * sizeof(RF_PhysDiskAddr_t),
596 (RF_PhysDiskAddr_t *), allocList);
597 *pdap = pda_p;
598
599 /* linkem together */
600 for (i = 0; i < (napdas - 1); i++)
601 pda_p[i].next = pda_p + (i + 1);
602
603 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
604 for (i = 0; i < numDataCol; i++) {
605 if ((pda_p - (*pdap)) == napdas)
606 continue;
607 pda_p->type = RF_PDA_TYPE_DATA;
608 pda_p->raidAddress = sosAddr + (i * secPerSU);
609 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
610 /* skip over dead disks */
611 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status))
612 continue;
613 switch (state) {
614 case 1: /* fone */
615 pda_p->numSector = fone->numSector;
616 pda_p->raidAddress += fone_start;
617 pda_p->startSector += fone_start;
618 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
619 break;
620 case 2: /* full stripe */
621 pda_p->numSector = secPerSU;
622 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
623 break;
624 case 3: /* two slabs */
625 pda_p->numSector = fone->numSector;
626 pda_p->raidAddress += fone_start;
627 pda_p->startSector += fone_start;
628 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
629 pda_p++;
630 pda_p->type = RF_PDA_TYPE_DATA;
631 pda_p->raidAddress = sosAddr + (i * secPerSU);
632 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0);
633 pda_p->numSector = ftwo->numSector;
634 pda_p->raidAddress += ftwo_start;
635 pda_p->startSector += ftwo_start;
636 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
637 break;
638 default:
639 RF_PANIC();
640 }
641 pda_p++;
642 }
643
644 RF_ASSERT(pda_p - *pdap == napdas);
645 return;
646 }
647 #define DISK_NODE_PDA(node) ((node)->params[0].p)
648
649 #define DISK_NODE_PARAMS(_node_,_p_) \
650 (_node_).params[0].p = _p_ ; \
651 (_node_).params[1].p = (_p_)->bufPtr; \
652 (_node_).params[2].v = parityStripeID; \
653 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
654
655 void
656 rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
657 RF_DagHeader_t *dag_h, void *bp,
658 RF_RaidAccessFlags_t flags,
659 RF_AllocListElem_t *allocList,
660 char *redundantReadNodeName,
661 char *redundantWriteNodeName,
662 char *recoveryNodeName,
663 int (*recovFunc) (RF_DagNode_t *))
664 {
665 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
666 RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
667 *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
668 RF_PhysDiskAddr_t *pda, *pqPDAs;
669 RF_PhysDiskAddr_t *npdas;
670 int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
671 RF_ReconUnitNum_t which_ru;
672 int nPQNodes;
673 RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
674
675 /* simple small write case - First part looks like a reconstruct-read
676 * of the failed data units. Then a write of all data units not
677 * failed. */
678
679
680 /* Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \
681 * / -------PQ----- / \ \ Wud Wp WQ \ | /
682 * --Unblock- | T
683 *
684 * Rrd = read recovery data (potentially none) Wud = write user data
685 * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
686 * (could be two)
687 *
688 */
689
690 rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
691
692 RF_ASSERT(asmap->numDataFailed == 1);
693
694 nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
695 nReadNodes = nRrdNodes + 2 * nPQNodes;
696 nWriteNodes = nWudNodes + 2 * nPQNodes;
697 nNodes = 4 + nReadNodes + nWriteNodes;
698
699 RF_MallocAndAdd(nodes, nNodes * sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
700 blockNode = nodes;
701 unblockNode = blockNode + 1;
702 termNode = unblockNode + 1;
703 recoveryNode = termNode + 1;
704 rrdNodes = recoveryNode + 1;
705 rpNodes = rrdNodes + nRrdNodes;
706 rqNodes = rpNodes + nPQNodes;
707 wudNodes = rqNodes + nPQNodes;
708 wpNodes = wudNodes + nWudNodes;
709 wqNodes = wpNodes + nPQNodes;
710
711 dag_h->creator = "PQ_DDSimpleSmallWrite";
712 dag_h->numSuccedents = 1;
713 dag_h->succedents[0] = blockNode;
714 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
715 termNode->antecedents[0] = unblockNode;
716 termNode->antType[0] = rf_control;
717
718 /* init the block and unblock nodes */
719 /* The block node has all the read nodes as successors */
720 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
721 for (i = 0; i < nReadNodes; i++)
722 blockNode->succedents[i] = rrdNodes + i;
723
724 /* The unblock node has all the writes as successors */
725 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
726 for (i = 0; i < nWriteNodes; i++) {
727 unblockNode->antecedents[i] = wudNodes + i;
728 unblockNode->antType[i] = rf_control;
729 }
730 unblockNode->succedents[0] = termNode;
731
732 #define INIT_READ_NODE(node,name) \
733 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
734 (node)->succedents[0] = recoveryNode; \
735 (node)->antecedents[0] = blockNode; \
736 (node)->antType[0] = rf_control;
737
738 /* build the read nodes */
739 pda = npdas;
740 for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
741 INIT_READ_NODE(rrdNodes + i, "rrd");
742 DISK_NODE_PARAMS(rrdNodes[i], pda);
743 }
744
745 /* read redundancy pdas */
746 pda = pqPDAs;
747 INIT_READ_NODE(rpNodes, "Rp");
748 RF_ASSERT(pda);
749 DISK_NODE_PARAMS(rpNodes[0], pda);
750 pda++;
751 INIT_READ_NODE(rqNodes, redundantReadNodeName);
752 RF_ASSERT(pda);
753 DISK_NODE_PARAMS(rqNodes[0], pda);
754 if (nPQNodes == 2) {
755 pda++;
756 INIT_READ_NODE(rpNodes + 1, "Rp");
757 RF_ASSERT(pda);
758 DISK_NODE_PARAMS(rpNodes[1], pda);
759 pda++;
760 INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
761 RF_ASSERT(pda);
762 DISK_NODE_PARAMS(rqNodes[1], pda);
763 }
764 /* the recovery node has all reads as precedessors and all writes as
765 * successors. It generates a result for every write P or write Q
766 * node. As parameters, it takes a pda per read and a pda per stripe
767 * of user data written. It also takes as the last params the raidPtr
768 * and asm. For results, it takes PDA for P & Q. */
769
770
771 rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
772 nWriteNodes, /* succesors */
773 nReadNodes, /* preds */
774 nReadNodes + nWudNodes + 3, /* params */
775 2 * nPQNodes, /* results */
776 dag_h, recoveryNodeName, allocList);
777
778
779
780 for (i = 0; i < nReadNodes; i++) {
781 recoveryNode->antecedents[i] = rrdNodes + i;
782 recoveryNode->antType[i] = rf_control;
783 recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
784 }
785 for (i = 0; i < nWudNodes; i++) {
786 recoveryNode->succedents[i] = wudNodes + i;
787 }
788 recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
789 recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
790 recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
791
792 for (; i < nWriteNodes; i++)
793 recoveryNode->succedents[i] = wudNodes + i;
794
795 pda = pqPDAs;
796 recoveryNode->results[0] = pda;
797 pda++;
798 recoveryNode->results[1] = pda;
799 if (nPQNodes == 2) {
800 pda++;
801 recoveryNode->results[2] = pda;
802 pda++;
803 recoveryNode->results[3] = pda;
804 }
805 /* fill writes */
806 #define INIT_WRITE_NODE(node,name) \
807 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
808 (node)->succedents[0] = unblockNode; \
809 (node)->antecedents[0] = recoveryNode; \
810 (node)->antType[0] = rf_control;
811
812 pda = asmap->physInfo;
813 for (i = 0; i < nWudNodes; i++) {
814 INIT_WRITE_NODE(wudNodes + i, "Wd");
815 DISK_NODE_PARAMS(wudNodes[i], pda);
816 recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
817 pda = pda->next;
818 }
819 /* write redundancy pdas */
820 pda = pqPDAs;
821 INIT_WRITE_NODE(wpNodes, "Wp");
822 RF_ASSERT(pda);
823 DISK_NODE_PARAMS(wpNodes[0], pda);
824 pda++;
825 INIT_WRITE_NODE(wqNodes, "Wq");
826 RF_ASSERT(pda);
827 DISK_NODE_PARAMS(wqNodes[0], pda);
828 if (nPQNodes == 2) {
829 pda++;
830 INIT_WRITE_NODE(wpNodes + 1, "Wp");
831 RF_ASSERT(pda);
832 DISK_NODE_PARAMS(wpNodes[1], pda);
833 pda++;
834 INIT_WRITE_NODE(wqNodes + 1, "Wq");
835 RF_ASSERT(pda);
836 DISK_NODE_PARAMS(wqNodes[1], pda);
837 }
838 }
839 #endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */
840