rf_dagffwr.c revision 1.13 1 /* $NetBSD: rf_dagffwr.c,v 1.13 2003/12/30 21:59:03 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_dagff.c
31 *
32 * code for creating fault-free DAGs
33 *
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.13 2003/12/30 21:59:03 oster Exp $");
38
39 #include <dev/raidframe/raidframevar.h>
40
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_dagffrd.h"
47 #include "rf_general.h"
48 #include "rf_dagffwr.h"
49
50 /******************************************************************************
51 *
52 * General comments on DAG creation:
53 *
54 * All DAGs in this file use roll-away error recovery. Each DAG has a single
55 * commit node, usually called "Cmt." If an error occurs before the Cmt node
56 * is reached, the execution engine will halt forward execution and work
57 * backward through the graph, executing the undo functions. Assuming that
58 * each node in the graph prior to the Cmt node are undoable and atomic - or -
59 * does not make changes to permanent state, the graph will fail atomically.
60 * If an error occurs after the Cmt node executes, the engine will roll-forward
61 * through the graph, blindly executing nodes until it reaches the end.
62 * If a graph reaches the end, it is assumed to have completed successfully.
63 *
64 * A graph has only 1 Cmt node.
65 *
66 */
67
68
69 /******************************************************************************
70 *
71 * The following wrappers map the standard DAG creation interface to the
72 * DAG creation routines. Additionally, these wrappers enable experimentation
73 * with new DAG structures by providing an extra level of indirection, allowing
74 * the DAG creation routines to be replaced at this single point.
75 */
76
77
78 void
79 rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
80 RF_DagHeader_t *dag_h, void *bp,
81 RF_RaidAccessFlags_t flags,
82 RF_AllocListElem_t *allocList,
83 RF_IoType_t type)
84 {
85 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
86 RF_IO_TYPE_WRITE);
87 }
88
89 void
90 rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
91 RF_DagHeader_t *dag_h, void *bp,
92 RF_RaidAccessFlags_t flags,
93 RF_AllocListElem_t *allocList,
94 RF_IoType_t type)
95 {
96 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
97 RF_IO_TYPE_WRITE);
98 }
99
100 void
101 rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
102 RF_DagHeader_t *dag_h, void *bp,
103 RF_RaidAccessFlags_t flags,
104 RF_AllocListElem_t *allocList)
105 {
106 /* "normal" rollaway */
107 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
108 &rf_xorFuncs, NULL);
109 }
110
111 void
112 rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
113 RF_DagHeader_t *dag_h, void *bp,
114 RF_RaidAccessFlags_t flags,
115 RF_AllocListElem_t *allocList)
116 {
117 /* "normal" rollaway */
118 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
119 1, rf_RegularXorFunc, RF_TRUE);
120 }
121
122
123 /******************************************************************************
124 *
125 * DAG creation code begins here
126 */
127
128
129 /******************************************************************************
130 *
131 * creates a DAG to perform a large-write operation:
132 *
133 * / Rod \ / Wnd \
134 * H -- block- Rod - Xor - Cmt - Wnd --- T
135 * \ Rod / \ Wnp /
136 * \[Wnq]/
137 *
138 * The XOR node also does the Q calculation in the P+Q architecture.
139 * All nodes are before the commit node (Cmt) are assumed to be atomic and
140 * undoable - or - they make no changes to permanent state.
141 *
142 * Rod = read old data
143 * Cmt = commit node
144 * Wnp = write new parity
145 * Wnd = write new data
146 * Wnq = write new "q"
147 * [] denotes optional segments in the graph
148 *
149 * Parameters: raidPtr - description of the physical array
150 * asmap - logical & physical addresses for this access
151 * bp - buffer ptr (holds write data)
152 * flags - general flags (e.g. disk locking)
153 * allocList - list of memory allocated in DAG creation
154 * nfaults - number of faults array can tolerate
155 * (equal to # redundancy units in stripe)
156 * redfuncs - list of redundancy generating functions
157 *
158 *****************************************************************************/
159
160 void
161 rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
162 RF_DagHeader_t *dag_h, void *bp,
163 RF_RaidAccessFlags_t flags,
164 RF_AllocListElem_t *allocList,
165 int nfaults, int (*redFunc) (RF_DagNode_t *),
166 int allowBufferRecycle)
167 {
168 RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
169 RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode;
170 int nWndNodes, nRodNodes, i, nodeNum, asmNum;
171 RF_AccessStripeMapHeader_t *new_asm_h[2];
172 RF_StripeNum_t parityStripeID;
173 char *sosBuffer, *eosBuffer;
174 RF_ReconUnitNum_t which_ru;
175 RF_RaidLayout_t *layoutPtr;
176 RF_PhysDiskAddr_t *pda;
177
178 layoutPtr = &(raidPtr->Layout);
179 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
180 &which_ru);
181
182 if (rf_dagDebug) {
183 printf("[Creating large-write DAG]\n");
184 }
185 dag_h->creator = "LargeWriteDAG";
186
187 dag_h->numCommitNodes = 1;
188 dag_h->numCommits = 0;
189 dag_h->numSuccedents = 1;
190
191 /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
192 nWndNodes = asmap->numStripeUnitsAccessed;
193 RF_MallocAndAdd(nodes,
194 (nWndNodes + 4 + nfaults) * sizeof(RF_DagNode_t),
195 (RF_DagNode_t *), allocList);
196 i = 0;
197 wndNodes = &nodes[i];
198 i += nWndNodes;
199 xorNode = &nodes[i];
200 i += 1;
201 wnpNode = &nodes[i];
202 i += 1;
203 blockNode = &nodes[i];
204 i += 1;
205 commitNode = &nodes[i];
206 i += 1;
207 termNode = &nodes[i];
208 i += 1;
209 if (nfaults == 2) {
210 wnqNode = &nodes[i];
211 i += 1;
212 } else {
213 wnqNode = NULL;
214 }
215 rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h,
216 &nRodNodes, &sosBuffer, &eosBuffer, allocList);
217 if (nRodNodes > 0) {
218 RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t),
219 (RF_DagNode_t *), allocList);
220 } else {
221 rodNodes = NULL;
222 }
223
224 /* begin node initialization */
225 if (nRodNodes > 0) {
226 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
227 NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList);
228 } else {
229 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
230 NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
231 }
232
233 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL,
234 nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
235 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL,
236 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList);
237
238 /* initialize the Rod nodes */
239 for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
240 if (new_asm_h[asmNum]) {
241 pda = new_asm_h[asmNum]->stripeMap->physInfo;
242 while (pda) {
243 rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc,
244 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
245 "Rod", allocList);
246 rodNodes[nodeNum].params[0].p = pda;
247 rodNodes[nodeNum].params[1].p = pda->bufPtr;
248 rodNodes[nodeNum].params[2].v = parityStripeID;
249 rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
250 0, 0, which_ru);
251 nodeNum++;
252 pda = pda->next;
253 }
254 }
255 }
256 RF_ASSERT(nodeNum == nRodNodes);
257
258 /* initialize the wnd nodes */
259 pda = asmap->physInfo;
260 for (i = 0; i < nWndNodes; i++) {
261 rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
262 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
263 RF_ASSERT(pda != NULL);
264 wndNodes[i].params[0].p = pda;
265 wndNodes[i].params[1].p = pda->bufPtr;
266 wndNodes[i].params[2].v = parityStripeID;
267 wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
268 pda = pda->next;
269 }
270
271 /* initialize the redundancy node */
272 if (nRodNodes > 0) {
273 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
274 nRodNodes, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h,
275 "Xr ", allocList);
276 } else {
277 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
278 1, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList);
279 }
280 xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
281 for (i = 0; i < nWndNodes; i++) {
282 xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
283 xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
284 }
285 for (i = 0; i < nRodNodes; i++) {
286 xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
287 xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
288 }
289 /* xor node needs to get at RAID information */
290 xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
291
292 /*
293 * Look for an Rod node that reads a complete SU. If none, alloc a buffer
294 * to receive the parity info. Note that we can't use a new data buffer
295 * because it will not have gotten written when the xor occurs.
296 */
297 if (allowBufferRecycle) {
298 for (i = 0; i < nRodNodes; i++) {
299 if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
300 break;
301 }
302 }
303 if ((!allowBufferRecycle) || (i == nRodNodes)) {
304 RF_MallocAndAdd(xorNode->results[0],
305 rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
306 (void *), allocList);
307 } else {
308 xorNode->results[0] = rodNodes[i].params[1].p;
309 }
310
311 /* initialize the Wnp node */
312 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
313 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
314 wnpNode->params[0].p = asmap->parityInfo;
315 wnpNode->params[1].p = xorNode->results[0];
316 wnpNode->params[2].v = parityStripeID;
317 wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
318 /* parityInfo must describe entire parity unit */
319 RF_ASSERT(asmap->parityInfo->next == NULL);
320
321 if (nfaults == 2) {
322 /*
323 * We never try to recycle a buffer for the Q calcuation
324 * in addition to the parity. This would cause two buffers
325 * to get smashed during the P and Q calculation, guaranteeing
326 * one would be wrong.
327 */
328 RF_MallocAndAdd(xorNode->results[1],
329 rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
330 (void *), allocList);
331 rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
332 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
333 wnqNode->params[0].p = asmap->qInfo;
334 wnqNode->params[1].p = xorNode->results[1];
335 wnqNode->params[2].v = parityStripeID;
336 wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
337 /* parityInfo must describe entire parity unit */
338 RF_ASSERT(asmap->parityInfo->next == NULL);
339 }
340 /*
341 * Connect nodes to form graph.
342 */
343
344 /* connect dag header to block node */
345 RF_ASSERT(blockNode->numAntecedents == 0);
346 dag_h->succedents[0] = blockNode;
347
348 if (nRodNodes > 0) {
349 /* connect the block node to the Rod nodes */
350 RF_ASSERT(blockNode->numSuccedents == nRodNodes);
351 RF_ASSERT(xorNode->numAntecedents == nRodNodes);
352 for (i = 0; i < nRodNodes; i++) {
353 RF_ASSERT(rodNodes[i].numAntecedents == 1);
354 blockNode->succedents[i] = &rodNodes[i];
355 rodNodes[i].antecedents[0] = blockNode;
356 rodNodes[i].antType[0] = rf_control;
357
358 /* connect the Rod nodes to the Xor node */
359 RF_ASSERT(rodNodes[i].numSuccedents == 1);
360 rodNodes[i].succedents[0] = xorNode;
361 xorNode->antecedents[i] = &rodNodes[i];
362 xorNode->antType[i] = rf_trueData;
363 }
364 } else {
365 /* connect the block node to the Xor node */
366 RF_ASSERT(blockNode->numSuccedents == 1);
367 RF_ASSERT(xorNode->numAntecedents == 1);
368 blockNode->succedents[0] = xorNode;
369 xorNode->antecedents[0] = blockNode;
370 xorNode->antType[0] = rf_control;
371 }
372
373 /* connect the xor node to the commit node */
374 RF_ASSERT(xorNode->numSuccedents == 1);
375 RF_ASSERT(commitNode->numAntecedents == 1);
376 xorNode->succedents[0] = commitNode;
377 commitNode->antecedents[0] = xorNode;
378 commitNode->antType[0] = rf_control;
379
380 /* connect the commit node to the write nodes */
381 RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
382 for (i = 0; i < nWndNodes; i++) {
383 RF_ASSERT(wndNodes->numAntecedents == 1);
384 commitNode->succedents[i] = &wndNodes[i];
385 wndNodes[i].antecedents[0] = commitNode;
386 wndNodes[i].antType[0] = rf_control;
387 }
388 RF_ASSERT(wnpNode->numAntecedents == 1);
389 commitNode->succedents[nWndNodes] = wnpNode;
390 wnpNode->antecedents[0] = commitNode;
391 wnpNode->antType[0] = rf_trueData;
392 if (nfaults == 2) {
393 RF_ASSERT(wnqNode->numAntecedents == 1);
394 commitNode->succedents[nWndNodes + 1] = wnqNode;
395 wnqNode->antecedents[0] = commitNode;
396 wnqNode->antType[0] = rf_trueData;
397 }
398 /* connect the write nodes to the term node */
399 RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
400 RF_ASSERT(termNode->numSuccedents == 0);
401 for (i = 0; i < nWndNodes; i++) {
402 RF_ASSERT(wndNodes->numSuccedents == 1);
403 wndNodes[i].succedents[0] = termNode;
404 termNode->antecedents[i] = &wndNodes[i];
405 termNode->antType[i] = rf_control;
406 }
407 RF_ASSERT(wnpNode->numSuccedents == 1);
408 wnpNode->succedents[0] = termNode;
409 termNode->antecedents[nWndNodes] = wnpNode;
410 termNode->antType[nWndNodes] = rf_control;
411 if (nfaults == 2) {
412 RF_ASSERT(wnqNode->numSuccedents == 1);
413 wnqNode->succedents[0] = termNode;
414 termNode->antecedents[nWndNodes + 1] = wnqNode;
415 termNode->antType[nWndNodes + 1] = rf_control;
416 }
417 }
418 /******************************************************************************
419 *
420 * creates a DAG to perform a small-write operation (either raid 5 or pq),
421 * which is as follows:
422 *
423 * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
424 * \- Rod X / \----> Wnd [Und]-/
425 * [\- Rod X / \---> Wnd [Und]-/]
426 * [\- Roq -> Q / \--> Wnq [Unq]-/]
427 *
428 * Rop = read old parity
429 * Rod = read old data
430 * Roq = read old "q"
431 * Cmt = commit node
432 * Und = unlock data disk
433 * Unp = unlock parity disk
434 * Unq = unlock q disk
435 * Wnp = write new parity
436 * Wnd = write new data
437 * Wnq = write new "q"
438 * [ ] denotes optional segments in the graph
439 *
440 * Parameters: raidPtr - description of the physical array
441 * asmap - logical & physical addresses for this access
442 * bp - buffer ptr (holds write data)
443 * flags - general flags (e.g. disk locking)
444 * allocList - list of memory allocated in DAG creation
445 * pfuncs - list of parity generating functions
446 * qfuncs - list of q generating functions
447 *
448 * A null qfuncs indicates single fault tolerant
449 *****************************************************************************/
450
451 void
452 rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
453 RF_DagHeader_t *dag_h, void *bp,
454 RF_RaidAccessFlags_t flags,
455 RF_AllocListElem_t *allocList,
456 const RF_RedFuncs_t *pfuncs,
457 const RF_RedFuncs_t *qfuncs)
458 {
459 RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
460 RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
461 RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes;
462 RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
463 int i, j, nNodes, totalNumNodes, lu_flag;
464 RF_ReconUnitNum_t which_ru;
465 int (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
466 int (*qfunc) (RF_DagNode_t *);
467 int numDataNodes, numParityNodes;
468 RF_StripeNum_t parityStripeID;
469 RF_PhysDiskAddr_t *pda;
470 char *name, *qname;
471 long nfaults;
472
473 nfaults = qfuncs ? 2 : 1;
474 lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
475
476 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
477 asmap->raidAddress, &which_ru);
478 pda = asmap->physInfo;
479 numDataNodes = asmap->numStripeUnitsAccessed;
480 numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
481
482 if (rf_dagDebug) {
483 printf("[Creating small-write DAG]\n");
484 }
485 RF_ASSERT(numDataNodes > 0);
486 dag_h->creator = "SmallWriteDAG";
487
488 dag_h->numCommitNodes = 1;
489 dag_h->numCommits = 0;
490 dag_h->numSuccedents = 1;
491
492 /*
493 * DAG creation occurs in four steps:
494 * 1. count the number of nodes in the DAG
495 * 2. create the nodes
496 * 3. initialize the nodes
497 * 4. connect the nodes
498 */
499
500 /*
501 * Step 1. compute number of nodes in the graph
502 */
503
504 /* number of nodes: a read and write for each data unit a redundancy
505 * computation node for each parity node (nfaults * nparity) a read
506 * and write for each parity unit a block and commit node (2) a
507 * terminate node if atomic RMW an unlock node for each data unit,
508 * redundancy unit */
509 totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
510 + (nfaults * 2 * numParityNodes) + 3;
511 if (lu_flag) {
512 totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
513 }
514 /*
515 * Step 2. create the nodes
516 */
517 RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t),
518 (RF_DagNode_t *), allocList);
519 i = 0;
520 blockNode = &nodes[i];
521 i += 1;
522 commitNode = &nodes[i];
523 i += 1;
524 readDataNodes = &nodes[i];
525 i += numDataNodes;
526 readParityNodes = &nodes[i];
527 i += numParityNodes;
528 writeDataNodes = &nodes[i];
529 i += numDataNodes;
530 writeParityNodes = &nodes[i];
531 i += numParityNodes;
532 xorNodes = &nodes[i];
533 i += numParityNodes;
534 termNode = &nodes[i];
535 i += 1;
536 if (lu_flag) {
537 unlockDataNodes = &nodes[i];
538 i += numDataNodes;
539 unlockParityNodes = &nodes[i];
540 i += numParityNodes;
541 } else {
542 unlockDataNodes = unlockParityNodes = NULL;
543 }
544 if (nfaults == 2) {
545 readQNodes = &nodes[i];
546 i += numParityNodes;
547 writeQNodes = &nodes[i];
548 i += numParityNodes;
549 qNodes = &nodes[i];
550 i += numParityNodes;
551 if (lu_flag) {
552 unlockQNodes = &nodes[i];
553 i += numParityNodes;
554 } else {
555 unlockQNodes = NULL;
556 }
557 } else {
558 readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
559 }
560 RF_ASSERT(i == totalNumNodes);
561
562 /*
563 * Step 3. initialize the nodes
564 */
565 /* initialize block node (Nil) */
566 nNodes = numDataNodes + (nfaults * numParityNodes);
567 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
568 NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
569
570 /* initialize commit node (Cmt) */
571 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
572 NULL, nNodes, (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
573
574 /* initialize terminate node (Trm) */
575 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
576 NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList);
577
578 /* initialize nodes which read old data (Rod) */
579 for (i = 0; i < numDataNodes; i++) {
580 rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
581 rf_GenericWakeupFunc, (nfaults * numParityNodes), 1, 4, 0, dag_h,
582 "Rod", allocList);
583 RF_ASSERT(pda != NULL);
584 /* physical disk addr desc */
585 readDataNodes[i].params[0].p = pda;
586 /* buffer to hold old data */
587 readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
588 dag_h, pda, allocList);
589 readDataNodes[i].params[2].v = parityStripeID;
590 readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
591 lu_flag, 0, which_ru);
592 pda = pda->next;
593 for (j = 0; j < readDataNodes[i].numSuccedents; j++) {
594 readDataNodes[i].propList[j] = NULL;
595 }
596 }
597
598 /* initialize nodes which read old parity (Rop) */
599 pda = asmap->parityInfo;
600 i = 0;
601 for (i = 0; i < numParityNodes; i++) {
602 RF_ASSERT(pda != NULL);
603 rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
604 rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4,
605 0, dag_h, "Rop", allocList);
606 readParityNodes[i].params[0].p = pda;
607 /* buffer to hold old parity */
608 readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
609 dag_h, pda, allocList);
610 readParityNodes[i].params[2].v = parityStripeID;
611 readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
612 lu_flag, 0, which_ru);
613 pda = pda->next;
614 for (j = 0; j < readParityNodes[i].numSuccedents; j++) {
615 readParityNodes[i].propList[0] = NULL;
616 }
617 }
618
619 /* initialize nodes which read old Q (Roq) */
620 if (nfaults == 2) {
621 pda = asmap->qInfo;
622 for (i = 0; i < numParityNodes; i++) {
623 RF_ASSERT(pda != NULL);
624 rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
625 rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList);
626 readQNodes[i].params[0].p = pda;
627 /* buffer to hold old Q */
628 readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda,
629 allocList);
630 readQNodes[i].params[2].v = parityStripeID;
631 readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
632 lu_flag, 0, which_ru);
633 pda = pda->next;
634 for (j = 0; j < readQNodes[i].numSuccedents; j++) {
635 readQNodes[i].propList[0] = NULL;
636 }
637 }
638 }
639 /* initialize nodes which write new data (Wnd) */
640 pda = asmap->physInfo;
641 for (i = 0; i < numDataNodes; i++) {
642 RF_ASSERT(pda != NULL);
643 rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
644 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
645 "Wnd", allocList);
646 /* physical disk addr desc */
647 writeDataNodes[i].params[0].p = pda;
648 /* buffer holding new data to be written */
649 writeDataNodes[i].params[1].p = pda->bufPtr;
650 writeDataNodes[i].params[2].v = parityStripeID;
651 writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
652 0, 0, which_ru);
653 if (lu_flag) {
654 /* initialize node to unlock the disk queue */
655 rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
656 rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
657 "Und", allocList);
658 /* physical disk addr desc */
659 unlockDataNodes[i].params[0].p = pda;
660 unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
661 0, lu_flag, which_ru);
662 }
663 pda = pda->next;
664 }
665
666 /*
667 * Initialize nodes which compute new parity and Q.
668 */
669 /*
670 * We use the simple XOR func in the double-XOR case, and when
671 * we're accessing only a portion of one stripe unit. The distinction
672 * between the two is that the regular XOR func assumes that the targbuf
673 * is a full SU in size, and examines the pda associated with the buffer
674 * to decide where within the buffer to XOR the data, whereas
675 * the simple XOR func just XORs the data into the start of the buffer.
676 */
677 if ((numParityNodes == 2) || ((numDataNodes == 1)
678 && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
679 func = pfuncs->simple;
680 undoFunc = rf_NullNodeUndoFunc;
681 name = pfuncs->SimpleName;
682 if (qfuncs) {
683 qfunc = qfuncs->simple;
684 qname = qfuncs->SimpleName;
685 } else {
686 qfunc = NULL;
687 qname = NULL;
688 }
689 } else {
690 func = pfuncs->regular;
691 undoFunc = rf_NullNodeUndoFunc;
692 name = pfuncs->RegularName;
693 if (qfuncs) {
694 qfunc = qfuncs->regular;
695 qname = qfuncs->RegularName;
696 } else {
697 qfunc = NULL;
698 qname = NULL;
699 }
700 }
701 /*
702 * Initialize the xor nodes: params are {pda,buf}
703 * from {Rod,Wnd,Rop} nodes, and raidPtr
704 */
705 if (numParityNodes == 2) {
706 /* double-xor case */
707 for (i = 0; i < numParityNodes; i++) {
708 /* note: no wakeup func for xor */
709 rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL,
710 1, (numDataNodes + numParityNodes), 7, 1, dag_h, name, allocList);
711 xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
712 xorNodes[i].params[0] = readDataNodes[i].params[0];
713 xorNodes[i].params[1] = readDataNodes[i].params[1];
714 xorNodes[i].params[2] = readParityNodes[i].params[0];
715 xorNodes[i].params[3] = readParityNodes[i].params[1];
716 xorNodes[i].params[4] = writeDataNodes[i].params[0];
717 xorNodes[i].params[5] = writeDataNodes[i].params[1];
718 xorNodes[i].params[6].p = raidPtr;
719 /* use old parity buf as target buf */
720 xorNodes[i].results[0] = readParityNodes[i].params[1].p;
721 if (nfaults == 2) {
722 /* note: no wakeup func for qor */
723 rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
724 (numDataNodes + numParityNodes), 7, 1, dag_h, qname, allocList);
725 qNodes[i].params[0] = readDataNodes[i].params[0];
726 qNodes[i].params[1] = readDataNodes[i].params[1];
727 qNodes[i].params[2] = readQNodes[i].params[0];
728 qNodes[i].params[3] = readQNodes[i].params[1];
729 qNodes[i].params[4] = writeDataNodes[i].params[0];
730 qNodes[i].params[5] = writeDataNodes[i].params[1];
731 qNodes[i].params[6].p = raidPtr;
732 /* use old Q buf as target buf */
733 qNodes[i].results[0] = readQNodes[i].params[1].p;
734 }
735 }
736 } else {
737 /* there is only one xor node in this case */
738 rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, 1,
739 (numDataNodes + numParityNodes),
740 (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
741 xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
742 for (i = 0; i < numDataNodes + 1; i++) {
743 /* set up params related to Rod and Rop nodes */
744 xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
745 xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
746 }
747 for (i = 0; i < numDataNodes; i++) {
748 /* set up params related to Wnd and Wnp nodes */
749 xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
750 writeDataNodes[i].params[0];
751 xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
752 writeDataNodes[i].params[1];
753 }
754 /* xor node needs to get at RAID information */
755 xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
756 xorNodes[0].results[0] = readParityNodes[0].params[1].p;
757 if (nfaults == 2) {
758 rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
759 (numDataNodes + numParityNodes),
760 (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h,
761 qname, allocList);
762 for (i = 0; i < numDataNodes; i++) {
763 /* set up params related to Rod */
764 qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
765 qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
766 }
767 /* and read old q */
768 qNodes[0].params[2 * numDataNodes + 0] = /* pda */
769 readQNodes[0].params[0];
770 qNodes[0].params[2 * numDataNodes + 1] = /* buffer ptr */
771 readQNodes[0].params[1];
772 for (i = 0; i < numDataNodes; i++) {
773 /* set up params related to Wnd nodes */
774 qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
775 writeDataNodes[i].params[0];
776 qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
777 writeDataNodes[i].params[1];
778 }
779 /* xor node needs to get at RAID information */
780 qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
781 qNodes[0].results[0] = readQNodes[0].params[1].p;
782 }
783 }
784
785 /* initialize nodes which write new parity (Wnp) */
786 pda = asmap->parityInfo;
787 for (i = 0; i < numParityNodes; i++) {
788 rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
789 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
790 "Wnp", allocList);
791 RF_ASSERT(pda != NULL);
792 writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr)
793 * filled in by xor node */
794 writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for
795 * parity write
796 * operation */
797 writeParityNodes[i].params[2].v = parityStripeID;
798 writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
799 0, 0, which_ru);
800 if (lu_flag) {
801 /* initialize node to unlock the disk queue */
802 rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
803 rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
804 "Unp", allocList);
805 unlockParityNodes[i].params[0].p = pda; /* physical disk addr
806 * desc */
807 unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
808 0, lu_flag, which_ru);
809 }
810 pda = pda->next;
811 }
812
813 /* initialize nodes which write new Q (Wnq) */
814 if (nfaults == 2) {
815 pda = asmap->qInfo;
816 for (i = 0; i < numParityNodes; i++) {
817 rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
818 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
819 "Wnq", allocList);
820 RF_ASSERT(pda != NULL);
821 writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr)
822 * filled in by xor node */
823 writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for
824 * parity write
825 * operation */
826 writeQNodes[i].params[2].v = parityStripeID;
827 writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
828 0, 0, which_ru);
829 if (lu_flag) {
830 /* initialize node to unlock the disk queue */
831 rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
832 rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
833 "Unq", allocList);
834 unlockQNodes[i].params[0].p = pda; /* physical disk addr
835 * desc */
836 unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
837 0, lu_flag, which_ru);
838 }
839 pda = pda->next;
840 }
841 }
842 /*
843 * Step 4. connect the nodes.
844 */
845
846 /* connect header to block node */
847 dag_h->succedents[0] = blockNode;
848
849 /* connect block node to read old data nodes */
850 RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
851 for (i = 0; i < numDataNodes; i++) {
852 blockNode->succedents[i] = &readDataNodes[i];
853 RF_ASSERT(readDataNodes[i].numAntecedents == 1);
854 readDataNodes[i].antecedents[0] = blockNode;
855 readDataNodes[i].antType[0] = rf_control;
856 }
857
858 /* connect block node to read old parity nodes */
859 for (i = 0; i < numParityNodes; i++) {
860 blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
861 RF_ASSERT(readParityNodes[i].numAntecedents == 1);
862 readParityNodes[i].antecedents[0] = blockNode;
863 readParityNodes[i].antType[0] = rf_control;
864 }
865
866 /* connect block node to read old Q nodes */
867 if (nfaults == 2) {
868 for (i = 0; i < numParityNodes; i++) {
869 blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
870 RF_ASSERT(readQNodes[i].numAntecedents == 1);
871 readQNodes[i].antecedents[0] = blockNode;
872 readQNodes[i].antType[0] = rf_control;
873 }
874 }
875 /* connect read old data nodes to xor nodes */
876 for (i = 0; i < numDataNodes; i++) {
877 RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes));
878 for (j = 0; j < numParityNodes; j++) {
879 RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
880 readDataNodes[i].succedents[j] = &xorNodes[j];
881 xorNodes[j].antecedents[i] = &readDataNodes[i];
882 xorNodes[j].antType[i] = rf_trueData;
883 }
884 }
885
886 /* connect read old data nodes to q nodes */
887 if (nfaults == 2) {
888 for (i = 0; i < numDataNodes; i++) {
889 for (j = 0; j < numParityNodes; j++) {
890 RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
891 readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j];
892 qNodes[j].antecedents[i] = &readDataNodes[i];
893 qNodes[j].antType[i] = rf_trueData;
894 }
895 }
896 }
897 /* connect read old parity nodes to xor nodes */
898 for (i = 0; i < numParityNodes; i++) {
899 RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
900 for (j = 0; j < numParityNodes; j++) {
901 readParityNodes[i].succedents[j] = &xorNodes[j];
902 xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
903 xorNodes[j].antType[numDataNodes + i] = rf_trueData;
904 }
905 }
906
907 /* connect read old q nodes to q nodes */
908 if (nfaults == 2) {
909 for (i = 0; i < numParityNodes; i++) {
910 RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
911 for (j = 0; j < numParityNodes; j++) {
912 readQNodes[i].succedents[j] = &qNodes[j];
913 qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
914 qNodes[j].antType[numDataNodes + i] = rf_trueData;
915 }
916 }
917 }
918 /* connect xor nodes to commit node */
919 RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
920 for (i = 0; i < numParityNodes; i++) {
921 RF_ASSERT(xorNodes[i].numSuccedents == 1);
922 xorNodes[i].succedents[0] = commitNode;
923 commitNode->antecedents[i] = &xorNodes[i];
924 commitNode->antType[i] = rf_control;
925 }
926
927 /* connect q nodes to commit node */
928 if (nfaults == 2) {
929 for (i = 0; i < numParityNodes; i++) {
930 RF_ASSERT(qNodes[i].numSuccedents == 1);
931 qNodes[i].succedents[0] = commitNode;
932 commitNode->antecedents[i + numParityNodes] = &qNodes[i];
933 commitNode->antType[i + numParityNodes] = rf_control;
934 }
935 }
936 /* connect commit node to write nodes */
937 RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
938 for (i = 0; i < numDataNodes; i++) {
939 RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
940 commitNode->succedents[i] = &writeDataNodes[i];
941 writeDataNodes[i].antecedents[0] = commitNode;
942 writeDataNodes[i].antType[0] = rf_trueData;
943 }
944 for (i = 0; i < numParityNodes; i++) {
945 RF_ASSERT(writeParityNodes[i].numAntecedents == 1);
946 commitNode->succedents[i + numDataNodes] = &writeParityNodes[i];
947 writeParityNodes[i].antecedents[0] = commitNode;
948 writeParityNodes[i].antType[0] = rf_trueData;
949 }
950 if (nfaults == 2) {
951 for (i = 0; i < numParityNodes; i++) {
952 RF_ASSERT(writeQNodes[i].numAntecedents == 1);
953 commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i];
954 writeQNodes[i].antecedents[0] = commitNode;
955 writeQNodes[i].antType[0] = rf_trueData;
956 }
957 }
958 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
959 RF_ASSERT(termNode->numSuccedents == 0);
960 for (i = 0; i < numDataNodes; i++) {
961 if (lu_flag) {
962 /* connect write new data nodes to unlock nodes */
963 RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
964 RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
965 writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
966 unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
967 unlockDataNodes[i].antType[0] = rf_control;
968
969 /* connect unlock nodes to term node */
970 RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
971 unlockDataNodes[i].succedents[0] = termNode;
972 termNode->antecedents[i] = &unlockDataNodes[i];
973 termNode->antType[i] = rf_control;
974 } else {
975 /* connect write new data nodes to term node */
976 RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
977 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
978 writeDataNodes[i].succedents[0] = termNode;
979 termNode->antecedents[i] = &writeDataNodes[i];
980 termNode->antType[i] = rf_control;
981 }
982 }
983
984 for (i = 0; i < numParityNodes; i++) {
985 if (lu_flag) {
986 /* connect write new parity nodes to unlock nodes */
987 RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
988 RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
989 writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
990 unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
991 unlockParityNodes[i].antType[0] = rf_control;
992
993 /* connect unlock nodes to term node */
994 RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
995 unlockParityNodes[i].succedents[0] = termNode;
996 termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
997 termNode->antType[numDataNodes + i] = rf_control;
998 } else {
999 RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
1000 writeParityNodes[i].succedents[0] = termNode;
1001 termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
1002 termNode->antType[numDataNodes + i] = rf_control;
1003 }
1004 }
1005
1006 if (nfaults == 2) {
1007 for (i = 0; i < numParityNodes; i++) {
1008 if (lu_flag) {
1009 /* connect write new Q nodes to unlock nodes */
1010 RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1011 RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
1012 writeQNodes[i].succedents[0] = &unlockQNodes[i];
1013 unlockQNodes[i].antecedents[0] = &writeQNodes[i];
1014 unlockQNodes[i].antType[0] = rf_control;
1015
1016 /* connect unlock nodes to unblock node */
1017 RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
1018 unlockQNodes[i].succedents[0] = termNode;
1019 termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
1020 termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1021 } else {
1022 RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1023 writeQNodes[i].succedents[0] = termNode;
1024 termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
1025 termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1026 }
1027 }
1028 }
1029 }
1030
1031
1032 /******************************************************************************
1033 * create a write graph (fault-free or degraded) for RAID level 1
1034 *
1035 * Hdr -> Commit -> Wpd -> Nil -> Trm
1036 * -> Wsd ->
1037 *
1038 * The "Wpd" node writes data to the primary copy in the mirror pair
1039 * The "Wsd" node writes data to the secondary copy in the mirror pair
1040 *
1041 * Parameters: raidPtr - description of the physical array
1042 * asmap - logical & physical addresses for this access
1043 * bp - buffer ptr (holds write data)
1044 * flags - general flags (e.g. disk locking)
1045 * allocList - list of memory allocated in DAG creation
1046 *****************************************************************************/
1047
1048 void
1049 rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1050 RF_DagHeader_t *dag_h, void *bp,
1051 RF_RaidAccessFlags_t flags,
1052 RF_AllocListElem_t *allocList)
1053 {
1054 RF_DagNode_t *unblockNode, *termNode, *commitNode;
1055 RF_DagNode_t *nodes, *wndNode, *wmirNode;
1056 int nWndNodes, nWmirNodes, i;
1057 RF_ReconUnitNum_t which_ru;
1058 RF_PhysDiskAddr_t *pda, *pdaP;
1059 RF_StripeNum_t parityStripeID;
1060
1061 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
1062 asmap->raidAddress, &which_ru);
1063 if (rf_dagDebug) {
1064 printf("[Creating RAID level 1 write DAG]\n");
1065 }
1066 dag_h->creator = "RaidOneWriteDAG";
1067
1068 /* 2 implies access not SU aligned */
1069 nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
1070 nWndNodes = (asmap->physInfo->next) ? 2 : 1;
1071
1072 /* alloc the Wnd nodes and the Wmir node */
1073 if (asmap->numDataFailed == 1)
1074 nWndNodes--;
1075 if (asmap->numParityFailed == 1)
1076 nWmirNodes--;
1077
1078 /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1079 * + terminator) */
1080 RF_MallocAndAdd(nodes,
1081 (nWndNodes + nWmirNodes + 3) * sizeof(RF_DagNode_t),
1082 (RF_DagNode_t *), allocList);
1083 i = 0;
1084 wndNode = &nodes[i];
1085 i += nWndNodes;
1086 wmirNode = &nodes[i];
1087 i += nWmirNodes;
1088 commitNode = &nodes[i];
1089 i += 1;
1090 unblockNode = &nodes[i];
1091 i += 1;
1092 termNode = &nodes[i];
1093 i += 1;
1094 RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
1095
1096 /* this dag can commit immediately */
1097 dag_h->numCommitNodes = 1;
1098 dag_h->numCommits = 0;
1099 dag_h->numSuccedents = 1;
1100
1101 /* initialize the commit, unblock, and term nodes */
1102 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
1103 NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Cmt", allocList);
1104 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
1105 NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList);
1106 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
1107 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
1108
1109 /* initialize the wnd nodes */
1110 if (nWndNodes > 0) {
1111 pda = asmap->physInfo;
1112 for (i = 0; i < nWndNodes; i++) {
1113 rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1114 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList);
1115 RF_ASSERT(pda != NULL);
1116 wndNode[i].params[0].p = pda;
1117 wndNode[i].params[1].p = pda->bufPtr;
1118 wndNode[i].params[2].v = parityStripeID;
1119 wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1120 pda = pda->next;
1121 }
1122 RF_ASSERT(pda == NULL);
1123 }
1124 /* initialize the mirror nodes */
1125 if (nWmirNodes > 0) {
1126 pda = asmap->physInfo;
1127 pdaP = asmap->parityInfo;
1128 for (i = 0; i < nWmirNodes; i++) {
1129 rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1130 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList);
1131 RF_ASSERT(pda != NULL);
1132 wmirNode[i].params[0].p = pdaP;
1133 wmirNode[i].params[1].p = pda->bufPtr;
1134 wmirNode[i].params[2].v = parityStripeID;
1135 wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1136 pda = pda->next;
1137 pdaP = pdaP->next;
1138 }
1139 RF_ASSERT(pda == NULL);
1140 RF_ASSERT(pdaP == NULL);
1141 }
1142 /* link the header node to the commit node */
1143 RF_ASSERT(dag_h->numSuccedents == 1);
1144 RF_ASSERT(commitNode->numAntecedents == 0);
1145 dag_h->succedents[0] = commitNode;
1146
1147 /* link the commit node to the write nodes */
1148 RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
1149 for (i = 0; i < nWndNodes; i++) {
1150 RF_ASSERT(wndNode[i].numAntecedents == 1);
1151 commitNode->succedents[i] = &wndNode[i];
1152 wndNode[i].antecedents[0] = commitNode;
1153 wndNode[i].antType[0] = rf_control;
1154 }
1155 for (i = 0; i < nWmirNodes; i++) {
1156 RF_ASSERT(wmirNode[i].numAntecedents == 1);
1157 commitNode->succedents[i + nWndNodes] = &wmirNode[i];
1158 wmirNode[i].antecedents[0] = commitNode;
1159 wmirNode[i].antType[0] = rf_control;
1160 }
1161
1162 /* link the write nodes to the unblock node */
1163 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
1164 for (i = 0; i < nWndNodes; i++) {
1165 RF_ASSERT(wndNode[i].numSuccedents == 1);
1166 wndNode[i].succedents[0] = unblockNode;
1167 unblockNode->antecedents[i] = &wndNode[i];
1168 unblockNode->antType[i] = rf_control;
1169 }
1170 for (i = 0; i < nWmirNodes; i++) {
1171 RF_ASSERT(wmirNode[i].numSuccedents == 1);
1172 wmirNode[i].succedents[0] = unblockNode;
1173 unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
1174 unblockNode->antType[i + nWndNodes] = rf_control;
1175 }
1176
1177 /* link the unblock node to the term node */
1178 RF_ASSERT(unblockNode->numSuccedents == 1);
1179 RF_ASSERT(termNode->numAntecedents == 1);
1180 RF_ASSERT(termNode->numSuccedents == 0);
1181 unblockNode->succedents[0] = termNode;
1182 termNode->antecedents[0] = unblockNode;
1183 termNode->antType[0] = rf_control;
1184 }
1185