rf_dagffwr.c revision 1.15 1 /* $NetBSD: rf_dagffwr.c,v 1.15 2004/01/09 23:26:17 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_dagff.c
31 *
32 * code for creating fault-free DAGs
33 *
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.15 2004/01/09 23:26:17 oster Exp $");
38
39 #include <dev/raidframe/raidframevar.h>
40
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_dagffrd.h"
47 #include "rf_general.h"
48 #include "rf_dagffwr.h"
49
50 /******************************************************************************
51 *
52 * General comments on DAG creation:
53 *
54 * All DAGs in this file use roll-away error recovery. Each DAG has a single
55 * commit node, usually called "Cmt." If an error occurs before the Cmt node
56 * is reached, the execution engine will halt forward execution and work
57 * backward through the graph, executing the undo functions. Assuming that
58 * each node in the graph prior to the Cmt node are undoable and atomic - or -
59 * does not make changes to permanent state, the graph will fail atomically.
60 * If an error occurs after the Cmt node executes, the engine will roll-forward
61 * through the graph, blindly executing nodes until it reaches the end.
62 * If a graph reaches the end, it is assumed to have completed successfully.
63 *
64 * A graph has only 1 Cmt node.
65 *
66 */
67
68
69 /******************************************************************************
70 *
71 * The following wrappers map the standard DAG creation interface to the
72 * DAG creation routines. Additionally, these wrappers enable experimentation
73 * with new DAG structures by providing an extra level of indirection, allowing
74 * the DAG creation routines to be replaced at this single point.
75 */
76
77
78 void
79 rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
80 RF_DagHeader_t *dag_h, void *bp,
81 RF_RaidAccessFlags_t flags,
82 RF_AllocListElem_t *allocList,
83 RF_IoType_t type)
84 {
85 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
86 RF_IO_TYPE_WRITE);
87 }
88
89 void
90 rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
91 RF_DagHeader_t *dag_h, void *bp,
92 RF_RaidAccessFlags_t flags,
93 RF_AllocListElem_t *allocList,
94 RF_IoType_t type)
95 {
96 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
97 RF_IO_TYPE_WRITE);
98 }
99
100 void
101 rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
102 RF_DagHeader_t *dag_h, void *bp,
103 RF_RaidAccessFlags_t flags,
104 RF_AllocListElem_t *allocList)
105 {
106 /* "normal" rollaway */
107 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
108 allocList, &rf_xorFuncs, NULL);
109 }
110
111 void
112 rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
113 RF_DagHeader_t *dag_h, void *bp,
114 RF_RaidAccessFlags_t flags,
115 RF_AllocListElem_t *allocList)
116 {
117 /* "normal" rollaway */
118 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
119 allocList, 1, rf_RegularXorFunc, RF_TRUE);
120 }
121
122
123 /******************************************************************************
124 *
125 * DAG creation code begins here
126 */
127
128
129 /******************************************************************************
130 *
131 * creates a DAG to perform a large-write operation:
132 *
133 * / Rod \ / Wnd \
134 * H -- block- Rod - Xor - Cmt - Wnd --- T
135 * \ Rod / \ Wnp /
136 * \[Wnq]/
137 *
138 * The XOR node also does the Q calculation in the P+Q architecture.
139 * All nodes are before the commit node (Cmt) are assumed to be atomic and
140 * undoable - or - they make no changes to permanent state.
141 *
142 * Rod = read old data
143 * Cmt = commit node
144 * Wnp = write new parity
145 * Wnd = write new data
146 * Wnq = write new "q"
147 * [] denotes optional segments in the graph
148 *
149 * Parameters: raidPtr - description of the physical array
150 * asmap - logical & physical addresses for this access
151 * bp - buffer ptr (holds write data)
152 * flags - general flags (e.g. disk locking)
153 * allocList - list of memory allocated in DAG creation
154 * nfaults - number of faults array can tolerate
155 * (equal to # redundancy units in stripe)
156 * redfuncs - list of redundancy generating functions
157 *
158 *****************************************************************************/
159
160 void
161 rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
162 RF_DagHeader_t *dag_h, void *bp,
163 RF_RaidAccessFlags_t flags,
164 RF_AllocListElem_t *allocList,
165 int nfaults, int (*redFunc) (RF_DagNode_t *),
166 int allowBufferRecycle)
167 {
168 RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
169 RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode;
170 int nWndNodes, nRodNodes, i, nodeNum, asmNum;
171 RF_AccessStripeMapHeader_t *new_asm_h[2];
172 RF_StripeNum_t parityStripeID;
173 char *sosBuffer, *eosBuffer;
174 RF_ReconUnitNum_t which_ru;
175 RF_RaidLayout_t *layoutPtr;
176 RF_PhysDiskAddr_t *pda;
177
178 layoutPtr = &(raidPtr->Layout);
179 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
180 asmap->raidAddress,
181 &which_ru);
182
183 if (rf_dagDebug) {
184 printf("[Creating large-write DAG]\n");
185 }
186 dag_h->creator = "LargeWriteDAG";
187
188 dag_h->numCommitNodes = 1;
189 dag_h->numCommits = 0;
190 dag_h->numSuccedents = 1;
191
192 /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
193 nWndNodes = asmap->numStripeUnitsAccessed;
194 RF_MallocAndAdd(nodes,
195 (nWndNodes + 4 + nfaults) * sizeof(RF_DagNode_t),
196 (RF_DagNode_t *), allocList);
197 i = 0;
198 wndNodes = &nodes[i];
199 i += nWndNodes;
200 xorNode = &nodes[i];
201 i += 1;
202 wnpNode = &nodes[i];
203 i += 1;
204 blockNode = &nodes[i];
205 i += 1;
206 commitNode = &nodes[i];
207 i += 1;
208 termNode = &nodes[i];
209 i += 1;
210 if (nfaults == 2) {
211 wnqNode = &nodes[i];
212 i += 1;
213 } else {
214 wnqNode = NULL;
215 }
216 rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h,
217 new_asm_h, &nRodNodes, &sosBuffer,
218 &eosBuffer, allocList);
219 if (nRodNodes > 0) {
220 RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t),
221 (RF_DagNode_t *), allocList);
222 } else {
223 rodNodes = NULL;
224 }
225
226 /* begin node initialization */
227 if (nRodNodes > 0) {
228 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
229 rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0,
230 dag_h, "Nil", allocList);
231 } else {
232 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
233 rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0,
234 dag_h, "Nil", allocList);
235 }
236
237 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
238 rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0,
239 dag_h, "Cmt", allocList);
240 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
241 rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0,
242 dag_h, "Trm", allocList);
243
244 /* initialize the Rod nodes */
245 for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
246 if (new_asm_h[asmNum]) {
247 pda = new_asm_h[asmNum]->stripeMap->physInfo;
248 while (pda) {
249 rf_InitNode(&rodNodes[nodeNum], rf_wait,
250 RF_FALSE, rf_DiskReadFunc,
251 rf_DiskReadUndoFunc,
252 rf_GenericWakeupFunc,
253 1, 1, 4, 0, dag_h,
254 "Rod", allocList);
255 rodNodes[nodeNum].params[0].p = pda;
256 rodNodes[nodeNum].params[1].p = pda->bufPtr;
257 rodNodes[nodeNum].params[2].v = parityStripeID;
258 rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
259 0, 0, which_ru);
260 nodeNum++;
261 pda = pda->next;
262 }
263 }
264 }
265 RF_ASSERT(nodeNum == nRodNodes);
266
267 /* initialize the wnd nodes */
268 pda = asmap->physInfo;
269 for (i = 0; i < nWndNodes; i++) {
270 rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE,
271 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
272 rf_GenericWakeupFunc, 1, 1, 4, 0,
273 dag_h, "Wnd", allocList);
274 RF_ASSERT(pda != NULL);
275 wndNodes[i].params[0].p = pda;
276 wndNodes[i].params[1].p = pda->bufPtr;
277 wndNodes[i].params[2].v = parityStripeID;
278 wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
279 pda = pda->next;
280 }
281
282 /* initialize the redundancy node */
283 if (nRodNodes > 0) {
284 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
285 rf_NullNodeUndoFunc, NULL, 1,
286 nRodNodes, 2 * (nWndNodes + nRodNodes) + 1,
287 nfaults, dag_h, "Xr ", allocList);
288 } else {
289 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
290 rf_NullNodeUndoFunc, NULL, 1,
291 1, 2 * (nWndNodes + nRodNodes) + 1,
292 nfaults, dag_h, "Xr ", allocList);
293 }
294 xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
295 for (i = 0; i < nWndNodes; i++) {
296 /* pda */
297 xorNode->params[2 * i + 0] = wndNodes[i].params[0];
298 /* buf ptr */
299 xorNode->params[2 * i + 1] = wndNodes[i].params[1];
300 }
301 for (i = 0; i < nRodNodes; i++) {
302 /* pda */
303 xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0];
304 /* buf ptr */
305 xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1];
306 }
307 /* xor node needs to get at RAID information */
308 xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
309
310 /*
311 * Look for an Rod node that reads a complete SU. If none,
312 * alloc a buffer to receive the parity info. Note that we
313 * can't use a new data buffer because it will not have gotten
314 * written when the xor occurs. */
315 if (allowBufferRecycle) {
316 for (i = 0; i < nRodNodes; i++) {
317 if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
318 break;
319 }
320 }
321 if ((!allowBufferRecycle) || (i == nRodNodes)) {
322 RF_MallocAndAdd(xorNode->results[0],
323 rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
324 (void *), allocList);
325 } else {
326 xorNode->results[0] = rodNodes[i].params[1].p;
327 }
328
329 /* initialize the Wnp node */
330 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
331 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
332 dag_h, "Wnp", allocList);
333 wnpNode->params[0].p = asmap->parityInfo;
334 wnpNode->params[1].p = xorNode->results[0];
335 wnpNode->params[2].v = parityStripeID;
336 wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
337 /* parityInfo must describe entire parity unit */
338 RF_ASSERT(asmap->parityInfo->next == NULL);
339
340 if (nfaults == 2) {
341 /*
342 * We never try to recycle a buffer for the Q calcuation
343 * in addition to the parity. This would cause two buffers
344 * to get smashed during the P and Q calculation, guaranteeing
345 * one would be wrong.
346 */
347 RF_MallocAndAdd(xorNode->results[1],
348 rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
349 (void *), allocList);
350 rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
351 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc,
352 1, 1, 4, 0, dag_h, "Wnq", allocList);
353 wnqNode->params[0].p = asmap->qInfo;
354 wnqNode->params[1].p = xorNode->results[1];
355 wnqNode->params[2].v = parityStripeID;
356 wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
357 /* parityInfo must describe entire parity unit */
358 RF_ASSERT(asmap->parityInfo->next == NULL);
359 }
360 /*
361 * Connect nodes to form graph.
362 */
363
364 /* connect dag header to block node */
365 RF_ASSERT(blockNode->numAntecedents == 0);
366 dag_h->succedents[0] = blockNode;
367
368 if (nRodNodes > 0) {
369 /* connect the block node to the Rod nodes */
370 RF_ASSERT(blockNode->numSuccedents == nRodNodes);
371 RF_ASSERT(xorNode->numAntecedents == nRodNodes);
372 for (i = 0; i < nRodNodes; i++) {
373 RF_ASSERT(rodNodes[i].numAntecedents == 1);
374 blockNode->succedents[i] = &rodNodes[i];
375 rodNodes[i].antecedents[0] = blockNode;
376 rodNodes[i].antType[0] = rf_control;
377
378 /* connect the Rod nodes to the Xor node */
379 RF_ASSERT(rodNodes[i].numSuccedents == 1);
380 rodNodes[i].succedents[0] = xorNode;
381 xorNode->antecedents[i] = &rodNodes[i];
382 xorNode->antType[i] = rf_trueData;
383 }
384 } else {
385 /* connect the block node to the Xor node */
386 RF_ASSERT(blockNode->numSuccedents == 1);
387 RF_ASSERT(xorNode->numAntecedents == 1);
388 blockNode->succedents[0] = xorNode;
389 xorNode->antecedents[0] = blockNode;
390 xorNode->antType[0] = rf_control;
391 }
392
393 /* connect the xor node to the commit node */
394 RF_ASSERT(xorNode->numSuccedents == 1);
395 RF_ASSERT(commitNode->numAntecedents == 1);
396 xorNode->succedents[0] = commitNode;
397 commitNode->antecedents[0] = xorNode;
398 commitNode->antType[0] = rf_control;
399
400 /* connect the commit node to the write nodes */
401 RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
402 for (i = 0; i < nWndNodes; i++) {
403 RF_ASSERT(wndNodes->numAntecedents == 1);
404 commitNode->succedents[i] = &wndNodes[i];
405 wndNodes[i].antecedents[0] = commitNode;
406 wndNodes[i].antType[0] = rf_control;
407 }
408 RF_ASSERT(wnpNode->numAntecedents == 1);
409 commitNode->succedents[nWndNodes] = wnpNode;
410 wnpNode->antecedents[0] = commitNode;
411 wnpNode->antType[0] = rf_trueData;
412 if (nfaults == 2) {
413 RF_ASSERT(wnqNode->numAntecedents == 1);
414 commitNode->succedents[nWndNodes + 1] = wnqNode;
415 wnqNode->antecedents[0] = commitNode;
416 wnqNode->antType[0] = rf_trueData;
417 }
418 /* connect the write nodes to the term node */
419 RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
420 RF_ASSERT(termNode->numSuccedents == 0);
421 for (i = 0; i < nWndNodes; i++) {
422 RF_ASSERT(wndNodes->numSuccedents == 1);
423 wndNodes[i].succedents[0] = termNode;
424 termNode->antecedents[i] = &wndNodes[i];
425 termNode->antType[i] = rf_control;
426 }
427 RF_ASSERT(wnpNode->numSuccedents == 1);
428 wnpNode->succedents[0] = termNode;
429 termNode->antecedents[nWndNodes] = wnpNode;
430 termNode->antType[nWndNodes] = rf_control;
431 if (nfaults == 2) {
432 RF_ASSERT(wnqNode->numSuccedents == 1);
433 wnqNode->succedents[0] = termNode;
434 termNode->antecedents[nWndNodes + 1] = wnqNode;
435 termNode->antType[nWndNodes + 1] = rf_control;
436 }
437 }
438 /******************************************************************************
439 *
440 * creates a DAG to perform a small-write operation (either raid 5 or pq),
441 * which is as follows:
442 *
443 * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
444 * \- Rod X / \----> Wnd [Und]-/
445 * [\- Rod X / \---> Wnd [Und]-/]
446 * [\- Roq -> Q / \--> Wnq [Unq]-/]
447 *
448 * Rop = read old parity
449 * Rod = read old data
450 * Roq = read old "q"
451 * Cmt = commit node
452 * Und = unlock data disk
453 * Unp = unlock parity disk
454 * Unq = unlock q disk
455 * Wnp = write new parity
456 * Wnd = write new data
457 * Wnq = write new "q"
458 * [ ] denotes optional segments in the graph
459 *
460 * Parameters: raidPtr - description of the physical array
461 * asmap - logical & physical addresses for this access
462 * bp - buffer ptr (holds write data)
463 * flags - general flags (e.g. disk locking)
464 * allocList - list of memory allocated in DAG creation
465 * pfuncs - list of parity generating functions
466 * qfuncs - list of q generating functions
467 *
468 * A null qfuncs indicates single fault tolerant
469 *****************************************************************************/
470
471 void
472 rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
473 RF_DagHeader_t *dag_h, void *bp,
474 RF_RaidAccessFlags_t flags,
475 RF_AllocListElem_t *allocList,
476 const RF_RedFuncs_t *pfuncs,
477 const RF_RedFuncs_t *qfuncs)
478 {
479 RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
480 RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
481 RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes;
482 RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
483 int i, j, nNodes, totalNumNodes, lu_flag;
484 RF_ReconUnitNum_t which_ru;
485 int (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
486 int (*qfunc) (RF_DagNode_t *);
487 int numDataNodes, numParityNodes;
488 RF_StripeNum_t parityStripeID;
489 RF_PhysDiskAddr_t *pda;
490 char *name, *qname;
491 long nfaults;
492
493 nfaults = qfuncs ? 2 : 1;
494 lu_flag = 0; /* lock/unlock flag */
495
496 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
497 asmap->raidAddress, &which_ru);
498 pda = asmap->physInfo;
499 numDataNodes = asmap->numStripeUnitsAccessed;
500 numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
501
502 if (rf_dagDebug) {
503 printf("[Creating small-write DAG]\n");
504 }
505 RF_ASSERT(numDataNodes > 0);
506 dag_h->creator = "SmallWriteDAG";
507
508 dag_h->numCommitNodes = 1;
509 dag_h->numCommits = 0;
510 dag_h->numSuccedents = 1;
511
512 /*
513 * DAG creation occurs in four steps:
514 * 1. count the number of nodes in the DAG
515 * 2. create the nodes
516 * 3. initialize the nodes
517 * 4. connect the nodes
518 */
519
520 /*
521 * Step 1. compute number of nodes in the graph
522 */
523
524 /* number of nodes: a read and write for each data unit a
525 * redundancy computation node for each parity node (nfaults *
526 * nparity) a read and write for each parity unit a block and
527 * commit node (2) a terminate node if atomic RMW an unlock
528 * node for each data unit, redundancy unit */
529 totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
530 + (nfaults * 2 * numParityNodes) + 3;
531 if (lu_flag) {
532 totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
533 }
534 /*
535 * Step 2. create the nodes
536 */
537 RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t),
538 (RF_DagNode_t *), allocList);
539 i = 0;
540 blockNode = &nodes[i];
541 i += 1;
542 commitNode = &nodes[i];
543 i += 1;
544 readDataNodes = &nodes[i];
545 i += numDataNodes;
546 readParityNodes = &nodes[i];
547 i += numParityNodes;
548 writeDataNodes = &nodes[i];
549 i += numDataNodes;
550 writeParityNodes = &nodes[i];
551 i += numParityNodes;
552 xorNodes = &nodes[i];
553 i += numParityNodes;
554 termNode = &nodes[i];
555 i += 1;
556 if (lu_flag) {
557 unlockDataNodes = &nodes[i];
558 i += numDataNodes;
559 unlockParityNodes = &nodes[i];
560 i += numParityNodes;
561 } else {
562 unlockDataNodes = unlockParityNodes = NULL;
563 }
564 if (nfaults == 2) {
565 readQNodes = &nodes[i];
566 i += numParityNodes;
567 writeQNodes = &nodes[i];
568 i += numParityNodes;
569 qNodes = &nodes[i];
570 i += numParityNodes;
571 if (lu_flag) {
572 unlockQNodes = &nodes[i];
573 i += numParityNodes;
574 } else {
575 unlockQNodes = NULL;
576 }
577 } else {
578 readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
579 }
580 RF_ASSERT(i == totalNumNodes);
581
582 /*
583 * Step 3. initialize the nodes
584 */
585 /* initialize block node (Nil) */
586 nNodes = numDataNodes + (nfaults * numParityNodes);
587 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
588 rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0,
589 dag_h, "Nil", allocList);
590
591 /* initialize commit node (Cmt) */
592 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
593 rf_NullNodeUndoFunc, NULL, nNodes,
594 (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
595
596 /* initialize terminate node (Trm) */
597 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
598 rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0,
599 dag_h, "Trm", allocList);
600
601 /* initialize nodes which read old data (Rod) */
602 for (i = 0; i < numDataNodes; i++) {
603 rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE,
604 rf_DiskReadFunc, rf_DiskReadUndoFunc,
605 rf_GenericWakeupFunc, (nfaults * numParityNodes),
606 1, 4, 0, dag_h, "Rod", allocList);
607 RF_ASSERT(pda != NULL);
608 /* physical disk addr desc */
609 readDataNodes[i].params[0].p = pda;
610 /* buffer to hold old data */
611 readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
612 dag_h, pda, allocList);
613 readDataNodes[i].params[2].v = parityStripeID;
614 readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
615 lu_flag, 0, which_ru);
616 pda = pda->next;
617 for (j = 0; j < readDataNodes[i].numSuccedents; j++) {
618 readDataNodes[i].propList[j] = NULL;
619 }
620 }
621
622 /* initialize nodes which read old parity (Rop) */
623 pda = asmap->parityInfo;
624 i = 0;
625 for (i = 0; i < numParityNodes; i++) {
626 RF_ASSERT(pda != NULL);
627 rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE,
628 rf_DiskReadFunc, rf_DiskReadUndoFunc,
629 rf_GenericWakeupFunc, numParityNodes, 1, 4, 0,
630 dag_h, "Rop", allocList);
631 readParityNodes[i].params[0].p = pda;
632 /* buffer to hold old parity */
633 readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
634 dag_h, pda, allocList);
635 readParityNodes[i].params[2].v = parityStripeID;
636 readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
637 lu_flag, 0, which_ru);
638 pda = pda->next;
639 for (j = 0; j < readParityNodes[i].numSuccedents; j++) {
640 readParityNodes[i].propList[0] = NULL;
641 }
642 }
643
644 /* initialize nodes which read old Q (Roq) */
645 if (nfaults == 2) {
646 pda = asmap->qInfo;
647 for (i = 0; i < numParityNodes; i++) {
648 RF_ASSERT(pda != NULL);
649 rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE,
650 rf_DiskReadFunc, rf_DiskReadUndoFunc,
651 rf_GenericWakeupFunc, numParityNodes,
652 1, 4, 0, dag_h, "Roq", allocList);
653 readQNodes[i].params[0].p = pda;
654 /* buffer to hold old Q */
655 readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
656 dag_h, pda,
657 allocList);
658 readQNodes[i].params[2].v = parityStripeID;
659 readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
660 lu_flag, 0, which_ru);
661 pda = pda->next;
662 for (j = 0; j < readQNodes[i].numSuccedents; j++) {
663 readQNodes[i].propList[0] = NULL;
664 }
665 }
666 }
667 /* initialize nodes which write new data (Wnd) */
668 pda = asmap->physInfo;
669 for (i = 0; i < numDataNodes; i++) {
670 RF_ASSERT(pda != NULL);
671 rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE,
672 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
673 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
674 "Wnd", allocList);
675 /* physical disk addr desc */
676 writeDataNodes[i].params[0].p = pda;
677 /* buffer holding new data to be written */
678 writeDataNodes[i].params[1].p = pda->bufPtr;
679 writeDataNodes[i].params[2].v = parityStripeID;
680 writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
681 0, 0, which_ru);
682 if (lu_flag) {
683 /* initialize node to unlock the disk queue */
684 rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE,
685 rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc,
686 rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
687 "Und", allocList);
688 /* physical disk addr desc */
689 unlockDataNodes[i].params[0].p = pda;
690 unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
691 0, lu_flag, which_ru);
692 }
693 pda = pda->next;
694 }
695
696 /*
697 * Initialize nodes which compute new parity and Q.
698 */
699 /*
700 * We use the simple XOR func in the double-XOR case, and when
701 * we're accessing only a portion of one stripe unit. The
702 * distinction between the two is that the regular XOR func
703 * assumes that the targbuf is a full SU in size, and examines
704 * the pda associated with the buffer to decide where within
705 * the buffer to XOR the data, whereas the simple XOR func
706 * just XORs the data into the start of the buffer. */
707 if ((numParityNodes == 2) || ((numDataNodes == 1)
708 && (asmap->totalSectorsAccessed <
709 raidPtr->Layout.sectorsPerStripeUnit))) {
710 func = pfuncs->simple;
711 undoFunc = rf_NullNodeUndoFunc;
712 name = pfuncs->SimpleName;
713 if (qfuncs) {
714 qfunc = qfuncs->simple;
715 qname = qfuncs->SimpleName;
716 } else {
717 qfunc = NULL;
718 qname = NULL;
719 }
720 } else {
721 func = pfuncs->regular;
722 undoFunc = rf_NullNodeUndoFunc;
723 name = pfuncs->RegularName;
724 if (qfuncs) {
725 qfunc = qfuncs->regular;
726 qname = qfuncs->RegularName;
727 } else {
728 qfunc = NULL;
729 qname = NULL;
730 }
731 }
732 /*
733 * Initialize the xor nodes: params are {pda,buf}
734 * from {Rod,Wnd,Rop} nodes, and raidPtr
735 */
736 if (numParityNodes == 2) {
737 /* double-xor case */
738 for (i = 0; i < numParityNodes; i++) {
739 /* note: no wakeup func for xor */
740 rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func,
741 undoFunc, NULL, 1,
742 (numDataNodes + numParityNodes),
743 7, 1, dag_h, name, allocList);
744 xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
745 xorNodes[i].params[0] = readDataNodes[i].params[0];
746 xorNodes[i].params[1] = readDataNodes[i].params[1];
747 xorNodes[i].params[2] = readParityNodes[i].params[0];
748 xorNodes[i].params[3] = readParityNodes[i].params[1];
749 xorNodes[i].params[4] = writeDataNodes[i].params[0];
750 xorNodes[i].params[5] = writeDataNodes[i].params[1];
751 xorNodes[i].params[6].p = raidPtr;
752 /* use old parity buf as target buf */
753 xorNodes[i].results[0] = readParityNodes[i].params[1].p;
754 if (nfaults == 2) {
755 /* note: no wakeup func for qor */
756 rf_InitNode(&qNodes[i], rf_wait, RF_FALSE,
757 qfunc, undoFunc, NULL, 1,
758 (numDataNodes + numParityNodes),
759 7, 1, dag_h, qname, allocList);
760 qNodes[i].params[0] = readDataNodes[i].params[0];
761 qNodes[i].params[1] = readDataNodes[i].params[1];
762 qNodes[i].params[2] = readQNodes[i].params[0];
763 qNodes[i].params[3] = readQNodes[i].params[1];
764 qNodes[i].params[4] = writeDataNodes[i].params[0];
765 qNodes[i].params[5] = writeDataNodes[i].params[1];
766 qNodes[i].params[6].p = raidPtr;
767 /* use old Q buf as target buf */
768 qNodes[i].results[0] = readQNodes[i].params[1].p;
769 }
770 }
771 } else {
772 /* there is only one xor node in this case */
773 rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func,
774 undoFunc, NULL, 1, (numDataNodes + numParityNodes),
775 (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
776 dag_h, name, allocList);
777 xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
778 for (i = 0; i < numDataNodes + 1; i++) {
779 /* set up params related to Rod and Rop nodes */
780 xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
781 xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
782 }
783 for (i = 0; i < numDataNodes; i++) {
784 /* set up params related to Wnd and Wnp nodes */
785 xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
786 writeDataNodes[i].params[0];
787 xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
788 writeDataNodes[i].params[1];
789 }
790 /* xor node needs to get at RAID information */
791 xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
792 xorNodes[0].results[0] = readParityNodes[0].params[1].p;
793 if (nfaults == 2) {
794 rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc,
795 undoFunc, NULL, 1,
796 (numDataNodes + numParityNodes),
797 (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
798 dag_h, qname, allocList);
799 for (i = 0; i < numDataNodes; i++) {
800 /* set up params related to Rod */
801 qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
802 qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
803 }
804 /* and read old q */
805 qNodes[0].params[2 * numDataNodes + 0] = /* pda */
806 readQNodes[0].params[0];
807 qNodes[0].params[2 * numDataNodes + 1] = /* buffer ptr */
808 readQNodes[0].params[1];
809 for (i = 0; i < numDataNodes; i++) {
810 /* set up params related to Wnd nodes */
811 qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
812 writeDataNodes[i].params[0];
813 qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
814 writeDataNodes[i].params[1];
815 }
816 /* xor node needs to get at RAID information */
817 qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
818 qNodes[0].results[0] = readQNodes[0].params[1].p;
819 }
820 }
821
822 /* initialize nodes which write new parity (Wnp) */
823 pda = asmap->parityInfo;
824 for (i = 0; i < numParityNodes; i++) {
825 rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE,
826 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
827 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
828 "Wnp", allocList);
829 RF_ASSERT(pda != NULL);
830 writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr)
831 * filled in by xor node */
832 writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for
833 * parity write
834 * operation */
835 writeParityNodes[i].params[2].v = parityStripeID;
836 writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
837 0, 0, which_ru);
838 if (lu_flag) {
839 /* initialize node to unlock the disk queue */
840 rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE,
841 rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc,
842 rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
843 "Unp", allocList);
844 unlockParityNodes[i].params[0].p = pda; /* physical disk addr
845 * desc */
846 unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
847 0, lu_flag, which_ru);
848 }
849 pda = pda->next;
850 }
851
852 /* initialize nodes which write new Q (Wnq) */
853 if (nfaults == 2) {
854 pda = asmap->qInfo;
855 for (i = 0; i < numParityNodes; i++) {
856 rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE,
857 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
858 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
859 "Wnq", allocList);
860 RF_ASSERT(pda != NULL);
861 writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr)
862 * filled in by xor node */
863 writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for
864 * parity write
865 * operation */
866 writeQNodes[i].params[2].v = parityStripeID;
867 writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
868 0, 0, which_ru);
869 if (lu_flag) {
870 /* initialize node to unlock the disk queue */
871 rf_InitNode(&unlockQNodes[i], rf_wait,
872 RF_FALSE, rf_DiskUnlockFunc,
873 rf_DiskUnlockUndoFunc,
874 rf_GenericWakeupFunc, 1, 1, 2, 0,
875 dag_h, "Unq", allocList);
876 unlockQNodes[i].params[0].p = pda; /* physical disk addr
877 * desc */
878 unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
879 0, lu_flag, which_ru);
880 }
881 pda = pda->next;
882 }
883 }
884 /*
885 * Step 4. connect the nodes.
886 */
887
888 /* connect header to block node */
889 dag_h->succedents[0] = blockNode;
890
891 /* connect block node to read old data nodes */
892 RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
893 for (i = 0; i < numDataNodes; i++) {
894 blockNode->succedents[i] = &readDataNodes[i];
895 RF_ASSERT(readDataNodes[i].numAntecedents == 1);
896 readDataNodes[i].antecedents[0] = blockNode;
897 readDataNodes[i].antType[0] = rf_control;
898 }
899
900 /* connect block node to read old parity nodes */
901 for (i = 0; i < numParityNodes; i++) {
902 blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
903 RF_ASSERT(readParityNodes[i].numAntecedents == 1);
904 readParityNodes[i].antecedents[0] = blockNode;
905 readParityNodes[i].antType[0] = rf_control;
906 }
907
908 /* connect block node to read old Q nodes */
909 if (nfaults == 2) {
910 for (i = 0; i < numParityNodes; i++) {
911 blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
912 RF_ASSERT(readQNodes[i].numAntecedents == 1);
913 readQNodes[i].antecedents[0] = blockNode;
914 readQNodes[i].antType[0] = rf_control;
915 }
916 }
917 /* connect read old data nodes to xor nodes */
918 for (i = 0; i < numDataNodes; i++) {
919 RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes));
920 for (j = 0; j < numParityNodes; j++) {
921 RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
922 readDataNodes[i].succedents[j] = &xorNodes[j];
923 xorNodes[j].antecedents[i] = &readDataNodes[i];
924 xorNodes[j].antType[i] = rf_trueData;
925 }
926 }
927
928 /* connect read old data nodes to q nodes */
929 if (nfaults == 2) {
930 for (i = 0; i < numDataNodes; i++) {
931 for (j = 0; j < numParityNodes; j++) {
932 RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
933 readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j];
934 qNodes[j].antecedents[i] = &readDataNodes[i];
935 qNodes[j].antType[i] = rf_trueData;
936 }
937 }
938 }
939 /* connect read old parity nodes to xor nodes */
940 for (i = 0; i < numParityNodes; i++) {
941 RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
942 for (j = 0; j < numParityNodes; j++) {
943 readParityNodes[i].succedents[j] = &xorNodes[j];
944 xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
945 xorNodes[j].antType[numDataNodes + i] = rf_trueData;
946 }
947 }
948
949 /* connect read old q nodes to q nodes */
950 if (nfaults == 2) {
951 for (i = 0; i < numParityNodes; i++) {
952 RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
953 for (j = 0; j < numParityNodes; j++) {
954 readQNodes[i].succedents[j] = &qNodes[j];
955 qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
956 qNodes[j].antType[numDataNodes + i] = rf_trueData;
957 }
958 }
959 }
960 /* connect xor nodes to commit node */
961 RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
962 for (i = 0; i < numParityNodes; i++) {
963 RF_ASSERT(xorNodes[i].numSuccedents == 1);
964 xorNodes[i].succedents[0] = commitNode;
965 commitNode->antecedents[i] = &xorNodes[i];
966 commitNode->antType[i] = rf_control;
967 }
968
969 /* connect q nodes to commit node */
970 if (nfaults == 2) {
971 for (i = 0; i < numParityNodes; i++) {
972 RF_ASSERT(qNodes[i].numSuccedents == 1);
973 qNodes[i].succedents[0] = commitNode;
974 commitNode->antecedents[i + numParityNodes] = &qNodes[i];
975 commitNode->antType[i + numParityNodes] = rf_control;
976 }
977 }
978 /* connect commit node to write nodes */
979 RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
980 for (i = 0; i < numDataNodes; i++) {
981 RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
982 commitNode->succedents[i] = &writeDataNodes[i];
983 writeDataNodes[i].antecedents[0] = commitNode;
984 writeDataNodes[i].antType[0] = rf_trueData;
985 }
986 for (i = 0; i < numParityNodes; i++) {
987 RF_ASSERT(writeParityNodes[i].numAntecedents == 1);
988 commitNode->succedents[i + numDataNodes] = &writeParityNodes[i];
989 writeParityNodes[i].antecedents[0] = commitNode;
990 writeParityNodes[i].antType[0] = rf_trueData;
991 }
992 if (nfaults == 2) {
993 for (i = 0; i < numParityNodes; i++) {
994 RF_ASSERT(writeQNodes[i].numAntecedents == 1);
995 commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i];
996 writeQNodes[i].antecedents[0] = commitNode;
997 writeQNodes[i].antType[0] = rf_trueData;
998 }
999 }
1000 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1001 RF_ASSERT(termNode->numSuccedents == 0);
1002 for (i = 0; i < numDataNodes; i++) {
1003 if (lu_flag) {
1004 /* connect write new data nodes to unlock nodes */
1005 RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
1006 RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
1007 writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
1008 unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
1009 unlockDataNodes[i].antType[0] = rf_control;
1010
1011 /* connect unlock nodes to term node */
1012 RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
1013 unlockDataNodes[i].succedents[0] = termNode;
1014 termNode->antecedents[i] = &unlockDataNodes[i];
1015 termNode->antType[i] = rf_control;
1016 } else {
1017 /* connect write new data nodes to term node */
1018 RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
1019 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1020 writeDataNodes[i].succedents[0] = termNode;
1021 termNode->antecedents[i] = &writeDataNodes[i];
1022 termNode->antType[i] = rf_control;
1023 }
1024 }
1025
1026 for (i = 0; i < numParityNodes; i++) {
1027 if (lu_flag) {
1028 /* connect write new parity nodes to unlock nodes */
1029 RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
1030 RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
1031 writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
1032 unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
1033 unlockParityNodes[i].antType[0] = rf_control;
1034
1035 /* connect unlock nodes to term node */
1036 RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
1037 unlockParityNodes[i].succedents[0] = termNode;
1038 termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
1039 termNode->antType[numDataNodes + i] = rf_control;
1040 } else {
1041 RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
1042 writeParityNodes[i].succedents[0] = termNode;
1043 termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
1044 termNode->antType[numDataNodes + i] = rf_control;
1045 }
1046 }
1047
1048 if (nfaults == 2) {
1049 for (i = 0; i < numParityNodes; i++) {
1050 if (lu_flag) {
1051 /* connect write new Q nodes to unlock nodes */
1052 RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1053 RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
1054 writeQNodes[i].succedents[0] = &unlockQNodes[i];
1055 unlockQNodes[i].antecedents[0] = &writeQNodes[i];
1056 unlockQNodes[i].antType[0] = rf_control;
1057
1058 /* connect unlock nodes to unblock node */
1059 RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
1060 unlockQNodes[i].succedents[0] = termNode;
1061 termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
1062 termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1063 } else {
1064 RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1065 writeQNodes[i].succedents[0] = termNode;
1066 termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
1067 termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1068 }
1069 }
1070 }
1071 }
1072
1073
1074 /******************************************************************************
1075 * create a write graph (fault-free or degraded) for RAID level 1
1076 *
1077 * Hdr -> Commit -> Wpd -> Nil -> Trm
1078 * -> Wsd ->
1079 *
1080 * The "Wpd" node writes data to the primary copy in the mirror pair
1081 * The "Wsd" node writes data to the secondary copy in the mirror pair
1082 *
1083 * Parameters: raidPtr - description of the physical array
1084 * asmap - logical & physical addresses for this access
1085 * bp - buffer ptr (holds write data)
1086 * flags - general flags (e.g. disk locking)
1087 * allocList - list of memory allocated in DAG creation
1088 *****************************************************************************/
1089
1090 void
1091 rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1092 RF_DagHeader_t *dag_h, void *bp,
1093 RF_RaidAccessFlags_t flags,
1094 RF_AllocListElem_t *allocList)
1095 {
1096 RF_DagNode_t *unblockNode, *termNode, *commitNode;
1097 RF_DagNode_t *nodes, *wndNode, *wmirNode;
1098 int nWndNodes, nWmirNodes, i;
1099 RF_ReconUnitNum_t which_ru;
1100 RF_PhysDiskAddr_t *pda, *pdaP;
1101 RF_StripeNum_t parityStripeID;
1102
1103 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
1104 asmap->raidAddress, &which_ru);
1105 if (rf_dagDebug) {
1106 printf("[Creating RAID level 1 write DAG]\n");
1107 }
1108 dag_h->creator = "RaidOneWriteDAG";
1109
1110 /* 2 implies access not SU aligned */
1111 nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
1112 nWndNodes = (asmap->physInfo->next) ? 2 : 1;
1113
1114 /* alloc the Wnd nodes and the Wmir node */
1115 if (asmap->numDataFailed == 1)
1116 nWndNodes--;
1117 if (asmap->numParityFailed == 1)
1118 nWmirNodes--;
1119
1120 /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1121 * + terminator) */
1122 RF_MallocAndAdd(nodes,
1123 (nWndNodes + nWmirNodes + 3) * sizeof(RF_DagNode_t),
1124 (RF_DagNode_t *), allocList);
1125 i = 0;
1126 wndNode = &nodes[i];
1127 i += nWndNodes;
1128 wmirNode = &nodes[i];
1129 i += nWmirNodes;
1130 commitNode = &nodes[i];
1131 i += 1;
1132 unblockNode = &nodes[i];
1133 i += 1;
1134 termNode = &nodes[i];
1135 i += 1;
1136 RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
1137
1138 /* this dag can commit immediately */
1139 dag_h->numCommitNodes = 1;
1140 dag_h->numCommits = 0;
1141 dag_h->numSuccedents = 1;
1142
1143 /* initialize the commit, unblock, and term nodes */
1144 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
1145 rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes),
1146 0, 0, 0, dag_h, "Cmt", allocList);
1147 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
1148 rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes),
1149 0, 0, dag_h, "Nil", allocList);
1150 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
1151 rf_TerminateUndoFunc, NULL, 0, 1, 0, 0,
1152 dag_h, "Trm", allocList);
1153
1154 /* initialize the wnd nodes */
1155 if (nWndNodes > 0) {
1156 pda = asmap->physInfo;
1157 for (i = 0; i < nWndNodes; i++) {
1158 rf_InitNode(&wndNode[i], rf_wait, RF_FALSE,
1159 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1160 rf_GenericWakeupFunc, 1, 1, 4, 0,
1161 dag_h, "Wpd", allocList);
1162 RF_ASSERT(pda != NULL);
1163 wndNode[i].params[0].p = pda;
1164 wndNode[i].params[1].p = pda->bufPtr;
1165 wndNode[i].params[2].v = parityStripeID;
1166 wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1167 pda = pda->next;
1168 }
1169 RF_ASSERT(pda == NULL);
1170 }
1171 /* initialize the mirror nodes */
1172 if (nWmirNodes > 0) {
1173 pda = asmap->physInfo;
1174 pdaP = asmap->parityInfo;
1175 for (i = 0; i < nWmirNodes; i++) {
1176 rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE,
1177 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1178 rf_GenericWakeupFunc, 1, 1, 4, 0,
1179 dag_h, "Wsd", allocList);
1180 RF_ASSERT(pda != NULL);
1181 wmirNode[i].params[0].p = pdaP;
1182 wmirNode[i].params[1].p = pda->bufPtr;
1183 wmirNode[i].params[2].v = parityStripeID;
1184 wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1185 pda = pda->next;
1186 pdaP = pdaP->next;
1187 }
1188 RF_ASSERT(pda == NULL);
1189 RF_ASSERT(pdaP == NULL);
1190 }
1191 /* link the header node to the commit node */
1192 RF_ASSERT(dag_h->numSuccedents == 1);
1193 RF_ASSERT(commitNode->numAntecedents == 0);
1194 dag_h->succedents[0] = commitNode;
1195
1196 /* link the commit node to the write nodes */
1197 RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
1198 for (i = 0; i < nWndNodes; i++) {
1199 RF_ASSERT(wndNode[i].numAntecedents == 1);
1200 commitNode->succedents[i] = &wndNode[i];
1201 wndNode[i].antecedents[0] = commitNode;
1202 wndNode[i].antType[0] = rf_control;
1203 }
1204 for (i = 0; i < nWmirNodes; i++) {
1205 RF_ASSERT(wmirNode[i].numAntecedents == 1);
1206 commitNode->succedents[i + nWndNodes] = &wmirNode[i];
1207 wmirNode[i].antecedents[0] = commitNode;
1208 wmirNode[i].antType[0] = rf_control;
1209 }
1210
1211 /* link the write nodes to the unblock node */
1212 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
1213 for (i = 0; i < nWndNodes; i++) {
1214 RF_ASSERT(wndNode[i].numSuccedents == 1);
1215 wndNode[i].succedents[0] = unblockNode;
1216 unblockNode->antecedents[i] = &wndNode[i];
1217 unblockNode->antType[i] = rf_control;
1218 }
1219 for (i = 0; i < nWmirNodes; i++) {
1220 RF_ASSERT(wmirNode[i].numSuccedents == 1);
1221 wmirNode[i].succedents[0] = unblockNode;
1222 unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
1223 unblockNode->antType[i + nWndNodes] = rf_control;
1224 }
1225
1226 /* link the unblock node to the term node */
1227 RF_ASSERT(unblockNode->numSuccedents == 1);
1228 RF_ASSERT(termNode->numAntecedents == 1);
1229 RF_ASSERT(termNode->numSuccedents == 0);
1230 unblockNode->succedents[0] = termNode;
1231 termNode->antecedents[0] = unblockNode;
1232 termNode->antType[0] = rf_control;
1233 }
1234