rf_dagffwr.c revision 1.5.6.2 1 1.5.6.2 nathanw /* $NetBSD: rf_dagffwr.c,v 1.5.6.2 2001/11/14 19:15:47 nathanw Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster /*
30 1.1 oster * rf_dagff.c
31 1.1 oster *
32 1.1 oster * code for creating fault-free DAGs
33 1.1 oster *
34 1.1 oster */
35 1.5.6.2 nathanw
36 1.5.6.2 nathanw #include <sys/cdefs.h>
37 1.5.6.2 nathanw __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.5.6.2 2001/11/14 19:15:47 nathanw Exp $");
38 1.1 oster
39 1.5.6.1 nathanw #include <dev/raidframe/raidframevar.h>
40 1.5.6.1 nathanw
41 1.1 oster #include "rf_raid.h"
42 1.1 oster #include "rf_dag.h"
43 1.1 oster #include "rf_dagutils.h"
44 1.1 oster #include "rf_dagfuncs.h"
45 1.1 oster #include "rf_debugMem.h"
46 1.1 oster #include "rf_dagffrd.h"
47 1.1 oster #include "rf_memchunk.h"
48 1.1 oster #include "rf_general.h"
49 1.1 oster #include "rf_dagffwr.h"
50 1.1 oster
51 1.1 oster /******************************************************************************
52 1.1 oster *
53 1.1 oster * General comments on DAG creation:
54 1.3 oster *
55 1.1 oster * All DAGs in this file use roll-away error recovery. Each DAG has a single
56 1.1 oster * commit node, usually called "Cmt." If an error occurs before the Cmt node
57 1.1 oster * is reached, the execution engine will halt forward execution and work
58 1.1 oster * backward through the graph, executing the undo functions. Assuming that
59 1.1 oster * each node in the graph prior to the Cmt node are undoable and atomic - or -
60 1.1 oster * does not make changes to permanent state, the graph will fail atomically.
61 1.1 oster * If an error occurs after the Cmt node executes, the engine will roll-forward
62 1.1 oster * through the graph, blindly executing nodes until it reaches the end.
63 1.1 oster * If a graph reaches the end, it is assumed to have completed successfully.
64 1.1 oster *
65 1.1 oster * A graph has only 1 Cmt node.
66 1.1 oster *
67 1.1 oster */
68 1.1 oster
69 1.1 oster
70 1.1 oster /******************************************************************************
71 1.1 oster *
72 1.1 oster * The following wrappers map the standard DAG creation interface to the
73 1.1 oster * DAG creation routines. Additionally, these wrappers enable experimentation
74 1.1 oster * with new DAG structures by providing an extra level of indirection, allowing
75 1.1 oster * the DAG creation routines to be replaced at this single point.
76 1.1 oster */
77 1.1 oster
78 1.1 oster
79 1.3 oster void
80 1.3 oster rf_CreateNonRedundantWriteDAG(
81 1.3 oster RF_Raid_t * raidPtr,
82 1.3 oster RF_AccessStripeMap_t * asmap,
83 1.3 oster RF_DagHeader_t * dag_h,
84 1.3 oster void *bp,
85 1.3 oster RF_RaidAccessFlags_t flags,
86 1.3 oster RF_AllocListElem_t * allocList,
87 1.3 oster RF_IoType_t type)
88 1.1 oster {
89 1.3 oster rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
90 1.3 oster RF_IO_TYPE_WRITE);
91 1.1 oster }
92 1.1 oster
93 1.3 oster void
94 1.3 oster rf_CreateRAID0WriteDAG(
95 1.3 oster RF_Raid_t * raidPtr,
96 1.3 oster RF_AccessStripeMap_t * asmap,
97 1.3 oster RF_DagHeader_t * dag_h,
98 1.3 oster void *bp,
99 1.3 oster RF_RaidAccessFlags_t flags,
100 1.3 oster RF_AllocListElem_t * allocList,
101 1.3 oster RF_IoType_t type)
102 1.1 oster {
103 1.3 oster rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
104 1.3 oster RF_IO_TYPE_WRITE);
105 1.1 oster }
106 1.1 oster
107 1.3 oster void
108 1.3 oster rf_CreateSmallWriteDAG(
109 1.3 oster RF_Raid_t * raidPtr,
110 1.3 oster RF_AccessStripeMap_t * asmap,
111 1.3 oster RF_DagHeader_t * dag_h,
112 1.3 oster void *bp,
113 1.3 oster RF_RaidAccessFlags_t flags,
114 1.3 oster RF_AllocListElem_t * allocList)
115 1.1 oster {
116 1.3 oster /* "normal" rollaway */
117 1.3 oster rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
118 1.3 oster &rf_xorFuncs, NULL);
119 1.1 oster }
120 1.1 oster
121 1.3 oster void
122 1.3 oster rf_CreateLargeWriteDAG(
123 1.3 oster RF_Raid_t * raidPtr,
124 1.3 oster RF_AccessStripeMap_t * asmap,
125 1.3 oster RF_DagHeader_t * dag_h,
126 1.3 oster void *bp,
127 1.3 oster RF_RaidAccessFlags_t flags,
128 1.3 oster RF_AllocListElem_t * allocList)
129 1.1 oster {
130 1.3 oster /* "normal" rollaway */
131 1.3 oster rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
132 1.3 oster 1, rf_RegularXorFunc, RF_TRUE);
133 1.1 oster }
134 1.1 oster
135 1.1 oster
136 1.1 oster /******************************************************************************
137 1.1 oster *
138 1.1 oster * DAG creation code begins here
139 1.1 oster */
140 1.1 oster
141 1.1 oster
142 1.1 oster /******************************************************************************
143 1.1 oster *
144 1.1 oster * creates a DAG to perform a large-write operation:
145 1.1 oster *
146 1.1 oster * / Rod \ / Wnd \
147 1.1 oster * H -- block- Rod - Xor - Cmt - Wnd --- T
148 1.1 oster * \ Rod / \ Wnp /
149 1.1 oster * \[Wnq]/
150 1.1 oster *
151 1.1 oster * The XOR node also does the Q calculation in the P+Q architecture.
152 1.1 oster * All nodes are before the commit node (Cmt) are assumed to be atomic and
153 1.1 oster * undoable - or - they make no changes to permanent state.
154 1.1 oster *
155 1.1 oster * Rod = read old data
156 1.1 oster * Cmt = commit node
157 1.1 oster * Wnp = write new parity
158 1.1 oster * Wnd = write new data
159 1.1 oster * Wnq = write new "q"
160 1.1 oster * [] denotes optional segments in the graph
161 1.1 oster *
162 1.1 oster * Parameters: raidPtr - description of the physical array
163 1.1 oster * asmap - logical & physical addresses for this access
164 1.1 oster * bp - buffer ptr (holds write data)
165 1.3 oster * flags - general flags (e.g. disk locking)
166 1.1 oster * allocList - list of memory allocated in DAG creation
167 1.1 oster * nfaults - number of faults array can tolerate
168 1.1 oster * (equal to # redundancy units in stripe)
169 1.1 oster * redfuncs - list of redundancy generating functions
170 1.1 oster *
171 1.1 oster *****************************************************************************/
172 1.1 oster
173 1.3 oster void
174 1.3 oster rf_CommonCreateLargeWriteDAG(
175 1.3 oster RF_Raid_t * raidPtr,
176 1.3 oster RF_AccessStripeMap_t * asmap,
177 1.3 oster RF_DagHeader_t * dag_h,
178 1.3 oster void *bp,
179 1.3 oster RF_RaidAccessFlags_t flags,
180 1.3 oster RF_AllocListElem_t * allocList,
181 1.3 oster int nfaults,
182 1.3 oster int (*redFunc) (RF_DagNode_t *),
183 1.3 oster int allowBufferRecycle)
184 1.1 oster {
185 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
186 1.3 oster RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode;
187 1.3 oster int nWndNodes, nRodNodes, i, nodeNum, asmNum;
188 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
189 1.3 oster RF_StripeNum_t parityStripeID;
190 1.3 oster char *sosBuffer, *eosBuffer;
191 1.3 oster RF_ReconUnitNum_t which_ru;
192 1.3 oster RF_RaidLayout_t *layoutPtr;
193 1.3 oster RF_PhysDiskAddr_t *pda;
194 1.3 oster
195 1.3 oster layoutPtr = &(raidPtr->Layout);
196 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
197 1.3 oster &which_ru);
198 1.3 oster
199 1.3 oster if (rf_dagDebug) {
200 1.3 oster printf("[Creating large-write DAG]\n");
201 1.3 oster }
202 1.3 oster dag_h->creator = "LargeWriteDAG";
203 1.3 oster
204 1.3 oster dag_h->numCommitNodes = 1;
205 1.3 oster dag_h->numCommits = 0;
206 1.3 oster dag_h->numSuccedents = 1;
207 1.3 oster
208 1.3 oster /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
209 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
210 1.3 oster RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t),
211 1.3 oster (RF_DagNode_t *), allocList);
212 1.3 oster i = 0;
213 1.3 oster wndNodes = &nodes[i];
214 1.3 oster i += nWndNodes;
215 1.3 oster xorNode = &nodes[i];
216 1.3 oster i += 1;
217 1.3 oster wnpNode = &nodes[i];
218 1.3 oster i += 1;
219 1.3 oster blockNode = &nodes[i];
220 1.3 oster i += 1;
221 1.3 oster commitNode = &nodes[i];
222 1.3 oster i += 1;
223 1.3 oster termNode = &nodes[i];
224 1.3 oster i += 1;
225 1.3 oster if (nfaults == 2) {
226 1.3 oster wnqNode = &nodes[i];
227 1.3 oster i += 1;
228 1.3 oster } else {
229 1.3 oster wnqNode = NULL;
230 1.3 oster }
231 1.3 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h,
232 1.3 oster &nRodNodes, &sosBuffer, &eosBuffer, allocList);
233 1.3 oster if (nRodNodes > 0) {
234 1.3 oster RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t),
235 1.3 oster (RF_DagNode_t *), allocList);
236 1.3 oster } else {
237 1.3 oster rodNodes = NULL;
238 1.3 oster }
239 1.3 oster
240 1.3 oster /* begin node initialization */
241 1.3 oster if (nRodNodes > 0) {
242 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
243 1.3 oster NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList);
244 1.3 oster } else {
245 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
246 1.3 oster NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
247 1.3 oster }
248 1.3 oster
249 1.3 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL,
250 1.3 oster nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
251 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL,
252 1.3 oster 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList);
253 1.3 oster
254 1.3 oster /* initialize the Rod nodes */
255 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
256 1.3 oster if (new_asm_h[asmNum]) {
257 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
258 1.3 oster while (pda) {
259 1.3 oster rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc,
260 1.3 oster rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
261 1.3 oster "Rod", allocList);
262 1.3 oster rodNodes[nodeNum].params[0].p = pda;
263 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
264 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
265 1.3 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
266 1.3 oster 0, 0, which_ru);
267 1.3 oster nodeNum++;
268 1.3 oster pda = pda->next;
269 1.3 oster }
270 1.3 oster }
271 1.3 oster }
272 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
273 1.3 oster
274 1.3 oster /* initialize the wnd nodes */
275 1.3 oster pda = asmap->physInfo;
276 1.3 oster for (i = 0; i < nWndNodes; i++) {
277 1.3 oster rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
278 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
279 1.3 oster RF_ASSERT(pda != NULL);
280 1.3 oster wndNodes[i].params[0].p = pda;
281 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
282 1.3 oster wndNodes[i].params[2].v = parityStripeID;
283 1.3 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
284 1.3 oster pda = pda->next;
285 1.3 oster }
286 1.3 oster
287 1.3 oster /* initialize the redundancy node */
288 1.3 oster if (nRodNodes > 0) {
289 1.3 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
290 1.3 oster nRodNodes, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h,
291 1.3 oster "Xr ", allocList);
292 1.3 oster } else {
293 1.3 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
294 1.3 oster 1, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList);
295 1.3 oster }
296 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
297 1.3 oster for (i = 0; i < nWndNodes; i++) {
298 1.3 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
299 1.3 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
300 1.3 oster }
301 1.3 oster for (i = 0; i < nRodNodes; i++) {
302 1.3 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
303 1.3 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
304 1.3 oster }
305 1.3 oster /* xor node needs to get at RAID information */
306 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
307 1.3 oster
308 1.3 oster /*
309 1.3 oster * Look for an Rod node that reads a complete SU. If none, alloc a buffer
310 1.3 oster * to receive the parity info. Note that we can't use a new data buffer
311 1.3 oster * because it will not have gotten written when the xor occurs.
312 1.3 oster */
313 1.3 oster if (allowBufferRecycle) {
314 1.3 oster for (i = 0; i < nRodNodes; i++) {
315 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
316 1.3 oster break;
317 1.3 oster }
318 1.3 oster }
319 1.3 oster if ((!allowBufferRecycle) || (i == nRodNodes)) {
320 1.3 oster RF_CallocAndAdd(xorNode->results[0], 1,
321 1.3 oster rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
322 1.3 oster (void *), allocList);
323 1.3 oster } else {
324 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
325 1.3 oster }
326 1.3 oster
327 1.3 oster /* initialize the Wnp node */
328 1.3 oster rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
329 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
330 1.3 oster wnpNode->params[0].p = asmap->parityInfo;
331 1.3 oster wnpNode->params[1].p = xorNode->results[0];
332 1.3 oster wnpNode->params[2].v = parityStripeID;
333 1.3 oster wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
334 1.3 oster /* parityInfo must describe entire parity unit */
335 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL);
336 1.3 oster
337 1.3 oster if (nfaults == 2) {
338 1.3 oster /*
339 1.3 oster * We never try to recycle a buffer for the Q calcuation
340 1.3 oster * in addition to the parity. This would cause two buffers
341 1.3 oster * to get smashed during the P and Q calculation, guaranteeing
342 1.3 oster * one would be wrong.
343 1.3 oster */
344 1.3 oster RF_CallocAndAdd(xorNode->results[1], 1,
345 1.3 oster rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
346 1.3 oster (void *), allocList);
347 1.3 oster rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
348 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
349 1.3 oster wnqNode->params[0].p = asmap->qInfo;
350 1.3 oster wnqNode->params[1].p = xorNode->results[1];
351 1.3 oster wnqNode->params[2].v = parityStripeID;
352 1.3 oster wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
353 1.3 oster /* parityInfo must describe entire parity unit */
354 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL);
355 1.3 oster }
356 1.3 oster /*
357 1.3 oster * Connect nodes to form graph.
358 1.3 oster */
359 1.3 oster
360 1.3 oster /* connect dag header to block node */
361 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
362 1.3 oster dag_h->succedents[0] = blockNode;
363 1.3 oster
364 1.3 oster if (nRodNodes > 0) {
365 1.3 oster /* connect the block node to the Rod nodes */
366 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes);
367 1.3 oster RF_ASSERT(xorNode->numAntecedents == nRodNodes);
368 1.3 oster for (i = 0; i < nRodNodes; i++) {
369 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
370 1.3 oster blockNode->succedents[i] = &rodNodes[i];
371 1.3 oster rodNodes[i].antecedents[0] = blockNode;
372 1.3 oster rodNodes[i].antType[0] = rf_control;
373 1.3 oster
374 1.3 oster /* connect the Rod nodes to the Xor node */
375 1.3 oster RF_ASSERT(rodNodes[i].numSuccedents == 1);
376 1.3 oster rodNodes[i].succedents[0] = xorNode;
377 1.3 oster xorNode->antecedents[i] = &rodNodes[i];
378 1.3 oster xorNode->antType[i] = rf_trueData;
379 1.3 oster }
380 1.3 oster } else {
381 1.3 oster /* connect the block node to the Xor node */
382 1.3 oster RF_ASSERT(blockNode->numSuccedents == 1);
383 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
384 1.3 oster blockNode->succedents[0] = xorNode;
385 1.3 oster xorNode->antecedents[0] = blockNode;
386 1.3 oster xorNode->antType[0] = rf_control;
387 1.3 oster }
388 1.3 oster
389 1.3 oster /* connect the xor node to the commit node */
390 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1);
391 1.3 oster RF_ASSERT(commitNode->numAntecedents == 1);
392 1.3 oster xorNode->succedents[0] = commitNode;
393 1.3 oster commitNode->antecedents[0] = xorNode;
394 1.3 oster commitNode->antType[0] = rf_control;
395 1.3 oster
396 1.3 oster /* connect the commit node to the write nodes */
397 1.3 oster RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
398 1.3 oster for (i = 0; i < nWndNodes; i++) {
399 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
400 1.3 oster commitNode->succedents[i] = &wndNodes[i];
401 1.3 oster wndNodes[i].antecedents[0] = commitNode;
402 1.3 oster wndNodes[i].antType[0] = rf_control;
403 1.3 oster }
404 1.3 oster RF_ASSERT(wnpNode->numAntecedents == 1);
405 1.3 oster commitNode->succedents[nWndNodes] = wnpNode;
406 1.3 oster wnpNode->antecedents[0] = commitNode;
407 1.3 oster wnpNode->antType[0] = rf_trueData;
408 1.3 oster if (nfaults == 2) {
409 1.3 oster RF_ASSERT(wnqNode->numAntecedents == 1);
410 1.3 oster commitNode->succedents[nWndNodes + 1] = wnqNode;
411 1.3 oster wnqNode->antecedents[0] = commitNode;
412 1.3 oster wnqNode->antType[0] = rf_trueData;
413 1.3 oster }
414 1.3 oster /* connect the write nodes to the term node */
415 1.3 oster RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
416 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
417 1.3 oster for (i = 0; i < nWndNodes; i++) {
418 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
419 1.3 oster wndNodes[i].succedents[0] = termNode;
420 1.3 oster termNode->antecedents[i] = &wndNodes[i];
421 1.3 oster termNode->antType[i] = rf_control;
422 1.3 oster }
423 1.3 oster RF_ASSERT(wnpNode->numSuccedents == 1);
424 1.3 oster wnpNode->succedents[0] = termNode;
425 1.3 oster termNode->antecedents[nWndNodes] = wnpNode;
426 1.3 oster termNode->antType[nWndNodes] = rf_control;
427 1.3 oster if (nfaults == 2) {
428 1.3 oster RF_ASSERT(wnqNode->numSuccedents == 1);
429 1.3 oster wnqNode->succedents[0] = termNode;
430 1.3 oster termNode->antecedents[nWndNodes + 1] = wnqNode;
431 1.3 oster termNode->antType[nWndNodes + 1] = rf_control;
432 1.3 oster }
433 1.1 oster }
434 1.1 oster /******************************************************************************
435 1.1 oster *
436 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq),
437 1.1 oster * which is as follows:
438 1.1 oster *
439 1.1 oster * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
440 1.1 oster * \- Rod X / \----> Wnd [Und]-/
441 1.1 oster * [\- Rod X / \---> Wnd [Und]-/]
442 1.1 oster * [\- Roq -> Q / \--> Wnq [Unq]-/]
443 1.1 oster *
444 1.1 oster * Rop = read old parity
445 1.1 oster * Rod = read old data
446 1.1 oster * Roq = read old "q"
447 1.1 oster * Cmt = commit node
448 1.1 oster * Und = unlock data disk
449 1.1 oster * Unp = unlock parity disk
450 1.1 oster * Unq = unlock q disk
451 1.1 oster * Wnp = write new parity
452 1.1 oster * Wnd = write new data
453 1.1 oster * Wnq = write new "q"
454 1.1 oster * [ ] denotes optional segments in the graph
455 1.1 oster *
456 1.1 oster * Parameters: raidPtr - description of the physical array
457 1.1 oster * asmap - logical & physical addresses for this access
458 1.1 oster * bp - buffer ptr (holds write data)
459 1.3 oster * flags - general flags (e.g. disk locking)
460 1.1 oster * allocList - list of memory allocated in DAG creation
461 1.1 oster * pfuncs - list of parity generating functions
462 1.1 oster * qfuncs - list of q generating functions
463 1.1 oster *
464 1.1 oster * A null qfuncs indicates single fault tolerant
465 1.1 oster *****************************************************************************/
466 1.1 oster
467 1.3 oster void
468 1.3 oster rf_CommonCreateSmallWriteDAG(
469 1.3 oster RF_Raid_t * raidPtr,
470 1.3 oster RF_AccessStripeMap_t * asmap,
471 1.3 oster RF_DagHeader_t * dag_h,
472 1.3 oster void *bp,
473 1.3 oster RF_RaidAccessFlags_t flags,
474 1.3 oster RF_AllocListElem_t * allocList,
475 1.3 oster RF_RedFuncs_t * pfuncs,
476 1.3 oster RF_RedFuncs_t * qfuncs)
477 1.1 oster {
478 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
479 1.3 oster RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
480 1.3 oster RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes;
481 1.3 oster RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
482 1.3 oster int i, j, nNodes, totalNumNodes, lu_flag;
483 1.3 oster RF_ReconUnitNum_t which_ru;
484 1.3 oster int (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
485 1.3 oster int (*qfunc) (RF_DagNode_t *);
486 1.3 oster int numDataNodes, numParityNodes;
487 1.3 oster RF_StripeNum_t parityStripeID;
488 1.3 oster RF_PhysDiskAddr_t *pda;
489 1.3 oster char *name, *qname;
490 1.3 oster long nfaults;
491 1.3 oster
492 1.3 oster nfaults = qfuncs ? 2 : 1;
493 1.3 oster lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
494 1.3 oster
495 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
496 1.3 oster asmap->raidAddress, &which_ru);
497 1.3 oster pda = asmap->physInfo;
498 1.3 oster numDataNodes = asmap->numStripeUnitsAccessed;
499 1.3 oster numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
500 1.3 oster
501 1.3 oster if (rf_dagDebug) {
502 1.3 oster printf("[Creating small-write DAG]\n");
503 1.3 oster }
504 1.3 oster RF_ASSERT(numDataNodes > 0);
505 1.3 oster dag_h->creator = "SmallWriteDAG";
506 1.3 oster
507 1.3 oster dag_h->numCommitNodes = 1;
508 1.3 oster dag_h->numCommits = 0;
509 1.3 oster dag_h->numSuccedents = 1;
510 1.3 oster
511 1.3 oster /*
512 1.3 oster * DAG creation occurs in four steps:
513 1.3 oster * 1. count the number of nodes in the DAG
514 1.3 oster * 2. create the nodes
515 1.3 oster * 3. initialize the nodes
516 1.3 oster * 4. connect the nodes
517 1.3 oster */
518 1.3 oster
519 1.3 oster /*
520 1.3 oster * Step 1. compute number of nodes in the graph
521 1.3 oster */
522 1.3 oster
523 1.3 oster /* number of nodes: a read and write for each data unit a redundancy
524 1.3 oster * computation node for each parity node (nfaults * nparity) a read
525 1.3 oster * and write for each parity unit a block and commit node (2) a
526 1.3 oster * terminate node if atomic RMW an unlock node for each data unit,
527 1.3 oster * redundancy unit */
528 1.3 oster totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
529 1.3 oster + (nfaults * 2 * numParityNodes) + 3;
530 1.3 oster if (lu_flag) {
531 1.3 oster totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
532 1.3 oster }
533 1.3 oster /*
534 1.3 oster * Step 2. create the nodes
535 1.3 oster */
536 1.3 oster RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
537 1.3 oster (RF_DagNode_t *), allocList);
538 1.3 oster i = 0;
539 1.3 oster blockNode = &nodes[i];
540 1.3 oster i += 1;
541 1.3 oster commitNode = &nodes[i];
542 1.3 oster i += 1;
543 1.3 oster readDataNodes = &nodes[i];
544 1.3 oster i += numDataNodes;
545 1.3 oster readParityNodes = &nodes[i];
546 1.3 oster i += numParityNodes;
547 1.3 oster writeDataNodes = &nodes[i];
548 1.3 oster i += numDataNodes;
549 1.3 oster writeParityNodes = &nodes[i];
550 1.3 oster i += numParityNodes;
551 1.3 oster xorNodes = &nodes[i];
552 1.3 oster i += numParityNodes;
553 1.3 oster termNode = &nodes[i];
554 1.3 oster i += 1;
555 1.3 oster if (lu_flag) {
556 1.3 oster unlockDataNodes = &nodes[i];
557 1.3 oster i += numDataNodes;
558 1.3 oster unlockParityNodes = &nodes[i];
559 1.3 oster i += numParityNodes;
560 1.3 oster } else {
561 1.3 oster unlockDataNodes = unlockParityNodes = NULL;
562 1.3 oster }
563 1.3 oster if (nfaults == 2) {
564 1.3 oster readQNodes = &nodes[i];
565 1.3 oster i += numParityNodes;
566 1.3 oster writeQNodes = &nodes[i];
567 1.3 oster i += numParityNodes;
568 1.3 oster qNodes = &nodes[i];
569 1.3 oster i += numParityNodes;
570 1.3 oster if (lu_flag) {
571 1.3 oster unlockQNodes = &nodes[i];
572 1.3 oster i += numParityNodes;
573 1.3 oster } else {
574 1.3 oster unlockQNodes = NULL;
575 1.3 oster }
576 1.3 oster } else {
577 1.3 oster readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
578 1.3 oster }
579 1.3 oster RF_ASSERT(i == totalNumNodes);
580 1.3 oster
581 1.3 oster /*
582 1.3 oster * Step 3. initialize the nodes
583 1.3 oster */
584 1.3 oster /* initialize block node (Nil) */
585 1.3 oster nNodes = numDataNodes + (nfaults * numParityNodes);
586 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
587 1.3 oster NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
588 1.3 oster
589 1.3 oster /* initialize commit node (Cmt) */
590 1.3 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
591 1.3 oster NULL, nNodes, (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
592 1.3 oster
593 1.3 oster /* initialize terminate node (Trm) */
594 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
595 1.3 oster NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList);
596 1.3 oster
597 1.3 oster /* initialize nodes which read old data (Rod) */
598 1.3 oster for (i = 0; i < numDataNodes; i++) {
599 1.3 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
600 1.3 oster rf_GenericWakeupFunc, (nfaults * numParityNodes), 1, 4, 0, dag_h,
601 1.3 oster "Rod", allocList);
602 1.3 oster RF_ASSERT(pda != NULL);
603 1.3 oster /* physical disk addr desc */
604 1.3 oster readDataNodes[i].params[0].p = pda;
605 1.3 oster /* buffer to hold old data */
606 1.3 oster readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
607 1.3 oster dag_h, pda, allocList);
608 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
609 1.3 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
610 1.3 oster lu_flag, 0, which_ru);
611 1.3 oster pda = pda->next;
612 1.3 oster for (j = 0; j < readDataNodes[i].numSuccedents; j++) {
613 1.3 oster readDataNodes[i].propList[j] = NULL;
614 1.3 oster }
615 1.3 oster }
616 1.3 oster
617 1.3 oster /* initialize nodes which read old parity (Rop) */
618 1.3 oster pda = asmap->parityInfo;
619 1.3 oster i = 0;
620 1.3 oster for (i = 0; i < numParityNodes; i++) {
621 1.3 oster RF_ASSERT(pda != NULL);
622 1.3 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
623 1.3 oster rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4,
624 1.3 oster 0, dag_h, "Rop", allocList);
625 1.3 oster readParityNodes[i].params[0].p = pda;
626 1.3 oster /* buffer to hold old parity */
627 1.3 oster readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
628 1.3 oster dag_h, pda, allocList);
629 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
630 1.3 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
631 1.3 oster lu_flag, 0, which_ru);
632 1.3 oster pda = pda->next;
633 1.3 oster for (j = 0; j < readParityNodes[i].numSuccedents; j++) {
634 1.3 oster readParityNodes[i].propList[0] = NULL;
635 1.3 oster }
636 1.3 oster }
637 1.3 oster
638 1.3 oster /* initialize nodes which read old Q (Roq) */
639 1.3 oster if (nfaults == 2) {
640 1.3 oster pda = asmap->qInfo;
641 1.3 oster for (i = 0; i < numParityNodes; i++) {
642 1.3 oster RF_ASSERT(pda != NULL);
643 1.3 oster rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
644 1.3 oster rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList);
645 1.3 oster readQNodes[i].params[0].p = pda;
646 1.3 oster /* buffer to hold old Q */
647 1.3 oster readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda,
648 1.3 oster allocList);
649 1.3 oster readQNodes[i].params[2].v = parityStripeID;
650 1.3 oster readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
651 1.3 oster lu_flag, 0, which_ru);
652 1.3 oster pda = pda->next;
653 1.3 oster for (j = 0; j < readQNodes[i].numSuccedents; j++) {
654 1.3 oster readQNodes[i].propList[0] = NULL;
655 1.3 oster }
656 1.3 oster }
657 1.3 oster }
658 1.3 oster /* initialize nodes which write new data (Wnd) */
659 1.3 oster pda = asmap->physInfo;
660 1.3 oster for (i = 0; i < numDataNodes; i++) {
661 1.3 oster RF_ASSERT(pda != NULL);
662 1.3 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
663 1.3 oster rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
664 1.3 oster "Wnd", allocList);
665 1.3 oster /* physical disk addr desc */
666 1.3 oster writeDataNodes[i].params[0].p = pda;
667 1.3 oster /* buffer holding new data to be written */
668 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr;
669 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
670 1.3 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
671 1.3 oster 0, 0, which_ru);
672 1.3 oster if (lu_flag) {
673 1.3 oster /* initialize node to unlock the disk queue */
674 1.3 oster rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
675 1.3 oster rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
676 1.3 oster "Und", allocList);
677 1.3 oster /* physical disk addr desc */
678 1.3 oster unlockDataNodes[i].params[0].p = pda;
679 1.3 oster unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
680 1.3 oster 0, lu_flag, which_ru);
681 1.3 oster }
682 1.3 oster pda = pda->next;
683 1.3 oster }
684 1.3 oster
685 1.3 oster /*
686 1.3 oster * Initialize nodes which compute new parity and Q.
687 1.3 oster */
688 1.3 oster /*
689 1.3 oster * We use the simple XOR func in the double-XOR case, and when
690 1.3 oster * we're accessing only a portion of one stripe unit. The distinction
691 1.3 oster * between the two is that the regular XOR func assumes that the targbuf
692 1.3 oster * is a full SU in size, and examines the pda associated with the buffer
693 1.3 oster * to decide where within the buffer to XOR the data, whereas
694 1.3 oster * the simple XOR func just XORs the data into the start of the buffer.
695 1.3 oster */
696 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1)
697 1.3 oster && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
698 1.3 oster func = pfuncs->simple;
699 1.3 oster undoFunc = rf_NullNodeUndoFunc;
700 1.3 oster name = pfuncs->SimpleName;
701 1.3 oster if (qfuncs) {
702 1.3 oster qfunc = qfuncs->simple;
703 1.3 oster qname = qfuncs->SimpleName;
704 1.3 oster } else {
705 1.3 oster qfunc = NULL;
706 1.3 oster qname = NULL;
707 1.3 oster }
708 1.3 oster } else {
709 1.3 oster func = pfuncs->regular;
710 1.3 oster undoFunc = rf_NullNodeUndoFunc;
711 1.3 oster name = pfuncs->RegularName;
712 1.3 oster if (qfuncs) {
713 1.3 oster qfunc = qfuncs->regular;
714 1.3 oster qname = qfuncs->RegularName;
715 1.3 oster } else {
716 1.3 oster qfunc = NULL;
717 1.3 oster qname = NULL;
718 1.3 oster }
719 1.3 oster }
720 1.3 oster /*
721 1.3 oster * Initialize the xor nodes: params are {pda,buf}
722 1.3 oster * from {Rod,Wnd,Rop} nodes, and raidPtr
723 1.3 oster */
724 1.3 oster if (numParityNodes == 2) {
725 1.3 oster /* double-xor case */
726 1.3 oster for (i = 0; i < numParityNodes; i++) {
727 1.3 oster /* note: no wakeup func for xor */
728 1.3 oster rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL,
729 1.3 oster 1, (numDataNodes + numParityNodes), 7, 1, dag_h, name, allocList);
730 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
731 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
732 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
733 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
734 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
735 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
736 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
737 1.3 oster xorNodes[i].params[6].p = raidPtr;
738 1.3 oster /* use old parity buf as target buf */
739 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p;
740 1.3 oster if (nfaults == 2) {
741 1.3 oster /* note: no wakeup func for qor */
742 1.3 oster rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
743 1.3 oster (numDataNodes + numParityNodes), 7, 1, dag_h, qname, allocList);
744 1.3 oster qNodes[i].params[0] = readDataNodes[i].params[0];
745 1.3 oster qNodes[i].params[1] = readDataNodes[i].params[1];
746 1.3 oster qNodes[i].params[2] = readQNodes[i].params[0];
747 1.3 oster qNodes[i].params[3] = readQNodes[i].params[1];
748 1.3 oster qNodes[i].params[4] = writeDataNodes[i].params[0];
749 1.3 oster qNodes[i].params[5] = writeDataNodes[i].params[1];
750 1.3 oster qNodes[i].params[6].p = raidPtr;
751 1.3 oster /* use old Q buf as target buf */
752 1.3 oster qNodes[i].results[0] = readQNodes[i].params[1].p;
753 1.3 oster }
754 1.3 oster }
755 1.3 oster } else {
756 1.3 oster /* there is only one xor node in this case */
757 1.3 oster rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, 1,
758 1.3 oster (numDataNodes + numParityNodes),
759 1.3 oster (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
760 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
761 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
762 1.3 oster /* set up params related to Rod and Rop nodes */
763 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
764 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
765 1.3 oster }
766 1.3 oster for (i = 0; i < numDataNodes; i++) {
767 1.3 oster /* set up params related to Wnd and Wnp nodes */
768 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
769 1.3 oster writeDataNodes[i].params[0];
770 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
771 1.3 oster writeDataNodes[i].params[1];
772 1.3 oster }
773 1.3 oster /* xor node needs to get at RAID information */
774 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
775 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
776 1.3 oster if (nfaults == 2) {
777 1.3 oster rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
778 1.3 oster (numDataNodes + numParityNodes),
779 1.3 oster (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h,
780 1.3 oster qname, allocList);
781 1.3 oster for (i = 0; i < numDataNodes; i++) {
782 1.3 oster /* set up params related to Rod */
783 1.3 oster qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
784 1.3 oster qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
785 1.3 oster }
786 1.3 oster /* and read old q */
787 1.3 oster qNodes[0].params[2 * numDataNodes + 0] = /* pda */
788 1.3 oster readQNodes[0].params[0];
789 1.3 oster qNodes[0].params[2 * numDataNodes + 1] = /* buffer ptr */
790 1.3 oster readQNodes[0].params[1];
791 1.3 oster for (i = 0; i < numDataNodes; i++) {
792 1.3 oster /* set up params related to Wnd nodes */
793 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
794 1.3 oster writeDataNodes[i].params[0];
795 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
796 1.3 oster writeDataNodes[i].params[1];
797 1.3 oster }
798 1.3 oster /* xor node needs to get at RAID information */
799 1.3 oster qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
800 1.3 oster qNodes[0].results[0] = readQNodes[0].params[1].p;
801 1.3 oster }
802 1.3 oster }
803 1.3 oster
804 1.3 oster /* initialize nodes which write new parity (Wnp) */
805 1.3 oster pda = asmap->parityInfo;
806 1.3 oster for (i = 0; i < numParityNodes; i++) {
807 1.3 oster rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
808 1.3 oster rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
809 1.3 oster "Wnp", allocList);
810 1.3 oster RF_ASSERT(pda != NULL);
811 1.3 oster writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr)
812 1.3 oster * filled in by xor node */
813 1.3 oster writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for
814 1.3 oster * parity write
815 1.3 oster * operation */
816 1.3 oster writeParityNodes[i].params[2].v = parityStripeID;
817 1.3 oster writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
818 1.3 oster 0, 0, which_ru);
819 1.3 oster if (lu_flag) {
820 1.3 oster /* initialize node to unlock the disk queue */
821 1.3 oster rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
822 1.3 oster rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
823 1.3 oster "Unp", allocList);
824 1.3 oster unlockParityNodes[i].params[0].p = pda; /* physical disk addr
825 1.3 oster * desc */
826 1.3 oster unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
827 1.3 oster 0, lu_flag, which_ru);
828 1.3 oster }
829 1.3 oster pda = pda->next;
830 1.3 oster }
831 1.3 oster
832 1.3 oster /* initialize nodes which write new Q (Wnq) */
833 1.3 oster if (nfaults == 2) {
834 1.3 oster pda = asmap->qInfo;
835 1.3 oster for (i = 0; i < numParityNodes; i++) {
836 1.3 oster rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
837 1.3 oster rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
838 1.3 oster "Wnq", allocList);
839 1.3 oster RF_ASSERT(pda != NULL);
840 1.3 oster writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr)
841 1.3 oster * filled in by xor node */
842 1.3 oster writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for
843 1.3 oster * parity write
844 1.3 oster * operation */
845 1.3 oster writeQNodes[i].params[2].v = parityStripeID;
846 1.3 oster writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
847 1.3 oster 0, 0, which_ru);
848 1.3 oster if (lu_flag) {
849 1.3 oster /* initialize node to unlock the disk queue */
850 1.3 oster rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
851 1.3 oster rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
852 1.3 oster "Unq", allocList);
853 1.3 oster unlockQNodes[i].params[0].p = pda; /* physical disk addr
854 1.3 oster * desc */
855 1.3 oster unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
856 1.3 oster 0, lu_flag, which_ru);
857 1.3 oster }
858 1.3 oster pda = pda->next;
859 1.3 oster }
860 1.3 oster }
861 1.3 oster /*
862 1.3 oster * Step 4. connect the nodes.
863 1.3 oster */
864 1.3 oster
865 1.3 oster /* connect header to block node */
866 1.3 oster dag_h->succedents[0] = blockNode;
867 1.3 oster
868 1.3 oster /* connect block node to read old data nodes */
869 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
870 1.3 oster for (i = 0; i < numDataNodes; i++) {
871 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
872 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
873 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
874 1.3 oster readDataNodes[i].antType[0] = rf_control;
875 1.3 oster }
876 1.3 oster
877 1.3 oster /* connect block node to read old parity nodes */
878 1.3 oster for (i = 0; i < numParityNodes; i++) {
879 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
880 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
881 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
882 1.3 oster readParityNodes[i].antType[0] = rf_control;
883 1.3 oster }
884 1.3 oster
885 1.3 oster /* connect block node to read old Q nodes */
886 1.3 oster if (nfaults == 2) {
887 1.3 oster for (i = 0; i < numParityNodes; i++) {
888 1.3 oster blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
889 1.3 oster RF_ASSERT(readQNodes[i].numAntecedents == 1);
890 1.3 oster readQNodes[i].antecedents[0] = blockNode;
891 1.3 oster readQNodes[i].antType[0] = rf_control;
892 1.3 oster }
893 1.3 oster }
894 1.3 oster /* connect read old data nodes to xor nodes */
895 1.3 oster for (i = 0; i < numDataNodes; i++) {
896 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes));
897 1.3 oster for (j = 0; j < numParityNodes; j++) {
898 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
899 1.3 oster readDataNodes[i].succedents[j] = &xorNodes[j];
900 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
901 1.3 oster xorNodes[j].antType[i] = rf_trueData;
902 1.3 oster }
903 1.3 oster }
904 1.3 oster
905 1.3 oster /* connect read old data nodes to q nodes */
906 1.3 oster if (nfaults == 2) {
907 1.3 oster for (i = 0; i < numDataNodes; i++) {
908 1.3 oster for (j = 0; j < numParityNodes; j++) {
909 1.3 oster RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
910 1.3 oster readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j];
911 1.3 oster qNodes[j].antecedents[i] = &readDataNodes[i];
912 1.3 oster qNodes[j].antType[i] = rf_trueData;
913 1.3 oster }
914 1.3 oster }
915 1.3 oster }
916 1.3 oster /* connect read old parity nodes to xor nodes */
917 1.3 oster for (i = 0; i < numParityNodes; i++) {
918 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
919 1.3 oster for (j = 0; j < numParityNodes; j++) {
920 1.3 oster readParityNodes[i].succedents[j] = &xorNodes[j];
921 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
922 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
923 1.3 oster }
924 1.3 oster }
925 1.3 oster
926 1.3 oster /* connect read old q nodes to q nodes */
927 1.3 oster if (nfaults == 2) {
928 1.3 oster for (i = 0; i < numParityNodes; i++) {
929 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
930 1.3 oster for (j = 0; j < numParityNodes; j++) {
931 1.3 oster readQNodes[i].succedents[j] = &qNodes[j];
932 1.3 oster qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
933 1.3 oster qNodes[j].antType[numDataNodes + i] = rf_trueData;
934 1.3 oster }
935 1.3 oster }
936 1.3 oster }
937 1.3 oster /* connect xor nodes to commit node */
938 1.3 oster RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
939 1.3 oster for (i = 0; i < numParityNodes; i++) {
940 1.3 oster RF_ASSERT(xorNodes[i].numSuccedents == 1);
941 1.3 oster xorNodes[i].succedents[0] = commitNode;
942 1.3 oster commitNode->antecedents[i] = &xorNodes[i];
943 1.3 oster commitNode->antType[i] = rf_control;
944 1.3 oster }
945 1.3 oster
946 1.3 oster /* connect q nodes to commit node */
947 1.3 oster if (nfaults == 2) {
948 1.3 oster for (i = 0; i < numParityNodes; i++) {
949 1.3 oster RF_ASSERT(qNodes[i].numSuccedents == 1);
950 1.3 oster qNodes[i].succedents[0] = commitNode;
951 1.3 oster commitNode->antecedents[i + numParityNodes] = &qNodes[i];
952 1.3 oster commitNode->antType[i + numParityNodes] = rf_control;
953 1.3 oster }
954 1.3 oster }
955 1.3 oster /* connect commit node to write nodes */
956 1.3 oster RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
957 1.3 oster for (i = 0; i < numDataNodes; i++) {
958 1.3 oster RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
959 1.3 oster commitNode->succedents[i] = &writeDataNodes[i];
960 1.3 oster writeDataNodes[i].antecedents[0] = commitNode;
961 1.3 oster writeDataNodes[i].antType[0] = rf_trueData;
962 1.3 oster }
963 1.3 oster for (i = 0; i < numParityNodes; i++) {
964 1.3 oster RF_ASSERT(writeParityNodes[i].numAntecedents == 1);
965 1.3 oster commitNode->succedents[i + numDataNodes] = &writeParityNodes[i];
966 1.3 oster writeParityNodes[i].antecedents[0] = commitNode;
967 1.3 oster writeParityNodes[i].antType[0] = rf_trueData;
968 1.3 oster }
969 1.3 oster if (nfaults == 2) {
970 1.3 oster for (i = 0; i < numParityNodes; i++) {
971 1.3 oster RF_ASSERT(writeQNodes[i].numAntecedents == 1);
972 1.3 oster commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i];
973 1.3 oster writeQNodes[i].antecedents[0] = commitNode;
974 1.3 oster writeQNodes[i].antType[0] = rf_trueData;
975 1.3 oster }
976 1.3 oster }
977 1.3 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
978 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
979 1.3 oster for (i = 0; i < numDataNodes; i++) {
980 1.3 oster if (lu_flag) {
981 1.3 oster /* connect write new data nodes to unlock nodes */
982 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
983 1.3 oster RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
984 1.3 oster writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
985 1.3 oster unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
986 1.3 oster unlockDataNodes[i].antType[0] = rf_control;
987 1.3 oster
988 1.3 oster /* connect unlock nodes to term node */
989 1.3 oster RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
990 1.3 oster unlockDataNodes[i].succedents[0] = termNode;
991 1.3 oster termNode->antecedents[i] = &unlockDataNodes[i];
992 1.3 oster termNode->antType[i] = rf_control;
993 1.3 oster } else {
994 1.3 oster /* connect write new data nodes to term node */
995 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
996 1.3 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
997 1.3 oster writeDataNodes[i].succedents[0] = termNode;
998 1.3 oster termNode->antecedents[i] = &writeDataNodes[i];
999 1.3 oster termNode->antType[i] = rf_control;
1000 1.3 oster }
1001 1.3 oster }
1002 1.3 oster
1003 1.3 oster for (i = 0; i < numParityNodes; i++) {
1004 1.3 oster if (lu_flag) {
1005 1.3 oster /* connect write new parity nodes to unlock nodes */
1006 1.3 oster RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
1007 1.3 oster RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
1008 1.3 oster writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
1009 1.3 oster unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
1010 1.3 oster unlockParityNodes[i].antType[0] = rf_control;
1011 1.3 oster
1012 1.3 oster /* connect unlock nodes to term node */
1013 1.3 oster RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
1014 1.3 oster unlockParityNodes[i].succedents[0] = termNode;
1015 1.3 oster termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
1016 1.3 oster termNode->antType[numDataNodes + i] = rf_control;
1017 1.3 oster } else {
1018 1.3 oster RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
1019 1.3 oster writeParityNodes[i].succedents[0] = termNode;
1020 1.3 oster termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
1021 1.3 oster termNode->antType[numDataNodes + i] = rf_control;
1022 1.3 oster }
1023 1.3 oster }
1024 1.3 oster
1025 1.3 oster if (nfaults == 2) {
1026 1.3 oster for (i = 0; i < numParityNodes; i++) {
1027 1.3 oster if (lu_flag) {
1028 1.3 oster /* connect write new Q nodes to unlock nodes */
1029 1.3 oster RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1030 1.3 oster RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
1031 1.3 oster writeQNodes[i].succedents[0] = &unlockQNodes[i];
1032 1.3 oster unlockQNodes[i].antecedents[0] = &writeQNodes[i];
1033 1.3 oster unlockQNodes[i].antType[0] = rf_control;
1034 1.3 oster
1035 1.3 oster /* connect unlock nodes to unblock node */
1036 1.3 oster RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
1037 1.3 oster unlockQNodes[i].succedents[0] = termNode;
1038 1.3 oster termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
1039 1.3 oster termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1040 1.3 oster } else {
1041 1.3 oster RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1042 1.3 oster writeQNodes[i].succedents[0] = termNode;
1043 1.3 oster termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
1044 1.3 oster termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1045 1.3 oster }
1046 1.3 oster }
1047 1.3 oster }
1048 1.1 oster }
1049 1.1 oster
1050 1.1 oster
1051 1.1 oster /******************************************************************************
1052 1.1 oster * create a write graph (fault-free or degraded) for RAID level 1
1053 1.1 oster *
1054 1.1 oster * Hdr -> Commit -> Wpd -> Nil -> Trm
1055 1.1 oster * -> Wsd ->
1056 1.1 oster *
1057 1.1 oster * The "Wpd" node writes data to the primary copy in the mirror pair
1058 1.1 oster * The "Wsd" node writes data to the secondary copy in the mirror pair
1059 1.1 oster *
1060 1.1 oster * Parameters: raidPtr - description of the physical array
1061 1.1 oster * asmap - logical & physical addresses for this access
1062 1.1 oster * bp - buffer ptr (holds write data)
1063 1.3 oster * flags - general flags (e.g. disk locking)
1064 1.1 oster * allocList - list of memory allocated in DAG creation
1065 1.1 oster *****************************************************************************/
1066 1.1 oster
1067 1.3 oster void
1068 1.3 oster rf_CreateRaidOneWriteDAG(
1069 1.3 oster RF_Raid_t * raidPtr,
1070 1.3 oster RF_AccessStripeMap_t * asmap,
1071 1.3 oster RF_DagHeader_t * dag_h,
1072 1.3 oster void *bp,
1073 1.3 oster RF_RaidAccessFlags_t flags,
1074 1.3 oster RF_AllocListElem_t * allocList)
1075 1.1 oster {
1076 1.3 oster RF_DagNode_t *unblockNode, *termNode, *commitNode;
1077 1.3 oster RF_DagNode_t *nodes, *wndNode, *wmirNode;
1078 1.3 oster int nWndNodes, nWmirNodes, i;
1079 1.3 oster RF_ReconUnitNum_t which_ru;
1080 1.3 oster RF_PhysDiskAddr_t *pda, *pdaP;
1081 1.3 oster RF_StripeNum_t parityStripeID;
1082 1.3 oster
1083 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
1084 1.3 oster asmap->raidAddress, &which_ru);
1085 1.3 oster if (rf_dagDebug) {
1086 1.3 oster printf("[Creating RAID level 1 write DAG]\n");
1087 1.3 oster }
1088 1.3 oster dag_h->creator = "RaidOneWriteDAG";
1089 1.3 oster
1090 1.3 oster /* 2 implies access not SU aligned */
1091 1.3 oster nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
1092 1.3 oster nWndNodes = (asmap->physInfo->next) ? 2 : 1;
1093 1.3 oster
1094 1.3 oster /* alloc the Wnd nodes and the Wmir node */
1095 1.3 oster if (asmap->numDataFailed == 1)
1096 1.3 oster nWndNodes--;
1097 1.3 oster if (asmap->numParityFailed == 1)
1098 1.3 oster nWmirNodes--;
1099 1.3 oster
1100 1.3 oster /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1101 1.3 oster * + terminator) */
1102 1.3 oster RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t),
1103 1.3 oster (RF_DagNode_t *), allocList);
1104 1.3 oster i = 0;
1105 1.3 oster wndNode = &nodes[i];
1106 1.3 oster i += nWndNodes;
1107 1.3 oster wmirNode = &nodes[i];
1108 1.3 oster i += nWmirNodes;
1109 1.3 oster commitNode = &nodes[i];
1110 1.3 oster i += 1;
1111 1.3 oster unblockNode = &nodes[i];
1112 1.3 oster i += 1;
1113 1.3 oster termNode = &nodes[i];
1114 1.3 oster i += 1;
1115 1.3 oster RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
1116 1.3 oster
1117 1.3 oster /* this dag can commit immediately */
1118 1.3 oster dag_h->numCommitNodes = 1;
1119 1.3 oster dag_h->numCommits = 0;
1120 1.3 oster dag_h->numSuccedents = 1;
1121 1.3 oster
1122 1.3 oster /* initialize the commit, unblock, and term nodes */
1123 1.3 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
1124 1.3 oster NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Cmt", allocList);
1125 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
1126 1.3 oster NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList);
1127 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
1128 1.3 oster NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
1129 1.3 oster
1130 1.3 oster /* initialize the wnd nodes */
1131 1.3 oster if (nWndNodes > 0) {
1132 1.3 oster pda = asmap->physInfo;
1133 1.3 oster for (i = 0; i < nWndNodes; i++) {
1134 1.3 oster rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1135 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList);
1136 1.3 oster RF_ASSERT(pda != NULL);
1137 1.3 oster wndNode[i].params[0].p = pda;
1138 1.3 oster wndNode[i].params[1].p = pda->bufPtr;
1139 1.3 oster wndNode[i].params[2].v = parityStripeID;
1140 1.3 oster wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1141 1.3 oster pda = pda->next;
1142 1.3 oster }
1143 1.3 oster RF_ASSERT(pda == NULL);
1144 1.3 oster }
1145 1.3 oster /* initialize the mirror nodes */
1146 1.3 oster if (nWmirNodes > 0) {
1147 1.3 oster pda = asmap->physInfo;
1148 1.3 oster pdaP = asmap->parityInfo;
1149 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1150 1.3 oster rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1151 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList);
1152 1.3 oster RF_ASSERT(pda != NULL);
1153 1.3 oster wmirNode[i].params[0].p = pdaP;
1154 1.3 oster wmirNode[i].params[1].p = pda->bufPtr;
1155 1.3 oster wmirNode[i].params[2].v = parityStripeID;
1156 1.3 oster wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1157 1.3 oster pda = pda->next;
1158 1.3 oster pdaP = pdaP->next;
1159 1.3 oster }
1160 1.3 oster RF_ASSERT(pda == NULL);
1161 1.3 oster RF_ASSERT(pdaP == NULL);
1162 1.3 oster }
1163 1.3 oster /* link the header node to the commit node */
1164 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
1165 1.3 oster RF_ASSERT(commitNode->numAntecedents == 0);
1166 1.3 oster dag_h->succedents[0] = commitNode;
1167 1.3 oster
1168 1.3 oster /* link the commit node to the write nodes */
1169 1.3 oster RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
1170 1.3 oster for (i = 0; i < nWndNodes; i++) {
1171 1.3 oster RF_ASSERT(wndNode[i].numAntecedents == 1);
1172 1.3 oster commitNode->succedents[i] = &wndNode[i];
1173 1.3 oster wndNode[i].antecedents[0] = commitNode;
1174 1.3 oster wndNode[i].antType[0] = rf_control;
1175 1.3 oster }
1176 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1177 1.3 oster RF_ASSERT(wmirNode[i].numAntecedents == 1);
1178 1.3 oster commitNode->succedents[i + nWndNodes] = &wmirNode[i];
1179 1.3 oster wmirNode[i].antecedents[0] = commitNode;
1180 1.3 oster wmirNode[i].antType[0] = rf_control;
1181 1.3 oster }
1182 1.3 oster
1183 1.3 oster /* link the write nodes to the unblock node */
1184 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
1185 1.3 oster for (i = 0; i < nWndNodes; i++) {
1186 1.3 oster RF_ASSERT(wndNode[i].numSuccedents == 1);
1187 1.3 oster wndNode[i].succedents[0] = unblockNode;
1188 1.3 oster unblockNode->antecedents[i] = &wndNode[i];
1189 1.3 oster unblockNode->antType[i] = rf_control;
1190 1.3 oster }
1191 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1192 1.3 oster RF_ASSERT(wmirNode[i].numSuccedents == 1);
1193 1.3 oster wmirNode[i].succedents[0] = unblockNode;
1194 1.3 oster unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
1195 1.3 oster unblockNode->antType[i + nWndNodes] = rf_control;
1196 1.3 oster }
1197 1.3 oster
1198 1.3 oster /* link the unblock node to the term node */
1199 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
1200 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
1201 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
1202 1.3 oster unblockNode->succedents[0] = termNode;
1203 1.3 oster termNode->antecedents[0] = unblockNode;
1204 1.3 oster termNode->antType[0] = rf_control;
1205 1.1 oster }
1206 1.1 oster
1207 1.1 oster
1208 1.1 oster
1209 1.1 oster /* DAGs which have no commit points.
1210 1.1 oster *
1211 1.1 oster * The following DAGs are used in forward and backward error recovery experiments.
1212 1.1 oster * They are identical to the DAGs above this comment with the exception that the
1213 1.1 oster * the commit points have been removed.
1214 1.1 oster */
1215 1.1 oster
1216 1.1 oster
1217 1.1 oster
1218 1.3 oster void
1219 1.3 oster rf_CommonCreateLargeWriteDAGFwd(
1220 1.3 oster RF_Raid_t * raidPtr,
1221 1.3 oster RF_AccessStripeMap_t * asmap,
1222 1.3 oster RF_DagHeader_t * dag_h,
1223 1.3 oster void *bp,
1224 1.3 oster RF_RaidAccessFlags_t flags,
1225 1.3 oster RF_AllocListElem_t * allocList,
1226 1.3 oster int nfaults,
1227 1.3 oster int (*redFunc) (RF_DagNode_t *),
1228 1.3 oster int allowBufferRecycle)
1229 1.1 oster {
1230 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
1231 1.3 oster RF_DagNode_t *wnqNode, *blockNode, *syncNode, *termNode;
1232 1.3 oster int nWndNodes, nRodNodes, i, nodeNum, asmNum;
1233 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
1234 1.3 oster RF_StripeNum_t parityStripeID;
1235 1.3 oster char *sosBuffer, *eosBuffer;
1236 1.3 oster RF_ReconUnitNum_t which_ru;
1237 1.3 oster RF_RaidLayout_t *layoutPtr;
1238 1.3 oster RF_PhysDiskAddr_t *pda;
1239 1.3 oster
1240 1.3 oster layoutPtr = &(raidPtr->Layout);
1241 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
1242 1.3 oster
1243 1.3 oster if (rf_dagDebug)
1244 1.3 oster printf("[Creating large-write DAG]\n");
1245 1.3 oster dag_h->creator = "LargeWriteDAGFwd";
1246 1.3 oster
1247 1.3 oster dag_h->numCommitNodes = 0;
1248 1.3 oster dag_h->numCommits = 0;
1249 1.3 oster dag_h->numSuccedents = 1;
1250 1.3 oster
1251 1.3 oster /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
1252 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
1253 1.3 oster RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
1254 1.3 oster i = 0;
1255 1.3 oster wndNodes = &nodes[i];
1256 1.3 oster i += nWndNodes;
1257 1.3 oster xorNode = &nodes[i];
1258 1.3 oster i += 1;
1259 1.3 oster wnpNode = &nodes[i];
1260 1.3 oster i += 1;
1261 1.3 oster blockNode = &nodes[i];
1262 1.3 oster i += 1;
1263 1.3 oster syncNode = &nodes[i];
1264 1.3 oster i += 1;
1265 1.3 oster termNode = &nodes[i];
1266 1.3 oster i += 1;
1267 1.3 oster if (nfaults == 2) {
1268 1.3 oster wnqNode = &nodes[i];
1269 1.3 oster i += 1;
1270 1.3 oster } else {
1271 1.3 oster wnqNode = NULL;
1272 1.3 oster }
1273 1.3 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
1274 1.3 oster if (nRodNodes > 0) {
1275 1.3 oster RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
1276 1.3 oster } else {
1277 1.3 oster rodNodes = NULL;
1278 1.3 oster }
1279 1.3 oster
1280 1.3 oster /* begin node initialization */
1281 1.3 oster if (nRodNodes > 0) {
1282 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList);
1283 1.3 oster rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes, 0, 0, dag_h, "Nil", allocList);
1284 1.3 oster } else {
1285 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
1286 1.3 oster rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, 1, 0, 0, dag_h, "Nil", allocList);
1287 1.3 oster }
1288 1.3 oster
1289 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList);
1290 1.3 oster
1291 1.3 oster /* initialize the Rod nodes */
1292 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
1293 1.3 oster if (new_asm_h[asmNum]) {
1294 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
1295 1.3 oster while (pda) {
1296 1.3 oster rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
1297 1.3 oster rodNodes[nodeNum].params[0].p = pda;
1298 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
1299 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
1300 1.3 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1301 1.3 oster nodeNum++;
1302 1.3 oster pda = pda->next;
1303 1.3 oster }
1304 1.3 oster }
1305 1.3 oster }
1306 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
1307 1.3 oster
1308 1.3 oster /* initialize the wnd nodes */
1309 1.3 oster pda = asmap->physInfo;
1310 1.3 oster for (i = 0; i < nWndNodes; i++) {
1311 1.3 oster rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
1312 1.3 oster RF_ASSERT(pda != NULL);
1313 1.3 oster wndNodes[i].params[0].p = pda;
1314 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
1315 1.3 oster wndNodes[i].params[2].v = parityStripeID;
1316 1.3 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1317 1.3 oster pda = pda->next;
1318 1.3 oster }
1319 1.3 oster
1320 1.3 oster /* initialize the redundancy node */
1321 1.3 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, nfaults, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList);
1322 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
1323 1.3 oster for (i = 0; i < nWndNodes; i++) {
1324 1.3 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
1325 1.3 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
1326 1.3 oster }
1327 1.3 oster for (i = 0; i < nRodNodes; i++) {
1328 1.3 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
1329 1.3 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
1330 1.3 oster }
1331 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get
1332 1.3 oster * at RAID information */
1333 1.3 oster
1334 1.3 oster /* look for an Rod node that reads a complete SU. If none, alloc a
1335 1.3 oster * buffer to receive the parity info. Note that we can't use a new
1336 1.3 oster * data buffer because it will not have gotten written when the xor
1337 1.3 oster * occurs. */
1338 1.3 oster if (allowBufferRecycle) {
1339 1.3 oster for (i = 0; i < nRodNodes; i++)
1340 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
1341 1.3 oster break;
1342 1.3 oster }
1343 1.3 oster if ((!allowBufferRecycle) || (i == nRodNodes)) {
1344 1.3 oster RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
1345 1.3 oster } else
1346 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
1347 1.3 oster
1348 1.3 oster /* initialize the Wnp node */
1349 1.3 oster rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
1350 1.3 oster wnpNode->params[0].p = asmap->parityInfo;
1351 1.3 oster wnpNode->params[1].p = xorNode->results[0];
1352 1.3 oster wnpNode->params[2].v = parityStripeID;
1353 1.3 oster wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1354 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must
1355 1.3 oster * describe entire
1356 1.3 oster * parity unit */
1357 1.3 oster
1358 1.3 oster if (nfaults == 2) {
1359 1.3 oster /* we never try to recycle a buffer for the Q calcuation in
1360 1.3 oster * addition to the parity. This would cause two buffers to get
1361 1.3 oster * smashed during the P and Q calculation, guaranteeing one
1362 1.3 oster * would be wrong. */
1363 1.3 oster RF_CallocAndAdd(xorNode->results[1], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
1364 1.3 oster rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
1365 1.3 oster wnqNode->params[0].p = asmap->qInfo;
1366 1.3 oster wnqNode->params[1].p = xorNode->results[1];
1367 1.3 oster wnqNode->params[2].v = parityStripeID;
1368 1.3 oster wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1369 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must
1370 1.3 oster * describe entire
1371 1.3 oster * parity unit */
1372 1.3 oster }
1373 1.3 oster /* connect nodes to form graph */
1374 1.3 oster
1375 1.3 oster /* connect dag header to block node */
1376 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
1377 1.3 oster dag_h->succedents[0] = blockNode;
1378 1.3 oster
1379 1.3 oster if (nRodNodes > 0) {
1380 1.3 oster /* connect the block node to the Rod nodes */
1381 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes);
1382 1.3 oster RF_ASSERT(syncNode->numAntecedents == nRodNodes);
1383 1.3 oster for (i = 0; i < nRodNodes; i++) {
1384 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
1385 1.3 oster blockNode->succedents[i] = &rodNodes[i];
1386 1.3 oster rodNodes[i].antecedents[0] = blockNode;
1387 1.3 oster rodNodes[i].antType[0] = rf_control;
1388 1.3 oster
1389 1.3 oster /* connect the Rod nodes to the Nil node */
1390 1.3 oster RF_ASSERT(rodNodes[i].numSuccedents == 1);
1391 1.3 oster rodNodes[i].succedents[0] = syncNode;
1392 1.3 oster syncNode->antecedents[i] = &rodNodes[i];
1393 1.3 oster syncNode->antType[i] = rf_trueData;
1394 1.3 oster }
1395 1.3 oster } else {
1396 1.3 oster /* connect the block node to the Nil node */
1397 1.3 oster RF_ASSERT(blockNode->numSuccedents == 1);
1398 1.3 oster RF_ASSERT(syncNode->numAntecedents == 1);
1399 1.3 oster blockNode->succedents[0] = syncNode;
1400 1.3 oster syncNode->antecedents[0] = blockNode;
1401 1.3 oster syncNode->antType[0] = rf_control;
1402 1.3 oster }
1403 1.3 oster
1404 1.3 oster /* connect the sync node to the Wnd nodes */
1405 1.3 oster RF_ASSERT(syncNode->numSuccedents == (1 + nWndNodes));
1406 1.3 oster for (i = 0; i < nWndNodes; i++) {
1407 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
1408 1.3 oster syncNode->succedents[i] = &wndNodes[i];
1409 1.3 oster wndNodes[i].antecedents[0] = syncNode;
1410 1.3 oster wndNodes[i].antType[0] = rf_control;
1411 1.3 oster }
1412 1.3 oster
1413 1.3 oster /* connect the sync node to the Xor node */
1414 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
1415 1.3 oster syncNode->succedents[nWndNodes] = xorNode;
1416 1.3 oster xorNode->antecedents[0] = syncNode;
1417 1.3 oster xorNode->antType[0] = rf_control;
1418 1.3 oster
1419 1.3 oster /* connect the xor node to the write parity node */
1420 1.3 oster RF_ASSERT(xorNode->numSuccedents == nfaults);
1421 1.3 oster RF_ASSERT(wnpNode->numAntecedents == 1);
1422 1.3 oster xorNode->succedents[0] = wnpNode;
1423 1.3 oster wnpNode->antecedents[0] = xorNode;
1424 1.3 oster wnpNode->antType[0] = rf_trueData;
1425 1.3 oster if (nfaults == 2) {
1426 1.3 oster RF_ASSERT(wnqNode->numAntecedents == 1);
1427 1.3 oster xorNode->succedents[1] = wnqNode;
1428 1.3 oster wnqNode->antecedents[0] = xorNode;
1429 1.3 oster wnqNode->antType[0] = rf_trueData;
1430 1.3 oster }
1431 1.3 oster /* connect the write nodes to the term node */
1432 1.3 oster RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
1433 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
1434 1.3 oster for (i = 0; i < nWndNodes; i++) {
1435 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
1436 1.3 oster wndNodes[i].succedents[0] = termNode;
1437 1.3 oster termNode->antecedents[i] = &wndNodes[i];
1438 1.3 oster termNode->antType[i] = rf_control;
1439 1.3 oster }
1440 1.3 oster RF_ASSERT(wnpNode->numSuccedents == 1);
1441 1.3 oster wnpNode->succedents[0] = termNode;
1442 1.3 oster termNode->antecedents[nWndNodes] = wnpNode;
1443 1.3 oster termNode->antType[nWndNodes] = rf_control;
1444 1.3 oster if (nfaults == 2) {
1445 1.3 oster RF_ASSERT(wnqNode->numSuccedents == 1);
1446 1.3 oster wnqNode->succedents[0] = termNode;
1447 1.3 oster termNode->antecedents[nWndNodes + 1] = wnqNode;
1448 1.3 oster termNode->antType[nWndNodes + 1] = rf_control;
1449 1.3 oster }
1450 1.1 oster }
1451 1.1 oster
1452 1.1 oster
1453 1.1 oster /******************************************************************************
1454 1.1 oster *
1455 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq),
1456 1.1 oster * which is as follows:
1457 1.1 oster *
1458 1.1 oster * Hdr -> Nil -> Rop - Xor - Wnp [Unp] -- Trm
1459 1.1 oster * \- Rod X- Wnd [Und] -------/
1460 1.1 oster * [\- Rod X- Wnd [Und] ------/]
1461 1.1 oster * [\- Roq - Q --> Wnq [Unq]-/]
1462 1.1 oster *
1463 1.1 oster * Rop = read old parity
1464 1.1 oster * Rod = read old data
1465 1.1 oster * Roq = read old "q"
1466 1.1 oster * Cmt = commit node
1467 1.1 oster * Und = unlock data disk
1468 1.1 oster * Unp = unlock parity disk
1469 1.1 oster * Unq = unlock q disk
1470 1.1 oster * Wnp = write new parity
1471 1.1 oster * Wnd = write new data
1472 1.1 oster * Wnq = write new "q"
1473 1.1 oster * [ ] denotes optional segments in the graph
1474 1.1 oster *
1475 1.1 oster * Parameters: raidPtr - description of the physical array
1476 1.1 oster * asmap - logical & physical addresses for this access
1477 1.1 oster * bp - buffer ptr (holds write data)
1478 1.3 oster * flags - general flags (e.g. disk locking)
1479 1.1 oster * allocList - list of memory allocated in DAG creation
1480 1.1 oster * pfuncs - list of parity generating functions
1481 1.1 oster * qfuncs - list of q generating functions
1482 1.1 oster *
1483 1.1 oster * A null qfuncs indicates single fault tolerant
1484 1.1 oster *****************************************************************************/
1485 1.1 oster
1486 1.3 oster void
1487 1.3 oster rf_CommonCreateSmallWriteDAGFwd(
1488 1.3 oster RF_Raid_t * raidPtr,
1489 1.3 oster RF_AccessStripeMap_t * asmap,
1490 1.3 oster RF_DagHeader_t * dag_h,
1491 1.3 oster void *bp,
1492 1.3 oster RF_RaidAccessFlags_t flags,
1493 1.3 oster RF_AllocListElem_t * allocList,
1494 1.3 oster RF_RedFuncs_t * pfuncs,
1495 1.3 oster RF_RedFuncs_t * qfuncs)
1496 1.1 oster {
1497 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
1498 1.3 oster RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
1499 1.3 oster RF_DagNode_t *xorNodes, *qNodes, *blockNode, *nodes;
1500 1.3 oster RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
1501 1.3 oster int i, j, nNodes, totalNumNodes, lu_flag;
1502 1.3 oster RF_ReconUnitNum_t which_ru;
1503 1.3 oster int (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
1504 1.3 oster int (*qfunc) (RF_DagNode_t *);
1505 1.3 oster int numDataNodes, numParityNodes;
1506 1.3 oster RF_StripeNum_t parityStripeID;
1507 1.3 oster RF_PhysDiskAddr_t *pda;
1508 1.3 oster char *name, *qname;
1509 1.3 oster long nfaults;
1510 1.3 oster
1511 1.3 oster nfaults = qfuncs ? 2 : 1;
1512 1.3 oster lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
1513 1.3 oster
1514 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
1515 1.3 oster pda = asmap->physInfo;
1516 1.3 oster numDataNodes = asmap->numStripeUnitsAccessed;
1517 1.3 oster numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
1518 1.3 oster
1519 1.3 oster if (rf_dagDebug)
1520 1.3 oster printf("[Creating small-write DAG]\n");
1521 1.3 oster RF_ASSERT(numDataNodes > 0);
1522 1.3 oster dag_h->creator = "SmallWriteDAGFwd";
1523 1.3 oster
1524 1.3 oster dag_h->numCommitNodes = 0;
1525 1.3 oster dag_h->numCommits = 0;
1526 1.3 oster dag_h->numSuccedents = 1;
1527 1.3 oster
1528 1.3 oster qfunc = NULL;
1529 1.3 oster qname = NULL;
1530 1.3 oster
1531 1.3 oster /* DAG creation occurs in four steps: 1. count the number of nodes in
1532 1.3 oster * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
1533 1.3 oster * nodes */
1534 1.3 oster
1535 1.3 oster /* Step 1. compute number of nodes in the graph */
1536 1.3 oster
1537 1.3 oster /* number of nodes: a read and write for each data unit a redundancy
1538 1.3 oster * computation node for each parity node (nfaults * nparity) a read
1539 1.3 oster * and write for each parity unit a block node a terminate node if
1540 1.3 oster * atomic RMW an unlock node for each data unit, redundancy unit */
1541 1.3 oster totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes) + (nfaults * 2 * numParityNodes) + 2;
1542 1.3 oster if (lu_flag)
1543 1.3 oster totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
1544 1.3 oster
1545 1.3 oster
1546 1.3 oster /* Step 2. create the nodes */
1547 1.3 oster RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
1548 1.3 oster i = 0;
1549 1.3 oster blockNode = &nodes[i];
1550 1.3 oster i += 1;
1551 1.3 oster readDataNodes = &nodes[i];
1552 1.3 oster i += numDataNodes;
1553 1.3 oster readParityNodes = &nodes[i];
1554 1.3 oster i += numParityNodes;
1555 1.3 oster writeDataNodes = &nodes[i];
1556 1.3 oster i += numDataNodes;
1557 1.3 oster writeParityNodes = &nodes[i];
1558 1.3 oster i += numParityNodes;
1559 1.3 oster xorNodes = &nodes[i];
1560 1.3 oster i += numParityNodes;
1561 1.3 oster termNode = &nodes[i];
1562 1.3 oster i += 1;
1563 1.3 oster if (lu_flag) {
1564 1.3 oster unlockDataNodes = &nodes[i];
1565 1.3 oster i += numDataNodes;
1566 1.3 oster unlockParityNodes = &nodes[i];
1567 1.3 oster i += numParityNodes;
1568 1.3 oster } else {
1569 1.3 oster unlockDataNodes = unlockParityNodes = NULL;
1570 1.3 oster }
1571 1.3 oster if (nfaults == 2) {
1572 1.3 oster readQNodes = &nodes[i];
1573 1.3 oster i += numParityNodes;
1574 1.3 oster writeQNodes = &nodes[i];
1575 1.3 oster i += numParityNodes;
1576 1.3 oster qNodes = &nodes[i];
1577 1.3 oster i += numParityNodes;
1578 1.3 oster if (lu_flag) {
1579 1.3 oster unlockQNodes = &nodes[i];
1580 1.3 oster i += numParityNodes;
1581 1.3 oster } else {
1582 1.3 oster unlockQNodes = NULL;
1583 1.3 oster }
1584 1.3 oster } else {
1585 1.3 oster readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
1586 1.3 oster }
1587 1.3 oster RF_ASSERT(i == totalNumNodes);
1588 1.1 oster
1589 1.3 oster /* Step 3. initialize the nodes */
1590 1.3 oster /* initialize block node (Nil) */
1591 1.3 oster nNodes = numDataNodes + (nfaults * numParityNodes);
1592 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
1593 1.3 oster
1594 1.3 oster /* initialize terminate node (Trm) */
1595 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList);
1596 1.3 oster
1597 1.3 oster /* initialize nodes which read old data (Rod) */
1598 1.3 oster for (i = 0; i < numDataNodes; i++) {
1599 1.3 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, (numParityNodes * nfaults) + 1, 1, 4, 0, dag_h, "Rod", allocList);
1600 1.3 oster RF_ASSERT(pda != NULL);
1601 1.3 oster readDataNodes[i].params[0].p = pda; /* physical disk addr
1602 1.3 oster * desc */
1603 1.3 oster readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
1604 1.3 oster * data */
1605 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
1606 1.3 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
1607 1.3 oster pda = pda->next;
1608 1.3 oster for (j = 0; j < readDataNodes[i].numSuccedents; j++)
1609 1.3 oster readDataNodes[i].propList[j] = NULL;
1610 1.3 oster }
1611 1.3 oster
1612 1.3 oster /* initialize nodes which read old parity (Rop) */
1613 1.3 oster pda = asmap->parityInfo;
1614 1.3 oster i = 0;
1615 1.3 oster for (i = 0; i < numParityNodes; i++) {
1616 1.3 oster RF_ASSERT(pda != NULL);
1617 1.3 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Rop", allocList);
1618 1.3 oster readParityNodes[i].params[0].p = pda;
1619 1.3 oster readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
1620 1.3 oster * parity */
1621 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
1622 1.3 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
1623 1.3 oster for (j = 0; j < readParityNodes[i].numSuccedents; j++)
1624 1.3 oster readParityNodes[i].propList[0] = NULL;
1625 1.3 oster pda = pda->next;
1626 1.3 oster }
1627 1.3 oster
1628 1.3 oster /* initialize nodes which read old Q (Roq) */
1629 1.3 oster if (nfaults == 2) {
1630 1.3 oster pda = asmap->qInfo;
1631 1.3 oster for (i = 0; i < numParityNodes; i++) {
1632 1.3 oster RF_ASSERT(pda != NULL);
1633 1.3 oster rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList);
1634 1.3 oster readQNodes[i].params[0].p = pda;
1635 1.3 oster readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old Q */
1636 1.3 oster readQNodes[i].params[2].v = parityStripeID;
1637 1.3 oster readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
1638 1.3 oster for (j = 0; j < readQNodes[i].numSuccedents; j++)
1639 1.3 oster readQNodes[i].propList[0] = NULL;
1640 1.3 oster pda = pda->next;
1641 1.3 oster }
1642 1.3 oster }
1643 1.3 oster /* initialize nodes which write new data (Wnd) */
1644 1.3 oster pda = asmap->physInfo;
1645 1.3 oster for (i = 0; i < numDataNodes; i++) {
1646 1.3 oster RF_ASSERT(pda != NULL);
1647 1.3 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
1648 1.3 oster writeDataNodes[i].params[0].p = pda; /* physical disk addr
1649 1.3 oster * desc */
1650 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new
1651 1.3 oster * data to be written */
1652 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
1653 1.3 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1654 1.3 oster
1655 1.3 oster if (lu_flag) {
1656 1.3 oster /* initialize node to unlock the disk queue */
1657 1.3 oster rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
1658 1.3 oster unlockDataNodes[i].params[0].p = pda; /* physical disk addr
1659 1.3 oster * desc */
1660 1.3 oster unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
1661 1.3 oster }
1662 1.3 oster pda = pda->next;
1663 1.3 oster }
1664 1.3 oster
1665 1.3 oster
1666 1.3 oster /* initialize nodes which compute new parity and Q */
1667 1.3 oster /* we use the simple XOR func in the double-XOR case, and when we're
1668 1.3 oster * accessing only a portion of one stripe unit. the distinction
1669 1.3 oster * between the two is that the regular XOR func assumes that the
1670 1.3 oster * targbuf is a full SU in size, and examines the pda associated with
1671 1.3 oster * the buffer to decide where within the buffer to XOR the data,
1672 1.3 oster * whereas the simple XOR func just XORs the data into the start of
1673 1.3 oster * the buffer. */
1674 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
1675 1.3 oster func = pfuncs->simple;
1676 1.3 oster undoFunc = rf_NullNodeUndoFunc;
1677 1.3 oster name = pfuncs->SimpleName;
1678 1.3 oster if (qfuncs) {
1679 1.3 oster qfunc = qfuncs->simple;
1680 1.3 oster qname = qfuncs->SimpleName;
1681 1.3 oster }
1682 1.3 oster } else {
1683 1.3 oster func = pfuncs->regular;
1684 1.3 oster undoFunc = rf_NullNodeUndoFunc;
1685 1.3 oster name = pfuncs->RegularName;
1686 1.3 oster if (qfuncs) {
1687 1.3 oster qfunc = qfuncs->regular;
1688 1.3 oster qname = qfuncs->RegularName;
1689 1.3 oster }
1690 1.3 oster }
1691 1.3 oster /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
1692 1.3 oster * nodes, and raidPtr */
1693 1.3 oster if (numParityNodes == 2) { /* double-xor case */
1694 1.3 oster for (i = 0; i < numParityNodes; i++) {
1695 1.3 oster rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for
1696 1.3 oster * xor */
1697 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
1698 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
1699 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
1700 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
1701 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
1702 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
1703 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
1704 1.3 oster xorNodes[i].params[6].p = raidPtr;
1705 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as
1706 1.3 oster * target buf */
1707 1.3 oster if (nfaults == 2) {
1708 1.3 oster rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, qname, allocList); /* no wakeup func for
1709 1.3 oster * xor */
1710 1.3 oster qNodes[i].params[0] = readDataNodes[i].params[0];
1711 1.3 oster qNodes[i].params[1] = readDataNodes[i].params[1];
1712 1.3 oster qNodes[i].params[2] = readQNodes[i].params[0];
1713 1.3 oster qNodes[i].params[3] = readQNodes[i].params[1];
1714 1.3 oster qNodes[i].params[4] = writeDataNodes[i].params[0];
1715 1.3 oster qNodes[i].params[5] = writeDataNodes[i].params[1];
1716 1.3 oster qNodes[i].params[6].p = raidPtr;
1717 1.3 oster qNodes[i].results[0] = readQNodes[i].params[1].p; /* use old Q buf as
1718 1.3 oster * target buf */
1719 1.3 oster }
1720 1.3 oster }
1721 1.3 oster } else {
1722 1.3 oster /* there is only one xor node in this case */
1723 1.3 oster rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
1724 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
1725 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
1726 1.3 oster /* set up params related to Rod and Rop nodes */
1727 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
1728 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */
1729 1.3 oster }
1730 1.3 oster for (i = 0; i < numDataNodes; i++) {
1731 1.3 oster /* set up params related to Wnd and Wnp nodes */
1732 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */
1733 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */
1734 1.3 oster }
1735 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get
1736 1.3 oster * at RAID information */
1737 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
1738 1.3 oster if (nfaults == 2) {
1739 1.3 oster rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, qname, allocList);
1740 1.3 oster for (i = 0; i < numDataNodes; i++) {
1741 1.3 oster /* set up params related to Rod */
1742 1.3 oster qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
1743 1.3 oster qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */
1744 1.3 oster }
1745 1.3 oster /* and read old q */
1746 1.3 oster qNodes[0].params[2 * numDataNodes + 0] = readQNodes[0].params[0]; /* pda */
1747 1.3 oster qNodes[0].params[2 * numDataNodes + 1] = readQNodes[0].params[1]; /* buffer pointer */
1748 1.3 oster for (i = 0; i < numDataNodes; i++) {
1749 1.3 oster /* set up params related to Wnd nodes */
1750 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */
1751 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */
1752 1.3 oster }
1753 1.3 oster qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get
1754 1.3 oster * at RAID information */
1755 1.3 oster qNodes[0].results[0] = readQNodes[0].params[1].p;
1756 1.3 oster }
1757 1.3 oster }
1758 1.3 oster
1759 1.3 oster /* initialize nodes which write new parity (Wnp) */
1760 1.3 oster pda = asmap->parityInfo;
1761 1.3 oster for (i = 0; i < numParityNodes; i++) {
1762 1.3 oster rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnp", allocList);
1763 1.3 oster RF_ASSERT(pda != NULL);
1764 1.3 oster writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr)
1765 1.3 oster * filled in by xor node */
1766 1.3 oster writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for
1767 1.3 oster * parity write
1768 1.3 oster * operation */
1769 1.3 oster writeParityNodes[i].params[2].v = parityStripeID;
1770 1.3 oster writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1771 1.3 oster
1772 1.3 oster if (lu_flag) {
1773 1.3 oster /* initialize node to unlock the disk queue */
1774 1.3 oster rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unp", allocList);
1775 1.3 oster unlockParityNodes[i].params[0].p = pda; /* physical disk addr
1776 1.3 oster * desc */
1777 1.3 oster unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
1778 1.3 oster }
1779 1.3 oster pda = pda->next;
1780 1.3 oster }
1781 1.3 oster
1782 1.3 oster /* initialize nodes which write new Q (Wnq) */
1783 1.3 oster if (nfaults == 2) {
1784 1.3 oster pda = asmap->qInfo;
1785 1.3 oster for (i = 0; i < numParityNodes; i++) {
1786 1.3 oster rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnq", allocList);
1787 1.3 oster RF_ASSERT(pda != NULL);
1788 1.3 oster writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr)
1789 1.3 oster * filled in by xor node */
1790 1.3 oster writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for
1791 1.3 oster * parity write
1792 1.3 oster * operation */
1793 1.3 oster writeQNodes[i].params[2].v = parityStripeID;
1794 1.3 oster writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
1795 1.3 oster
1796 1.3 oster if (lu_flag) {
1797 1.3 oster /* initialize node to unlock the disk queue */
1798 1.3 oster rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unq", allocList);
1799 1.3 oster unlockQNodes[i].params[0].p = pda; /* physical disk addr
1800 1.3 oster * desc */
1801 1.3 oster unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
1802 1.3 oster }
1803 1.3 oster pda = pda->next;
1804 1.3 oster }
1805 1.3 oster }
1806 1.3 oster /* Step 4. connect the nodes */
1807 1.3 oster
1808 1.3 oster /* connect header to block node */
1809 1.3 oster dag_h->succedents[0] = blockNode;
1810 1.3 oster
1811 1.3 oster /* connect block node to read old data nodes */
1812 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
1813 1.3 oster for (i = 0; i < numDataNodes; i++) {
1814 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
1815 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
1816 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
1817 1.3 oster readDataNodes[i].antType[0] = rf_control;
1818 1.3 oster }
1819 1.3 oster
1820 1.3 oster /* connect block node to read old parity nodes */
1821 1.3 oster for (i = 0; i < numParityNodes; i++) {
1822 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
1823 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
1824 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
1825 1.3 oster readParityNodes[i].antType[0] = rf_control;
1826 1.3 oster }
1827 1.3 oster
1828 1.3 oster /* connect block node to read old Q nodes */
1829 1.3 oster if (nfaults == 2)
1830 1.3 oster for (i = 0; i < numParityNodes; i++) {
1831 1.3 oster blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
1832 1.3 oster RF_ASSERT(readQNodes[i].numAntecedents == 1);
1833 1.3 oster readQNodes[i].antecedents[0] = blockNode;
1834 1.3 oster readQNodes[i].antType[0] = rf_control;
1835 1.3 oster }
1836 1.3 oster
1837 1.3 oster /* connect read old data nodes to write new data nodes */
1838 1.3 oster for (i = 0; i < numDataNodes; i++) {
1839 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == ((nfaults * numParityNodes) + 1));
1840 1.3 oster RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
1841 1.3 oster readDataNodes[i].succedents[0] = &writeDataNodes[i];
1842 1.3 oster writeDataNodes[i].antecedents[0] = &readDataNodes[i];
1843 1.3 oster writeDataNodes[i].antType[0] = rf_antiData;
1844 1.3 oster }
1845 1.3 oster
1846 1.3 oster /* connect read old data nodes to xor nodes */
1847 1.3 oster for (i = 0; i < numDataNodes; i++) {
1848 1.3 oster for (j = 0; j < numParityNodes; j++) {
1849 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
1850 1.3 oster readDataNodes[i].succedents[1 + j] = &xorNodes[j];
1851 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
1852 1.3 oster xorNodes[j].antType[i] = rf_trueData;
1853 1.3 oster }
1854 1.3 oster }
1855 1.3 oster
1856 1.3 oster /* connect read old data nodes to q nodes */
1857 1.3 oster if (nfaults == 2)
1858 1.3 oster for (i = 0; i < numDataNodes; i++)
1859 1.3 oster for (j = 0; j < numParityNodes; j++) {
1860 1.3 oster RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
1861 1.3 oster readDataNodes[i].succedents[1 + numParityNodes + j] = &qNodes[j];
1862 1.3 oster qNodes[j].antecedents[i] = &readDataNodes[i];
1863 1.3 oster qNodes[j].antType[i] = rf_trueData;
1864 1.3 oster }
1865 1.3 oster
1866 1.3 oster /* connect read old parity nodes to xor nodes */
1867 1.3 oster for (i = 0; i < numParityNodes; i++) {
1868 1.3 oster for (j = 0; j < numParityNodes; j++) {
1869 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
1870 1.3 oster readParityNodes[i].succedents[j] = &xorNodes[j];
1871 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
1872 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
1873 1.3 oster }
1874 1.3 oster }
1875 1.3 oster
1876 1.3 oster /* connect read old q nodes to q nodes */
1877 1.3 oster if (nfaults == 2)
1878 1.3 oster for (i = 0; i < numParityNodes; i++) {
1879 1.3 oster for (j = 0; j < numParityNodes; j++) {
1880 1.3 oster RF_ASSERT(readQNodes[i].numSuccedents == numParityNodes);
1881 1.3 oster readQNodes[i].succedents[j] = &qNodes[j];
1882 1.3 oster qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
1883 1.3 oster qNodes[j].antType[numDataNodes + i] = rf_trueData;
1884 1.3 oster }
1885 1.3 oster }
1886 1.3 oster
1887 1.3 oster /* connect xor nodes to the write new parity nodes */
1888 1.3 oster for (i = 0; i < numParityNodes; i++) {
1889 1.3 oster RF_ASSERT(writeParityNodes[i].numAntecedents == numParityNodes);
1890 1.3 oster for (j = 0; j < numParityNodes; j++) {
1891 1.3 oster RF_ASSERT(xorNodes[j].numSuccedents == numParityNodes);
1892 1.3 oster xorNodes[i].succedents[j] = &writeParityNodes[j];
1893 1.3 oster writeParityNodes[j].antecedents[i] = &xorNodes[i];
1894 1.3 oster writeParityNodes[j].antType[i] = rf_trueData;
1895 1.3 oster }
1896 1.3 oster }
1897 1.3 oster
1898 1.3 oster /* connect q nodes to the write new q nodes */
1899 1.3 oster if (nfaults == 2)
1900 1.3 oster for (i = 0; i < numParityNodes; i++) {
1901 1.3 oster RF_ASSERT(writeQNodes[i].numAntecedents == numParityNodes);
1902 1.3 oster for (j = 0; j < numParityNodes; j++) {
1903 1.3 oster RF_ASSERT(qNodes[j].numSuccedents == 1);
1904 1.3 oster qNodes[i].succedents[j] = &writeQNodes[j];
1905 1.3 oster writeQNodes[j].antecedents[i] = &qNodes[i];
1906 1.3 oster writeQNodes[j].antType[i] = rf_trueData;
1907 1.3 oster }
1908 1.3 oster }
1909 1.3 oster
1910 1.3 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1911 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
1912 1.3 oster for (i = 0; i < numDataNodes; i++) {
1913 1.3 oster if (lu_flag) {
1914 1.3 oster /* connect write new data nodes to unlock nodes */
1915 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
1916 1.3 oster RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
1917 1.3 oster writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
1918 1.3 oster unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
1919 1.3 oster unlockDataNodes[i].antType[0] = rf_control;
1920 1.3 oster
1921 1.3 oster /* connect unlock nodes to term node */
1922 1.3 oster RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
1923 1.3 oster unlockDataNodes[i].succedents[0] = termNode;
1924 1.3 oster termNode->antecedents[i] = &unlockDataNodes[i];
1925 1.3 oster termNode->antType[i] = rf_control;
1926 1.3 oster } else {
1927 1.3 oster /* connect write new data nodes to term node */
1928 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
1929 1.3 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1930 1.3 oster writeDataNodes[i].succedents[0] = termNode;
1931 1.3 oster termNode->antecedents[i] = &writeDataNodes[i];
1932 1.3 oster termNode->antType[i] = rf_control;
1933 1.3 oster }
1934 1.3 oster }
1935 1.3 oster
1936 1.3 oster for (i = 0; i < numParityNodes; i++) {
1937 1.3 oster if (lu_flag) {
1938 1.3 oster /* connect write new parity nodes to unlock nodes */
1939 1.3 oster RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
1940 1.3 oster RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
1941 1.3 oster writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
1942 1.3 oster unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
1943 1.3 oster unlockParityNodes[i].antType[0] = rf_control;
1944 1.3 oster
1945 1.3 oster /* connect unlock nodes to term node */
1946 1.3 oster RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
1947 1.3 oster unlockParityNodes[i].succedents[0] = termNode;
1948 1.3 oster termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
1949 1.3 oster termNode->antType[numDataNodes + i] = rf_control;
1950 1.3 oster } else {
1951 1.3 oster RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
1952 1.3 oster writeParityNodes[i].succedents[0] = termNode;
1953 1.3 oster termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
1954 1.3 oster termNode->antType[numDataNodes + i] = rf_control;
1955 1.3 oster }
1956 1.3 oster }
1957 1.3 oster
1958 1.3 oster if (nfaults == 2)
1959 1.3 oster for (i = 0; i < numParityNodes; i++) {
1960 1.3 oster if (lu_flag) {
1961 1.3 oster /* connect write new Q nodes to unlock nodes */
1962 1.3 oster RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1963 1.3 oster RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
1964 1.3 oster writeQNodes[i].succedents[0] = &unlockQNodes[i];
1965 1.3 oster unlockQNodes[i].antecedents[0] = &writeQNodes[i];
1966 1.3 oster unlockQNodes[i].antType[0] = rf_control;
1967 1.3 oster
1968 1.3 oster /* connect unlock nodes to unblock node */
1969 1.3 oster RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
1970 1.3 oster unlockQNodes[i].succedents[0] = termNode;
1971 1.3 oster termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
1972 1.3 oster termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1973 1.3 oster } else {
1974 1.3 oster RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1975 1.3 oster writeQNodes[i].succedents[0] = termNode;
1976 1.3 oster termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
1977 1.3 oster termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1978 1.3 oster }
1979 1.3 oster }
1980 1.1 oster }
1981 1.1 oster
1982 1.1 oster
1983 1.1 oster
1984 1.1 oster /******************************************************************************
1985 1.1 oster * create a write graph (fault-free or degraded) for RAID level 1
1986 1.1 oster *
1987 1.1 oster * Hdr Nil -> Wpd -> Nil -> Trm
1988 1.1 oster * Nil -> Wsd ->
1989 1.1 oster *
1990 1.1 oster * The "Wpd" node writes data to the primary copy in the mirror pair
1991 1.1 oster * The "Wsd" node writes data to the secondary copy in the mirror pair
1992 1.1 oster *
1993 1.1 oster * Parameters: raidPtr - description of the physical array
1994 1.1 oster * asmap - logical & physical addresses for this access
1995 1.1 oster * bp - buffer ptr (holds write data)
1996 1.3 oster * flags - general flags (e.g. disk locking)
1997 1.1 oster * allocList - list of memory allocated in DAG creation
1998 1.1 oster *****************************************************************************/
1999 1.1 oster
2000 1.3 oster void
2001 1.3 oster rf_CreateRaidOneWriteDAGFwd(
2002 1.3 oster RF_Raid_t * raidPtr,
2003 1.3 oster RF_AccessStripeMap_t * asmap,
2004 1.3 oster RF_DagHeader_t * dag_h,
2005 1.3 oster void *bp,
2006 1.3 oster RF_RaidAccessFlags_t flags,
2007 1.3 oster RF_AllocListElem_t * allocList)
2008 1.1 oster {
2009 1.3 oster RF_DagNode_t *blockNode, *unblockNode, *termNode;
2010 1.3 oster RF_DagNode_t *nodes, *wndNode, *wmirNode;
2011 1.3 oster int nWndNodes, nWmirNodes, i;
2012 1.3 oster RF_ReconUnitNum_t which_ru;
2013 1.3 oster RF_PhysDiskAddr_t *pda, *pdaP;
2014 1.3 oster RF_StripeNum_t parityStripeID;
2015 1.3 oster
2016 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
2017 1.3 oster asmap->raidAddress, &which_ru);
2018 1.3 oster if (rf_dagDebug) {
2019 1.3 oster printf("[Creating RAID level 1 write DAG]\n");
2020 1.3 oster }
2021 1.3 oster nWmirNodes = (asmap->parityInfo->next) ? 2 : 1; /* 2 implies access not
2022 1.3 oster * SU aligned */
2023 1.3 oster nWndNodes = (asmap->physInfo->next) ? 2 : 1;
2024 1.3 oster
2025 1.3 oster /* alloc the Wnd nodes and the Wmir node */
2026 1.3 oster if (asmap->numDataFailed == 1)
2027 1.3 oster nWndNodes--;
2028 1.3 oster if (asmap->numParityFailed == 1)
2029 1.3 oster nWmirNodes--;
2030 1.3 oster
2031 1.3 oster /* total number of nodes = nWndNodes + nWmirNodes + (block + unblock +
2032 1.3 oster * terminator) */
2033 1.3 oster RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
2034 1.3 oster i = 0;
2035 1.3 oster wndNode = &nodes[i];
2036 1.3 oster i += nWndNodes;
2037 1.3 oster wmirNode = &nodes[i];
2038 1.3 oster i += nWmirNodes;
2039 1.3 oster blockNode = &nodes[i];
2040 1.3 oster i += 1;
2041 1.3 oster unblockNode = &nodes[i];
2042 1.3 oster i += 1;
2043 1.3 oster termNode = &nodes[i];
2044 1.3 oster i += 1;
2045 1.3 oster RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
2046 1.3 oster
2047 1.3 oster /* this dag can commit immediately */
2048 1.3 oster dag_h->numCommitNodes = 0;
2049 1.3 oster dag_h->numCommits = 0;
2050 1.3 oster dag_h->numSuccedents = 1;
2051 1.3 oster
2052 1.3 oster /* initialize the unblock and term nodes */
2053 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Nil", allocList);
2054 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList);
2055 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
2056 1.3 oster
2057 1.3 oster /* initialize the wnd nodes */
2058 1.3 oster if (nWndNodes > 0) {
2059 1.3 oster pda = asmap->physInfo;
2060 1.3 oster for (i = 0; i < nWndNodes; i++) {
2061 1.3 oster rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList);
2062 1.3 oster RF_ASSERT(pda != NULL);
2063 1.3 oster wndNode[i].params[0].p = pda;
2064 1.3 oster wndNode[i].params[1].p = pda->bufPtr;
2065 1.3 oster wndNode[i].params[2].v = parityStripeID;
2066 1.3 oster wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
2067 1.3 oster pda = pda->next;
2068 1.3 oster }
2069 1.3 oster RF_ASSERT(pda == NULL);
2070 1.3 oster }
2071 1.3 oster /* initialize the mirror nodes */
2072 1.3 oster if (nWmirNodes > 0) {
2073 1.3 oster pda = asmap->physInfo;
2074 1.3 oster pdaP = asmap->parityInfo;
2075 1.3 oster for (i = 0; i < nWmirNodes; i++) {
2076 1.3 oster rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList);
2077 1.3 oster RF_ASSERT(pda != NULL);
2078 1.3 oster wmirNode[i].params[0].p = pdaP;
2079 1.3 oster wmirNode[i].params[1].p = pda->bufPtr;
2080 1.3 oster wmirNode[i].params[2].v = parityStripeID;
2081 1.3 oster wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
2082 1.3 oster pda = pda->next;
2083 1.3 oster pdaP = pdaP->next;
2084 1.3 oster }
2085 1.3 oster RF_ASSERT(pda == NULL);
2086 1.3 oster RF_ASSERT(pdaP == NULL);
2087 1.3 oster }
2088 1.3 oster /* link the header node to the block node */
2089 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
2090 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
2091 1.3 oster dag_h->succedents[0] = blockNode;
2092 1.3 oster
2093 1.3 oster /* link the block node to the write nodes */
2094 1.3 oster RF_ASSERT(blockNode->numSuccedents == (nWndNodes + nWmirNodes));
2095 1.3 oster for (i = 0; i < nWndNodes; i++) {
2096 1.3 oster RF_ASSERT(wndNode[i].numAntecedents == 1);
2097 1.3 oster blockNode->succedents[i] = &wndNode[i];
2098 1.3 oster wndNode[i].antecedents[0] = blockNode;
2099 1.3 oster wndNode[i].antType[0] = rf_control;
2100 1.3 oster }
2101 1.3 oster for (i = 0; i < nWmirNodes; i++) {
2102 1.3 oster RF_ASSERT(wmirNode[i].numAntecedents == 1);
2103 1.3 oster blockNode->succedents[i + nWndNodes] = &wmirNode[i];
2104 1.3 oster wmirNode[i].antecedents[0] = blockNode;
2105 1.3 oster wmirNode[i].antType[0] = rf_control;
2106 1.3 oster }
2107 1.3 oster
2108 1.3 oster /* link the write nodes to the unblock node */
2109 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
2110 1.3 oster for (i = 0; i < nWndNodes; i++) {
2111 1.3 oster RF_ASSERT(wndNode[i].numSuccedents == 1);
2112 1.3 oster wndNode[i].succedents[0] = unblockNode;
2113 1.3 oster unblockNode->antecedents[i] = &wndNode[i];
2114 1.3 oster unblockNode->antType[i] = rf_control;
2115 1.3 oster }
2116 1.3 oster for (i = 0; i < nWmirNodes; i++) {
2117 1.3 oster RF_ASSERT(wmirNode[i].numSuccedents == 1);
2118 1.3 oster wmirNode[i].succedents[0] = unblockNode;
2119 1.3 oster unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
2120 1.3 oster unblockNode->antType[i + nWndNodes] = rf_control;
2121 1.3 oster }
2122 1.3 oster
2123 1.3 oster /* link the unblock node to the term node */
2124 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
2125 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
2126 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
2127 1.3 oster unblockNode->succedents[0] = termNode;
2128 1.3 oster termNode->antecedents[0] = unblockNode;
2129 1.3 oster termNode->antType[0] = rf_control;
2130 1.1 oster
2131 1.3 oster return;
2132 1.1 oster }
2133