rf_dagffwr.c revision 1.19 1 1.19 oster /* $NetBSD: rf_dagffwr.c,v 1.19 2004/03/05 03:22:05 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster /*
30 1.1 oster * rf_dagff.c
31 1.1 oster *
32 1.1 oster * code for creating fault-free DAGs
33 1.1 oster *
34 1.1 oster */
35 1.7 lukem
36 1.7 lukem #include <sys/cdefs.h>
37 1.19 oster __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.19 2004/03/05 03:22:05 oster Exp $");
38 1.1 oster
39 1.6 oster #include <dev/raidframe/raidframevar.h>
40 1.6 oster
41 1.1 oster #include "rf_raid.h"
42 1.1 oster #include "rf_dag.h"
43 1.1 oster #include "rf_dagutils.h"
44 1.1 oster #include "rf_dagfuncs.h"
45 1.1 oster #include "rf_debugMem.h"
46 1.1 oster #include "rf_dagffrd.h"
47 1.1 oster #include "rf_general.h"
48 1.1 oster #include "rf_dagffwr.h"
49 1.1 oster
50 1.1 oster /******************************************************************************
51 1.1 oster *
52 1.1 oster * General comments on DAG creation:
53 1.3 oster *
54 1.1 oster * All DAGs in this file use roll-away error recovery. Each DAG has a single
55 1.1 oster * commit node, usually called "Cmt." If an error occurs before the Cmt node
56 1.1 oster * is reached, the execution engine will halt forward execution and work
57 1.1 oster * backward through the graph, executing the undo functions. Assuming that
58 1.1 oster * each node in the graph prior to the Cmt node are undoable and atomic - or -
59 1.1 oster * does not make changes to permanent state, the graph will fail atomically.
60 1.1 oster * If an error occurs after the Cmt node executes, the engine will roll-forward
61 1.1 oster * through the graph, blindly executing nodes until it reaches the end.
62 1.1 oster * If a graph reaches the end, it is assumed to have completed successfully.
63 1.1 oster *
64 1.1 oster * A graph has only 1 Cmt node.
65 1.1 oster *
66 1.1 oster */
67 1.1 oster
68 1.1 oster
69 1.1 oster /******************************************************************************
70 1.1 oster *
71 1.1 oster * The following wrappers map the standard DAG creation interface to the
72 1.1 oster * DAG creation routines. Additionally, these wrappers enable experimentation
73 1.1 oster * with new DAG structures by providing an extra level of indirection, allowing
74 1.1 oster * the DAG creation routines to be replaced at this single point.
75 1.1 oster */
76 1.1 oster
77 1.1 oster
78 1.3 oster void
79 1.13 oster rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
80 1.13 oster RF_DagHeader_t *dag_h, void *bp,
81 1.13 oster RF_RaidAccessFlags_t flags,
82 1.13 oster RF_AllocListElem_t *allocList,
83 1.13 oster RF_IoType_t type)
84 1.1 oster {
85 1.3 oster rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
86 1.14 oster RF_IO_TYPE_WRITE);
87 1.1 oster }
88 1.1 oster
89 1.3 oster void
90 1.13 oster rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
91 1.13 oster RF_DagHeader_t *dag_h, void *bp,
92 1.13 oster RF_RaidAccessFlags_t flags,
93 1.13 oster RF_AllocListElem_t *allocList,
94 1.13 oster RF_IoType_t type)
95 1.1 oster {
96 1.3 oster rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
97 1.14 oster RF_IO_TYPE_WRITE);
98 1.1 oster }
99 1.1 oster
100 1.3 oster void
101 1.13 oster rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
102 1.13 oster RF_DagHeader_t *dag_h, void *bp,
103 1.13 oster RF_RaidAccessFlags_t flags,
104 1.13 oster RF_AllocListElem_t *allocList)
105 1.1 oster {
106 1.3 oster /* "normal" rollaway */
107 1.14 oster rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
108 1.14 oster allocList, &rf_xorFuncs, NULL);
109 1.1 oster }
110 1.1 oster
111 1.3 oster void
112 1.13 oster rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
113 1.13 oster RF_DagHeader_t *dag_h, void *bp,
114 1.13 oster RF_RaidAccessFlags_t flags,
115 1.13 oster RF_AllocListElem_t *allocList)
116 1.1 oster {
117 1.3 oster /* "normal" rollaway */
118 1.14 oster rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
119 1.14 oster allocList, 1, rf_RegularXorFunc, RF_TRUE);
120 1.1 oster }
121 1.1 oster
122 1.1 oster
123 1.1 oster /******************************************************************************
124 1.1 oster *
125 1.1 oster * DAG creation code begins here
126 1.1 oster */
127 1.1 oster
128 1.1 oster
129 1.1 oster /******************************************************************************
130 1.1 oster *
131 1.1 oster * creates a DAG to perform a large-write operation:
132 1.1 oster *
133 1.1 oster * / Rod \ / Wnd \
134 1.1 oster * H -- block- Rod - Xor - Cmt - Wnd --- T
135 1.1 oster * \ Rod / \ Wnp /
136 1.1 oster * \[Wnq]/
137 1.1 oster *
138 1.1 oster * The XOR node also does the Q calculation in the P+Q architecture.
139 1.1 oster * All nodes are before the commit node (Cmt) are assumed to be atomic and
140 1.1 oster * undoable - or - they make no changes to permanent state.
141 1.1 oster *
142 1.1 oster * Rod = read old data
143 1.1 oster * Cmt = commit node
144 1.1 oster * Wnp = write new parity
145 1.1 oster * Wnd = write new data
146 1.1 oster * Wnq = write new "q"
147 1.1 oster * [] denotes optional segments in the graph
148 1.1 oster *
149 1.1 oster * Parameters: raidPtr - description of the physical array
150 1.1 oster * asmap - logical & physical addresses for this access
151 1.1 oster * bp - buffer ptr (holds write data)
152 1.3 oster * flags - general flags (e.g. disk locking)
153 1.1 oster * allocList - list of memory allocated in DAG creation
154 1.1 oster * nfaults - number of faults array can tolerate
155 1.1 oster * (equal to # redundancy units in stripe)
156 1.1 oster * redfuncs - list of redundancy generating functions
157 1.1 oster *
158 1.1 oster *****************************************************************************/
159 1.1 oster
160 1.3 oster void
161 1.13 oster rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
162 1.13 oster RF_DagHeader_t *dag_h, void *bp,
163 1.13 oster RF_RaidAccessFlags_t flags,
164 1.13 oster RF_AllocListElem_t *allocList,
165 1.13 oster int nfaults, int (*redFunc) (RF_DagNode_t *),
166 1.13 oster int allowBufferRecycle)
167 1.1 oster {
168 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
169 1.3 oster RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode;
170 1.3 oster int nWndNodes, nRodNodes, i, nodeNum, asmNum;
171 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
172 1.3 oster RF_StripeNum_t parityStripeID;
173 1.3 oster char *sosBuffer, *eosBuffer;
174 1.3 oster RF_ReconUnitNum_t which_ru;
175 1.3 oster RF_RaidLayout_t *layoutPtr;
176 1.3 oster RF_PhysDiskAddr_t *pda;
177 1.3 oster
178 1.3 oster layoutPtr = &(raidPtr->Layout);
179 1.14 oster parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
180 1.14 oster asmap->raidAddress,
181 1.14 oster &which_ru);
182 1.3 oster
183 1.19 oster #if RF_DEBUG_DAG
184 1.3 oster if (rf_dagDebug) {
185 1.3 oster printf("[Creating large-write DAG]\n");
186 1.3 oster }
187 1.19 oster #endif
188 1.3 oster dag_h->creator = "LargeWriteDAG";
189 1.3 oster
190 1.3 oster dag_h->numCommitNodes = 1;
191 1.3 oster dag_h->numCommits = 0;
192 1.3 oster dag_h->numSuccedents = 1;
193 1.3 oster
194 1.3 oster /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
195 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
196 1.12 oster RF_MallocAndAdd(nodes,
197 1.12 oster (nWndNodes + 4 + nfaults) * sizeof(RF_DagNode_t),
198 1.12 oster (RF_DagNode_t *), allocList);
199 1.3 oster i = 0;
200 1.3 oster wndNodes = &nodes[i];
201 1.3 oster i += nWndNodes;
202 1.3 oster xorNode = &nodes[i];
203 1.3 oster i += 1;
204 1.3 oster wnpNode = &nodes[i];
205 1.3 oster i += 1;
206 1.3 oster blockNode = &nodes[i];
207 1.3 oster i += 1;
208 1.3 oster commitNode = &nodes[i];
209 1.3 oster i += 1;
210 1.3 oster termNode = &nodes[i];
211 1.3 oster i += 1;
212 1.3 oster if (nfaults == 2) {
213 1.3 oster wnqNode = &nodes[i];
214 1.3 oster i += 1;
215 1.3 oster } else {
216 1.3 oster wnqNode = NULL;
217 1.3 oster }
218 1.14 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h,
219 1.14 oster new_asm_h, &nRodNodes, &sosBuffer,
220 1.14 oster &eosBuffer, allocList);
221 1.3 oster if (nRodNodes > 0) {
222 1.12 oster RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t),
223 1.12 oster (RF_DagNode_t *), allocList);
224 1.3 oster } else {
225 1.3 oster rodNodes = NULL;
226 1.3 oster }
227 1.3 oster
228 1.3 oster /* begin node initialization */
229 1.3 oster if (nRodNodes > 0) {
230 1.14 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
231 1.14 oster rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0,
232 1.14 oster dag_h, "Nil", allocList);
233 1.3 oster } else {
234 1.14 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
235 1.14 oster rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0,
236 1.14 oster dag_h, "Nil", allocList);
237 1.3 oster }
238 1.3 oster
239 1.14 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
240 1.14 oster rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0,
241 1.14 oster dag_h, "Cmt", allocList);
242 1.14 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
243 1.14 oster rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0,
244 1.14 oster dag_h, "Trm", allocList);
245 1.3 oster
246 1.3 oster /* initialize the Rod nodes */
247 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
248 1.3 oster if (new_asm_h[asmNum]) {
249 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
250 1.3 oster while (pda) {
251 1.14 oster rf_InitNode(&rodNodes[nodeNum], rf_wait,
252 1.14 oster RF_FALSE, rf_DiskReadFunc,
253 1.14 oster rf_DiskReadUndoFunc,
254 1.14 oster rf_GenericWakeupFunc,
255 1.14 oster 1, 1, 4, 0, dag_h,
256 1.14 oster "Rod", allocList);
257 1.3 oster rodNodes[nodeNum].params[0].p = pda;
258 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
259 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
260 1.3 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
261 1.17 oster which_ru);
262 1.3 oster nodeNum++;
263 1.3 oster pda = pda->next;
264 1.3 oster }
265 1.3 oster }
266 1.3 oster }
267 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
268 1.3 oster
269 1.3 oster /* initialize the wnd nodes */
270 1.3 oster pda = asmap->physInfo;
271 1.3 oster for (i = 0; i < nWndNodes; i++) {
272 1.14 oster rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE,
273 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
274 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0,
275 1.14 oster dag_h, "Wnd", allocList);
276 1.3 oster RF_ASSERT(pda != NULL);
277 1.3 oster wndNodes[i].params[0].p = pda;
278 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
279 1.3 oster wndNodes[i].params[2].v = parityStripeID;
280 1.17 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
281 1.3 oster pda = pda->next;
282 1.3 oster }
283 1.3 oster
284 1.3 oster /* initialize the redundancy node */
285 1.3 oster if (nRodNodes > 0) {
286 1.14 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
287 1.14 oster rf_NullNodeUndoFunc, NULL, 1,
288 1.14 oster nRodNodes, 2 * (nWndNodes + nRodNodes) + 1,
289 1.14 oster nfaults, dag_h, "Xr ", allocList);
290 1.3 oster } else {
291 1.14 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
292 1.14 oster rf_NullNodeUndoFunc, NULL, 1,
293 1.14 oster 1, 2 * (nWndNodes + nRodNodes) + 1,
294 1.14 oster nfaults, dag_h, "Xr ", allocList);
295 1.3 oster }
296 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
297 1.3 oster for (i = 0; i < nWndNodes; i++) {
298 1.14 oster /* pda */
299 1.14 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0];
300 1.14 oster /* buf ptr */
301 1.14 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1];
302 1.3 oster }
303 1.3 oster for (i = 0; i < nRodNodes; i++) {
304 1.14 oster /* pda */
305 1.14 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0];
306 1.14 oster /* buf ptr */
307 1.14 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1];
308 1.3 oster }
309 1.3 oster /* xor node needs to get at RAID information */
310 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
311 1.3 oster
312 1.3 oster /*
313 1.14 oster * Look for an Rod node that reads a complete SU. If none,
314 1.14 oster * alloc a buffer to receive the parity info. Note that we
315 1.14 oster * can't use a new data buffer because it will not have gotten
316 1.14 oster * written when the xor occurs. */
317 1.3 oster if (allowBufferRecycle) {
318 1.3 oster for (i = 0; i < nRodNodes; i++) {
319 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
320 1.3 oster break;
321 1.3 oster }
322 1.3 oster }
323 1.3 oster if ((!allowBufferRecycle) || (i == nRodNodes)) {
324 1.12 oster RF_MallocAndAdd(xorNode->results[0],
325 1.12 oster rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
326 1.12 oster (void *), allocList);
327 1.3 oster } else {
328 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
329 1.3 oster }
330 1.3 oster
331 1.3 oster /* initialize the Wnp node */
332 1.14 oster rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
333 1.14 oster rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
334 1.14 oster dag_h, "Wnp", allocList);
335 1.3 oster wnpNode->params[0].p = asmap->parityInfo;
336 1.3 oster wnpNode->params[1].p = xorNode->results[0];
337 1.3 oster wnpNode->params[2].v = parityStripeID;
338 1.17 oster wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
339 1.3 oster /* parityInfo must describe entire parity unit */
340 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL);
341 1.3 oster
342 1.3 oster if (nfaults == 2) {
343 1.3 oster /*
344 1.3 oster * We never try to recycle a buffer for the Q calcuation
345 1.3 oster * in addition to the parity. This would cause two buffers
346 1.3 oster * to get smashed during the P and Q calculation, guaranteeing
347 1.3 oster * one would be wrong.
348 1.3 oster */
349 1.12 oster RF_MallocAndAdd(xorNode->results[1],
350 1.12 oster rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
351 1.12 oster (void *), allocList);
352 1.14 oster rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
353 1.14 oster rf_DiskWriteUndoFunc, rf_GenericWakeupFunc,
354 1.14 oster 1, 1, 4, 0, dag_h, "Wnq", allocList);
355 1.3 oster wnqNode->params[0].p = asmap->qInfo;
356 1.3 oster wnqNode->params[1].p = xorNode->results[1];
357 1.3 oster wnqNode->params[2].v = parityStripeID;
358 1.17 oster wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
359 1.3 oster /* parityInfo must describe entire parity unit */
360 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL);
361 1.3 oster }
362 1.3 oster /*
363 1.3 oster * Connect nodes to form graph.
364 1.3 oster */
365 1.3 oster
366 1.3 oster /* connect dag header to block node */
367 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
368 1.3 oster dag_h->succedents[0] = blockNode;
369 1.3 oster
370 1.3 oster if (nRodNodes > 0) {
371 1.3 oster /* connect the block node to the Rod nodes */
372 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes);
373 1.3 oster RF_ASSERT(xorNode->numAntecedents == nRodNodes);
374 1.3 oster for (i = 0; i < nRodNodes; i++) {
375 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
376 1.3 oster blockNode->succedents[i] = &rodNodes[i];
377 1.3 oster rodNodes[i].antecedents[0] = blockNode;
378 1.3 oster rodNodes[i].antType[0] = rf_control;
379 1.3 oster
380 1.3 oster /* connect the Rod nodes to the Xor node */
381 1.3 oster RF_ASSERT(rodNodes[i].numSuccedents == 1);
382 1.3 oster rodNodes[i].succedents[0] = xorNode;
383 1.3 oster xorNode->antecedents[i] = &rodNodes[i];
384 1.3 oster xorNode->antType[i] = rf_trueData;
385 1.3 oster }
386 1.3 oster } else {
387 1.3 oster /* connect the block node to the Xor node */
388 1.3 oster RF_ASSERT(blockNode->numSuccedents == 1);
389 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
390 1.3 oster blockNode->succedents[0] = xorNode;
391 1.3 oster xorNode->antecedents[0] = blockNode;
392 1.3 oster xorNode->antType[0] = rf_control;
393 1.3 oster }
394 1.3 oster
395 1.3 oster /* connect the xor node to the commit node */
396 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1);
397 1.3 oster RF_ASSERT(commitNode->numAntecedents == 1);
398 1.3 oster xorNode->succedents[0] = commitNode;
399 1.3 oster commitNode->antecedents[0] = xorNode;
400 1.3 oster commitNode->antType[0] = rf_control;
401 1.3 oster
402 1.3 oster /* connect the commit node to the write nodes */
403 1.3 oster RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
404 1.3 oster for (i = 0; i < nWndNodes; i++) {
405 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
406 1.3 oster commitNode->succedents[i] = &wndNodes[i];
407 1.3 oster wndNodes[i].antecedents[0] = commitNode;
408 1.3 oster wndNodes[i].antType[0] = rf_control;
409 1.3 oster }
410 1.3 oster RF_ASSERT(wnpNode->numAntecedents == 1);
411 1.3 oster commitNode->succedents[nWndNodes] = wnpNode;
412 1.3 oster wnpNode->antecedents[0] = commitNode;
413 1.3 oster wnpNode->antType[0] = rf_trueData;
414 1.3 oster if (nfaults == 2) {
415 1.3 oster RF_ASSERT(wnqNode->numAntecedents == 1);
416 1.3 oster commitNode->succedents[nWndNodes + 1] = wnqNode;
417 1.3 oster wnqNode->antecedents[0] = commitNode;
418 1.3 oster wnqNode->antType[0] = rf_trueData;
419 1.3 oster }
420 1.3 oster /* connect the write nodes to the term node */
421 1.3 oster RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
422 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
423 1.3 oster for (i = 0; i < nWndNodes; i++) {
424 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
425 1.3 oster wndNodes[i].succedents[0] = termNode;
426 1.3 oster termNode->antecedents[i] = &wndNodes[i];
427 1.3 oster termNode->antType[i] = rf_control;
428 1.3 oster }
429 1.3 oster RF_ASSERT(wnpNode->numSuccedents == 1);
430 1.3 oster wnpNode->succedents[0] = termNode;
431 1.3 oster termNode->antecedents[nWndNodes] = wnpNode;
432 1.3 oster termNode->antType[nWndNodes] = rf_control;
433 1.3 oster if (nfaults == 2) {
434 1.3 oster RF_ASSERT(wnqNode->numSuccedents == 1);
435 1.3 oster wnqNode->succedents[0] = termNode;
436 1.3 oster termNode->antecedents[nWndNodes + 1] = wnqNode;
437 1.3 oster termNode->antType[nWndNodes + 1] = rf_control;
438 1.3 oster }
439 1.1 oster }
440 1.1 oster /******************************************************************************
441 1.1 oster *
442 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq),
443 1.1 oster * which is as follows:
444 1.1 oster *
445 1.1 oster * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
446 1.1 oster * \- Rod X / \----> Wnd [Und]-/
447 1.1 oster * [\- Rod X / \---> Wnd [Und]-/]
448 1.1 oster * [\- Roq -> Q / \--> Wnq [Unq]-/]
449 1.1 oster *
450 1.1 oster * Rop = read old parity
451 1.1 oster * Rod = read old data
452 1.1 oster * Roq = read old "q"
453 1.1 oster * Cmt = commit node
454 1.1 oster * Und = unlock data disk
455 1.1 oster * Unp = unlock parity disk
456 1.1 oster * Unq = unlock q disk
457 1.1 oster * Wnp = write new parity
458 1.1 oster * Wnd = write new data
459 1.1 oster * Wnq = write new "q"
460 1.1 oster * [ ] denotes optional segments in the graph
461 1.1 oster *
462 1.1 oster * Parameters: raidPtr - description of the physical array
463 1.1 oster * asmap - logical & physical addresses for this access
464 1.1 oster * bp - buffer ptr (holds write data)
465 1.3 oster * flags - general flags (e.g. disk locking)
466 1.1 oster * allocList - list of memory allocated in DAG creation
467 1.1 oster * pfuncs - list of parity generating functions
468 1.1 oster * qfuncs - list of q generating functions
469 1.1 oster *
470 1.1 oster * A null qfuncs indicates single fault tolerant
471 1.1 oster *****************************************************************************/
472 1.1 oster
473 1.3 oster void
474 1.13 oster rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
475 1.13 oster RF_DagHeader_t *dag_h, void *bp,
476 1.13 oster RF_RaidAccessFlags_t flags,
477 1.13 oster RF_AllocListElem_t *allocList,
478 1.13 oster const RF_RedFuncs_t *pfuncs,
479 1.13 oster const RF_RedFuncs_t *qfuncs)
480 1.1 oster {
481 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
482 1.3 oster RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes;
483 1.3 oster RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
484 1.16 oster int i, j, nNodes, totalNumNodes;
485 1.3 oster RF_ReconUnitNum_t which_ru;
486 1.3 oster int (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
487 1.3 oster int (*qfunc) (RF_DagNode_t *);
488 1.3 oster int numDataNodes, numParityNodes;
489 1.3 oster RF_StripeNum_t parityStripeID;
490 1.3 oster RF_PhysDiskAddr_t *pda;
491 1.3 oster char *name, *qname;
492 1.3 oster long nfaults;
493 1.3 oster
494 1.3 oster nfaults = qfuncs ? 2 : 1;
495 1.3 oster
496 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
497 1.3 oster asmap->raidAddress, &which_ru);
498 1.3 oster pda = asmap->physInfo;
499 1.3 oster numDataNodes = asmap->numStripeUnitsAccessed;
500 1.3 oster numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
501 1.3 oster
502 1.19 oster #if RF_DEBUG_DAG
503 1.3 oster if (rf_dagDebug) {
504 1.3 oster printf("[Creating small-write DAG]\n");
505 1.3 oster }
506 1.19 oster #endif
507 1.3 oster RF_ASSERT(numDataNodes > 0);
508 1.3 oster dag_h->creator = "SmallWriteDAG";
509 1.3 oster
510 1.3 oster dag_h->numCommitNodes = 1;
511 1.3 oster dag_h->numCommits = 0;
512 1.3 oster dag_h->numSuccedents = 1;
513 1.3 oster
514 1.3 oster /*
515 1.3 oster * DAG creation occurs in four steps:
516 1.3 oster * 1. count the number of nodes in the DAG
517 1.3 oster * 2. create the nodes
518 1.3 oster * 3. initialize the nodes
519 1.3 oster * 4. connect the nodes
520 1.3 oster */
521 1.3 oster
522 1.3 oster /*
523 1.3 oster * Step 1. compute number of nodes in the graph
524 1.3 oster */
525 1.3 oster
526 1.14 oster /* number of nodes: a read and write for each data unit a
527 1.14 oster * redundancy computation node for each parity node (nfaults *
528 1.14 oster * nparity) a read and write for each parity unit a block and
529 1.14 oster * commit node (2) a terminate node if atomic RMW an unlock
530 1.14 oster * node for each data unit, redundancy unit */
531 1.3 oster totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
532 1.3 oster + (nfaults * 2 * numParityNodes) + 3;
533 1.3 oster /*
534 1.3 oster * Step 2. create the nodes
535 1.3 oster */
536 1.12 oster RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t),
537 1.12 oster (RF_DagNode_t *), allocList);
538 1.3 oster i = 0;
539 1.3 oster blockNode = &nodes[i];
540 1.3 oster i += 1;
541 1.3 oster commitNode = &nodes[i];
542 1.3 oster i += 1;
543 1.3 oster readDataNodes = &nodes[i];
544 1.3 oster i += numDataNodes;
545 1.3 oster readParityNodes = &nodes[i];
546 1.3 oster i += numParityNodes;
547 1.3 oster writeDataNodes = &nodes[i];
548 1.3 oster i += numDataNodes;
549 1.3 oster writeParityNodes = &nodes[i];
550 1.3 oster i += numParityNodes;
551 1.3 oster xorNodes = &nodes[i];
552 1.3 oster i += numParityNodes;
553 1.3 oster termNode = &nodes[i];
554 1.3 oster i += 1;
555 1.16 oster
556 1.3 oster if (nfaults == 2) {
557 1.3 oster readQNodes = &nodes[i];
558 1.3 oster i += numParityNodes;
559 1.3 oster writeQNodes = &nodes[i];
560 1.3 oster i += numParityNodes;
561 1.3 oster qNodes = &nodes[i];
562 1.3 oster i += numParityNodes;
563 1.3 oster } else {
564 1.18 oster readQNodes = writeQNodes = qNodes = NULL;
565 1.3 oster }
566 1.3 oster RF_ASSERT(i == totalNumNodes);
567 1.3 oster
568 1.3 oster /*
569 1.3 oster * Step 3. initialize the nodes
570 1.3 oster */
571 1.3 oster /* initialize block node (Nil) */
572 1.3 oster nNodes = numDataNodes + (nfaults * numParityNodes);
573 1.14 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
574 1.14 oster rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0,
575 1.14 oster dag_h, "Nil", allocList);
576 1.3 oster
577 1.3 oster /* initialize commit node (Cmt) */
578 1.14 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
579 1.14 oster rf_NullNodeUndoFunc, NULL, nNodes,
580 1.14 oster (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
581 1.3 oster
582 1.3 oster /* initialize terminate node (Trm) */
583 1.14 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
584 1.14 oster rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0,
585 1.14 oster dag_h, "Trm", allocList);
586 1.3 oster
587 1.3 oster /* initialize nodes which read old data (Rod) */
588 1.3 oster for (i = 0; i < numDataNodes; i++) {
589 1.14 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE,
590 1.14 oster rf_DiskReadFunc, rf_DiskReadUndoFunc,
591 1.14 oster rf_GenericWakeupFunc, (nfaults * numParityNodes),
592 1.14 oster 1, 4, 0, dag_h, "Rod", allocList);
593 1.3 oster RF_ASSERT(pda != NULL);
594 1.3 oster /* physical disk addr desc */
595 1.3 oster readDataNodes[i].params[0].p = pda;
596 1.3 oster /* buffer to hold old data */
597 1.3 oster readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
598 1.3 oster dag_h, pda, allocList);
599 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
600 1.3 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
601 1.17 oster which_ru);
602 1.3 oster pda = pda->next;
603 1.3 oster for (j = 0; j < readDataNodes[i].numSuccedents; j++) {
604 1.3 oster readDataNodes[i].propList[j] = NULL;
605 1.3 oster }
606 1.3 oster }
607 1.3 oster
608 1.3 oster /* initialize nodes which read old parity (Rop) */
609 1.3 oster pda = asmap->parityInfo;
610 1.3 oster i = 0;
611 1.3 oster for (i = 0; i < numParityNodes; i++) {
612 1.3 oster RF_ASSERT(pda != NULL);
613 1.14 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE,
614 1.14 oster rf_DiskReadFunc, rf_DiskReadUndoFunc,
615 1.14 oster rf_GenericWakeupFunc, numParityNodes, 1, 4, 0,
616 1.14 oster dag_h, "Rop", allocList);
617 1.3 oster readParityNodes[i].params[0].p = pda;
618 1.3 oster /* buffer to hold old parity */
619 1.3 oster readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
620 1.3 oster dag_h, pda, allocList);
621 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
622 1.3 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
623 1.17 oster which_ru);
624 1.3 oster pda = pda->next;
625 1.3 oster for (j = 0; j < readParityNodes[i].numSuccedents; j++) {
626 1.3 oster readParityNodes[i].propList[0] = NULL;
627 1.3 oster }
628 1.3 oster }
629 1.3 oster
630 1.3 oster /* initialize nodes which read old Q (Roq) */
631 1.3 oster if (nfaults == 2) {
632 1.3 oster pda = asmap->qInfo;
633 1.3 oster for (i = 0; i < numParityNodes; i++) {
634 1.3 oster RF_ASSERT(pda != NULL);
635 1.14 oster rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE,
636 1.14 oster rf_DiskReadFunc, rf_DiskReadUndoFunc,
637 1.14 oster rf_GenericWakeupFunc, numParityNodes,
638 1.14 oster 1, 4, 0, dag_h, "Roq", allocList);
639 1.3 oster readQNodes[i].params[0].p = pda;
640 1.3 oster /* buffer to hold old Q */
641 1.14 oster readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
642 1.14 oster dag_h, pda,
643 1.14 oster allocList);
644 1.3 oster readQNodes[i].params[2].v = parityStripeID;
645 1.3 oster readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
646 1.17 oster which_ru);
647 1.3 oster pda = pda->next;
648 1.3 oster for (j = 0; j < readQNodes[i].numSuccedents; j++) {
649 1.3 oster readQNodes[i].propList[0] = NULL;
650 1.3 oster }
651 1.3 oster }
652 1.3 oster }
653 1.3 oster /* initialize nodes which write new data (Wnd) */
654 1.3 oster pda = asmap->physInfo;
655 1.3 oster for (i = 0; i < numDataNodes; i++) {
656 1.3 oster RF_ASSERT(pda != NULL);
657 1.14 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE,
658 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
659 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
660 1.14 oster "Wnd", allocList);
661 1.3 oster /* physical disk addr desc */
662 1.3 oster writeDataNodes[i].params[0].p = pda;
663 1.3 oster /* buffer holding new data to be written */
664 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr;
665 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
666 1.3 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
667 1.17 oster which_ru);
668 1.3 oster pda = pda->next;
669 1.3 oster }
670 1.3 oster
671 1.3 oster /*
672 1.3 oster * Initialize nodes which compute new parity and Q.
673 1.3 oster */
674 1.3 oster /*
675 1.3 oster * We use the simple XOR func in the double-XOR case, and when
676 1.14 oster * we're accessing only a portion of one stripe unit. The
677 1.14 oster * distinction between the two is that the regular XOR func
678 1.14 oster * assumes that the targbuf is a full SU in size, and examines
679 1.14 oster * the pda associated with the buffer to decide where within
680 1.14 oster * the buffer to XOR the data, whereas the simple XOR func
681 1.14 oster * just XORs the data into the start of the buffer. */
682 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1)
683 1.14 oster && (asmap->totalSectorsAccessed <
684 1.14 oster raidPtr->Layout.sectorsPerStripeUnit))) {
685 1.3 oster func = pfuncs->simple;
686 1.3 oster undoFunc = rf_NullNodeUndoFunc;
687 1.3 oster name = pfuncs->SimpleName;
688 1.3 oster if (qfuncs) {
689 1.3 oster qfunc = qfuncs->simple;
690 1.3 oster qname = qfuncs->SimpleName;
691 1.3 oster } else {
692 1.3 oster qfunc = NULL;
693 1.3 oster qname = NULL;
694 1.3 oster }
695 1.3 oster } else {
696 1.3 oster func = pfuncs->regular;
697 1.3 oster undoFunc = rf_NullNodeUndoFunc;
698 1.3 oster name = pfuncs->RegularName;
699 1.3 oster if (qfuncs) {
700 1.3 oster qfunc = qfuncs->regular;
701 1.3 oster qname = qfuncs->RegularName;
702 1.3 oster } else {
703 1.3 oster qfunc = NULL;
704 1.3 oster qname = NULL;
705 1.3 oster }
706 1.3 oster }
707 1.3 oster /*
708 1.3 oster * Initialize the xor nodes: params are {pda,buf}
709 1.3 oster * from {Rod,Wnd,Rop} nodes, and raidPtr
710 1.3 oster */
711 1.3 oster if (numParityNodes == 2) {
712 1.3 oster /* double-xor case */
713 1.3 oster for (i = 0; i < numParityNodes; i++) {
714 1.3 oster /* note: no wakeup func for xor */
715 1.14 oster rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func,
716 1.14 oster undoFunc, NULL, 1,
717 1.14 oster (numDataNodes + numParityNodes),
718 1.14 oster 7, 1, dag_h, name, allocList);
719 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
720 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
721 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
722 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
723 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
724 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
725 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
726 1.3 oster xorNodes[i].params[6].p = raidPtr;
727 1.3 oster /* use old parity buf as target buf */
728 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p;
729 1.3 oster if (nfaults == 2) {
730 1.3 oster /* note: no wakeup func for qor */
731 1.14 oster rf_InitNode(&qNodes[i], rf_wait, RF_FALSE,
732 1.14 oster qfunc, undoFunc, NULL, 1,
733 1.14 oster (numDataNodes + numParityNodes),
734 1.14 oster 7, 1, dag_h, qname, allocList);
735 1.3 oster qNodes[i].params[0] = readDataNodes[i].params[0];
736 1.3 oster qNodes[i].params[1] = readDataNodes[i].params[1];
737 1.3 oster qNodes[i].params[2] = readQNodes[i].params[0];
738 1.3 oster qNodes[i].params[3] = readQNodes[i].params[1];
739 1.3 oster qNodes[i].params[4] = writeDataNodes[i].params[0];
740 1.3 oster qNodes[i].params[5] = writeDataNodes[i].params[1];
741 1.3 oster qNodes[i].params[6].p = raidPtr;
742 1.3 oster /* use old Q buf as target buf */
743 1.3 oster qNodes[i].results[0] = readQNodes[i].params[1].p;
744 1.3 oster }
745 1.3 oster }
746 1.3 oster } else {
747 1.3 oster /* there is only one xor node in this case */
748 1.14 oster rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func,
749 1.14 oster undoFunc, NULL, 1, (numDataNodes + numParityNodes),
750 1.14 oster (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
751 1.14 oster dag_h, name, allocList);
752 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
753 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
754 1.3 oster /* set up params related to Rod and Rop nodes */
755 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
756 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
757 1.3 oster }
758 1.3 oster for (i = 0; i < numDataNodes; i++) {
759 1.3 oster /* set up params related to Wnd and Wnp nodes */
760 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
761 1.3 oster writeDataNodes[i].params[0];
762 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
763 1.3 oster writeDataNodes[i].params[1];
764 1.3 oster }
765 1.3 oster /* xor node needs to get at RAID information */
766 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
767 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
768 1.3 oster if (nfaults == 2) {
769 1.14 oster rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc,
770 1.14 oster undoFunc, NULL, 1,
771 1.14 oster (numDataNodes + numParityNodes),
772 1.14 oster (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
773 1.14 oster dag_h, qname, allocList);
774 1.3 oster for (i = 0; i < numDataNodes; i++) {
775 1.3 oster /* set up params related to Rod */
776 1.3 oster qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
777 1.3 oster qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
778 1.3 oster }
779 1.3 oster /* and read old q */
780 1.3 oster qNodes[0].params[2 * numDataNodes + 0] = /* pda */
781 1.3 oster readQNodes[0].params[0];
782 1.3 oster qNodes[0].params[2 * numDataNodes + 1] = /* buffer ptr */
783 1.3 oster readQNodes[0].params[1];
784 1.3 oster for (i = 0; i < numDataNodes; i++) {
785 1.3 oster /* set up params related to Wnd nodes */
786 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
787 1.3 oster writeDataNodes[i].params[0];
788 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
789 1.3 oster writeDataNodes[i].params[1];
790 1.3 oster }
791 1.3 oster /* xor node needs to get at RAID information */
792 1.3 oster qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
793 1.3 oster qNodes[0].results[0] = readQNodes[0].params[1].p;
794 1.3 oster }
795 1.3 oster }
796 1.3 oster
797 1.3 oster /* initialize nodes which write new parity (Wnp) */
798 1.3 oster pda = asmap->parityInfo;
799 1.3 oster for (i = 0; i < numParityNodes; i++) {
800 1.14 oster rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE,
801 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
802 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
803 1.14 oster "Wnp", allocList);
804 1.3 oster RF_ASSERT(pda != NULL);
805 1.3 oster writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr)
806 1.3 oster * filled in by xor node */
807 1.3 oster writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for
808 1.3 oster * parity write
809 1.3 oster * operation */
810 1.3 oster writeParityNodes[i].params[2].v = parityStripeID;
811 1.3 oster writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
812 1.17 oster which_ru);
813 1.3 oster pda = pda->next;
814 1.3 oster }
815 1.3 oster
816 1.3 oster /* initialize nodes which write new Q (Wnq) */
817 1.3 oster if (nfaults == 2) {
818 1.3 oster pda = asmap->qInfo;
819 1.3 oster for (i = 0; i < numParityNodes; i++) {
820 1.14 oster rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE,
821 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
822 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
823 1.14 oster "Wnq", allocList);
824 1.3 oster RF_ASSERT(pda != NULL);
825 1.3 oster writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr)
826 1.3 oster * filled in by xor node */
827 1.3 oster writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for
828 1.3 oster * parity write
829 1.3 oster * operation */
830 1.3 oster writeQNodes[i].params[2].v = parityStripeID;
831 1.3 oster writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
832 1.17 oster which_ru);
833 1.3 oster pda = pda->next;
834 1.3 oster }
835 1.3 oster }
836 1.3 oster /*
837 1.3 oster * Step 4. connect the nodes.
838 1.3 oster */
839 1.3 oster
840 1.3 oster /* connect header to block node */
841 1.3 oster dag_h->succedents[0] = blockNode;
842 1.3 oster
843 1.3 oster /* connect block node to read old data nodes */
844 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
845 1.3 oster for (i = 0; i < numDataNodes; i++) {
846 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
847 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
848 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
849 1.3 oster readDataNodes[i].antType[0] = rf_control;
850 1.3 oster }
851 1.3 oster
852 1.3 oster /* connect block node to read old parity nodes */
853 1.3 oster for (i = 0; i < numParityNodes; i++) {
854 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
855 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
856 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
857 1.3 oster readParityNodes[i].antType[0] = rf_control;
858 1.3 oster }
859 1.3 oster
860 1.3 oster /* connect block node to read old Q nodes */
861 1.3 oster if (nfaults == 2) {
862 1.3 oster for (i = 0; i < numParityNodes; i++) {
863 1.3 oster blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
864 1.3 oster RF_ASSERT(readQNodes[i].numAntecedents == 1);
865 1.3 oster readQNodes[i].antecedents[0] = blockNode;
866 1.3 oster readQNodes[i].antType[0] = rf_control;
867 1.3 oster }
868 1.3 oster }
869 1.3 oster /* connect read old data nodes to xor nodes */
870 1.3 oster for (i = 0; i < numDataNodes; i++) {
871 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes));
872 1.3 oster for (j = 0; j < numParityNodes; j++) {
873 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
874 1.3 oster readDataNodes[i].succedents[j] = &xorNodes[j];
875 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
876 1.3 oster xorNodes[j].antType[i] = rf_trueData;
877 1.3 oster }
878 1.3 oster }
879 1.3 oster
880 1.3 oster /* connect read old data nodes to q nodes */
881 1.3 oster if (nfaults == 2) {
882 1.3 oster for (i = 0; i < numDataNodes; i++) {
883 1.3 oster for (j = 0; j < numParityNodes; j++) {
884 1.3 oster RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
885 1.3 oster readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j];
886 1.3 oster qNodes[j].antecedents[i] = &readDataNodes[i];
887 1.3 oster qNodes[j].antType[i] = rf_trueData;
888 1.3 oster }
889 1.3 oster }
890 1.3 oster }
891 1.3 oster /* connect read old parity nodes to xor nodes */
892 1.3 oster for (i = 0; i < numParityNodes; i++) {
893 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
894 1.3 oster for (j = 0; j < numParityNodes; j++) {
895 1.3 oster readParityNodes[i].succedents[j] = &xorNodes[j];
896 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
897 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
898 1.3 oster }
899 1.3 oster }
900 1.3 oster
901 1.3 oster /* connect read old q nodes to q nodes */
902 1.3 oster if (nfaults == 2) {
903 1.3 oster for (i = 0; i < numParityNodes; i++) {
904 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
905 1.3 oster for (j = 0; j < numParityNodes; j++) {
906 1.3 oster readQNodes[i].succedents[j] = &qNodes[j];
907 1.3 oster qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
908 1.3 oster qNodes[j].antType[numDataNodes + i] = rf_trueData;
909 1.3 oster }
910 1.3 oster }
911 1.3 oster }
912 1.3 oster /* connect xor nodes to commit node */
913 1.3 oster RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
914 1.3 oster for (i = 0; i < numParityNodes; i++) {
915 1.3 oster RF_ASSERT(xorNodes[i].numSuccedents == 1);
916 1.3 oster xorNodes[i].succedents[0] = commitNode;
917 1.3 oster commitNode->antecedents[i] = &xorNodes[i];
918 1.3 oster commitNode->antType[i] = rf_control;
919 1.3 oster }
920 1.3 oster
921 1.3 oster /* connect q nodes to commit node */
922 1.3 oster if (nfaults == 2) {
923 1.3 oster for (i = 0; i < numParityNodes; i++) {
924 1.3 oster RF_ASSERT(qNodes[i].numSuccedents == 1);
925 1.3 oster qNodes[i].succedents[0] = commitNode;
926 1.3 oster commitNode->antecedents[i + numParityNodes] = &qNodes[i];
927 1.3 oster commitNode->antType[i + numParityNodes] = rf_control;
928 1.3 oster }
929 1.3 oster }
930 1.3 oster /* connect commit node to write nodes */
931 1.3 oster RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
932 1.3 oster for (i = 0; i < numDataNodes; i++) {
933 1.3 oster RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
934 1.3 oster commitNode->succedents[i] = &writeDataNodes[i];
935 1.3 oster writeDataNodes[i].antecedents[0] = commitNode;
936 1.3 oster writeDataNodes[i].antType[0] = rf_trueData;
937 1.3 oster }
938 1.3 oster for (i = 0; i < numParityNodes; i++) {
939 1.3 oster RF_ASSERT(writeParityNodes[i].numAntecedents == 1);
940 1.3 oster commitNode->succedents[i + numDataNodes] = &writeParityNodes[i];
941 1.3 oster writeParityNodes[i].antecedents[0] = commitNode;
942 1.3 oster writeParityNodes[i].antType[0] = rf_trueData;
943 1.3 oster }
944 1.3 oster if (nfaults == 2) {
945 1.3 oster for (i = 0; i < numParityNodes; i++) {
946 1.3 oster RF_ASSERT(writeQNodes[i].numAntecedents == 1);
947 1.3 oster commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i];
948 1.3 oster writeQNodes[i].antecedents[0] = commitNode;
949 1.3 oster writeQNodes[i].antType[0] = rf_trueData;
950 1.3 oster }
951 1.3 oster }
952 1.3 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
953 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
954 1.3 oster for (i = 0; i < numDataNodes; i++) {
955 1.16 oster /* connect write new data nodes to term node */
956 1.16 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
957 1.16 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
958 1.16 oster writeDataNodes[i].succedents[0] = termNode;
959 1.16 oster termNode->antecedents[i] = &writeDataNodes[i];
960 1.16 oster termNode->antType[i] = rf_control;
961 1.3 oster }
962 1.3 oster
963 1.3 oster for (i = 0; i < numParityNodes; i++) {
964 1.16 oster RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
965 1.16 oster writeParityNodes[i].succedents[0] = termNode;
966 1.16 oster termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
967 1.16 oster termNode->antType[numDataNodes + i] = rf_control;
968 1.3 oster }
969 1.3 oster
970 1.3 oster if (nfaults == 2) {
971 1.3 oster for (i = 0; i < numParityNodes; i++) {
972 1.16 oster RF_ASSERT(writeQNodes[i].numSuccedents == 1);
973 1.16 oster writeQNodes[i].succedents[0] = termNode;
974 1.16 oster termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
975 1.16 oster termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
976 1.3 oster }
977 1.3 oster }
978 1.1 oster }
979 1.1 oster
980 1.1 oster
981 1.1 oster /******************************************************************************
982 1.1 oster * create a write graph (fault-free or degraded) for RAID level 1
983 1.1 oster *
984 1.1 oster * Hdr -> Commit -> Wpd -> Nil -> Trm
985 1.1 oster * -> Wsd ->
986 1.1 oster *
987 1.1 oster * The "Wpd" node writes data to the primary copy in the mirror pair
988 1.1 oster * The "Wsd" node writes data to the secondary copy in the mirror pair
989 1.1 oster *
990 1.1 oster * Parameters: raidPtr - description of the physical array
991 1.1 oster * asmap - logical & physical addresses for this access
992 1.1 oster * bp - buffer ptr (holds write data)
993 1.3 oster * flags - general flags (e.g. disk locking)
994 1.1 oster * allocList - list of memory allocated in DAG creation
995 1.1 oster *****************************************************************************/
996 1.1 oster
997 1.3 oster void
998 1.13 oster rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
999 1.13 oster RF_DagHeader_t *dag_h, void *bp,
1000 1.13 oster RF_RaidAccessFlags_t flags,
1001 1.13 oster RF_AllocListElem_t *allocList)
1002 1.1 oster {
1003 1.3 oster RF_DagNode_t *unblockNode, *termNode, *commitNode;
1004 1.3 oster RF_DagNode_t *nodes, *wndNode, *wmirNode;
1005 1.3 oster int nWndNodes, nWmirNodes, i;
1006 1.3 oster RF_ReconUnitNum_t which_ru;
1007 1.3 oster RF_PhysDiskAddr_t *pda, *pdaP;
1008 1.3 oster RF_StripeNum_t parityStripeID;
1009 1.3 oster
1010 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
1011 1.3 oster asmap->raidAddress, &which_ru);
1012 1.19 oster #if RF_DEBUG_DAG
1013 1.3 oster if (rf_dagDebug) {
1014 1.3 oster printf("[Creating RAID level 1 write DAG]\n");
1015 1.3 oster }
1016 1.19 oster #endif
1017 1.3 oster dag_h->creator = "RaidOneWriteDAG";
1018 1.3 oster
1019 1.3 oster /* 2 implies access not SU aligned */
1020 1.3 oster nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
1021 1.3 oster nWndNodes = (asmap->physInfo->next) ? 2 : 1;
1022 1.3 oster
1023 1.3 oster /* alloc the Wnd nodes and the Wmir node */
1024 1.3 oster if (asmap->numDataFailed == 1)
1025 1.3 oster nWndNodes--;
1026 1.3 oster if (asmap->numParityFailed == 1)
1027 1.3 oster nWmirNodes--;
1028 1.3 oster
1029 1.3 oster /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1030 1.3 oster * + terminator) */
1031 1.12 oster RF_MallocAndAdd(nodes,
1032 1.12 oster (nWndNodes + nWmirNodes + 3) * sizeof(RF_DagNode_t),
1033 1.12 oster (RF_DagNode_t *), allocList);
1034 1.3 oster i = 0;
1035 1.3 oster wndNode = &nodes[i];
1036 1.3 oster i += nWndNodes;
1037 1.3 oster wmirNode = &nodes[i];
1038 1.3 oster i += nWmirNodes;
1039 1.3 oster commitNode = &nodes[i];
1040 1.3 oster i += 1;
1041 1.3 oster unblockNode = &nodes[i];
1042 1.3 oster i += 1;
1043 1.3 oster termNode = &nodes[i];
1044 1.3 oster i += 1;
1045 1.3 oster RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
1046 1.3 oster
1047 1.3 oster /* this dag can commit immediately */
1048 1.3 oster dag_h->numCommitNodes = 1;
1049 1.3 oster dag_h->numCommits = 0;
1050 1.3 oster dag_h->numSuccedents = 1;
1051 1.3 oster
1052 1.3 oster /* initialize the commit, unblock, and term nodes */
1053 1.14 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
1054 1.14 oster rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes),
1055 1.14 oster 0, 0, 0, dag_h, "Cmt", allocList);
1056 1.14 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
1057 1.14 oster rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes),
1058 1.14 oster 0, 0, dag_h, "Nil", allocList);
1059 1.14 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
1060 1.14 oster rf_TerminateUndoFunc, NULL, 0, 1, 0, 0,
1061 1.14 oster dag_h, "Trm", allocList);
1062 1.3 oster
1063 1.3 oster /* initialize the wnd nodes */
1064 1.3 oster if (nWndNodes > 0) {
1065 1.3 oster pda = asmap->physInfo;
1066 1.3 oster for (i = 0; i < nWndNodes; i++) {
1067 1.14 oster rf_InitNode(&wndNode[i], rf_wait, RF_FALSE,
1068 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1069 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0,
1070 1.14 oster dag_h, "Wpd", allocList);
1071 1.3 oster RF_ASSERT(pda != NULL);
1072 1.3 oster wndNode[i].params[0].p = pda;
1073 1.3 oster wndNode[i].params[1].p = pda->bufPtr;
1074 1.3 oster wndNode[i].params[2].v = parityStripeID;
1075 1.17 oster wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1076 1.3 oster pda = pda->next;
1077 1.3 oster }
1078 1.3 oster RF_ASSERT(pda == NULL);
1079 1.3 oster }
1080 1.3 oster /* initialize the mirror nodes */
1081 1.3 oster if (nWmirNodes > 0) {
1082 1.3 oster pda = asmap->physInfo;
1083 1.3 oster pdaP = asmap->parityInfo;
1084 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1085 1.14 oster rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE,
1086 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1087 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0,
1088 1.14 oster dag_h, "Wsd", allocList);
1089 1.3 oster RF_ASSERT(pda != NULL);
1090 1.3 oster wmirNode[i].params[0].p = pdaP;
1091 1.3 oster wmirNode[i].params[1].p = pda->bufPtr;
1092 1.3 oster wmirNode[i].params[2].v = parityStripeID;
1093 1.17 oster wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1094 1.3 oster pda = pda->next;
1095 1.3 oster pdaP = pdaP->next;
1096 1.3 oster }
1097 1.3 oster RF_ASSERT(pda == NULL);
1098 1.3 oster RF_ASSERT(pdaP == NULL);
1099 1.3 oster }
1100 1.3 oster /* link the header node to the commit node */
1101 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
1102 1.3 oster RF_ASSERT(commitNode->numAntecedents == 0);
1103 1.3 oster dag_h->succedents[0] = commitNode;
1104 1.3 oster
1105 1.3 oster /* link the commit node to the write nodes */
1106 1.3 oster RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
1107 1.3 oster for (i = 0; i < nWndNodes; i++) {
1108 1.3 oster RF_ASSERT(wndNode[i].numAntecedents == 1);
1109 1.3 oster commitNode->succedents[i] = &wndNode[i];
1110 1.3 oster wndNode[i].antecedents[0] = commitNode;
1111 1.3 oster wndNode[i].antType[0] = rf_control;
1112 1.3 oster }
1113 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1114 1.3 oster RF_ASSERT(wmirNode[i].numAntecedents == 1);
1115 1.3 oster commitNode->succedents[i + nWndNodes] = &wmirNode[i];
1116 1.3 oster wmirNode[i].antecedents[0] = commitNode;
1117 1.3 oster wmirNode[i].antType[0] = rf_control;
1118 1.3 oster }
1119 1.3 oster
1120 1.3 oster /* link the write nodes to the unblock node */
1121 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
1122 1.3 oster for (i = 0; i < nWndNodes; i++) {
1123 1.3 oster RF_ASSERT(wndNode[i].numSuccedents == 1);
1124 1.3 oster wndNode[i].succedents[0] = unblockNode;
1125 1.3 oster unblockNode->antecedents[i] = &wndNode[i];
1126 1.3 oster unblockNode->antType[i] = rf_control;
1127 1.3 oster }
1128 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1129 1.3 oster RF_ASSERT(wmirNode[i].numSuccedents == 1);
1130 1.3 oster wmirNode[i].succedents[0] = unblockNode;
1131 1.3 oster unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
1132 1.3 oster unblockNode->antType[i + nWndNodes] = rf_control;
1133 1.3 oster }
1134 1.3 oster
1135 1.3 oster /* link the unblock node to the term node */
1136 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
1137 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
1138 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
1139 1.3 oster unblockNode->succedents[0] = termNode;
1140 1.3 oster termNode->antecedents[0] = unblockNode;
1141 1.3 oster termNode->antType[0] = rf_control;
1142 1.1 oster }
1143