rf_dagffwr.c revision 1.21 1 1.21 oster /* $NetBSD: rf_dagffwr.c,v 1.21 2004/03/06 23:52:20 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster /*
30 1.1 oster * rf_dagff.c
31 1.1 oster *
32 1.1 oster * code for creating fault-free DAGs
33 1.1 oster *
34 1.1 oster */
35 1.7 lukem
36 1.7 lukem #include <sys/cdefs.h>
37 1.21 oster __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.21 2004/03/06 23:52:20 oster Exp $");
38 1.1 oster
39 1.6 oster #include <dev/raidframe/raidframevar.h>
40 1.6 oster
41 1.1 oster #include "rf_raid.h"
42 1.1 oster #include "rf_dag.h"
43 1.1 oster #include "rf_dagutils.h"
44 1.1 oster #include "rf_dagfuncs.h"
45 1.1 oster #include "rf_debugMem.h"
46 1.1 oster #include "rf_dagffrd.h"
47 1.1 oster #include "rf_general.h"
48 1.1 oster #include "rf_dagffwr.h"
49 1.1 oster
50 1.1 oster /******************************************************************************
51 1.1 oster *
52 1.1 oster * General comments on DAG creation:
53 1.3 oster *
54 1.1 oster * All DAGs in this file use roll-away error recovery. Each DAG has a single
55 1.1 oster * commit node, usually called "Cmt." If an error occurs before the Cmt node
56 1.1 oster * is reached, the execution engine will halt forward execution and work
57 1.1 oster * backward through the graph, executing the undo functions. Assuming that
58 1.1 oster * each node in the graph prior to the Cmt node are undoable and atomic - or -
59 1.1 oster * does not make changes to permanent state, the graph will fail atomically.
60 1.1 oster * If an error occurs after the Cmt node executes, the engine will roll-forward
61 1.1 oster * through the graph, blindly executing nodes until it reaches the end.
62 1.1 oster * If a graph reaches the end, it is assumed to have completed successfully.
63 1.1 oster *
64 1.1 oster * A graph has only 1 Cmt node.
65 1.1 oster *
66 1.1 oster */
67 1.1 oster
68 1.1 oster
69 1.1 oster /******************************************************************************
70 1.1 oster *
71 1.1 oster * The following wrappers map the standard DAG creation interface to the
72 1.1 oster * DAG creation routines. Additionally, these wrappers enable experimentation
73 1.1 oster * with new DAG structures by providing an extra level of indirection, allowing
74 1.1 oster * the DAG creation routines to be replaced at this single point.
75 1.1 oster */
76 1.1 oster
77 1.1 oster
78 1.3 oster void
79 1.13 oster rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
80 1.13 oster RF_DagHeader_t *dag_h, void *bp,
81 1.13 oster RF_RaidAccessFlags_t flags,
82 1.13 oster RF_AllocListElem_t *allocList,
83 1.13 oster RF_IoType_t type)
84 1.1 oster {
85 1.3 oster rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
86 1.14 oster RF_IO_TYPE_WRITE);
87 1.1 oster }
88 1.1 oster
89 1.3 oster void
90 1.13 oster rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
91 1.13 oster RF_DagHeader_t *dag_h, void *bp,
92 1.13 oster RF_RaidAccessFlags_t flags,
93 1.13 oster RF_AllocListElem_t *allocList,
94 1.13 oster RF_IoType_t type)
95 1.1 oster {
96 1.3 oster rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
97 1.14 oster RF_IO_TYPE_WRITE);
98 1.1 oster }
99 1.1 oster
100 1.3 oster void
101 1.13 oster rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
102 1.13 oster RF_DagHeader_t *dag_h, void *bp,
103 1.13 oster RF_RaidAccessFlags_t flags,
104 1.13 oster RF_AllocListElem_t *allocList)
105 1.1 oster {
106 1.3 oster /* "normal" rollaway */
107 1.14 oster rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
108 1.14 oster allocList, &rf_xorFuncs, NULL);
109 1.1 oster }
110 1.1 oster
111 1.3 oster void
112 1.13 oster rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
113 1.13 oster RF_DagHeader_t *dag_h, void *bp,
114 1.13 oster RF_RaidAccessFlags_t flags,
115 1.13 oster RF_AllocListElem_t *allocList)
116 1.1 oster {
117 1.3 oster /* "normal" rollaway */
118 1.14 oster rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
119 1.14 oster allocList, 1, rf_RegularXorFunc, RF_TRUE);
120 1.1 oster }
121 1.1 oster
122 1.1 oster
123 1.1 oster /******************************************************************************
124 1.1 oster *
125 1.1 oster * DAG creation code begins here
126 1.1 oster */
127 1.1 oster
128 1.1 oster
129 1.1 oster /******************************************************************************
130 1.1 oster *
131 1.1 oster * creates a DAG to perform a large-write operation:
132 1.1 oster *
133 1.1 oster * / Rod \ / Wnd \
134 1.1 oster * H -- block- Rod - Xor - Cmt - Wnd --- T
135 1.1 oster * \ Rod / \ Wnp /
136 1.1 oster * \[Wnq]/
137 1.1 oster *
138 1.1 oster * The XOR node also does the Q calculation in the P+Q architecture.
139 1.1 oster * All nodes are before the commit node (Cmt) are assumed to be atomic and
140 1.1 oster * undoable - or - they make no changes to permanent state.
141 1.1 oster *
142 1.1 oster * Rod = read old data
143 1.1 oster * Cmt = commit node
144 1.1 oster * Wnp = write new parity
145 1.1 oster * Wnd = write new data
146 1.1 oster * Wnq = write new "q"
147 1.1 oster * [] denotes optional segments in the graph
148 1.1 oster *
149 1.1 oster * Parameters: raidPtr - description of the physical array
150 1.1 oster * asmap - logical & physical addresses for this access
151 1.1 oster * bp - buffer ptr (holds write data)
152 1.3 oster * flags - general flags (e.g. disk locking)
153 1.1 oster * allocList - list of memory allocated in DAG creation
154 1.1 oster * nfaults - number of faults array can tolerate
155 1.1 oster * (equal to # redundancy units in stripe)
156 1.1 oster * redfuncs - list of redundancy generating functions
157 1.1 oster *
158 1.1 oster *****************************************************************************/
159 1.1 oster
160 1.3 oster void
161 1.13 oster rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
162 1.13 oster RF_DagHeader_t *dag_h, void *bp,
163 1.13 oster RF_RaidAccessFlags_t flags,
164 1.13 oster RF_AllocListElem_t *allocList,
165 1.13 oster int nfaults, int (*redFunc) (RF_DagNode_t *),
166 1.13 oster int allowBufferRecycle)
167 1.1 oster {
168 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
169 1.3 oster RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode;
170 1.3 oster int nWndNodes, nRodNodes, i, nodeNum, asmNum;
171 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
172 1.3 oster RF_StripeNum_t parityStripeID;
173 1.3 oster char *sosBuffer, *eosBuffer;
174 1.3 oster RF_ReconUnitNum_t which_ru;
175 1.3 oster RF_RaidLayout_t *layoutPtr;
176 1.3 oster RF_PhysDiskAddr_t *pda;
177 1.3 oster
178 1.3 oster layoutPtr = &(raidPtr->Layout);
179 1.14 oster parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
180 1.14 oster asmap->raidAddress,
181 1.14 oster &which_ru);
182 1.3 oster
183 1.19 oster #if RF_DEBUG_DAG
184 1.3 oster if (rf_dagDebug) {
185 1.3 oster printf("[Creating large-write DAG]\n");
186 1.3 oster }
187 1.19 oster #endif
188 1.3 oster dag_h->creator = "LargeWriteDAG";
189 1.3 oster
190 1.3 oster dag_h->numCommitNodes = 1;
191 1.3 oster dag_h->numCommits = 0;
192 1.3 oster dag_h->numSuccedents = 1;
193 1.3 oster
194 1.3 oster /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
195 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
196 1.12 oster RF_MallocAndAdd(nodes,
197 1.12 oster (nWndNodes + 4 + nfaults) * sizeof(RF_DagNode_t),
198 1.12 oster (RF_DagNode_t *), allocList);
199 1.3 oster i = 0;
200 1.3 oster wndNodes = &nodes[i];
201 1.3 oster i += nWndNodes;
202 1.3 oster xorNode = &nodes[i];
203 1.3 oster i += 1;
204 1.3 oster wnpNode = &nodes[i];
205 1.3 oster i += 1;
206 1.3 oster blockNode = &nodes[i];
207 1.3 oster i += 1;
208 1.3 oster commitNode = &nodes[i];
209 1.3 oster i += 1;
210 1.3 oster termNode = &nodes[i];
211 1.3 oster i += 1;
212 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
213 1.3 oster if (nfaults == 2) {
214 1.3 oster wnqNode = &nodes[i];
215 1.3 oster i += 1;
216 1.3 oster } else {
217 1.20 oster #endif
218 1.3 oster wnqNode = NULL;
219 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
220 1.3 oster }
221 1.20 oster #endif
222 1.14 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h,
223 1.14 oster new_asm_h, &nRodNodes, &sosBuffer,
224 1.14 oster &eosBuffer, allocList);
225 1.3 oster if (nRodNodes > 0) {
226 1.12 oster RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t),
227 1.12 oster (RF_DagNode_t *), allocList);
228 1.3 oster } else {
229 1.3 oster rodNodes = NULL;
230 1.3 oster }
231 1.3 oster
232 1.3 oster /* begin node initialization */
233 1.3 oster if (nRodNodes > 0) {
234 1.14 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
235 1.14 oster rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0,
236 1.14 oster dag_h, "Nil", allocList);
237 1.3 oster } else {
238 1.14 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
239 1.14 oster rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0,
240 1.14 oster dag_h, "Nil", allocList);
241 1.3 oster }
242 1.3 oster
243 1.14 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
244 1.14 oster rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0,
245 1.14 oster dag_h, "Cmt", allocList);
246 1.14 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
247 1.14 oster rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0,
248 1.14 oster dag_h, "Trm", allocList);
249 1.3 oster
250 1.3 oster /* initialize the Rod nodes */
251 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
252 1.3 oster if (new_asm_h[asmNum]) {
253 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
254 1.3 oster while (pda) {
255 1.14 oster rf_InitNode(&rodNodes[nodeNum], rf_wait,
256 1.14 oster RF_FALSE, rf_DiskReadFunc,
257 1.14 oster rf_DiskReadUndoFunc,
258 1.14 oster rf_GenericWakeupFunc,
259 1.14 oster 1, 1, 4, 0, dag_h,
260 1.14 oster "Rod", allocList);
261 1.3 oster rodNodes[nodeNum].params[0].p = pda;
262 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
263 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
264 1.3 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
265 1.17 oster which_ru);
266 1.3 oster nodeNum++;
267 1.3 oster pda = pda->next;
268 1.3 oster }
269 1.3 oster }
270 1.3 oster }
271 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
272 1.3 oster
273 1.3 oster /* initialize the wnd nodes */
274 1.3 oster pda = asmap->physInfo;
275 1.3 oster for (i = 0; i < nWndNodes; i++) {
276 1.14 oster rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE,
277 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
278 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0,
279 1.14 oster dag_h, "Wnd", allocList);
280 1.3 oster RF_ASSERT(pda != NULL);
281 1.3 oster wndNodes[i].params[0].p = pda;
282 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
283 1.3 oster wndNodes[i].params[2].v = parityStripeID;
284 1.17 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
285 1.3 oster pda = pda->next;
286 1.3 oster }
287 1.3 oster
288 1.3 oster /* initialize the redundancy node */
289 1.3 oster if (nRodNodes > 0) {
290 1.14 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
291 1.14 oster rf_NullNodeUndoFunc, NULL, 1,
292 1.14 oster nRodNodes, 2 * (nWndNodes + nRodNodes) + 1,
293 1.14 oster nfaults, dag_h, "Xr ", allocList);
294 1.3 oster } else {
295 1.14 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
296 1.14 oster rf_NullNodeUndoFunc, NULL, 1,
297 1.14 oster 1, 2 * (nWndNodes + nRodNodes) + 1,
298 1.14 oster nfaults, dag_h, "Xr ", allocList);
299 1.3 oster }
300 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
301 1.3 oster for (i = 0; i < nWndNodes; i++) {
302 1.14 oster /* pda */
303 1.14 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0];
304 1.14 oster /* buf ptr */
305 1.14 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1];
306 1.3 oster }
307 1.3 oster for (i = 0; i < nRodNodes; i++) {
308 1.14 oster /* pda */
309 1.14 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0];
310 1.14 oster /* buf ptr */
311 1.14 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1];
312 1.3 oster }
313 1.3 oster /* xor node needs to get at RAID information */
314 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
315 1.3 oster
316 1.3 oster /*
317 1.14 oster * Look for an Rod node that reads a complete SU. If none,
318 1.14 oster * alloc a buffer to receive the parity info. Note that we
319 1.14 oster * can't use a new data buffer because it will not have gotten
320 1.14 oster * written when the xor occurs. */
321 1.3 oster if (allowBufferRecycle) {
322 1.3 oster for (i = 0; i < nRodNodes; i++) {
323 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
324 1.3 oster break;
325 1.3 oster }
326 1.3 oster }
327 1.3 oster if ((!allowBufferRecycle) || (i == nRodNodes)) {
328 1.12 oster RF_MallocAndAdd(xorNode->results[0],
329 1.12 oster rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
330 1.12 oster (void *), allocList);
331 1.3 oster } else {
332 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
333 1.3 oster }
334 1.3 oster
335 1.3 oster /* initialize the Wnp node */
336 1.14 oster rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
337 1.14 oster rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
338 1.14 oster dag_h, "Wnp", allocList);
339 1.3 oster wnpNode->params[0].p = asmap->parityInfo;
340 1.3 oster wnpNode->params[1].p = xorNode->results[0];
341 1.3 oster wnpNode->params[2].v = parityStripeID;
342 1.17 oster wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
343 1.3 oster /* parityInfo must describe entire parity unit */
344 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL);
345 1.3 oster
346 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
347 1.3 oster if (nfaults == 2) {
348 1.3 oster /*
349 1.3 oster * We never try to recycle a buffer for the Q calcuation
350 1.3 oster * in addition to the parity. This would cause two buffers
351 1.3 oster * to get smashed during the P and Q calculation, guaranteeing
352 1.3 oster * one would be wrong.
353 1.3 oster */
354 1.12 oster RF_MallocAndAdd(xorNode->results[1],
355 1.12 oster rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
356 1.12 oster (void *), allocList);
357 1.14 oster rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
358 1.14 oster rf_DiskWriteUndoFunc, rf_GenericWakeupFunc,
359 1.14 oster 1, 1, 4, 0, dag_h, "Wnq", allocList);
360 1.3 oster wnqNode->params[0].p = asmap->qInfo;
361 1.3 oster wnqNode->params[1].p = xorNode->results[1];
362 1.3 oster wnqNode->params[2].v = parityStripeID;
363 1.17 oster wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
364 1.3 oster /* parityInfo must describe entire parity unit */
365 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL);
366 1.3 oster }
367 1.20 oster #endif
368 1.3 oster /*
369 1.3 oster * Connect nodes to form graph.
370 1.3 oster */
371 1.3 oster
372 1.3 oster /* connect dag header to block node */
373 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
374 1.3 oster dag_h->succedents[0] = blockNode;
375 1.3 oster
376 1.3 oster if (nRodNodes > 0) {
377 1.3 oster /* connect the block node to the Rod nodes */
378 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes);
379 1.3 oster RF_ASSERT(xorNode->numAntecedents == nRodNodes);
380 1.3 oster for (i = 0; i < nRodNodes; i++) {
381 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
382 1.3 oster blockNode->succedents[i] = &rodNodes[i];
383 1.3 oster rodNodes[i].antecedents[0] = blockNode;
384 1.3 oster rodNodes[i].antType[0] = rf_control;
385 1.3 oster
386 1.3 oster /* connect the Rod nodes to the Xor node */
387 1.3 oster RF_ASSERT(rodNodes[i].numSuccedents == 1);
388 1.3 oster rodNodes[i].succedents[0] = xorNode;
389 1.3 oster xorNode->antecedents[i] = &rodNodes[i];
390 1.3 oster xorNode->antType[i] = rf_trueData;
391 1.3 oster }
392 1.3 oster } else {
393 1.3 oster /* connect the block node to the Xor node */
394 1.3 oster RF_ASSERT(blockNode->numSuccedents == 1);
395 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
396 1.3 oster blockNode->succedents[0] = xorNode;
397 1.3 oster xorNode->antecedents[0] = blockNode;
398 1.3 oster xorNode->antType[0] = rf_control;
399 1.3 oster }
400 1.3 oster
401 1.3 oster /* connect the xor node to the commit node */
402 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1);
403 1.3 oster RF_ASSERT(commitNode->numAntecedents == 1);
404 1.3 oster xorNode->succedents[0] = commitNode;
405 1.3 oster commitNode->antecedents[0] = xorNode;
406 1.3 oster commitNode->antType[0] = rf_control;
407 1.3 oster
408 1.3 oster /* connect the commit node to the write nodes */
409 1.3 oster RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
410 1.3 oster for (i = 0; i < nWndNodes; i++) {
411 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
412 1.3 oster commitNode->succedents[i] = &wndNodes[i];
413 1.3 oster wndNodes[i].antecedents[0] = commitNode;
414 1.3 oster wndNodes[i].antType[0] = rf_control;
415 1.3 oster }
416 1.3 oster RF_ASSERT(wnpNode->numAntecedents == 1);
417 1.3 oster commitNode->succedents[nWndNodes] = wnpNode;
418 1.3 oster wnpNode->antecedents[0] = commitNode;
419 1.3 oster wnpNode->antType[0] = rf_trueData;
420 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
421 1.3 oster if (nfaults == 2) {
422 1.3 oster RF_ASSERT(wnqNode->numAntecedents == 1);
423 1.3 oster commitNode->succedents[nWndNodes + 1] = wnqNode;
424 1.3 oster wnqNode->antecedents[0] = commitNode;
425 1.3 oster wnqNode->antType[0] = rf_trueData;
426 1.3 oster }
427 1.20 oster #endif
428 1.3 oster /* connect the write nodes to the term node */
429 1.3 oster RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
430 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
431 1.3 oster for (i = 0; i < nWndNodes; i++) {
432 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
433 1.3 oster wndNodes[i].succedents[0] = termNode;
434 1.3 oster termNode->antecedents[i] = &wndNodes[i];
435 1.3 oster termNode->antType[i] = rf_control;
436 1.3 oster }
437 1.3 oster RF_ASSERT(wnpNode->numSuccedents == 1);
438 1.3 oster wnpNode->succedents[0] = termNode;
439 1.3 oster termNode->antecedents[nWndNodes] = wnpNode;
440 1.3 oster termNode->antType[nWndNodes] = rf_control;
441 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
442 1.3 oster if (nfaults == 2) {
443 1.3 oster RF_ASSERT(wnqNode->numSuccedents == 1);
444 1.3 oster wnqNode->succedents[0] = termNode;
445 1.3 oster termNode->antecedents[nWndNodes + 1] = wnqNode;
446 1.3 oster termNode->antType[nWndNodes + 1] = rf_control;
447 1.3 oster }
448 1.20 oster #endif
449 1.1 oster }
450 1.1 oster /******************************************************************************
451 1.1 oster *
452 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq),
453 1.1 oster * which is as follows:
454 1.1 oster *
455 1.1 oster * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
456 1.1 oster * \- Rod X / \----> Wnd [Und]-/
457 1.1 oster * [\- Rod X / \---> Wnd [Und]-/]
458 1.1 oster * [\- Roq -> Q / \--> Wnq [Unq]-/]
459 1.1 oster *
460 1.1 oster * Rop = read old parity
461 1.1 oster * Rod = read old data
462 1.1 oster * Roq = read old "q"
463 1.1 oster * Cmt = commit node
464 1.1 oster * Und = unlock data disk
465 1.1 oster * Unp = unlock parity disk
466 1.1 oster * Unq = unlock q disk
467 1.1 oster * Wnp = write new parity
468 1.1 oster * Wnd = write new data
469 1.1 oster * Wnq = write new "q"
470 1.1 oster * [ ] denotes optional segments in the graph
471 1.1 oster *
472 1.1 oster * Parameters: raidPtr - description of the physical array
473 1.1 oster * asmap - logical & physical addresses for this access
474 1.1 oster * bp - buffer ptr (holds write data)
475 1.3 oster * flags - general flags (e.g. disk locking)
476 1.1 oster * allocList - list of memory allocated in DAG creation
477 1.1 oster * pfuncs - list of parity generating functions
478 1.1 oster * qfuncs - list of q generating functions
479 1.1 oster *
480 1.1 oster * A null qfuncs indicates single fault tolerant
481 1.1 oster *****************************************************************************/
482 1.1 oster
483 1.3 oster void
484 1.13 oster rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
485 1.13 oster RF_DagHeader_t *dag_h, void *bp,
486 1.13 oster RF_RaidAccessFlags_t flags,
487 1.13 oster RF_AllocListElem_t *allocList,
488 1.13 oster const RF_RedFuncs_t *pfuncs,
489 1.13 oster const RF_RedFuncs_t *qfuncs)
490 1.1 oster {
491 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
492 1.3 oster RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes;
493 1.3 oster RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
494 1.16 oster int i, j, nNodes, totalNumNodes;
495 1.3 oster RF_ReconUnitNum_t which_ru;
496 1.3 oster int (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
497 1.3 oster int (*qfunc) (RF_DagNode_t *);
498 1.3 oster int numDataNodes, numParityNodes;
499 1.3 oster RF_StripeNum_t parityStripeID;
500 1.3 oster RF_PhysDiskAddr_t *pda;
501 1.3 oster char *name, *qname;
502 1.3 oster long nfaults;
503 1.3 oster
504 1.3 oster nfaults = qfuncs ? 2 : 1;
505 1.3 oster
506 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
507 1.3 oster asmap->raidAddress, &which_ru);
508 1.3 oster pda = asmap->physInfo;
509 1.3 oster numDataNodes = asmap->numStripeUnitsAccessed;
510 1.3 oster numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
511 1.3 oster
512 1.19 oster #if RF_DEBUG_DAG
513 1.3 oster if (rf_dagDebug) {
514 1.3 oster printf("[Creating small-write DAG]\n");
515 1.3 oster }
516 1.19 oster #endif
517 1.3 oster RF_ASSERT(numDataNodes > 0);
518 1.3 oster dag_h->creator = "SmallWriteDAG";
519 1.3 oster
520 1.3 oster dag_h->numCommitNodes = 1;
521 1.3 oster dag_h->numCommits = 0;
522 1.3 oster dag_h->numSuccedents = 1;
523 1.3 oster
524 1.3 oster /*
525 1.3 oster * DAG creation occurs in four steps:
526 1.3 oster * 1. count the number of nodes in the DAG
527 1.3 oster * 2. create the nodes
528 1.3 oster * 3. initialize the nodes
529 1.3 oster * 4. connect the nodes
530 1.3 oster */
531 1.3 oster
532 1.3 oster /*
533 1.3 oster * Step 1. compute number of nodes in the graph
534 1.3 oster */
535 1.3 oster
536 1.14 oster /* number of nodes: a read and write for each data unit a
537 1.14 oster * redundancy computation node for each parity node (nfaults *
538 1.14 oster * nparity) a read and write for each parity unit a block and
539 1.14 oster * commit node (2) a terminate node if atomic RMW an unlock
540 1.14 oster * node for each data unit, redundancy unit */
541 1.3 oster totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
542 1.3 oster + (nfaults * 2 * numParityNodes) + 3;
543 1.3 oster /*
544 1.3 oster * Step 2. create the nodes
545 1.3 oster */
546 1.12 oster RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t),
547 1.12 oster (RF_DagNode_t *), allocList);
548 1.3 oster i = 0;
549 1.3 oster blockNode = &nodes[i];
550 1.3 oster i += 1;
551 1.3 oster commitNode = &nodes[i];
552 1.3 oster i += 1;
553 1.3 oster readDataNodes = &nodes[i];
554 1.3 oster i += numDataNodes;
555 1.3 oster readParityNodes = &nodes[i];
556 1.3 oster i += numParityNodes;
557 1.3 oster writeDataNodes = &nodes[i];
558 1.3 oster i += numDataNodes;
559 1.3 oster writeParityNodes = &nodes[i];
560 1.3 oster i += numParityNodes;
561 1.3 oster xorNodes = &nodes[i];
562 1.3 oster i += numParityNodes;
563 1.3 oster termNode = &nodes[i];
564 1.3 oster i += 1;
565 1.16 oster
566 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
567 1.3 oster if (nfaults == 2) {
568 1.3 oster readQNodes = &nodes[i];
569 1.3 oster i += numParityNodes;
570 1.3 oster writeQNodes = &nodes[i];
571 1.3 oster i += numParityNodes;
572 1.3 oster qNodes = &nodes[i];
573 1.3 oster i += numParityNodes;
574 1.3 oster } else {
575 1.20 oster #endif
576 1.18 oster readQNodes = writeQNodes = qNodes = NULL;
577 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
578 1.3 oster }
579 1.20 oster #endif
580 1.3 oster RF_ASSERT(i == totalNumNodes);
581 1.3 oster
582 1.3 oster /*
583 1.3 oster * Step 3. initialize the nodes
584 1.3 oster */
585 1.3 oster /* initialize block node (Nil) */
586 1.3 oster nNodes = numDataNodes + (nfaults * numParityNodes);
587 1.14 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
588 1.14 oster rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0,
589 1.14 oster dag_h, "Nil", allocList);
590 1.3 oster
591 1.3 oster /* initialize commit node (Cmt) */
592 1.14 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
593 1.14 oster rf_NullNodeUndoFunc, NULL, nNodes,
594 1.14 oster (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
595 1.3 oster
596 1.3 oster /* initialize terminate node (Trm) */
597 1.14 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
598 1.14 oster rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0,
599 1.14 oster dag_h, "Trm", allocList);
600 1.3 oster
601 1.3 oster /* initialize nodes which read old data (Rod) */
602 1.3 oster for (i = 0; i < numDataNodes; i++) {
603 1.14 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE,
604 1.14 oster rf_DiskReadFunc, rf_DiskReadUndoFunc,
605 1.14 oster rf_GenericWakeupFunc, (nfaults * numParityNodes),
606 1.14 oster 1, 4, 0, dag_h, "Rod", allocList);
607 1.3 oster RF_ASSERT(pda != NULL);
608 1.3 oster /* physical disk addr desc */
609 1.3 oster readDataNodes[i].params[0].p = pda;
610 1.3 oster /* buffer to hold old data */
611 1.21 oster readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, pda, allocList);
612 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
613 1.3 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
614 1.17 oster which_ru);
615 1.3 oster pda = pda->next;
616 1.3 oster for (j = 0; j < readDataNodes[i].numSuccedents; j++) {
617 1.3 oster readDataNodes[i].propList[j] = NULL;
618 1.3 oster }
619 1.3 oster }
620 1.3 oster
621 1.3 oster /* initialize nodes which read old parity (Rop) */
622 1.3 oster pda = asmap->parityInfo;
623 1.3 oster i = 0;
624 1.3 oster for (i = 0; i < numParityNodes; i++) {
625 1.3 oster RF_ASSERT(pda != NULL);
626 1.14 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE,
627 1.14 oster rf_DiskReadFunc, rf_DiskReadUndoFunc,
628 1.14 oster rf_GenericWakeupFunc, numParityNodes, 1, 4, 0,
629 1.14 oster dag_h, "Rop", allocList);
630 1.3 oster readParityNodes[i].params[0].p = pda;
631 1.3 oster /* buffer to hold old parity */
632 1.21 oster readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, pda, allocList);
633 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
634 1.3 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
635 1.17 oster which_ru);
636 1.3 oster pda = pda->next;
637 1.3 oster for (j = 0; j < readParityNodes[i].numSuccedents; j++) {
638 1.3 oster readParityNodes[i].propList[0] = NULL;
639 1.3 oster }
640 1.3 oster }
641 1.3 oster
642 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
643 1.3 oster /* initialize nodes which read old Q (Roq) */
644 1.3 oster if (nfaults == 2) {
645 1.3 oster pda = asmap->qInfo;
646 1.3 oster for (i = 0; i < numParityNodes; i++) {
647 1.3 oster RF_ASSERT(pda != NULL);
648 1.14 oster rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE,
649 1.14 oster rf_DiskReadFunc, rf_DiskReadUndoFunc,
650 1.14 oster rf_GenericWakeupFunc, numParityNodes,
651 1.14 oster 1, 4, 0, dag_h, "Roq", allocList);
652 1.3 oster readQNodes[i].params[0].p = pda;
653 1.3 oster /* buffer to hold old Q */
654 1.21 oster readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, pda, allocList);
655 1.3 oster readQNodes[i].params[2].v = parityStripeID;
656 1.3 oster readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
657 1.17 oster which_ru);
658 1.3 oster pda = pda->next;
659 1.3 oster for (j = 0; j < readQNodes[i].numSuccedents; j++) {
660 1.3 oster readQNodes[i].propList[0] = NULL;
661 1.3 oster }
662 1.3 oster }
663 1.3 oster }
664 1.20 oster #endif
665 1.3 oster /* initialize nodes which write new data (Wnd) */
666 1.3 oster pda = asmap->physInfo;
667 1.3 oster for (i = 0; i < numDataNodes; i++) {
668 1.3 oster RF_ASSERT(pda != NULL);
669 1.14 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE,
670 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
671 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
672 1.14 oster "Wnd", allocList);
673 1.3 oster /* physical disk addr desc */
674 1.3 oster writeDataNodes[i].params[0].p = pda;
675 1.3 oster /* buffer holding new data to be written */
676 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr;
677 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
678 1.3 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
679 1.17 oster which_ru);
680 1.3 oster pda = pda->next;
681 1.3 oster }
682 1.3 oster
683 1.3 oster /*
684 1.3 oster * Initialize nodes which compute new parity and Q.
685 1.3 oster */
686 1.3 oster /*
687 1.3 oster * We use the simple XOR func in the double-XOR case, and when
688 1.14 oster * we're accessing only a portion of one stripe unit. The
689 1.14 oster * distinction between the two is that the regular XOR func
690 1.14 oster * assumes that the targbuf is a full SU in size, and examines
691 1.14 oster * the pda associated with the buffer to decide where within
692 1.14 oster * the buffer to XOR the data, whereas the simple XOR func
693 1.14 oster * just XORs the data into the start of the buffer. */
694 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1)
695 1.14 oster && (asmap->totalSectorsAccessed <
696 1.14 oster raidPtr->Layout.sectorsPerStripeUnit))) {
697 1.3 oster func = pfuncs->simple;
698 1.3 oster undoFunc = rf_NullNodeUndoFunc;
699 1.3 oster name = pfuncs->SimpleName;
700 1.3 oster if (qfuncs) {
701 1.3 oster qfunc = qfuncs->simple;
702 1.3 oster qname = qfuncs->SimpleName;
703 1.3 oster } else {
704 1.3 oster qfunc = NULL;
705 1.3 oster qname = NULL;
706 1.3 oster }
707 1.3 oster } else {
708 1.3 oster func = pfuncs->regular;
709 1.3 oster undoFunc = rf_NullNodeUndoFunc;
710 1.3 oster name = pfuncs->RegularName;
711 1.3 oster if (qfuncs) {
712 1.3 oster qfunc = qfuncs->regular;
713 1.3 oster qname = qfuncs->RegularName;
714 1.3 oster } else {
715 1.3 oster qfunc = NULL;
716 1.3 oster qname = NULL;
717 1.3 oster }
718 1.3 oster }
719 1.3 oster /*
720 1.3 oster * Initialize the xor nodes: params are {pda,buf}
721 1.3 oster * from {Rod,Wnd,Rop} nodes, and raidPtr
722 1.3 oster */
723 1.3 oster if (numParityNodes == 2) {
724 1.3 oster /* double-xor case */
725 1.3 oster for (i = 0; i < numParityNodes; i++) {
726 1.3 oster /* note: no wakeup func for xor */
727 1.14 oster rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func,
728 1.14 oster undoFunc, NULL, 1,
729 1.14 oster (numDataNodes + numParityNodes),
730 1.14 oster 7, 1, dag_h, name, allocList);
731 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
732 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
733 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
734 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
735 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
736 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
737 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
738 1.3 oster xorNodes[i].params[6].p = raidPtr;
739 1.3 oster /* use old parity buf as target buf */
740 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p;
741 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
742 1.3 oster if (nfaults == 2) {
743 1.3 oster /* note: no wakeup func for qor */
744 1.14 oster rf_InitNode(&qNodes[i], rf_wait, RF_FALSE,
745 1.14 oster qfunc, undoFunc, NULL, 1,
746 1.14 oster (numDataNodes + numParityNodes),
747 1.14 oster 7, 1, dag_h, qname, allocList);
748 1.3 oster qNodes[i].params[0] = readDataNodes[i].params[0];
749 1.3 oster qNodes[i].params[1] = readDataNodes[i].params[1];
750 1.3 oster qNodes[i].params[2] = readQNodes[i].params[0];
751 1.3 oster qNodes[i].params[3] = readQNodes[i].params[1];
752 1.3 oster qNodes[i].params[4] = writeDataNodes[i].params[0];
753 1.3 oster qNodes[i].params[5] = writeDataNodes[i].params[1];
754 1.3 oster qNodes[i].params[6].p = raidPtr;
755 1.3 oster /* use old Q buf as target buf */
756 1.3 oster qNodes[i].results[0] = readQNodes[i].params[1].p;
757 1.3 oster }
758 1.20 oster #endif
759 1.3 oster }
760 1.3 oster } else {
761 1.3 oster /* there is only one xor node in this case */
762 1.14 oster rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func,
763 1.14 oster undoFunc, NULL, 1, (numDataNodes + numParityNodes),
764 1.14 oster (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
765 1.14 oster dag_h, name, allocList);
766 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
767 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
768 1.3 oster /* set up params related to Rod and Rop nodes */
769 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
770 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
771 1.3 oster }
772 1.3 oster for (i = 0; i < numDataNodes; i++) {
773 1.3 oster /* set up params related to Wnd and Wnp nodes */
774 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
775 1.3 oster writeDataNodes[i].params[0];
776 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
777 1.3 oster writeDataNodes[i].params[1];
778 1.3 oster }
779 1.3 oster /* xor node needs to get at RAID information */
780 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
781 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
782 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
783 1.3 oster if (nfaults == 2) {
784 1.14 oster rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc,
785 1.14 oster undoFunc, NULL, 1,
786 1.14 oster (numDataNodes + numParityNodes),
787 1.14 oster (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
788 1.14 oster dag_h, qname, allocList);
789 1.3 oster for (i = 0; i < numDataNodes; i++) {
790 1.3 oster /* set up params related to Rod */
791 1.3 oster qNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
792 1.3 oster qNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer ptr */
793 1.3 oster }
794 1.3 oster /* and read old q */
795 1.3 oster qNodes[0].params[2 * numDataNodes + 0] = /* pda */
796 1.3 oster readQNodes[0].params[0];
797 1.3 oster qNodes[0].params[2 * numDataNodes + 1] = /* buffer ptr */
798 1.3 oster readQNodes[0].params[1];
799 1.3 oster for (i = 0; i < numDataNodes; i++) {
800 1.3 oster /* set up params related to Wnd nodes */
801 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
802 1.3 oster writeDataNodes[i].params[0];
803 1.3 oster qNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
804 1.3 oster writeDataNodes[i].params[1];
805 1.3 oster }
806 1.3 oster /* xor node needs to get at RAID information */
807 1.3 oster qNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
808 1.3 oster qNodes[0].results[0] = readQNodes[0].params[1].p;
809 1.3 oster }
810 1.20 oster #endif
811 1.3 oster }
812 1.3 oster
813 1.3 oster /* initialize nodes which write new parity (Wnp) */
814 1.3 oster pda = asmap->parityInfo;
815 1.3 oster for (i = 0; i < numParityNodes; i++) {
816 1.14 oster rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE,
817 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
818 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
819 1.14 oster "Wnp", allocList);
820 1.3 oster RF_ASSERT(pda != NULL);
821 1.3 oster writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr)
822 1.3 oster * filled in by xor node */
823 1.3 oster writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for
824 1.3 oster * parity write
825 1.3 oster * operation */
826 1.3 oster writeParityNodes[i].params[2].v = parityStripeID;
827 1.3 oster writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
828 1.17 oster which_ru);
829 1.3 oster pda = pda->next;
830 1.3 oster }
831 1.3 oster
832 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
833 1.3 oster /* initialize nodes which write new Q (Wnq) */
834 1.3 oster if (nfaults == 2) {
835 1.3 oster pda = asmap->qInfo;
836 1.3 oster for (i = 0; i < numParityNodes; i++) {
837 1.14 oster rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE,
838 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
839 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
840 1.14 oster "Wnq", allocList);
841 1.3 oster RF_ASSERT(pda != NULL);
842 1.3 oster writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr)
843 1.3 oster * filled in by xor node */
844 1.3 oster writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for
845 1.3 oster * parity write
846 1.3 oster * operation */
847 1.3 oster writeQNodes[i].params[2].v = parityStripeID;
848 1.3 oster writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
849 1.17 oster which_ru);
850 1.3 oster pda = pda->next;
851 1.3 oster }
852 1.3 oster }
853 1.20 oster #endif
854 1.3 oster /*
855 1.3 oster * Step 4. connect the nodes.
856 1.3 oster */
857 1.3 oster
858 1.3 oster /* connect header to block node */
859 1.3 oster dag_h->succedents[0] = blockNode;
860 1.3 oster
861 1.3 oster /* connect block node to read old data nodes */
862 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
863 1.3 oster for (i = 0; i < numDataNodes; i++) {
864 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
865 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
866 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
867 1.3 oster readDataNodes[i].antType[0] = rf_control;
868 1.3 oster }
869 1.3 oster
870 1.3 oster /* connect block node to read old parity nodes */
871 1.3 oster for (i = 0; i < numParityNodes; i++) {
872 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
873 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
874 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
875 1.3 oster readParityNodes[i].antType[0] = rf_control;
876 1.3 oster }
877 1.3 oster
878 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
879 1.3 oster /* connect block node to read old Q nodes */
880 1.3 oster if (nfaults == 2) {
881 1.3 oster for (i = 0; i < numParityNodes; i++) {
882 1.3 oster blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
883 1.3 oster RF_ASSERT(readQNodes[i].numAntecedents == 1);
884 1.3 oster readQNodes[i].antecedents[0] = blockNode;
885 1.3 oster readQNodes[i].antType[0] = rf_control;
886 1.3 oster }
887 1.3 oster }
888 1.20 oster #endif
889 1.3 oster /* connect read old data nodes to xor nodes */
890 1.3 oster for (i = 0; i < numDataNodes; i++) {
891 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes));
892 1.3 oster for (j = 0; j < numParityNodes; j++) {
893 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
894 1.3 oster readDataNodes[i].succedents[j] = &xorNodes[j];
895 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
896 1.3 oster xorNodes[j].antType[i] = rf_trueData;
897 1.3 oster }
898 1.3 oster }
899 1.3 oster
900 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
901 1.3 oster /* connect read old data nodes to q nodes */
902 1.3 oster if (nfaults == 2) {
903 1.3 oster for (i = 0; i < numDataNodes; i++) {
904 1.3 oster for (j = 0; j < numParityNodes; j++) {
905 1.3 oster RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
906 1.3 oster readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j];
907 1.3 oster qNodes[j].antecedents[i] = &readDataNodes[i];
908 1.3 oster qNodes[j].antType[i] = rf_trueData;
909 1.3 oster }
910 1.3 oster }
911 1.3 oster }
912 1.20 oster #endif
913 1.3 oster /* connect read old parity nodes to xor nodes */
914 1.3 oster for (i = 0; i < numParityNodes; i++) {
915 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
916 1.3 oster for (j = 0; j < numParityNodes; j++) {
917 1.3 oster readParityNodes[i].succedents[j] = &xorNodes[j];
918 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
919 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
920 1.3 oster }
921 1.3 oster }
922 1.3 oster
923 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
924 1.3 oster /* connect read old q nodes to q nodes */
925 1.3 oster if (nfaults == 2) {
926 1.3 oster for (i = 0; i < numParityNodes; i++) {
927 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
928 1.3 oster for (j = 0; j < numParityNodes; j++) {
929 1.3 oster readQNodes[i].succedents[j] = &qNodes[j];
930 1.3 oster qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
931 1.3 oster qNodes[j].antType[numDataNodes + i] = rf_trueData;
932 1.3 oster }
933 1.3 oster }
934 1.3 oster }
935 1.20 oster #endif
936 1.3 oster /* connect xor nodes to commit node */
937 1.3 oster RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
938 1.3 oster for (i = 0; i < numParityNodes; i++) {
939 1.3 oster RF_ASSERT(xorNodes[i].numSuccedents == 1);
940 1.3 oster xorNodes[i].succedents[0] = commitNode;
941 1.3 oster commitNode->antecedents[i] = &xorNodes[i];
942 1.3 oster commitNode->antType[i] = rf_control;
943 1.3 oster }
944 1.3 oster
945 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
946 1.3 oster /* connect q nodes to commit node */
947 1.3 oster if (nfaults == 2) {
948 1.3 oster for (i = 0; i < numParityNodes; i++) {
949 1.3 oster RF_ASSERT(qNodes[i].numSuccedents == 1);
950 1.3 oster qNodes[i].succedents[0] = commitNode;
951 1.3 oster commitNode->antecedents[i + numParityNodes] = &qNodes[i];
952 1.3 oster commitNode->antType[i + numParityNodes] = rf_control;
953 1.3 oster }
954 1.3 oster }
955 1.20 oster #endif
956 1.3 oster /* connect commit node to write nodes */
957 1.3 oster RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
958 1.3 oster for (i = 0; i < numDataNodes; i++) {
959 1.3 oster RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
960 1.3 oster commitNode->succedents[i] = &writeDataNodes[i];
961 1.3 oster writeDataNodes[i].antecedents[0] = commitNode;
962 1.3 oster writeDataNodes[i].antType[0] = rf_trueData;
963 1.3 oster }
964 1.3 oster for (i = 0; i < numParityNodes; i++) {
965 1.3 oster RF_ASSERT(writeParityNodes[i].numAntecedents == 1);
966 1.3 oster commitNode->succedents[i + numDataNodes] = &writeParityNodes[i];
967 1.3 oster writeParityNodes[i].antecedents[0] = commitNode;
968 1.3 oster writeParityNodes[i].antType[0] = rf_trueData;
969 1.3 oster }
970 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
971 1.3 oster if (nfaults == 2) {
972 1.3 oster for (i = 0; i < numParityNodes; i++) {
973 1.3 oster RF_ASSERT(writeQNodes[i].numAntecedents == 1);
974 1.3 oster commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i];
975 1.3 oster writeQNodes[i].antecedents[0] = commitNode;
976 1.3 oster writeQNodes[i].antType[0] = rf_trueData;
977 1.3 oster }
978 1.3 oster }
979 1.20 oster #endif
980 1.3 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
981 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
982 1.3 oster for (i = 0; i < numDataNodes; i++) {
983 1.16 oster /* connect write new data nodes to term node */
984 1.16 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
985 1.16 oster RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
986 1.16 oster writeDataNodes[i].succedents[0] = termNode;
987 1.16 oster termNode->antecedents[i] = &writeDataNodes[i];
988 1.16 oster termNode->antType[i] = rf_control;
989 1.3 oster }
990 1.3 oster
991 1.3 oster for (i = 0; i < numParityNodes; i++) {
992 1.16 oster RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
993 1.16 oster writeParityNodes[i].succedents[0] = termNode;
994 1.16 oster termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
995 1.16 oster termNode->antType[numDataNodes + i] = rf_control;
996 1.3 oster }
997 1.3 oster
998 1.20 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
999 1.3 oster if (nfaults == 2) {
1000 1.3 oster for (i = 0; i < numParityNodes; i++) {
1001 1.16 oster RF_ASSERT(writeQNodes[i].numSuccedents == 1);
1002 1.16 oster writeQNodes[i].succedents[0] = termNode;
1003 1.16 oster termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
1004 1.16 oster termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1005 1.3 oster }
1006 1.3 oster }
1007 1.20 oster #endif
1008 1.1 oster }
1009 1.1 oster
1010 1.1 oster
1011 1.1 oster /******************************************************************************
1012 1.1 oster * create a write graph (fault-free or degraded) for RAID level 1
1013 1.1 oster *
1014 1.1 oster * Hdr -> Commit -> Wpd -> Nil -> Trm
1015 1.1 oster * -> Wsd ->
1016 1.1 oster *
1017 1.1 oster * The "Wpd" node writes data to the primary copy in the mirror pair
1018 1.1 oster * The "Wsd" node writes data to the secondary copy in the mirror pair
1019 1.1 oster *
1020 1.1 oster * Parameters: raidPtr - description of the physical array
1021 1.1 oster * asmap - logical & physical addresses for this access
1022 1.1 oster * bp - buffer ptr (holds write data)
1023 1.3 oster * flags - general flags (e.g. disk locking)
1024 1.1 oster * allocList - list of memory allocated in DAG creation
1025 1.1 oster *****************************************************************************/
1026 1.1 oster
1027 1.3 oster void
1028 1.13 oster rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1029 1.13 oster RF_DagHeader_t *dag_h, void *bp,
1030 1.13 oster RF_RaidAccessFlags_t flags,
1031 1.13 oster RF_AllocListElem_t *allocList)
1032 1.1 oster {
1033 1.3 oster RF_DagNode_t *unblockNode, *termNode, *commitNode;
1034 1.3 oster RF_DagNode_t *nodes, *wndNode, *wmirNode;
1035 1.3 oster int nWndNodes, nWmirNodes, i;
1036 1.3 oster RF_ReconUnitNum_t which_ru;
1037 1.3 oster RF_PhysDiskAddr_t *pda, *pdaP;
1038 1.3 oster RF_StripeNum_t parityStripeID;
1039 1.3 oster
1040 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
1041 1.3 oster asmap->raidAddress, &which_ru);
1042 1.19 oster #if RF_DEBUG_DAG
1043 1.3 oster if (rf_dagDebug) {
1044 1.3 oster printf("[Creating RAID level 1 write DAG]\n");
1045 1.3 oster }
1046 1.19 oster #endif
1047 1.3 oster dag_h->creator = "RaidOneWriteDAG";
1048 1.3 oster
1049 1.3 oster /* 2 implies access not SU aligned */
1050 1.3 oster nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
1051 1.3 oster nWndNodes = (asmap->physInfo->next) ? 2 : 1;
1052 1.3 oster
1053 1.3 oster /* alloc the Wnd nodes and the Wmir node */
1054 1.3 oster if (asmap->numDataFailed == 1)
1055 1.3 oster nWndNodes--;
1056 1.3 oster if (asmap->numParityFailed == 1)
1057 1.3 oster nWmirNodes--;
1058 1.3 oster
1059 1.3 oster /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1060 1.3 oster * + terminator) */
1061 1.12 oster RF_MallocAndAdd(nodes,
1062 1.12 oster (nWndNodes + nWmirNodes + 3) * sizeof(RF_DagNode_t),
1063 1.12 oster (RF_DagNode_t *), allocList);
1064 1.3 oster i = 0;
1065 1.3 oster wndNode = &nodes[i];
1066 1.3 oster i += nWndNodes;
1067 1.3 oster wmirNode = &nodes[i];
1068 1.3 oster i += nWmirNodes;
1069 1.3 oster commitNode = &nodes[i];
1070 1.3 oster i += 1;
1071 1.3 oster unblockNode = &nodes[i];
1072 1.3 oster i += 1;
1073 1.3 oster termNode = &nodes[i];
1074 1.3 oster i += 1;
1075 1.3 oster RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
1076 1.3 oster
1077 1.3 oster /* this dag can commit immediately */
1078 1.3 oster dag_h->numCommitNodes = 1;
1079 1.3 oster dag_h->numCommits = 0;
1080 1.3 oster dag_h->numSuccedents = 1;
1081 1.3 oster
1082 1.3 oster /* initialize the commit, unblock, and term nodes */
1083 1.14 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
1084 1.14 oster rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes),
1085 1.14 oster 0, 0, 0, dag_h, "Cmt", allocList);
1086 1.14 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
1087 1.14 oster rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes),
1088 1.14 oster 0, 0, dag_h, "Nil", allocList);
1089 1.14 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
1090 1.14 oster rf_TerminateUndoFunc, NULL, 0, 1, 0, 0,
1091 1.14 oster dag_h, "Trm", allocList);
1092 1.3 oster
1093 1.3 oster /* initialize the wnd nodes */
1094 1.3 oster if (nWndNodes > 0) {
1095 1.3 oster pda = asmap->physInfo;
1096 1.3 oster for (i = 0; i < nWndNodes; i++) {
1097 1.14 oster rf_InitNode(&wndNode[i], rf_wait, RF_FALSE,
1098 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1099 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0,
1100 1.14 oster dag_h, "Wpd", allocList);
1101 1.3 oster RF_ASSERT(pda != NULL);
1102 1.3 oster wndNode[i].params[0].p = pda;
1103 1.3 oster wndNode[i].params[1].p = pda->bufPtr;
1104 1.3 oster wndNode[i].params[2].v = parityStripeID;
1105 1.17 oster wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1106 1.3 oster pda = pda->next;
1107 1.3 oster }
1108 1.3 oster RF_ASSERT(pda == NULL);
1109 1.3 oster }
1110 1.3 oster /* initialize the mirror nodes */
1111 1.3 oster if (nWmirNodes > 0) {
1112 1.3 oster pda = asmap->physInfo;
1113 1.3 oster pdaP = asmap->parityInfo;
1114 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1115 1.14 oster rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE,
1116 1.14 oster rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1117 1.14 oster rf_GenericWakeupFunc, 1, 1, 4, 0,
1118 1.14 oster dag_h, "Wsd", allocList);
1119 1.3 oster RF_ASSERT(pda != NULL);
1120 1.3 oster wmirNode[i].params[0].p = pdaP;
1121 1.3 oster wmirNode[i].params[1].p = pda->bufPtr;
1122 1.3 oster wmirNode[i].params[2].v = parityStripeID;
1123 1.17 oster wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1124 1.3 oster pda = pda->next;
1125 1.3 oster pdaP = pdaP->next;
1126 1.3 oster }
1127 1.3 oster RF_ASSERT(pda == NULL);
1128 1.3 oster RF_ASSERT(pdaP == NULL);
1129 1.3 oster }
1130 1.3 oster /* link the header node to the commit node */
1131 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
1132 1.3 oster RF_ASSERT(commitNode->numAntecedents == 0);
1133 1.3 oster dag_h->succedents[0] = commitNode;
1134 1.3 oster
1135 1.3 oster /* link the commit node to the write nodes */
1136 1.3 oster RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
1137 1.3 oster for (i = 0; i < nWndNodes; i++) {
1138 1.3 oster RF_ASSERT(wndNode[i].numAntecedents == 1);
1139 1.3 oster commitNode->succedents[i] = &wndNode[i];
1140 1.3 oster wndNode[i].antecedents[0] = commitNode;
1141 1.3 oster wndNode[i].antType[0] = rf_control;
1142 1.3 oster }
1143 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1144 1.3 oster RF_ASSERT(wmirNode[i].numAntecedents == 1);
1145 1.3 oster commitNode->succedents[i + nWndNodes] = &wmirNode[i];
1146 1.3 oster wmirNode[i].antecedents[0] = commitNode;
1147 1.3 oster wmirNode[i].antType[0] = rf_control;
1148 1.3 oster }
1149 1.3 oster
1150 1.3 oster /* link the write nodes to the unblock node */
1151 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
1152 1.3 oster for (i = 0; i < nWndNodes; i++) {
1153 1.3 oster RF_ASSERT(wndNode[i].numSuccedents == 1);
1154 1.3 oster wndNode[i].succedents[0] = unblockNode;
1155 1.3 oster unblockNode->antecedents[i] = &wndNode[i];
1156 1.3 oster unblockNode->antType[i] = rf_control;
1157 1.3 oster }
1158 1.3 oster for (i = 0; i < nWmirNodes; i++) {
1159 1.3 oster RF_ASSERT(wmirNode[i].numSuccedents == 1);
1160 1.3 oster wmirNode[i].succedents[0] = unblockNode;
1161 1.3 oster unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
1162 1.3 oster unblockNode->antType[i + nWndNodes] = rf_control;
1163 1.3 oster }
1164 1.3 oster
1165 1.3 oster /* link the unblock node to the term node */
1166 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
1167 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
1168 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
1169 1.3 oster unblockNode->succedents[0] = termNode;
1170 1.3 oster termNode->antecedents[0] = unblockNode;
1171 1.3 oster termNode->antType[0] = rf_control;
1172 1.1 oster }
1173