rf_parityloggingdags.c revision 1.18 1 1.18 christos /* $NetBSD: rf_parityloggingdags.c,v 1.18 2006/11/16 01:33:23 christos Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: William V. Courtright II
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.7 lukem /*
30 1.7 lukem DAGs specific to parity logging are created here
31 1.7 lukem */
32 1.7 lukem
33 1.7 lukem #include <sys/cdefs.h>
34 1.18 christos __KERNEL_RCSID(0, "$NetBSD: rf_parityloggingdags.c,v 1.18 2006/11/16 01:33:23 christos Exp $");
35 1.7 lukem
36 1.1 oster #include "rf_archs.h"
37 1.9 martin #include "opt_raid_diagnostic.h"
38 1.1 oster
39 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0
40 1.1 oster
41 1.6 oster #include <dev/raidframe/raidframevar.h>
42 1.6 oster
43 1.1 oster #include "rf_raid.h"
44 1.1 oster #include "rf_dag.h"
45 1.1 oster #include "rf_dagutils.h"
46 1.1 oster #include "rf_dagfuncs.h"
47 1.1 oster #include "rf_debugMem.h"
48 1.1 oster #include "rf_paritylog.h"
49 1.1 oster #include "rf_general.h"
50 1.1 oster
51 1.1 oster #include "rf_parityloggingdags.h"
52 1.1 oster
53 1.1 oster /******************************************************************************
54 1.1 oster *
55 1.1 oster * creates a DAG to perform a large-write operation:
56 1.1 oster *
57 1.1 oster * / Rod \ / Wnd \
58 1.1 oster * H -- NIL- Rod - NIL - Wnd ------ NIL - T
59 1.1 oster * \ Rod / \ Xor - Lpo /
60 1.1 oster *
61 1.1 oster * The writes are not done until the reads complete because if they were done in
62 1.1 oster * parallel, a failure on one of the reads could leave the parity in an inconsistent
63 1.1 oster * state, so that the retry with a new DAG would produce erroneous parity.
64 1.1 oster *
65 1.1 oster * Note: this DAG has the nasty property that none of the buffers allocated for reading
66 1.1 oster * old data can be freed until the XOR node fires. Need to fix this.
67 1.1 oster *
68 1.1 oster * The last two arguments are the number of faults tolerated, and function for the
69 1.1 oster * redundancy calculation. The undo for the redundancy calc is assumed to be null
70 1.1 oster *
71 1.1 oster *****************************************************************************/
72 1.1 oster
73 1.14 perry void
74 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(
75 1.3 oster RF_Raid_t * raidPtr,
76 1.3 oster RF_AccessStripeMap_t * asmap,
77 1.3 oster RF_DagHeader_t * dag_h,
78 1.18 christos void *bp,
79 1.18 christos RF_RaidAccessFlags_t flags,
80 1.3 oster RF_AllocListElem_t * allocList,
81 1.18 christos int nfaults,
82 1.3 oster int (*redFunc) (RF_DagNode_t *))
83 1.1 oster {
84 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode,
85 1.3 oster *lpoNode, *blockNode, *unblockNode, *termNode;
86 1.3 oster int nWndNodes, nRodNodes, i;
87 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
88 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
89 1.3 oster int nodeNum, asmNum;
90 1.3 oster RF_ReconUnitNum_t which_ru;
91 1.3 oster char *sosBuffer, *eosBuffer;
92 1.3 oster RF_PhysDiskAddr_t *pda;
93 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
94 1.3 oster
95 1.3 oster if (rf_dagDebug)
96 1.3 oster printf("[Creating parity-logging large-write DAG]\n");
97 1.3 oster RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */
98 1.3 oster dag_h->creator = "ParityLoggingLargeWriteDAG";
99 1.3 oster
100 1.3 oster /* alloc the Wnd nodes, the xor node, and the Lpo node */
101 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
102 1.14 perry RF_MallocAndAdd(nodes, (nWndNodes + 6) * sizeof(RF_DagNode_t),
103 1.10 oster (RF_DagNode_t *), allocList);
104 1.3 oster i = 0;
105 1.3 oster wndNodes = &nodes[i];
106 1.3 oster i += nWndNodes;
107 1.3 oster xorNode = &nodes[i];
108 1.3 oster i += 1;
109 1.3 oster lpoNode = &nodes[i];
110 1.3 oster i += 1;
111 1.3 oster blockNode = &nodes[i];
112 1.3 oster i += 1;
113 1.3 oster syncNode = &nodes[i];
114 1.3 oster i += 1;
115 1.3 oster unblockNode = &nodes[i];
116 1.3 oster i += 1;
117 1.3 oster termNode = &nodes[i];
118 1.3 oster i += 1;
119 1.3 oster
120 1.3 oster dag_h->numCommitNodes = nWndNodes + 1;
121 1.3 oster dag_h->numCommits = 0;
122 1.3 oster dag_h->numSuccedents = 1;
123 1.3 oster
124 1.3 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
125 1.3 oster if (nRodNodes > 0)
126 1.14 perry RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t),
127 1.10 oster (RF_DagNode_t *), allocList);
128 1.3 oster
129 1.3 oster /* begin node initialization */
130 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
131 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
132 1.3 oster rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
133 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
134 1.3 oster
135 1.3 oster /* initialize the Rod nodes */
136 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
137 1.3 oster if (new_asm_h[asmNum]) {
138 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
139 1.3 oster while (pda) {
140 1.3 oster rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
141 1.3 oster rodNodes[nodeNum].params[0].p = pda;
142 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
143 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
144 1.13 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
145 1.3 oster nodeNum++;
146 1.3 oster pda = pda->next;
147 1.3 oster }
148 1.3 oster }
149 1.3 oster }
150 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
151 1.3 oster
152 1.3 oster /* initialize the wnd nodes */
153 1.3 oster pda = asmap->physInfo;
154 1.3 oster for (i = 0; i < nWndNodes; i++) {
155 1.3 oster rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
156 1.3 oster RF_ASSERT(pda != NULL);
157 1.3 oster wndNodes[i].params[0].p = pda;
158 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
159 1.3 oster wndNodes[i].params[2].v = parityStripeID;
160 1.13 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
161 1.3 oster pda = pda->next;
162 1.3 oster }
163 1.3 oster
164 1.3 oster /* initialize the redundancy node */
165 1.3 oster rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr ", allocList);
166 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
167 1.3 oster for (i = 0; i < nWndNodes; i++) {
168 1.3 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
169 1.3 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
170 1.3 oster }
171 1.3 oster for (i = 0; i < nRodNodes; i++) {
172 1.3 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
173 1.3 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
174 1.3 oster }
175 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get
176 1.3 oster * at RAID information */
177 1.3 oster
178 1.3 oster /* look for an Rod node that reads a complete SU. If none, alloc a
179 1.3 oster * buffer to receive the parity info. Note that we can't use a new
180 1.3 oster * data buffer because it will not have gotten written when the xor
181 1.3 oster * occurs. */
182 1.3 oster for (i = 0; i < nRodNodes; i++)
183 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
184 1.3 oster break;
185 1.3 oster if (i == nRodNodes) {
186 1.14 perry RF_MallocAndAdd(xorNode->results[0],
187 1.10 oster rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
188 1.3 oster } else {
189 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
190 1.3 oster }
191 1.3 oster
192 1.3 oster /* initialize the Lpo node */
193 1.3 oster rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
194 1.3 oster
195 1.3 oster lpoNode->params[0].p = asmap->parityInfo;
196 1.3 oster lpoNode->params[1].p = xorNode->results[0];
197 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must
198 1.3 oster * describe entire
199 1.3 oster * parity unit */
200 1.3 oster
201 1.3 oster /* connect nodes to form graph */
202 1.3 oster
203 1.3 oster /* connect dag header to block node */
204 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
205 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
206 1.3 oster dag_h->succedents[0] = blockNode;
207 1.3 oster
208 1.3 oster /* connect the block node to the Rod nodes */
209 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
210 1.3 oster for (i = 0; i < nRodNodes; i++) {
211 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
212 1.3 oster blockNode->succedents[i] = &rodNodes[i];
213 1.3 oster rodNodes[i].antecedents[0] = blockNode;
214 1.3 oster rodNodes[i].antType[0] = rf_control;
215 1.3 oster }
216 1.3 oster
217 1.3 oster /* connect the block node to the sync node */
218 1.3 oster /* necessary if nRodNodes == 0 */
219 1.3 oster RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
220 1.3 oster blockNode->succedents[nRodNodes] = syncNode;
221 1.3 oster syncNode->antecedents[0] = blockNode;
222 1.3 oster syncNode->antType[0] = rf_control;
223 1.3 oster
224 1.3 oster /* connect the Rod nodes to the syncNode */
225 1.3 oster for (i = 0; i < nRodNodes; i++) {
226 1.3 oster rodNodes[i].succedents[0] = syncNode;
227 1.3 oster syncNode->antecedents[1 + i] = &rodNodes[i];
228 1.3 oster syncNode->antType[1 + i] = rf_control;
229 1.3 oster }
230 1.3 oster
231 1.3 oster /* connect the sync node to the xor node */
232 1.3 oster RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
233 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
234 1.3 oster syncNode->succedents[0] = xorNode;
235 1.3 oster xorNode->antecedents[0] = syncNode;
236 1.3 oster xorNode->antType[0] = rf_trueData; /* carry forward from sync */
237 1.3 oster
238 1.3 oster /* connect the sync node to the Wnd nodes */
239 1.3 oster for (i = 0; i < nWndNodes; i++) {
240 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
241 1.3 oster syncNode->succedents[1 + i] = &wndNodes[i];
242 1.3 oster wndNodes[i].antecedents[0] = syncNode;
243 1.3 oster wndNodes[i].antType[0] = rf_control;
244 1.3 oster }
245 1.3 oster
246 1.3 oster /* connect the xor node to the Lpo node */
247 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1);
248 1.3 oster RF_ASSERT(lpoNode->numAntecedents == 1);
249 1.3 oster xorNode->succedents[0] = lpoNode;
250 1.3 oster lpoNode->antecedents[0] = xorNode;
251 1.3 oster lpoNode->antType[0] = rf_trueData;
252 1.3 oster
253 1.3 oster /* connect the Wnd nodes to the unblock node */
254 1.3 oster RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
255 1.3 oster for (i = 0; i < nWndNodes; i++) {
256 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
257 1.3 oster wndNodes[i].succedents[0] = unblockNode;
258 1.3 oster unblockNode->antecedents[i] = &wndNodes[i];
259 1.3 oster unblockNode->antType[i] = rf_control;
260 1.3 oster }
261 1.3 oster
262 1.3 oster /* connect the Lpo node to the unblock node */
263 1.3 oster RF_ASSERT(lpoNode->numSuccedents == 1);
264 1.3 oster lpoNode->succedents[0] = unblockNode;
265 1.3 oster unblockNode->antecedents[nWndNodes] = lpoNode;
266 1.3 oster unblockNode->antType[nWndNodes] = rf_control;
267 1.3 oster
268 1.3 oster /* connect unblock node to terminator */
269 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
270 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
271 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
272 1.3 oster unblockNode->succedents[0] = termNode;
273 1.3 oster termNode->antecedents[0] = unblockNode;
274 1.3 oster termNode->antType[0] = rf_control;
275 1.1 oster }
276 1.1 oster
277 1.1 oster
278 1.1 oster
279 1.1 oster
280 1.1 oster /******************************************************************************
281 1.1 oster *
282 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
283 1.1 oster *
284 1.1 oster * Header
285 1.1 oster * |
286 1.1 oster * Block
287 1.3 oster * / | ... \ \
288 1.3 oster * / | \ \
289 1.1 oster * Rod Rod Rod Rop
290 1.3 oster * | \ /| \ / | \/ |
291 1.3 oster * | | | /\ |
292 1.3 oster * Wnd Wnd Wnd X
293 1.3 oster * | \ / |
294 1.3 oster * | \ / |
295 1.1 oster * \ \ / Lpo
296 1.3 oster * \ \ / /
297 1.3 oster * +-> Unblock <-+
298 1.1 oster * |
299 1.1 oster * T
300 1.3 oster *
301 1.1 oster *
302 1.1 oster * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
303 1.1 oster * When the access spans a stripe unit boundary and is less than one SU in size, there will
304 1.1 oster * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case.
305 1.1 oster * The second output from each Rod node goes to the X node. In the double-XOR
306 1.1 oster * case, there are exactly 2 Rod nodes, and each sends one output to one X node.
307 1.1 oster * There is one Rod -- Wnd -- T branch for each stripe unit being updated.
308 1.1 oster *
309 1.1 oster * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG.
310 1.1 oster *
311 1.1 oster * Note: this DAG ignores all the optimizations related to making the RMWs atomic.
312 1.1 oster * it also has the nasty property that none of the buffers allocated for reading
313 1.1 oster * old data & parity can be freed until the XOR node fires. Need to fix this.
314 1.1 oster *
315 1.1 oster * A null qfuncs indicates single fault tolerant
316 1.1 oster *****************************************************************************/
317 1.1 oster
318 1.14 perry void
319 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(
320 1.3 oster RF_Raid_t * raidPtr,
321 1.3 oster RF_AccessStripeMap_t * asmap,
322 1.3 oster RF_DagHeader_t * dag_h,
323 1.18 christos void *bp,
324 1.18 christos RF_RaidAccessFlags_t flags,
325 1.3 oster RF_AllocListElem_t * allocList,
326 1.16 christos const RF_RedFuncs_t * pfuncs,
327 1.16 christos const RF_RedFuncs_t * qfuncs)
328 1.1 oster {
329 1.3 oster RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
330 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes;
331 1.3 oster RF_DagNode_t *writeDataNodes, *lpuNodes;
332 1.16 christos RF_DagNode_t *termNode;
333 1.3 oster RF_PhysDiskAddr_t *pda = asmap->physInfo;
334 1.3 oster int numDataNodes = asmap->numStripeUnitsAccessed;
335 1.3 oster int numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
336 1.3 oster int i, j, nNodes, totalNumNodes;
337 1.3 oster RF_ReconUnitNum_t which_ru;
338 1.3 oster int (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node);
339 1.3 oster int (*qfunc) (RF_DagNode_t * node);
340 1.16 christos const char *name, *qname;
341 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
342 1.5 thorpej #ifdef RAID_DIAGNOSTIC
343 1.3 oster long nfaults = qfuncs ? 2 : 1;
344 1.5 thorpej #endif /* RAID_DIAGNOSTIC */
345 1.3 oster
346 1.3 oster if (rf_dagDebug)
347 1.3 oster printf("[Creating parity-logging small-write DAG]\n");
348 1.3 oster RF_ASSERT(numDataNodes > 0);
349 1.3 oster RF_ASSERT(nfaults == 1);
350 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
351 1.3 oster
352 1.3 oster /* DAG creation occurs in three steps: 1. count the number of nodes in
353 1.3 oster * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
354 1.3 oster * nodes */
355 1.3 oster
356 1.3 oster /* Step 1. compute number of nodes in the graph */
357 1.3 oster
358 1.3 oster /* number of nodes: a read and write for each data unit a redundancy
359 1.3 oster * computation node for each parity node a read and Lpu for each
360 1.3 oster * parity unit a block and unblock node (2) a terminator node if
361 1.3 oster * atomic RMW an unlock node for each data unit, redundancy unit */
362 1.3 oster totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
363 1.3 oster
364 1.3 oster nNodes = numDataNodes + numParityNodes;
365 1.3 oster
366 1.3 oster dag_h->numCommitNodes = numDataNodes + numParityNodes;
367 1.3 oster dag_h->numCommits = 0;
368 1.3 oster dag_h->numSuccedents = 1;
369 1.3 oster
370 1.3 oster /* Step 2. create the nodes */
371 1.14 perry RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t),
372 1.10 oster (RF_DagNode_t *), allocList);
373 1.3 oster i = 0;
374 1.3 oster blockNode = &nodes[i];
375 1.3 oster i += 1;
376 1.3 oster unblockNode = &nodes[i];
377 1.3 oster i += 1;
378 1.3 oster readDataNodes = &nodes[i];
379 1.3 oster i += numDataNodes;
380 1.3 oster readParityNodes = &nodes[i];
381 1.3 oster i += numParityNodes;
382 1.3 oster writeDataNodes = &nodes[i];
383 1.3 oster i += numDataNodes;
384 1.3 oster lpuNodes = &nodes[i];
385 1.3 oster i += numParityNodes;
386 1.3 oster xorNodes = &nodes[i];
387 1.3 oster i += numParityNodes;
388 1.3 oster termNode = &nodes[i];
389 1.3 oster i += 1;
390 1.12 oster
391 1.3 oster RF_ASSERT(i == totalNumNodes);
392 1.3 oster
393 1.3 oster /* Step 3. initialize the nodes */
394 1.3 oster /* initialize block node (Nil) */
395 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
396 1.3 oster
397 1.3 oster /* initialize unblock node (Nil) */
398 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
399 1.3 oster
400 1.3 oster /* initialize terminatory node (Trm) */
401 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
402 1.3 oster
403 1.3 oster /* initialize nodes which read old data (Rod) */
404 1.3 oster for (i = 0; i < numDataNodes; i++) {
405 1.3 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
406 1.3 oster RF_ASSERT(pda != NULL);
407 1.3 oster readDataNodes[i].params[0].p = pda; /* physical disk addr
408 1.3 oster * desc */
409 1.16 christos readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); /* buffer to hold old data */
410 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
411 1.13 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
412 1.3 oster pda = pda->next;
413 1.3 oster readDataNodes[i].propList[0] = NULL;
414 1.3 oster readDataNodes[i].propList[1] = NULL;
415 1.3 oster }
416 1.3 oster
417 1.3 oster /* initialize nodes which read old parity (Rop) */
418 1.3 oster pda = asmap->parityInfo;
419 1.3 oster i = 0;
420 1.3 oster for (i = 0; i < numParityNodes; i++) {
421 1.3 oster RF_ASSERT(pda != NULL);
422 1.3 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
423 1.3 oster readParityNodes[i].params[0].p = pda;
424 1.16 christos readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector); /* buffer to hold old parity */
425 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
426 1.13 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
427 1.3 oster readParityNodes[i].propList[0] = NULL;
428 1.3 oster pda = pda->next;
429 1.3 oster }
430 1.3 oster
431 1.3 oster /* initialize nodes which write new data (Wnd) */
432 1.3 oster pda = asmap->physInfo;
433 1.3 oster for (i = 0; i < numDataNodes; i++) {
434 1.3 oster RF_ASSERT(pda != NULL);
435 1.3 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
436 1.3 oster writeDataNodes[i].params[0].p = pda; /* physical disk addr
437 1.3 oster * desc */
438 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new
439 1.3 oster * data to be written */
440 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
441 1.13 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
442 1.3 oster
443 1.3 oster pda = pda->next;
444 1.3 oster }
445 1.3 oster
446 1.3 oster
447 1.3 oster /* initialize nodes which compute new parity */
448 1.3 oster /* we use the simple XOR func in the double-XOR case, and when we're
449 1.3 oster * accessing only a portion of one stripe unit. the distinction
450 1.3 oster * between the two is that the regular XOR func assumes that the
451 1.3 oster * targbuf is a full SU in size, and examines the pda associated with
452 1.3 oster * the buffer to decide where within the buffer to XOR the data,
453 1.3 oster * whereas the simple XOR func just XORs the data into the start of
454 1.3 oster * the buffer. */
455 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
456 1.3 oster func = pfuncs->simple;
457 1.3 oster undoFunc = rf_NullNodeUndoFunc;
458 1.3 oster name = pfuncs->SimpleName;
459 1.3 oster if (qfuncs) {
460 1.3 oster qfunc = qfuncs->simple;
461 1.3 oster qname = qfuncs->SimpleName;
462 1.3 oster }
463 1.3 oster } else {
464 1.3 oster func = pfuncs->regular;
465 1.3 oster undoFunc = rf_NullNodeUndoFunc;
466 1.3 oster name = pfuncs->RegularName;
467 1.3 oster if (qfuncs) {
468 1.3 oster qfunc = qfuncs->regular;
469 1.3 oster qname = qfuncs->RegularName;
470 1.3 oster }
471 1.3 oster }
472 1.3 oster /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
473 1.3 oster * nodes, and raidPtr */
474 1.3 oster if (numParityNodes == 2) { /* double-xor case */
475 1.3 oster for (i = 0; i < numParityNodes; i++) {
476 1.3 oster rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for
477 1.3 oster * xor */
478 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
479 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
480 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
481 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
482 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
483 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
484 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
485 1.3 oster xorNodes[i].params[6].p = raidPtr;
486 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as
487 1.3 oster * target buf */
488 1.3 oster }
489 1.3 oster } else {
490 1.3 oster /* there is only one xor node in this case */
491 1.3 oster rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
492 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
493 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
494 1.3 oster /* set up params related to Rod and Rop nodes */
495 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
496 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */
497 1.3 oster }
498 1.3 oster for (i = 0; i < numDataNodes; i++) {
499 1.3 oster /* set up params related to Wnd and Wnp nodes */
500 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */
501 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */
502 1.3 oster }
503 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get
504 1.3 oster * at RAID information */
505 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
506 1.3 oster }
507 1.3 oster
508 1.3 oster /* initialize the log node(s) */
509 1.3 oster pda = asmap->parityInfo;
510 1.3 oster for (i = 0; i < numParityNodes; i++) {
511 1.3 oster RF_ASSERT(pda);
512 1.3 oster rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
513 1.3 oster lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */
514 1.3 oster lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to
515 1.3 oster * parity */
516 1.3 oster pda = pda->next;
517 1.3 oster }
518 1.3 oster
519 1.3 oster
520 1.3 oster /* Step 4. connect the nodes */
521 1.3 oster
522 1.3 oster /* connect header to block node */
523 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
524 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
525 1.3 oster dag_h->succedents[0] = blockNode;
526 1.3 oster
527 1.3 oster /* connect block node to read old data nodes */
528 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
529 1.3 oster for (i = 0; i < numDataNodes; i++) {
530 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
531 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
532 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
533 1.3 oster readDataNodes[i].antType[0] = rf_control;
534 1.3 oster }
535 1.3 oster
536 1.3 oster /* connect block node to read old parity nodes */
537 1.3 oster for (i = 0; i < numParityNodes; i++) {
538 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
539 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
540 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
541 1.3 oster readParityNodes[i].antType[0] = rf_control;
542 1.3 oster }
543 1.3 oster
544 1.3 oster /* connect read old data nodes to write new data nodes */
545 1.3 oster for (i = 0; i < numDataNodes; i++) {
546 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
547 1.3 oster for (j = 0; j < numDataNodes; j++) {
548 1.3 oster RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
549 1.3 oster readDataNodes[i].succedents[j] = &writeDataNodes[j];
550 1.3 oster writeDataNodes[j].antecedents[i] = &readDataNodes[i];
551 1.3 oster if (i == j)
552 1.3 oster writeDataNodes[j].antType[i] = rf_antiData;
553 1.3 oster else
554 1.3 oster writeDataNodes[j].antType[i] = rf_control;
555 1.3 oster }
556 1.3 oster }
557 1.3 oster
558 1.3 oster /* connect read old data nodes to xor nodes */
559 1.3 oster for (i = 0; i < numDataNodes; i++)
560 1.3 oster for (j = 0; j < numParityNodes; j++) {
561 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
562 1.3 oster readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
563 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
564 1.3 oster xorNodes[j].antType[i] = rf_trueData;
565 1.3 oster }
566 1.3 oster
567 1.3 oster /* connect read old parity nodes to write new data nodes */
568 1.3 oster for (i = 0; i < numParityNodes; i++) {
569 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
570 1.3 oster for (j = 0; j < numDataNodes; j++) {
571 1.3 oster readParityNodes[i].succedents[j] = &writeDataNodes[j];
572 1.3 oster writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
573 1.3 oster writeDataNodes[j].antType[numDataNodes + i] = rf_control;
574 1.3 oster }
575 1.3 oster }
576 1.3 oster
577 1.3 oster /* connect read old parity nodes to xor nodes */
578 1.3 oster for (i = 0; i < numParityNodes; i++)
579 1.3 oster for (j = 0; j < numParityNodes; j++) {
580 1.3 oster readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
581 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
582 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
583 1.3 oster }
584 1.3 oster
585 1.3 oster /* connect xor nodes to write new parity nodes */
586 1.3 oster for (i = 0; i < numParityNodes; i++) {
587 1.3 oster RF_ASSERT(xorNodes[i].numSuccedents == 1);
588 1.3 oster RF_ASSERT(lpuNodes[i].numAntecedents == 1);
589 1.3 oster xorNodes[i].succedents[0] = &lpuNodes[i];
590 1.3 oster lpuNodes[i].antecedents[0] = &xorNodes[i];
591 1.3 oster lpuNodes[i].antType[0] = rf_trueData;
592 1.3 oster }
593 1.3 oster
594 1.3 oster for (i = 0; i < numDataNodes; i++) {
595 1.12 oster /* connect write new data nodes to unblock node */
596 1.12 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
597 1.12 oster RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
598 1.12 oster writeDataNodes[i].succedents[0] = unblockNode;
599 1.12 oster unblockNode->antecedents[i] = &writeDataNodes[i];
600 1.12 oster unblockNode->antType[i] = rf_control;
601 1.3 oster }
602 1.3 oster
603 1.3 oster /* connect write new parity nodes to unblock node */
604 1.3 oster for (i = 0; i < numParityNodes; i++) {
605 1.3 oster RF_ASSERT(lpuNodes[i].numSuccedents == 1);
606 1.3 oster lpuNodes[i].succedents[0] = unblockNode;
607 1.3 oster unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
608 1.3 oster unblockNode->antType[numDataNodes + i] = rf_control;
609 1.3 oster }
610 1.3 oster
611 1.3 oster /* connect unblock node to terminator */
612 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
613 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
614 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
615 1.3 oster unblockNode->succedents[0] = termNode;
616 1.3 oster termNode->antecedents[0] = unblockNode;
617 1.3 oster termNode->antType[0] = rf_control;
618 1.1 oster }
619 1.1 oster
620 1.1 oster
621 1.14 perry void
622 1.3 oster rf_CreateParityLoggingSmallWriteDAG(
623 1.3 oster RF_Raid_t * raidPtr,
624 1.3 oster RF_AccessStripeMap_t * asmap,
625 1.3 oster RF_DagHeader_t * dag_h,
626 1.3 oster void *bp,
627 1.3 oster RF_RaidAccessFlags_t flags,
628 1.3 oster RF_AllocListElem_t * allocList,
629 1.18 christos const RF_RedFuncs_t * pfuncs,
630 1.18 christos const RF_RedFuncs_t * qfuncs)
631 1.1 oster {
632 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
633 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
634 1.1 oster }
635 1.1 oster
636 1.1 oster
637 1.14 perry void
638 1.3 oster rf_CreateParityLoggingLargeWriteDAG(
639 1.3 oster RF_Raid_t * raidPtr,
640 1.3 oster RF_AccessStripeMap_t * asmap,
641 1.3 oster RF_DagHeader_t * dag_h,
642 1.3 oster void *bp,
643 1.3 oster RF_RaidAccessFlags_t flags,
644 1.3 oster RF_AllocListElem_t * allocList,
645 1.18 christos int nfaults,
646 1.18 christos int (*redFunc) (RF_DagNode_t *))
647 1.1 oster {
648 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
649 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
650 1.1 oster }
651 1.3 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
652