rf_parityloggingdags.c revision 1.9 1 1.9 martin /* $NetBSD: rf_parityloggingdags.c,v 1.9 2003/06/23 11:02:01 martin Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: William V. Courtright II
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.7 lukem /*
30 1.7 lukem DAGs specific to parity logging are created here
31 1.7 lukem */
32 1.7 lukem
33 1.7 lukem #include <sys/cdefs.h>
34 1.9 martin __KERNEL_RCSID(0, "$NetBSD: rf_parityloggingdags.c,v 1.9 2003/06/23 11:02:01 martin Exp $");
35 1.7 lukem
36 1.1 oster #include "rf_archs.h"
37 1.9 martin #include "opt_raid_diagnostic.h"
38 1.1 oster
39 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0
40 1.1 oster
41 1.6 oster #include <dev/raidframe/raidframevar.h>
42 1.6 oster
43 1.1 oster #include "rf_raid.h"
44 1.1 oster #include "rf_dag.h"
45 1.1 oster #include "rf_dagutils.h"
46 1.1 oster #include "rf_dagfuncs.h"
47 1.1 oster #include "rf_debugMem.h"
48 1.1 oster #include "rf_paritylog.h"
49 1.1 oster #include "rf_general.h"
50 1.1 oster
51 1.1 oster #include "rf_parityloggingdags.h"
52 1.1 oster
53 1.1 oster /******************************************************************************
54 1.1 oster *
55 1.1 oster * creates a DAG to perform a large-write operation:
56 1.1 oster *
57 1.1 oster * / Rod \ / Wnd \
58 1.1 oster * H -- NIL- Rod - NIL - Wnd ------ NIL - T
59 1.1 oster * \ Rod / \ Xor - Lpo /
60 1.1 oster *
61 1.1 oster * The writes are not done until the reads complete because if they were done in
62 1.1 oster * parallel, a failure on one of the reads could leave the parity in an inconsistent
63 1.1 oster * state, so that the retry with a new DAG would produce erroneous parity.
64 1.1 oster *
65 1.1 oster * Note: this DAG has the nasty property that none of the buffers allocated for reading
66 1.1 oster * old data can be freed until the XOR node fires. Need to fix this.
67 1.1 oster *
68 1.1 oster * The last two arguments are the number of faults tolerated, and function for the
69 1.1 oster * redundancy calculation. The undo for the redundancy calc is assumed to be null
70 1.1 oster *
71 1.1 oster *****************************************************************************/
72 1.1 oster
73 1.3 oster void
74 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(
75 1.3 oster RF_Raid_t * raidPtr,
76 1.3 oster RF_AccessStripeMap_t * asmap,
77 1.3 oster RF_DagHeader_t * dag_h,
78 1.3 oster void *bp,
79 1.3 oster RF_RaidAccessFlags_t flags,
80 1.3 oster RF_AllocListElem_t * allocList,
81 1.3 oster int nfaults,
82 1.3 oster int (*redFunc) (RF_DagNode_t *))
83 1.1 oster {
84 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode,
85 1.3 oster *lpoNode, *blockNode, *unblockNode, *termNode;
86 1.3 oster int nWndNodes, nRodNodes, i;
87 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
88 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
89 1.3 oster int nodeNum, asmNum;
90 1.3 oster RF_ReconUnitNum_t which_ru;
91 1.3 oster char *sosBuffer, *eosBuffer;
92 1.3 oster RF_PhysDiskAddr_t *pda;
93 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
94 1.3 oster
95 1.3 oster if (rf_dagDebug)
96 1.3 oster printf("[Creating parity-logging large-write DAG]\n");
97 1.3 oster RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */
98 1.3 oster dag_h->creator = "ParityLoggingLargeWriteDAG";
99 1.3 oster
100 1.3 oster /* alloc the Wnd nodes, the xor node, and the Lpo node */
101 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
102 1.3 oster RF_CallocAndAdd(nodes, nWndNodes + 6, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
103 1.3 oster i = 0;
104 1.3 oster wndNodes = &nodes[i];
105 1.3 oster i += nWndNodes;
106 1.3 oster xorNode = &nodes[i];
107 1.3 oster i += 1;
108 1.3 oster lpoNode = &nodes[i];
109 1.3 oster i += 1;
110 1.3 oster blockNode = &nodes[i];
111 1.3 oster i += 1;
112 1.3 oster syncNode = &nodes[i];
113 1.3 oster i += 1;
114 1.3 oster unblockNode = &nodes[i];
115 1.3 oster i += 1;
116 1.3 oster termNode = &nodes[i];
117 1.3 oster i += 1;
118 1.3 oster
119 1.3 oster dag_h->numCommitNodes = nWndNodes + 1;
120 1.3 oster dag_h->numCommits = 0;
121 1.3 oster dag_h->numSuccedents = 1;
122 1.3 oster
123 1.3 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
124 1.3 oster if (nRodNodes > 0)
125 1.3 oster RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
126 1.3 oster
127 1.3 oster /* begin node initialization */
128 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
129 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
130 1.3 oster rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
131 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
132 1.3 oster
133 1.3 oster /* initialize the Rod nodes */
134 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
135 1.3 oster if (new_asm_h[asmNum]) {
136 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
137 1.3 oster while (pda) {
138 1.3 oster rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
139 1.3 oster rodNodes[nodeNum].params[0].p = pda;
140 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
141 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
142 1.3 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
143 1.3 oster nodeNum++;
144 1.3 oster pda = pda->next;
145 1.3 oster }
146 1.3 oster }
147 1.3 oster }
148 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
149 1.3 oster
150 1.3 oster /* initialize the wnd nodes */
151 1.3 oster pda = asmap->physInfo;
152 1.3 oster for (i = 0; i < nWndNodes; i++) {
153 1.3 oster rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
154 1.3 oster RF_ASSERT(pda != NULL);
155 1.3 oster wndNodes[i].params[0].p = pda;
156 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
157 1.3 oster wndNodes[i].params[2].v = parityStripeID;
158 1.3 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
159 1.3 oster pda = pda->next;
160 1.3 oster }
161 1.3 oster
162 1.3 oster /* initialize the redundancy node */
163 1.3 oster rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr ", allocList);
164 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
165 1.3 oster for (i = 0; i < nWndNodes; i++) {
166 1.3 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
167 1.3 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
168 1.3 oster }
169 1.3 oster for (i = 0; i < nRodNodes; i++) {
170 1.3 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
171 1.3 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
172 1.3 oster }
173 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get
174 1.3 oster * at RAID information */
175 1.3 oster
176 1.3 oster /* look for an Rod node that reads a complete SU. If none, alloc a
177 1.3 oster * buffer to receive the parity info. Note that we can't use a new
178 1.3 oster * data buffer because it will not have gotten written when the xor
179 1.3 oster * occurs. */
180 1.3 oster for (i = 0; i < nRodNodes; i++)
181 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
182 1.3 oster break;
183 1.3 oster if (i == nRodNodes) {
184 1.3 oster RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
185 1.3 oster } else {
186 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
187 1.3 oster }
188 1.3 oster
189 1.3 oster /* initialize the Lpo node */
190 1.3 oster rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
191 1.3 oster
192 1.3 oster lpoNode->params[0].p = asmap->parityInfo;
193 1.3 oster lpoNode->params[1].p = xorNode->results[0];
194 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must
195 1.3 oster * describe entire
196 1.3 oster * parity unit */
197 1.3 oster
198 1.3 oster /* connect nodes to form graph */
199 1.3 oster
200 1.3 oster /* connect dag header to block node */
201 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
202 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
203 1.3 oster dag_h->succedents[0] = blockNode;
204 1.3 oster
205 1.3 oster /* connect the block node to the Rod nodes */
206 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
207 1.3 oster for (i = 0; i < nRodNodes; i++) {
208 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
209 1.3 oster blockNode->succedents[i] = &rodNodes[i];
210 1.3 oster rodNodes[i].antecedents[0] = blockNode;
211 1.3 oster rodNodes[i].antType[0] = rf_control;
212 1.3 oster }
213 1.3 oster
214 1.3 oster /* connect the block node to the sync node */
215 1.3 oster /* necessary if nRodNodes == 0 */
216 1.3 oster RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
217 1.3 oster blockNode->succedents[nRodNodes] = syncNode;
218 1.3 oster syncNode->antecedents[0] = blockNode;
219 1.3 oster syncNode->antType[0] = rf_control;
220 1.3 oster
221 1.3 oster /* connect the Rod nodes to the syncNode */
222 1.3 oster for (i = 0; i < nRodNodes; i++) {
223 1.3 oster rodNodes[i].succedents[0] = syncNode;
224 1.3 oster syncNode->antecedents[1 + i] = &rodNodes[i];
225 1.3 oster syncNode->antType[1 + i] = rf_control;
226 1.3 oster }
227 1.3 oster
228 1.3 oster /* connect the sync node to the xor node */
229 1.3 oster RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
230 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
231 1.3 oster syncNode->succedents[0] = xorNode;
232 1.3 oster xorNode->antecedents[0] = syncNode;
233 1.3 oster xorNode->antType[0] = rf_trueData; /* carry forward from sync */
234 1.3 oster
235 1.3 oster /* connect the sync node to the Wnd nodes */
236 1.3 oster for (i = 0; i < nWndNodes; i++) {
237 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
238 1.3 oster syncNode->succedents[1 + i] = &wndNodes[i];
239 1.3 oster wndNodes[i].antecedents[0] = syncNode;
240 1.3 oster wndNodes[i].antType[0] = rf_control;
241 1.3 oster }
242 1.3 oster
243 1.3 oster /* connect the xor node to the Lpo node */
244 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1);
245 1.3 oster RF_ASSERT(lpoNode->numAntecedents == 1);
246 1.3 oster xorNode->succedents[0] = lpoNode;
247 1.3 oster lpoNode->antecedents[0] = xorNode;
248 1.3 oster lpoNode->antType[0] = rf_trueData;
249 1.3 oster
250 1.3 oster /* connect the Wnd nodes to the unblock node */
251 1.3 oster RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
252 1.3 oster for (i = 0; i < nWndNodes; i++) {
253 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
254 1.3 oster wndNodes[i].succedents[0] = unblockNode;
255 1.3 oster unblockNode->antecedents[i] = &wndNodes[i];
256 1.3 oster unblockNode->antType[i] = rf_control;
257 1.3 oster }
258 1.3 oster
259 1.3 oster /* connect the Lpo node to the unblock node */
260 1.3 oster RF_ASSERT(lpoNode->numSuccedents == 1);
261 1.3 oster lpoNode->succedents[0] = unblockNode;
262 1.3 oster unblockNode->antecedents[nWndNodes] = lpoNode;
263 1.3 oster unblockNode->antType[nWndNodes] = rf_control;
264 1.3 oster
265 1.3 oster /* connect unblock node to terminator */
266 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
267 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
268 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
269 1.3 oster unblockNode->succedents[0] = termNode;
270 1.3 oster termNode->antecedents[0] = unblockNode;
271 1.3 oster termNode->antType[0] = rf_control;
272 1.1 oster }
273 1.1 oster
274 1.1 oster
275 1.1 oster
276 1.1 oster
277 1.1 oster /******************************************************************************
278 1.1 oster *
279 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
280 1.1 oster *
281 1.1 oster * Header
282 1.1 oster * |
283 1.1 oster * Block
284 1.3 oster * / | ... \ \
285 1.3 oster * / | \ \
286 1.1 oster * Rod Rod Rod Rop
287 1.3 oster * | \ /| \ / | \/ |
288 1.3 oster * | | | /\ |
289 1.3 oster * Wnd Wnd Wnd X
290 1.3 oster * | \ / |
291 1.3 oster * | \ / |
292 1.1 oster * \ \ / Lpo
293 1.3 oster * \ \ / /
294 1.3 oster * +-> Unblock <-+
295 1.1 oster * |
296 1.1 oster * T
297 1.3 oster *
298 1.1 oster *
299 1.1 oster * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
300 1.1 oster * When the access spans a stripe unit boundary and is less than one SU in size, there will
301 1.1 oster * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case.
302 1.1 oster * The second output from each Rod node goes to the X node. In the double-XOR
303 1.1 oster * case, there are exactly 2 Rod nodes, and each sends one output to one X node.
304 1.1 oster * There is one Rod -- Wnd -- T branch for each stripe unit being updated.
305 1.1 oster *
306 1.1 oster * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG.
307 1.1 oster *
308 1.1 oster * Note: this DAG ignores all the optimizations related to making the RMWs atomic.
309 1.1 oster * it also has the nasty property that none of the buffers allocated for reading
310 1.1 oster * old data & parity can be freed until the XOR node fires. Need to fix this.
311 1.1 oster *
312 1.1 oster * A null qfuncs indicates single fault tolerant
313 1.1 oster *****************************************************************************/
314 1.1 oster
315 1.3 oster void
316 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(
317 1.3 oster RF_Raid_t * raidPtr,
318 1.3 oster RF_AccessStripeMap_t * asmap,
319 1.3 oster RF_DagHeader_t * dag_h,
320 1.3 oster void *bp,
321 1.3 oster RF_RaidAccessFlags_t flags,
322 1.3 oster RF_AllocListElem_t * allocList,
323 1.3 oster RF_RedFuncs_t * pfuncs,
324 1.3 oster RF_RedFuncs_t * qfuncs)
325 1.1 oster {
326 1.3 oster RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
327 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes;
328 1.3 oster RF_DagNode_t *writeDataNodes, *lpuNodes;
329 1.3 oster RF_DagNode_t *unlockDataNodes = NULL, *termNode;
330 1.3 oster RF_PhysDiskAddr_t *pda = asmap->physInfo;
331 1.3 oster int numDataNodes = asmap->numStripeUnitsAccessed;
332 1.3 oster int numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
333 1.3 oster int i, j, nNodes, totalNumNodes;
334 1.3 oster RF_ReconUnitNum_t which_ru;
335 1.3 oster int (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node);
336 1.3 oster int (*qfunc) (RF_DagNode_t * node);
337 1.3 oster char *name, *qname;
338 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
339 1.5 thorpej #ifdef RAID_DIAGNOSTIC
340 1.3 oster long nfaults = qfuncs ? 2 : 1;
341 1.5 thorpej #endif /* RAID_DIAGNOSTIC */
342 1.3 oster int lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
343 1.3 oster
344 1.3 oster if (rf_dagDebug)
345 1.3 oster printf("[Creating parity-logging small-write DAG]\n");
346 1.3 oster RF_ASSERT(numDataNodes > 0);
347 1.3 oster RF_ASSERT(nfaults == 1);
348 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
349 1.3 oster
350 1.3 oster /* DAG creation occurs in three steps: 1. count the number of nodes in
351 1.3 oster * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
352 1.3 oster * nodes */
353 1.3 oster
354 1.3 oster /* Step 1. compute number of nodes in the graph */
355 1.3 oster
356 1.3 oster /* number of nodes: a read and write for each data unit a redundancy
357 1.3 oster * computation node for each parity node a read and Lpu for each
358 1.3 oster * parity unit a block and unblock node (2) a terminator node if
359 1.3 oster * atomic RMW an unlock node for each data unit, redundancy unit */
360 1.3 oster totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
361 1.3 oster if (lu_flag)
362 1.3 oster totalNumNodes += numDataNodes;
363 1.3 oster
364 1.3 oster nNodes = numDataNodes + numParityNodes;
365 1.3 oster
366 1.3 oster dag_h->numCommitNodes = numDataNodes + numParityNodes;
367 1.3 oster dag_h->numCommits = 0;
368 1.3 oster dag_h->numSuccedents = 1;
369 1.3 oster
370 1.3 oster /* Step 2. create the nodes */
371 1.3 oster RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
372 1.3 oster i = 0;
373 1.3 oster blockNode = &nodes[i];
374 1.3 oster i += 1;
375 1.3 oster unblockNode = &nodes[i];
376 1.3 oster i += 1;
377 1.3 oster readDataNodes = &nodes[i];
378 1.3 oster i += numDataNodes;
379 1.3 oster readParityNodes = &nodes[i];
380 1.3 oster i += numParityNodes;
381 1.3 oster writeDataNodes = &nodes[i];
382 1.3 oster i += numDataNodes;
383 1.3 oster lpuNodes = &nodes[i];
384 1.3 oster i += numParityNodes;
385 1.3 oster xorNodes = &nodes[i];
386 1.3 oster i += numParityNodes;
387 1.3 oster termNode = &nodes[i];
388 1.3 oster i += 1;
389 1.3 oster if (lu_flag) {
390 1.3 oster unlockDataNodes = &nodes[i];
391 1.3 oster i += numDataNodes;
392 1.3 oster }
393 1.3 oster RF_ASSERT(i == totalNumNodes);
394 1.3 oster
395 1.3 oster /* Step 3. initialize the nodes */
396 1.3 oster /* initialize block node (Nil) */
397 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
398 1.3 oster
399 1.3 oster /* initialize unblock node (Nil) */
400 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
401 1.3 oster
402 1.3 oster /* initialize terminatory node (Trm) */
403 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
404 1.3 oster
405 1.3 oster /* initialize nodes which read old data (Rod) */
406 1.3 oster for (i = 0; i < numDataNodes; i++) {
407 1.3 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
408 1.3 oster RF_ASSERT(pda != NULL);
409 1.3 oster readDataNodes[i].params[0].p = pda; /* physical disk addr
410 1.3 oster * desc */
411 1.3 oster readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
412 1.3 oster * data */
413 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
414 1.3 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
415 1.3 oster pda = pda->next;
416 1.3 oster readDataNodes[i].propList[0] = NULL;
417 1.3 oster readDataNodes[i].propList[1] = NULL;
418 1.3 oster }
419 1.3 oster
420 1.3 oster /* initialize nodes which read old parity (Rop) */
421 1.3 oster pda = asmap->parityInfo;
422 1.3 oster i = 0;
423 1.3 oster for (i = 0; i < numParityNodes; i++) {
424 1.3 oster RF_ASSERT(pda != NULL);
425 1.3 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
426 1.3 oster readParityNodes[i].params[0].p = pda;
427 1.3 oster readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
428 1.3 oster * parity */
429 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
430 1.3 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
431 1.3 oster readParityNodes[i].propList[0] = NULL;
432 1.3 oster pda = pda->next;
433 1.3 oster }
434 1.3 oster
435 1.3 oster /* initialize nodes which write new data (Wnd) */
436 1.3 oster pda = asmap->physInfo;
437 1.3 oster for (i = 0; i < numDataNodes; i++) {
438 1.3 oster RF_ASSERT(pda != NULL);
439 1.3 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
440 1.3 oster writeDataNodes[i].params[0].p = pda; /* physical disk addr
441 1.3 oster * desc */
442 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new
443 1.3 oster * data to be written */
444 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
445 1.3 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
446 1.3 oster
447 1.3 oster if (lu_flag) {
448 1.3 oster /* initialize node to unlock the disk queue */
449 1.3 oster rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
450 1.3 oster unlockDataNodes[i].params[0].p = pda; /* physical disk addr
451 1.3 oster * desc */
452 1.3 oster unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
453 1.3 oster }
454 1.3 oster pda = pda->next;
455 1.3 oster }
456 1.3 oster
457 1.3 oster
458 1.3 oster /* initialize nodes which compute new parity */
459 1.3 oster /* we use the simple XOR func in the double-XOR case, and when we're
460 1.3 oster * accessing only a portion of one stripe unit. the distinction
461 1.3 oster * between the two is that the regular XOR func assumes that the
462 1.3 oster * targbuf is a full SU in size, and examines the pda associated with
463 1.3 oster * the buffer to decide where within the buffer to XOR the data,
464 1.3 oster * whereas the simple XOR func just XORs the data into the start of
465 1.3 oster * the buffer. */
466 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
467 1.3 oster func = pfuncs->simple;
468 1.3 oster undoFunc = rf_NullNodeUndoFunc;
469 1.3 oster name = pfuncs->SimpleName;
470 1.3 oster if (qfuncs) {
471 1.3 oster qfunc = qfuncs->simple;
472 1.3 oster qname = qfuncs->SimpleName;
473 1.3 oster }
474 1.3 oster } else {
475 1.3 oster func = pfuncs->regular;
476 1.3 oster undoFunc = rf_NullNodeUndoFunc;
477 1.3 oster name = pfuncs->RegularName;
478 1.3 oster if (qfuncs) {
479 1.3 oster qfunc = qfuncs->regular;
480 1.3 oster qname = qfuncs->RegularName;
481 1.3 oster }
482 1.3 oster }
483 1.3 oster /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
484 1.3 oster * nodes, and raidPtr */
485 1.3 oster if (numParityNodes == 2) { /* double-xor case */
486 1.3 oster for (i = 0; i < numParityNodes; i++) {
487 1.3 oster rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for
488 1.3 oster * xor */
489 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
490 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
491 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
492 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
493 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
494 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
495 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
496 1.3 oster xorNodes[i].params[6].p = raidPtr;
497 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as
498 1.3 oster * target buf */
499 1.3 oster }
500 1.3 oster } else {
501 1.3 oster /* there is only one xor node in this case */
502 1.3 oster rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
503 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
504 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
505 1.3 oster /* set up params related to Rod and Rop nodes */
506 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
507 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */
508 1.3 oster }
509 1.3 oster for (i = 0; i < numDataNodes; i++) {
510 1.3 oster /* set up params related to Wnd and Wnp nodes */
511 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */
512 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */
513 1.3 oster }
514 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get
515 1.3 oster * at RAID information */
516 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
517 1.3 oster }
518 1.3 oster
519 1.3 oster /* initialize the log node(s) */
520 1.3 oster pda = asmap->parityInfo;
521 1.3 oster for (i = 0; i < numParityNodes; i++) {
522 1.3 oster RF_ASSERT(pda);
523 1.3 oster rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
524 1.3 oster lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */
525 1.3 oster lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to
526 1.3 oster * parity */
527 1.3 oster pda = pda->next;
528 1.3 oster }
529 1.3 oster
530 1.3 oster
531 1.3 oster /* Step 4. connect the nodes */
532 1.3 oster
533 1.3 oster /* connect header to block node */
534 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
535 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
536 1.3 oster dag_h->succedents[0] = blockNode;
537 1.3 oster
538 1.3 oster /* connect block node to read old data nodes */
539 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
540 1.3 oster for (i = 0; i < numDataNodes; i++) {
541 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
542 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
543 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
544 1.3 oster readDataNodes[i].antType[0] = rf_control;
545 1.3 oster }
546 1.3 oster
547 1.3 oster /* connect block node to read old parity nodes */
548 1.3 oster for (i = 0; i < numParityNodes; i++) {
549 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
550 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
551 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
552 1.3 oster readParityNodes[i].antType[0] = rf_control;
553 1.3 oster }
554 1.3 oster
555 1.3 oster /* connect read old data nodes to write new data nodes */
556 1.3 oster for (i = 0; i < numDataNodes; i++) {
557 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
558 1.3 oster for (j = 0; j < numDataNodes; j++) {
559 1.3 oster RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
560 1.3 oster readDataNodes[i].succedents[j] = &writeDataNodes[j];
561 1.3 oster writeDataNodes[j].antecedents[i] = &readDataNodes[i];
562 1.3 oster if (i == j)
563 1.3 oster writeDataNodes[j].antType[i] = rf_antiData;
564 1.3 oster else
565 1.3 oster writeDataNodes[j].antType[i] = rf_control;
566 1.3 oster }
567 1.3 oster }
568 1.3 oster
569 1.3 oster /* connect read old data nodes to xor nodes */
570 1.3 oster for (i = 0; i < numDataNodes; i++)
571 1.3 oster for (j = 0; j < numParityNodes; j++) {
572 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
573 1.3 oster readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
574 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
575 1.3 oster xorNodes[j].antType[i] = rf_trueData;
576 1.3 oster }
577 1.3 oster
578 1.3 oster /* connect read old parity nodes to write new data nodes */
579 1.3 oster for (i = 0; i < numParityNodes; i++) {
580 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
581 1.3 oster for (j = 0; j < numDataNodes; j++) {
582 1.3 oster readParityNodes[i].succedents[j] = &writeDataNodes[j];
583 1.3 oster writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
584 1.3 oster writeDataNodes[j].antType[numDataNodes + i] = rf_control;
585 1.3 oster }
586 1.3 oster }
587 1.3 oster
588 1.3 oster /* connect read old parity nodes to xor nodes */
589 1.3 oster for (i = 0; i < numParityNodes; i++)
590 1.3 oster for (j = 0; j < numParityNodes; j++) {
591 1.3 oster readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
592 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
593 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
594 1.3 oster }
595 1.3 oster
596 1.3 oster /* connect xor nodes to write new parity nodes */
597 1.3 oster for (i = 0; i < numParityNodes; i++) {
598 1.3 oster RF_ASSERT(xorNodes[i].numSuccedents == 1);
599 1.3 oster RF_ASSERT(lpuNodes[i].numAntecedents == 1);
600 1.3 oster xorNodes[i].succedents[0] = &lpuNodes[i];
601 1.3 oster lpuNodes[i].antecedents[0] = &xorNodes[i];
602 1.3 oster lpuNodes[i].antType[0] = rf_trueData;
603 1.3 oster }
604 1.3 oster
605 1.3 oster for (i = 0; i < numDataNodes; i++) {
606 1.3 oster if (lu_flag) {
607 1.3 oster /* connect write new data nodes to unlock nodes */
608 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
609 1.3 oster RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
610 1.3 oster writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
611 1.3 oster unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
612 1.3 oster unlockDataNodes[i].antType[0] = rf_control;
613 1.3 oster
614 1.3 oster /* connect unlock nodes to unblock node */
615 1.3 oster RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
616 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
617 1.3 oster unlockDataNodes[i].succedents[0] = unblockNode;
618 1.3 oster unblockNode->antecedents[i] = &unlockDataNodes[i];
619 1.3 oster unblockNode->antType[i] = rf_control;
620 1.3 oster } else {
621 1.3 oster /* connect write new data nodes to unblock node */
622 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
623 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
624 1.3 oster writeDataNodes[i].succedents[0] = unblockNode;
625 1.3 oster unblockNode->antecedents[i] = &writeDataNodes[i];
626 1.3 oster unblockNode->antType[i] = rf_control;
627 1.3 oster }
628 1.3 oster }
629 1.3 oster
630 1.3 oster /* connect write new parity nodes to unblock node */
631 1.3 oster for (i = 0; i < numParityNodes; i++) {
632 1.3 oster RF_ASSERT(lpuNodes[i].numSuccedents == 1);
633 1.3 oster lpuNodes[i].succedents[0] = unblockNode;
634 1.3 oster unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
635 1.3 oster unblockNode->antType[numDataNodes + i] = rf_control;
636 1.3 oster }
637 1.3 oster
638 1.3 oster /* connect unblock node to terminator */
639 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
640 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
641 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
642 1.3 oster unblockNode->succedents[0] = termNode;
643 1.3 oster termNode->antecedents[0] = unblockNode;
644 1.3 oster termNode->antType[0] = rf_control;
645 1.1 oster }
646 1.1 oster
647 1.1 oster
648 1.3 oster void
649 1.3 oster rf_CreateParityLoggingSmallWriteDAG(
650 1.3 oster RF_Raid_t * raidPtr,
651 1.3 oster RF_AccessStripeMap_t * asmap,
652 1.3 oster RF_DagHeader_t * dag_h,
653 1.3 oster void *bp,
654 1.3 oster RF_RaidAccessFlags_t flags,
655 1.3 oster RF_AllocListElem_t * allocList,
656 1.3 oster RF_RedFuncs_t * pfuncs,
657 1.3 oster RF_RedFuncs_t * qfuncs)
658 1.1 oster {
659 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
660 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
661 1.1 oster }
662 1.1 oster
663 1.1 oster
664 1.3 oster void
665 1.3 oster rf_CreateParityLoggingLargeWriteDAG(
666 1.3 oster RF_Raid_t * raidPtr,
667 1.3 oster RF_AccessStripeMap_t * asmap,
668 1.3 oster RF_DagHeader_t * dag_h,
669 1.3 oster void *bp,
670 1.3 oster RF_RaidAccessFlags_t flags,
671 1.3 oster RF_AllocListElem_t * allocList,
672 1.3 oster int nfaults,
673 1.3 oster int (*redFunc) (RF_DagNode_t *))
674 1.1 oster {
675 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
676 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
677 1.1 oster }
678 1.3 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
679