rf_parityloggingdags.c revision 1.5 1 1.5 thorpej /* $NetBSD: rf_parityloggingdags.c,v 1.5 2001/09/01 23:50:44 thorpej Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: William V. Courtright II
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster #include "rf_archs.h"
30 1.1 oster
31 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0
32 1.1 oster
33 1.1 oster /*
34 1.1 oster DAGs specific to parity logging are created here
35 1.1 oster */
36 1.1 oster
37 1.1 oster #include "rf_types.h"
38 1.1 oster #include "rf_raid.h"
39 1.1 oster #include "rf_dag.h"
40 1.1 oster #include "rf_dagutils.h"
41 1.1 oster #include "rf_dagfuncs.h"
42 1.1 oster #include "rf_debugMem.h"
43 1.1 oster #include "rf_paritylog.h"
44 1.1 oster #include "rf_memchunk.h"
45 1.1 oster #include "rf_general.h"
46 1.1 oster
47 1.1 oster #include "rf_parityloggingdags.h"
48 1.1 oster
49 1.1 oster /******************************************************************************
50 1.1 oster *
51 1.1 oster * creates a DAG to perform a large-write operation:
52 1.1 oster *
53 1.1 oster * / Rod \ / Wnd \
54 1.1 oster * H -- NIL- Rod - NIL - Wnd ------ NIL - T
55 1.1 oster * \ Rod / \ Xor - Lpo /
56 1.1 oster *
57 1.1 oster * The writes are not done until the reads complete because if they were done in
58 1.1 oster * parallel, a failure on one of the reads could leave the parity in an inconsistent
59 1.1 oster * state, so that the retry with a new DAG would produce erroneous parity.
60 1.1 oster *
61 1.1 oster * Note: this DAG has the nasty property that none of the buffers allocated for reading
62 1.1 oster * old data can be freed until the XOR node fires. Need to fix this.
63 1.1 oster *
64 1.1 oster * The last two arguments are the number of faults tolerated, and function for the
65 1.1 oster * redundancy calculation. The undo for the redundancy calc is assumed to be null
66 1.1 oster *
67 1.1 oster *****************************************************************************/
68 1.1 oster
69 1.3 oster void
70 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(
71 1.3 oster RF_Raid_t * raidPtr,
72 1.3 oster RF_AccessStripeMap_t * asmap,
73 1.3 oster RF_DagHeader_t * dag_h,
74 1.3 oster void *bp,
75 1.3 oster RF_RaidAccessFlags_t flags,
76 1.3 oster RF_AllocListElem_t * allocList,
77 1.3 oster int nfaults,
78 1.3 oster int (*redFunc) (RF_DagNode_t *))
79 1.1 oster {
80 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode,
81 1.3 oster *lpoNode, *blockNode, *unblockNode, *termNode;
82 1.3 oster int nWndNodes, nRodNodes, i;
83 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
84 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
85 1.3 oster int nodeNum, asmNum;
86 1.3 oster RF_ReconUnitNum_t which_ru;
87 1.3 oster char *sosBuffer, *eosBuffer;
88 1.3 oster RF_PhysDiskAddr_t *pda;
89 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
90 1.3 oster
91 1.3 oster if (rf_dagDebug)
92 1.3 oster printf("[Creating parity-logging large-write DAG]\n");
93 1.3 oster RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */
94 1.3 oster dag_h->creator = "ParityLoggingLargeWriteDAG";
95 1.3 oster
96 1.3 oster /* alloc the Wnd nodes, the xor node, and the Lpo node */
97 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
98 1.3 oster RF_CallocAndAdd(nodes, nWndNodes + 6, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
99 1.3 oster i = 0;
100 1.3 oster wndNodes = &nodes[i];
101 1.3 oster i += nWndNodes;
102 1.3 oster xorNode = &nodes[i];
103 1.3 oster i += 1;
104 1.3 oster lpoNode = &nodes[i];
105 1.3 oster i += 1;
106 1.3 oster blockNode = &nodes[i];
107 1.3 oster i += 1;
108 1.3 oster syncNode = &nodes[i];
109 1.3 oster i += 1;
110 1.3 oster unblockNode = &nodes[i];
111 1.3 oster i += 1;
112 1.3 oster termNode = &nodes[i];
113 1.3 oster i += 1;
114 1.3 oster
115 1.3 oster dag_h->numCommitNodes = nWndNodes + 1;
116 1.3 oster dag_h->numCommits = 0;
117 1.3 oster dag_h->numSuccedents = 1;
118 1.3 oster
119 1.3 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
120 1.3 oster if (nRodNodes > 0)
121 1.3 oster RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
122 1.3 oster
123 1.3 oster /* begin node initialization */
124 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
125 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
126 1.3 oster rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
127 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
128 1.3 oster
129 1.3 oster /* initialize the Rod nodes */
130 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
131 1.3 oster if (new_asm_h[asmNum]) {
132 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
133 1.3 oster while (pda) {
134 1.3 oster rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
135 1.3 oster rodNodes[nodeNum].params[0].p = pda;
136 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
137 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
138 1.3 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
139 1.3 oster nodeNum++;
140 1.3 oster pda = pda->next;
141 1.3 oster }
142 1.3 oster }
143 1.3 oster }
144 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
145 1.3 oster
146 1.3 oster /* initialize the wnd nodes */
147 1.3 oster pda = asmap->physInfo;
148 1.3 oster for (i = 0; i < nWndNodes; i++) {
149 1.3 oster rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
150 1.3 oster RF_ASSERT(pda != NULL);
151 1.3 oster wndNodes[i].params[0].p = pda;
152 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
153 1.3 oster wndNodes[i].params[2].v = parityStripeID;
154 1.3 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
155 1.3 oster pda = pda->next;
156 1.3 oster }
157 1.3 oster
158 1.3 oster /* initialize the redundancy node */
159 1.3 oster rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr ", allocList);
160 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
161 1.3 oster for (i = 0; i < nWndNodes; i++) {
162 1.3 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
163 1.3 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
164 1.3 oster }
165 1.3 oster for (i = 0; i < nRodNodes; i++) {
166 1.3 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
167 1.3 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
168 1.3 oster }
169 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get
170 1.3 oster * at RAID information */
171 1.3 oster
172 1.3 oster /* look for an Rod node that reads a complete SU. If none, alloc a
173 1.3 oster * buffer to receive the parity info. Note that we can't use a new
174 1.3 oster * data buffer because it will not have gotten written when the xor
175 1.3 oster * occurs. */
176 1.3 oster for (i = 0; i < nRodNodes; i++)
177 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
178 1.3 oster break;
179 1.3 oster if (i == nRodNodes) {
180 1.3 oster RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
181 1.3 oster } else {
182 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
183 1.3 oster }
184 1.3 oster
185 1.3 oster /* initialize the Lpo node */
186 1.3 oster rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
187 1.3 oster
188 1.3 oster lpoNode->params[0].p = asmap->parityInfo;
189 1.3 oster lpoNode->params[1].p = xorNode->results[0];
190 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must
191 1.3 oster * describe entire
192 1.3 oster * parity unit */
193 1.3 oster
194 1.3 oster /* connect nodes to form graph */
195 1.3 oster
196 1.3 oster /* connect dag header to block node */
197 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
198 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
199 1.3 oster dag_h->succedents[0] = blockNode;
200 1.3 oster
201 1.3 oster /* connect the block node to the Rod nodes */
202 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
203 1.3 oster for (i = 0; i < nRodNodes; i++) {
204 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
205 1.3 oster blockNode->succedents[i] = &rodNodes[i];
206 1.3 oster rodNodes[i].antecedents[0] = blockNode;
207 1.3 oster rodNodes[i].antType[0] = rf_control;
208 1.3 oster }
209 1.3 oster
210 1.3 oster /* connect the block node to the sync node */
211 1.3 oster /* necessary if nRodNodes == 0 */
212 1.3 oster RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
213 1.3 oster blockNode->succedents[nRodNodes] = syncNode;
214 1.3 oster syncNode->antecedents[0] = blockNode;
215 1.3 oster syncNode->antType[0] = rf_control;
216 1.3 oster
217 1.3 oster /* connect the Rod nodes to the syncNode */
218 1.3 oster for (i = 0; i < nRodNodes; i++) {
219 1.3 oster rodNodes[i].succedents[0] = syncNode;
220 1.3 oster syncNode->antecedents[1 + i] = &rodNodes[i];
221 1.3 oster syncNode->antType[1 + i] = rf_control;
222 1.3 oster }
223 1.3 oster
224 1.3 oster /* connect the sync node to the xor node */
225 1.3 oster RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
226 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
227 1.3 oster syncNode->succedents[0] = xorNode;
228 1.3 oster xorNode->antecedents[0] = syncNode;
229 1.3 oster xorNode->antType[0] = rf_trueData; /* carry forward from sync */
230 1.3 oster
231 1.3 oster /* connect the sync node to the Wnd nodes */
232 1.3 oster for (i = 0; i < nWndNodes; i++) {
233 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
234 1.3 oster syncNode->succedents[1 + i] = &wndNodes[i];
235 1.3 oster wndNodes[i].antecedents[0] = syncNode;
236 1.3 oster wndNodes[i].antType[0] = rf_control;
237 1.3 oster }
238 1.3 oster
239 1.3 oster /* connect the xor node to the Lpo node */
240 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1);
241 1.3 oster RF_ASSERT(lpoNode->numAntecedents == 1);
242 1.3 oster xorNode->succedents[0] = lpoNode;
243 1.3 oster lpoNode->antecedents[0] = xorNode;
244 1.3 oster lpoNode->antType[0] = rf_trueData;
245 1.3 oster
246 1.3 oster /* connect the Wnd nodes to the unblock node */
247 1.3 oster RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
248 1.3 oster for (i = 0; i < nWndNodes; i++) {
249 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
250 1.3 oster wndNodes[i].succedents[0] = unblockNode;
251 1.3 oster unblockNode->antecedents[i] = &wndNodes[i];
252 1.3 oster unblockNode->antType[i] = rf_control;
253 1.3 oster }
254 1.3 oster
255 1.3 oster /* connect the Lpo node to the unblock node */
256 1.3 oster RF_ASSERT(lpoNode->numSuccedents == 1);
257 1.3 oster lpoNode->succedents[0] = unblockNode;
258 1.3 oster unblockNode->antecedents[nWndNodes] = lpoNode;
259 1.3 oster unblockNode->antType[nWndNodes] = rf_control;
260 1.3 oster
261 1.3 oster /* connect unblock node to terminator */
262 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
263 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
264 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
265 1.3 oster unblockNode->succedents[0] = termNode;
266 1.3 oster termNode->antecedents[0] = unblockNode;
267 1.3 oster termNode->antType[0] = rf_control;
268 1.1 oster }
269 1.1 oster
270 1.1 oster
271 1.1 oster
272 1.1 oster
273 1.1 oster /******************************************************************************
274 1.1 oster *
275 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
276 1.1 oster *
277 1.1 oster * Header
278 1.1 oster * |
279 1.1 oster * Block
280 1.3 oster * / | ... \ \
281 1.3 oster * / | \ \
282 1.1 oster * Rod Rod Rod Rop
283 1.3 oster * | \ /| \ / | \/ |
284 1.3 oster * | | | /\ |
285 1.3 oster * Wnd Wnd Wnd X
286 1.3 oster * | \ / |
287 1.3 oster * | \ / |
288 1.1 oster * \ \ / Lpo
289 1.3 oster * \ \ / /
290 1.3 oster * +-> Unblock <-+
291 1.1 oster * |
292 1.1 oster * T
293 1.3 oster *
294 1.1 oster *
295 1.1 oster * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
296 1.1 oster * When the access spans a stripe unit boundary and is less than one SU in size, there will
297 1.1 oster * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case.
298 1.1 oster * The second output from each Rod node goes to the X node. In the double-XOR
299 1.1 oster * case, there are exactly 2 Rod nodes, and each sends one output to one X node.
300 1.1 oster * There is one Rod -- Wnd -- T branch for each stripe unit being updated.
301 1.1 oster *
302 1.1 oster * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG.
303 1.1 oster *
304 1.1 oster * Note: this DAG ignores all the optimizations related to making the RMWs atomic.
305 1.1 oster * it also has the nasty property that none of the buffers allocated for reading
306 1.1 oster * old data & parity can be freed until the XOR node fires. Need to fix this.
307 1.1 oster *
308 1.1 oster * A null qfuncs indicates single fault tolerant
309 1.1 oster *****************************************************************************/
310 1.1 oster
311 1.3 oster void
312 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(
313 1.3 oster RF_Raid_t * raidPtr,
314 1.3 oster RF_AccessStripeMap_t * asmap,
315 1.3 oster RF_DagHeader_t * dag_h,
316 1.3 oster void *bp,
317 1.3 oster RF_RaidAccessFlags_t flags,
318 1.3 oster RF_AllocListElem_t * allocList,
319 1.3 oster RF_RedFuncs_t * pfuncs,
320 1.3 oster RF_RedFuncs_t * qfuncs)
321 1.1 oster {
322 1.3 oster RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
323 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes;
324 1.3 oster RF_DagNode_t *writeDataNodes, *lpuNodes;
325 1.3 oster RF_DagNode_t *unlockDataNodes = NULL, *termNode;
326 1.3 oster RF_PhysDiskAddr_t *pda = asmap->physInfo;
327 1.3 oster int numDataNodes = asmap->numStripeUnitsAccessed;
328 1.3 oster int numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
329 1.3 oster int i, j, nNodes, totalNumNodes;
330 1.3 oster RF_ReconUnitNum_t which_ru;
331 1.3 oster int (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node);
332 1.3 oster int (*qfunc) (RF_DagNode_t * node);
333 1.3 oster char *name, *qname;
334 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
335 1.5 thorpej #ifdef RAID_DIAGNOSTIC
336 1.3 oster long nfaults = qfuncs ? 2 : 1;
337 1.5 thorpej #endif /* RAID_DIAGNOSTIC */
338 1.3 oster int lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
339 1.3 oster
340 1.3 oster if (rf_dagDebug)
341 1.3 oster printf("[Creating parity-logging small-write DAG]\n");
342 1.3 oster RF_ASSERT(numDataNodes > 0);
343 1.3 oster RF_ASSERT(nfaults == 1);
344 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
345 1.3 oster
346 1.3 oster /* DAG creation occurs in three steps: 1. count the number of nodes in
347 1.3 oster * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
348 1.3 oster * nodes */
349 1.3 oster
350 1.3 oster /* Step 1. compute number of nodes in the graph */
351 1.3 oster
352 1.3 oster /* number of nodes: a read and write for each data unit a redundancy
353 1.3 oster * computation node for each parity node a read and Lpu for each
354 1.3 oster * parity unit a block and unblock node (2) a terminator node if
355 1.3 oster * atomic RMW an unlock node for each data unit, redundancy unit */
356 1.3 oster totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
357 1.3 oster if (lu_flag)
358 1.3 oster totalNumNodes += numDataNodes;
359 1.3 oster
360 1.3 oster nNodes = numDataNodes + numParityNodes;
361 1.3 oster
362 1.3 oster dag_h->numCommitNodes = numDataNodes + numParityNodes;
363 1.3 oster dag_h->numCommits = 0;
364 1.3 oster dag_h->numSuccedents = 1;
365 1.3 oster
366 1.3 oster /* Step 2. create the nodes */
367 1.3 oster RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
368 1.3 oster i = 0;
369 1.3 oster blockNode = &nodes[i];
370 1.3 oster i += 1;
371 1.3 oster unblockNode = &nodes[i];
372 1.3 oster i += 1;
373 1.3 oster readDataNodes = &nodes[i];
374 1.3 oster i += numDataNodes;
375 1.3 oster readParityNodes = &nodes[i];
376 1.3 oster i += numParityNodes;
377 1.3 oster writeDataNodes = &nodes[i];
378 1.3 oster i += numDataNodes;
379 1.3 oster lpuNodes = &nodes[i];
380 1.3 oster i += numParityNodes;
381 1.3 oster xorNodes = &nodes[i];
382 1.3 oster i += numParityNodes;
383 1.3 oster termNode = &nodes[i];
384 1.3 oster i += 1;
385 1.3 oster if (lu_flag) {
386 1.3 oster unlockDataNodes = &nodes[i];
387 1.3 oster i += numDataNodes;
388 1.3 oster }
389 1.3 oster RF_ASSERT(i == totalNumNodes);
390 1.3 oster
391 1.3 oster /* Step 3. initialize the nodes */
392 1.3 oster /* initialize block node (Nil) */
393 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
394 1.3 oster
395 1.3 oster /* initialize unblock node (Nil) */
396 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
397 1.3 oster
398 1.3 oster /* initialize terminatory node (Trm) */
399 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
400 1.3 oster
401 1.3 oster /* initialize nodes which read old data (Rod) */
402 1.3 oster for (i = 0; i < numDataNodes; i++) {
403 1.3 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
404 1.3 oster RF_ASSERT(pda != NULL);
405 1.3 oster readDataNodes[i].params[0].p = pda; /* physical disk addr
406 1.3 oster * desc */
407 1.3 oster readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
408 1.3 oster * data */
409 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
410 1.3 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
411 1.3 oster pda = pda->next;
412 1.3 oster readDataNodes[i].propList[0] = NULL;
413 1.3 oster readDataNodes[i].propList[1] = NULL;
414 1.3 oster }
415 1.3 oster
416 1.3 oster /* initialize nodes which read old parity (Rop) */
417 1.3 oster pda = asmap->parityInfo;
418 1.3 oster i = 0;
419 1.3 oster for (i = 0; i < numParityNodes; i++) {
420 1.3 oster RF_ASSERT(pda != NULL);
421 1.3 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
422 1.3 oster readParityNodes[i].params[0].p = pda;
423 1.3 oster readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
424 1.3 oster * parity */
425 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
426 1.3 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
427 1.3 oster readParityNodes[i].propList[0] = NULL;
428 1.3 oster pda = pda->next;
429 1.3 oster }
430 1.3 oster
431 1.3 oster /* initialize nodes which write new data (Wnd) */
432 1.3 oster pda = asmap->physInfo;
433 1.3 oster for (i = 0; i < numDataNodes; i++) {
434 1.3 oster RF_ASSERT(pda != NULL);
435 1.3 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
436 1.3 oster writeDataNodes[i].params[0].p = pda; /* physical disk addr
437 1.3 oster * desc */
438 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new
439 1.3 oster * data to be written */
440 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
441 1.3 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
442 1.3 oster
443 1.3 oster if (lu_flag) {
444 1.3 oster /* initialize node to unlock the disk queue */
445 1.3 oster rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
446 1.3 oster unlockDataNodes[i].params[0].p = pda; /* physical disk addr
447 1.3 oster * desc */
448 1.3 oster unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
449 1.3 oster }
450 1.3 oster pda = pda->next;
451 1.3 oster }
452 1.3 oster
453 1.3 oster
454 1.3 oster /* initialize nodes which compute new parity */
455 1.3 oster /* we use the simple XOR func in the double-XOR case, and when we're
456 1.3 oster * accessing only a portion of one stripe unit. the distinction
457 1.3 oster * between the two is that the regular XOR func assumes that the
458 1.3 oster * targbuf is a full SU in size, and examines the pda associated with
459 1.3 oster * the buffer to decide where within the buffer to XOR the data,
460 1.3 oster * whereas the simple XOR func just XORs the data into the start of
461 1.3 oster * the buffer. */
462 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
463 1.3 oster func = pfuncs->simple;
464 1.3 oster undoFunc = rf_NullNodeUndoFunc;
465 1.3 oster name = pfuncs->SimpleName;
466 1.3 oster if (qfuncs) {
467 1.3 oster qfunc = qfuncs->simple;
468 1.3 oster qname = qfuncs->SimpleName;
469 1.3 oster }
470 1.3 oster } else {
471 1.3 oster func = pfuncs->regular;
472 1.3 oster undoFunc = rf_NullNodeUndoFunc;
473 1.3 oster name = pfuncs->RegularName;
474 1.3 oster if (qfuncs) {
475 1.3 oster qfunc = qfuncs->regular;
476 1.3 oster qname = qfuncs->RegularName;
477 1.3 oster }
478 1.3 oster }
479 1.3 oster /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
480 1.3 oster * nodes, and raidPtr */
481 1.3 oster if (numParityNodes == 2) { /* double-xor case */
482 1.3 oster for (i = 0; i < numParityNodes; i++) {
483 1.3 oster rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for
484 1.3 oster * xor */
485 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
486 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
487 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
488 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
489 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
490 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
491 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
492 1.3 oster xorNodes[i].params[6].p = raidPtr;
493 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as
494 1.3 oster * target buf */
495 1.3 oster }
496 1.3 oster } else {
497 1.3 oster /* there is only one xor node in this case */
498 1.3 oster rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
499 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
500 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
501 1.3 oster /* set up params related to Rod and Rop nodes */
502 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
503 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */
504 1.3 oster }
505 1.3 oster for (i = 0; i < numDataNodes; i++) {
506 1.3 oster /* set up params related to Wnd and Wnp nodes */
507 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */
508 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */
509 1.3 oster }
510 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get
511 1.3 oster * at RAID information */
512 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
513 1.3 oster }
514 1.3 oster
515 1.3 oster /* initialize the log node(s) */
516 1.3 oster pda = asmap->parityInfo;
517 1.3 oster for (i = 0; i < numParityNodes; i++) {
518 1.3 oster RF_ASSERT(pda);
519 1.3 oster rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
520 1.3 oster lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */
521 1.3 oster lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to
522 1.3 oster * parity */
523 1.3 oster pda = pda->next;
524 1.3 oster }
525 1.3 oster
526 1.3 oster
527 1.3 oster /* Step 4. connect the nodes */
528 1.3 oster
529 1.3 oster /* connect header to block node */
530 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
531 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
532 1.3 oster dag_h->succedents[0] = blockNode;
533 1.3 oster
534 1.3 oster /* connect block node to read old data nodes */
535 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
536 1.3 oster for (i = 0; i < numDataNodes; i++) {
537 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
538 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
539 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
540 1.3 oster readDataNodes[i].antType[0] = rf_control;
541 1.3 oster }
542 1.3 oster
543 1.3 oster /* connect block node to read old parity nodes */
544 1.3 oster for (i = 0; i < numParityNodes; i++) {
545 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
546 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
547 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
548 1.3 oster readParityNodes[i].antType[0] = rf_control;
549 1.3 oster }
550 1.3 oster
551 1.3 oster /* connect read old data nodes to write new data nodes */
552 1.3 oster for (i = 0; i < numDataNodes; i++) {
553 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
554 1.3 oster for (j = 0; j < numDataNodes; j++) {
555 1.3 oster RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
556 1.3 oster readDataNodes[i].succedents[j] = &writeDataNodes[j];
557 1.3 oster writeDataNodes[j].antecedents[i] = &readDataNodes[i];
558 1.3 oster if (i == j)
559 1.3 oster writeDataNodes[j].antType[i] = rf_antiData;
560 1.3 oster else
561 1.3 oster writeDataNodes[j].antType[i] = rf_control;
562 1.3 oster }
563 1.3 oster }
564 1.3 oster
565 1.3 oster /* connect read old data nodes to xor nodes */
566 1.3 oster for (i = 0; i < numDataNodes; i++)
567 1.3 oster for (j = 0; j < numParityNodes; j++) {
568 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
569 1.3 oster readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
570 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
571 1.3 oster xorNodes[j].antType[i] = rf_trueData;
572 1.3 oster }
573 1.3 oster
574 1.3 oster /* connect read old parity nodes to write new data nodes */
575 1.3 oster for (i = 0; i < numParityNodes; i++) {
576 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
577 1.3 oster for (j = 0; j < numDataNodes; j++) {
578 1.3 oster readParityNodes[i].succedents[j] = &writeDataNodes[j];
579 1.3 oster writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
580 1.3 oster writeDataNodes[j].antType[numDataNodes + i] = rf_control;
581 1.3 oster }
582 1.3 oster }
583 1.3 oster
584 1.3 oster /* connect read old parity nodes to xor nodes */
585 1.3 oster for (i = 0; i < numParityNodes; i++)
586 1.3 oster for (j = 0; j < numParityNodes; j++) {
587 1.3 oster readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
588 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
589 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
590 1.3 oster }
591 1.3 oster
592 1.3 oster /* connect xor nodes to write new parity nodes */
593 1.3 oster for (i = 0; i < numParityNodes; i++) {
594 1.3 oster RF_ASSERT(xorNodes[i].numSuccedents == 1);
595 1.3 oster RF_ASSERT(lpuNodes[i].numAntecedents == 1);
596 1.3 oster xorNodes[i].succedents[0] = &lpuNodes[i];
597 1.3 oster lpuNodes[i].antecedents[0] = &xorNodes[i];
598 1.3 oster lpuNodes[i].antType[0] = rf_trueData;
599 1.3 oster }
600 1.3 oster
601 1.3 oster for (i = 0; i < numDataNodes; i++) {
602 1.3 oster if (lu_flag) {
603 1.3 oster /* connect write new data nodes to unlock nodes */
604 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
605 1.3 oster RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
606 1.3 oster writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
607 1.3 oster unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
608 1.3 oster unlockDataNodes[i].antType[0] = rf_control;
609 1.3 oster
610 1.3 oster /* connect unlock nodes to unblock node */
611 1.3 oster RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
612 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
613 1.3 oster unlockDataNodes[i].succedents[0] = unblockNode;
614 1.3 oster unblockNode->antecedents[i] = &unlockDataNodes[i];
615 1.3 oster unblockNode->antType[i] = rf_control;
616 1.3 oster } else {
617 1.3 oster /* connect write new data nodes to unblock node */
618 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
619 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
620 1.3 oster writeDataNodes[i].succedents[0] = unblockNode;
621 1.3 oster unblockNode->antecedents[i] = &writeDataNodes[i];
622 1.3 oster unblockNode->antType[i] = rf_control;
623 1.3 oster }
624 1.3 oster }
625 1.3 oster
626 1.3 oster /* connect write new parity nodes to unblock node */
627 1.3 oster for (i = 0; i < numParityNodes; i++) {
628 1.3 oster RF_ASSERT(lpuNodes[i].numSuccedents == 1);
629 1.3 oster lpuNodes[i].succedents[0] = unblockNode;
630 1.3 oster unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
631 1.3 oster unblockNode->antType[numDataNodes + i] = rf_control;
632 1.3 oster }
633 1.3 oster
634 1.3 oster /* connect unblock node to terminator */
635 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
636 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
637 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
638 1.3 oster unblockNode->succedents[0] = termNode;
639 1.3 oster termNode->antecedents[0] = unblockNode;
640 1.3 oster termNode->antType[0] = rf_control;
641 1.1 oster }
642 1.1 oster
643 1.1 oster
644 1.3 oster void
645 1.3 oster rf_CreateParityLoggingSmallWriteDAG(
646 1.3 oster RF_Raid_t * raidPtr,
647 1.3 oster RF_AccessStripeMap_t * asmap,
648 1.3 oster RF_DagHeader_t * dag_h,
649 1.3 oster void *bp,
650 1.3 oster RF_RaidAccessFlags_t flags,
651 1.3 oster RF_AllocListElem_t * allocList,
652 1.3 oster RF_RedFuncs_t * pfuncs,
653 1.3 oster RF_RedFuncs_t * qfuncs)
654 1.1 oster {
655 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
656 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
657 1.1 oster }
658 1.1 oster
659 1.1 oster
660 1.3 oster void
661 1.3 oster rf_CreateParityLoggingLargeWriteDAG(
662 1.3 oster RF_Raid_t * raidPtr,
663 1.3 oster RF_AccessStripeMap_t * asmap,
664 1.3 oster RF_DagHeader_t * dag_h,
665 1.3 oster void *bp,
666 1.3 oster RF_RaidAccessFlags_t flags,
667 1.3 oster RF_AllocListElem_t * allocList,
668 1.3 oster int nfaults,
669 1.3 oster int (*redFunc) (RF_DagNode_t *))
670 1.1 oster {
671 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
672 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
673 1.1 oster }
674 1.3 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
675