rf_parityloggingdags.c revision 1.3 1 1.3 oster /* $NetBSD: rf_parityloggingdags.c,v 1.3 1999/02/05 00:06:14 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: William V. Courtright II
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster #include "rf_archs.h"
30 1.1 oster
31 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0
32 1.1 oster
33 1.1 oster /*
34 1.1 oster DAGs specific to parity logging are created here
35 1.1 oster */
36 1.1 oster
37 1.1 oster #include "rf_types.h"
38 1.1 oster #include "rf_raid.h"
39 1.1 oster #include "rf_dag.h"
40 1.1 oster #include "rf_dagutils.h"
41 1.1 oster #include "rf_dagfuncs.h"
42 1.1 oster #include "rf_threadid.h"
43 1.1 oster #include "rf_debugMem.h"
44 1.1 oster #include "rf_paritylog.h"
45 1.1 oster #include "rf_memchunk.h"
46 1.1 oster #include "rf_general.h"
47 1.1 oster
48 1.1 oster #include "rf_parityloggingdags.h"
49 1.1 oster
50 1.1 oster /******************************************************************************
51 1.1 oster *
52 1.1 oster * creates a DAG to perform a large-write operation:
53 1.1 oster *
54 1.1 oster * / Rod \ / Wnd \
55 1.1 oster * H -- NIL- Rod - NIL - Wnd ------ NIL - T
56 1.1 oster * \ Rod / \ Xor - Lpo /
57 1.1 oster *
58 1.1 oster * The writes are not done until the reads complete because if they were done in
59 1.1 oster * parallel, a failure on one of the reads could leave the parity in an inconsistent
60 1.1 oster * state, so that the retry with a new DAG would produce erroneous parity.
61 1.1 oster *
62 1.1 oster * Note: this DAG has the nasty property that none of the buffers allocated for reading
63 1.1 oster * old data can be freed until the XOR node fires. Need to fix this.
64 1.1 oster *
65 1.1 oster * The last two arguments are the number of faults tolerated, and function for the
66 1.1 oster * redundancy calculation. The undo for the redundancy calc is assumed to be null
67 1.1 oster *
68 1.1 oster *****************************************************************************/
69 1.1 oster
70 1.3 oster void
71 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(
72 1.3 oster RF_Raid_t * raidPtr,
73 1.3 oster RF_AccessStripeMap_t * asmap,
74 1.3 oster RF_DagHeader_t * dag_h,
75 1.3 oster void *bp,
76 1.3 oster RF_RaidAccessFlags_t flags,
77 1.3 oster RF_AllocListElem_t * allocList,
78 1.3 oster int nfaults,
79 1.3 oster int (*redFunc) (RF_DagNode_t *))
80 1.1 oster {
81 1.3 oster RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode,
82 1.3 oster *lpoNode, *blockNode, *unblockNode, *termNode;
83 1.3 oster int nWndNodes, nRodNodes, i;
84 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
85 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2];
86 1.3 oster int nodeNum, asmNum;
87 1.3 oster RF_ReconUnitNum_t which_ru;
88 1.3 oster char *sosBuffer, *eosBuffer;
89 1.3 oster RF_PhysDiskAddr_t *pda;
90 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
91 1.3 oster
92 1.3 oster if (rf_dagDebug)
93 1.3 oster printf("[Creating parity-logging large-write DAG]\n");
94 1.3 oster RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */
95 1.3 oster dag_h->creator = "ParityLoggingLargeWriteDAG";
96 1.3 oster
97 1.3 oster /* alloc the Wnd nodes, the xor node, and the Lpo node */
98 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed;
99 1.3 oster RF_CallocAndAdd(nodes, nWndNodes + 6, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
100 1.3 oster i = 0;
101 1.3 oster wndNodes = &nodes[i];
102 1.3 oster i += nWndNodes;
103 1.3 oster xorNode = &nodes[i];
104 1.3 oster i += 1;
105 1.3 oster lpoNode = &nodes[i];
106 1.3 oster i += 1;
107 1.3 oster blockNode = &nodes[i];
108 1.3 oster i += 1;
109 1.3 oster syncNode = &nodes[i];
110 1.3 oster i += 1;
111 1.3 oster unblockNode = &nodes[i];
112 1.3 oster i += 1;
113 1.3 oster termNode = &nodes[i];
114 1.3 oster i += 1;
115 1.3 oster
116 1.3 oster dag_h->numCommitNodes = nWndNodes + 1;
117 1.3 oster dag_h->numCommits = 0;
118 1.3 oster dag_h->numSuccedents = 1;
119 1.3 oster
120 1.3 oster rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
121 1.3 oster if (nRodNodes > 0)
122 1.3 oster RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
123 1.3 oster
124 1.3 oster /* begin node initialization */
125 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
126 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
127 1.3 oster rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
128 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
129 1.3 oster
130 1.3 oster /* initialize the Rod nodes */
131 1.3 oster for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
132 1.3 oster if (new_asm_h[asmNum]) {
133 1.3 oster pda = new_asm_h[asmNum]->stripeMap->physInfo;
134 1.3 oster while (pda) {
135 1.3 oster rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
136 1.3 oster rodNodes[nodeNum].params[0].p = pda;
137 1.3 oster rodNodes[nodeNum].params[1].p = pda->bufPtr;
138 1.3 oster rodNodes[nodeNum].params[2].v = parityStripeID;
139 1.3 oster rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
140 1.3 oster nodeNum++;
141 1.3 oster pda = pda->next;
142 1.3 oster }
143 1.3 oster }
144 1.3 oster }
145 1.3 oster RF_ASSERT(nodeNum == nRodNodes);
146 1.3 oster
147 1.3 oster /* initialize the wnd nodes */
148 1.3 oster pda = asmap->physInfo;
149 1.3 oster for (i = 0; i < nWndNodes; i++) {
150 1.3 oster rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
151 1.3 oster RF_ASSERT(pda != NULL);
152 1.3 oster wndNodes[i].params[0].p = pda;
153 1.3 oster wndNodes[i].params[1].p = pda->bufPtr;
154 1.3 oster wndNodes[i].params[2].v = parityStripeID;
155 1.3 oster wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
156 1.3 oster pda = pda->next;
157 1.3 oster }
158 1.3 oster
159 1.3 oster /* initialize the redundancy node */
160 1.3 oster rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr ", allocList);
161 1.3 oster xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
162 1.3 oster for (i = 0; i < nWndNodes; i++) {
163 1.3 oster xorNode->params[2 * i + 0] = wndNodes[i].params[0]; /* pda */
164 1.3 oster xorNode->params[2 * i + 1] = wndNodes[i].params[1]; /* buf ptr */
165 1.3 oster }
166 1.3 oster for (i = 0; i < nRodNodes; i++) {
167 1.3 oster xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0]; /* pda */
168 1.3 oster xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1]; /* buf ptr */
169 1.3 oster }
170 1.3 oster xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr; /* xor node needs to get
171 1.3 oster * at RAID information */
172 1.3 oster
173 1.3 oster /* look for an Rod node that reads a complete SU. If none, alloc a
174 1.3 oster * buffer to receive the parity info. Note that we can't use a new
175 1.3 oster * data buffer because it will not have gotten written when the xor
176 1.3 oster * occurs. */
177 1.3 oster for (i = 0; i < nRodNodes; i++)
178 1.3 oster if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
179 1.3 oster break;
180 1.3 oster if (i == nRodNodes) {
181 1.3 oster RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
182 1.3 oster } else {
183 1.3 oster xorNode->results[0] = rodNodes[i].params[1].p;
184 1.3 oster }
185 1.3 oster
186 1.3 oster /* initialize the Lpo node */
187 1.3 oster rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
188 1.3 oster
189 1.3 oster lpoNode->params[0].p = asmap->parityInfo;
190 1.3 oster lpoNode->params[1].p = xorNode->results[0];
191 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must
192 1.3 oster * describe entire
193 1.3 oster * parity unit */
194 1.3 oster
195 1.3 oster /* connect nodes to form graph */
196 1.3 oster
197 1.3 oster /* connect dag header to block node */
198 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
199 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
200 1.3 oster dag_h->succedents[0] = blockNode;
201 1.3 oster
202 1.3 oster /* connect the block node to the Rod nodes */
203 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
204 1.3 oster for (i = 0; i < nRodNodes; i++) {
205 1.3 oster RF_ASSERT(rodNodes[i].numAntecedents == 1);
206 1.3 oster blockNode->succedents[i] = &rodNodes[i];
207 1.3 oster rodNodes[i].antecedents[0] = blockNode;
208 1.3 oster rodNodes[i].antType[0] = rf_control;
209 1.3 oster }
210 1.3 oster
211 1.3 oster /* connect the block node to the sync node */
212 1.3 oster /* necessary if nRodNodes == 0 */
213 1.3 oster RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
214 1.3 oster blockNode->succedents[nRodNodes] = syncNode;
215 1.3 oster syncNode->antecedents[0] = blockNode;
216 1.3 oster syncNode->antType[0] = rf_control;
217 1.3 oster
218 1.3 oster /* connect the Rod nodes to the syncNode */
219 1.3 oster for (i = 0; i < nRodNodes; i++) {
220 1.3 oster rodNodes[i].succedents[0] = syncNode;
221 1.3 oster syncNode->antecedents[1 + i] = &rodNodes[i];
222 1.3 oster syncNode->antType[1 + i] = rf_control;
223 1.3 oster }
224 1.3 oster
225 1.3 oster /* connect the sync node to the xor node */
226 1.3 oster RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
227 1.3 oster RF_ASSERT(xorNode->numAntecedents == 1);
228 1.3 oster syncNode->succedents[0] = xorNode;
229 1.3 oster xorNode->antecedents[0] = syncNode;
230 1.3 oster xorNode->antType[0] = rf_trueData; /* carry forward from sync */
231 1.3 oster
232 1.3 oster /* connect the sync node to the Wnd nodes */
233 1.3 oster for (i = 0; i < nWndNodes; i++) {
234 1.3 oster RF_ASSERT(wndNodes->numAntecedents == 1);
235 1.3 oster syncNode->succedents[1 + i] = &wndNodes[i];
236 1.3 oster wndNodes[i].antecedents[0] = syncNode;
237 1.3 oster wndNodes[i].antType[0] = rf_control;
238 1.3 oster }
239 1.3 oster
240 1.3 oster /* connect the xor node to the Lpo node */
241 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1);
242 1.3 oster RF_ASSERT(lpoNode->numAntecedents == 1);
243 1.3 oster xorNode->succedents[0] = lpoNode;
244 1.3 oster lpoNode->antecedents[0] = xorNode;
245 1.3 oster lpoNode->antType[0] = rf_trueData;
246 1.3 oster
247 1.3 oster /* connect the Wnd nodes to the unblock node */
248 1.3 oster RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
249 1.3 oster for (i = 0; i < nWndNodes; i++) {
250 1.3 oster RF_ASSERT(wndNodes->numSuccedents == 1);
251 1.3 oster wndNodes[i].succedents[0] = unblockNode;
252 1.3 oster unblockNode->antecedents[i] = &wndNodes[i];
253 1.3 oster unblockNode->antType[i] = rf_control;
254 1.3 oster }
255 1.3 oster
256 1.3 oster /* connect the Lpo node to the unblock node */
257 1.3 oster RF_ASSERT(lpoNode->numSuccedents == 1);
258 1.3 oster lpoNode->succedents[0] = unblockNode;
259 1.3 oster unblockNode->antecedents[nWndNodes] = lpoNode;
260 1.3 oster unblockNode->antType[nWndNodes] = rf_control;
261 1.3 oster
262 1.3 oster /* connect unblock node to terminator */
263 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
264 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
265 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
266 1.3 oster unblockNode->succedents[0] = termNode;
267 1.3 oster termNode->antecedents[0] = unblockNode;
268 1.3 oster termNode->antType[0] = rf_control;
269 1.1 oster }
270 1.1 oster
271 1.1 oster
272 1.1 oster
273 1.1 oster
274 1.1 oster /******************************************************************************
275 1.1 oster *
276 1.1 oster * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
277 1.1 oster *
278 1.1 oster * Header
279 1.1 oster * |
280 1.1 oster * Block
281 1.3 oster * / | ... \ \
282 1.3 oster * / | \ \
283 1.1 oster * Rod Rod Rod Rop
284 1.3 oster * | \ /| \ / | \/ |
285 1.3 oster * | | | /\ |
286 1.3 oster * Wnd Wnd Wnd X
287 1.3 oster * | \ / |
288 1.3 oster * | \ / |
289 1.1 oster * \ \ / Lpo
290 1.3 oster * \ \ / /
291 1.3 oster * +-> Unblock <-+
292 1.1 oster * |
293 1.1 oster * T
294 1.3 oster *
295 1.1 oster *
296 1.1 oster * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
297 1.1 oster * When the access spans a stripe unit boundary and is less than one SU in size, there will
298 1.1 oster * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case.
299 1.1 oster * The second output from each Rod node goes to the X node. In the double-XOR
300 1.1 oster * case, there are exactly 2 Rod nodes, and each sends one output to one X node.
301 1.1 oster * There is one Rod -- Wnd -- T branch for each stripe unit being updated.
302 1.1 oster *
303 1.1 oster * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG.
304 1.1 oster *
305 1.1 oster * Note: this DAG ignores all the optimizations related to making the RMWs atomic.
306 1.1 oster * it also has the nasty property that none of the buffers allocated for reading
307 1.1 oster * old data & parity can be freed until the XOR node fires. Need to fix this.
308 1.1 oster *
309 1.1 oster * A null qfuncs indicates single fault tolerant
310 1.1 oster *****************************************************************************/
311 1.1 oster
312 1.3 oster void
313 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(
314 1.3 oster RF_Raid_t * raidPtr,
315 1.3 oster RF_AccessStripeMap_t * asmap,
316 1.3 oster RF_DagHeader_t * dag_h,
317 1.3 oster void *bp,
318 1.3 oster RF_RaidAccessFlags_t flags,
319 1.3 oster RF_AllocListElem_t * allocList,
320 1.3 oster RF_RedFuncs_t * pfuncs,
321 1.3 oster RF_RedFuncs_t * qfuncs)
322 1.1 oster {
323 1.3 oster RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
324 1.3 oster RF_DagNode_t *readDataNodes, *readParityNodes;
325 1.3 oster RF_DagNode_t *writeDataNodes, *lpuNodes;
326 1.3 oster RF_DagNode_t *unlockDataNodes = NULL, *termNode;
327 1.3 oster RF_PhysDiskAddr_t *pda = asmap->physInfo;
328 1.3 oster int numDataNodes = asmap->numStripeUnitsAccessed;
329 1.3 oster int numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
330 1.3 oster int i, j, nNodes, totalNumNodes;
331 1.3 oster RF_ReconUnitNum_t which_ru;
332 1.3 oster int (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node);
333 1.3 oster int (*qfunc) (RF_DagNode_t * node);
334 1.3 oster char *name, *qname;
335 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
336 1.3 oster long nfaults = qfuncs ? 2 : 1;
337 1.3 oster int lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
338 1.3 oster
339 1.3 oster if (rf_dagDebug)
340 1.3 oster printf("[Creating parity-logging small-write DAG]\n");
341 1.3 oster RF_ASSERT(numDataNodes > 0);
342 1.3 oster RF_ASSERT(nfaults == 1);
343 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
344 1.3 oster
345 1.3 oster /* DAG creation occurs in three steps: 1. count the number of nodes in
346 1.3 oster * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
347 1.3 oster * nodes */
348 1.3 oster
349 1.3 oster /* Step 1. compute number of nodes in the graph */
350 1.3 oster
351 1.3 oster /* number of nodes: a read and write for each data unit a redundancy
352 1.3 oster * computation node for each parity node a read and Lpu for each
353 1.3 oster * parity unit a block and unblock node (2) a terminator node if
354 1.3 oster * atomic RMW an unlock node for each data unit, redundancy unit */
355 1.3 oster totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
356 1.3 oster if (lu_flag)
357 1.3 oster totalNumNodes += numDataNodes;
358 1.3 oster
359 1.3 oster nNodes = numDataNodes + numParityNodes;
360 1.3 oster
361 1.3 oster dag_h->numCommitNodes = numDataNodes + numParityNodes;
362 1.3 oster dag_h->numCommits = 0;
363 1.3 oster dag_h->numSuccedents = 1;
364 1.3 oster
365 1.3 oster /* Step 2. create the nodes */
366 1.3 oster RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
367 1.3 oster i = 0;
368 1.3 oster blockNode = &nodes[i];
369 1.3 oster i += 1;
370 1.3 oster unblockNode = &nodes[i];
371 1.3 oster i += 1;
372 1.3 oster readDataNodes = &nodes[i];
373 1.3 oster i += numDataNodes;
374 1.3 oster readParityNodes = &nodes[i];
375 1.3 oster i += numParityNodes;
376 1.3 oster writeDataNodes = &nodes[i];
377 1.3 oster i += numDataNodes;
378 1.3 oster lpuNodes = &nodes[i];
379 1.3 oster i += numParityNodes;
380 1.3 oster xorNodes = &nodes[i];
381 1.3 oster i += numParityNodes;
382 1.3 oster termNode = &nodes[i];
383 1.3 oster i += 1;
384 1.3 oster if (lu_flag) {
385 1.3 oster unlockDataNodes = &nodes[i];
386 1.3 oster i += numDataNodes;
387 1.3 oster }
388 1.3 oster RF_ASSERT(i == totalNumNodes);
389 1.3 oster
390 1.3 oster /* Step 3. initialize the nodes */
391 1.3 oster /* initialize block node (Nil) */
392 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
393 1.3 oster
394 1.3 oster /* initialize unblock node (Nil) */
395 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
396 1.3 oster
397 1.3 oster /* initialize terminatory node (Trm) */
398 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
399 1.3 oster
400 1.3 oster /* initialize nodes which read old data (Rod) */
401 1.3 oster for (i = 0; i < numDataNodes; i++) {
402 1.3 oster rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
403 1.3 oster RF_ASSERT(pda != NULL);
404 1.3 oster readDataNodes[i].params[0].p = pda; /* physical disk addr
405 1.3 oster * desc */
406 1.3 oster readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
407 1.3 oster * data */
408 1.3 oster readDataNodes[i].params[2].v = parityStripeID;
409 1.3 oster readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
410 1.3 oster pda = pda->next;
411 1.3 oster readDataNodes[i].propList[0] = NULL;
412 1.3 oster readDataNodes[i].propList[1] = NULL;
413 1.3 oster }
414 1.3 oster
415 1.3 oster /* initialize nodes which read old parity (Rop) */
416 1.3 oster pda = asmap->parityInfo;
417 1.3 oster i = 0;
418 1.3 oster for (i = 0; i < numParityNodes; i++) {
419 1.3 oster RF_ASSERT(pda != NULL);
420 1.3 oster rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
421 1.3 oster readParityNodes[i].params[0].p = pda;
422 1.3 oster readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old
423 1.3 oster * parity */
424 1.3 oster readParityNodes[i].params[2].v = parityStripeID;
425 1.3 oster readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
426 1.3 oster readParityNodes[i].propList[0] = NULL;
427 1.3 oster pda = pda->next;
428 1.3 oster }
429 1.3 oster
430 1.3 oster /* initialize nodes which write new data (Wnd) */
431 1.3 oster pda = asmap->physInfo;
432 1.3 oster for (i = 0; i < numDataNodes; i++) {
433 1.3 oster RF_ASSERT(pda != NULL);
434 1.3 oster rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
435 1.3 oster writeDataNodes[i].params[0].p = pda; /* physical disk addr
436 1.3 oster * desc */
437 1.3 oster writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new
438 1.3 oster * data to be written */
439 1.3 oster writeDataNodes[i].params[2].v = parityStripeID;
440 1.3 oster writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
441 1.3 oster
442 1.3 oster if (lu_flag) {
443 1.3 oster /* initialize node to unlock the disk queue */
444 1.3 oster rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
445 1.3 oster unlockDataNodes[i].params[0].p = pda; /* physical disk addr
446 1.3 oster * desc */
447 1.3 oster unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
448 1.3 oster }
449 1.3 oster pda = pda->next;
450 1.3 oster }
451 1.3 oster
452 1.3 oster
453 1.3 oster /* initialize nodes which compute new parity */
454 1.3 oster /* we use the simple XOR func in the double-XOR case, and when we're
455 1.3 oster * accessing only a portion of one stripe unit. the distinction
456 1.3 oster * between the two is that the regular XOR func assumes that the
457 1.3 oster * targbuf is a full SU in size, and examines the pda associated with
458 1.3 oster * the buffer to decide where within the buffer to XOR the data,
459 1.3 oster * whereas the simple XOR func just XORs the data into the start of
460 1.3 oster * the buffer. */
461 1.3 oster if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
462 1.3 oster func = pfuncs->simple;
463 1.3 oster undoFunc = rf_NullNodeUndoFunc;
464 1.3 oster name = pfuncs->SimpleName;
465 1.3 oster if (qfuncs) {
466 1.3 oster qfunc = qfuncs->simple;
467 1.3 oster qname = qfuncs->SimpleName;
468 1.3 oster }
469 1.3 oster } else {
470 1.3 oster func = pfuncs->regular;
471 1.3 oster undoFunc = rf_NullNodeUndoFunc;
472 1.3 oster name = pfuncs->RegularName;
473 1.3 oster if (qfuncs) {
474 1.3 oster qfunc = qfuncs->regular;
475 1.3 oster qname = qfuncs->RegularName;
476 1.3 oster }
477 1.3 oster }
478 1.3 oster /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
479 1.3 oster * nodes, and raidPtr */
480 1.3 oster if (numParityNodes == 2) { /* double-xor case */
481 1.3 oster for (i = 0; i < numParityNodes; i++) {
482 1.3 oster rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for
483 1.3 oster * xor */
484 1.3 oster xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
485 1.3 oster xorNodes[i].params[0] = readDataNodes[i].params[0];
486 1.3 oster xorNodes[i].params[1] = readDataNodes[i].params[1];
487 1.3 oster xorNodes[i].params[2] = readParityNodes[i].params[0];
488 1.3 oster xorNodes[i].params[3] = readParityNodes[i].params[1];
489 1.3 oster xorNodes[i].params[4] = writeDataNodes[i].params[0];
490 1.3 oster xorNodes[i].params[5] = writeDataNodes[i].params[1];
491 1.3 oster xorNodes[i].params[6].p = raidPtr;
492 1.3 oster xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as
493 1.3 oster * target buf */
494 1.3 oster }
495 1.3 oster } else {
496 1.3 oster /* there is only one xor node in this case */
497 1.3 oster rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
498 1.3 oster xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
499 1.3 oster for (i = 0; i < numDataNodes + 1; i++) {
500 1.3 oster /* set up params related to Rod and Rop nodes */
501 1.3 oster xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0]; /* pda */
502 1.3 oster xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1]; /* buffer pointer */
503 1.3 oster }
504 1.3 oster for (i = 0; i < numDataNodes; i++) {
505 1.3 oster /* set up params related to Wnd and Wnp nodes */
506 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0]; /* pda */
507 1.3 oster xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1]; /* buffer pointer */
508 1.3 oster }
509 1.3 oster xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr; /* xor node needs to get
510 1.3 oster * at RAID information */
511 1.3 oster xorNodes[0].results[0] = readParityNodes[0].params[1].p;
512 1.3 oster }
513 1.3 oster
514 1.3 oster /* initialize the log node(s) */
515 1.3 oster pda = asmap->parityInfo;
516 1.3 oster for (i = 0; i < numParityNodes; i++) {
517 1.3 oster RF_ASSERT(pda);
518 1.3 oster rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
519 1.3 oster lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */
520 1.3 oster lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to
521 1.3 oster * parity */
522 1.3 oster pda = pda->next;
523 1.3 oster }
524 1.3 oster
525 1.3 oster
526 1.3 oster /* Step 4. connect the nodes */
527 1.3 oster
528 1.3 oster /* connect header to block node */
529 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1);
530 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0);
531 1.3 oster dag_h->succedents[0] = blockNode;
532 1.3 oster
533 1.3 oster /* connect block node to read old data nodes */
534 1.3 oster RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
535 1.3 oster for (i = 0; i < numDataNodes; i++) {
536 1.3 oster blockNode->succedents[i] = &readDataNodes[i];
537 1.3 oster RF_ASSERT(readDataNodes[i].numAntecedents == 1);
538 1.3 oster readDataNodes[i].antecedents[0] = blockNode;
539 1.3 oster readDataNodes[i].antType[0] = rf_control;
540 1.3 oster }
541 1.3 oster
542 1.3 oster /* connect block node to read old parity nodes */
543 1.3 oster for (i = 0; i < numParityNodes; i++) {
544 1.3 oster blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
545 1.3 oster RF_ASSERT(readParityNodes[i].numAntecedents == 1);
546 1.3 oster readParityNodes[i].antecedents[0] = blockNode;
547 1.3 oster readParityNodes[i].antType[0] = rf_control;
548 1.3 oster }
549 1.3 oster
550 1.3 oster /* connect read old data nodes to write new data nodes */
551 1.3 oster for (i = 0; i < numDataNodes; i++) {
552 1.3 oster RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
553 1.3 oster for (j = 0; j < numDataNodes; j++) {
554 1.3 oster RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
555 1.3 oster readDataNodes[i].succedents[j] = &writeDataNodes[j];
556 1.3 oster writeDataNodes[j].antecedents[i] = &readDataNodes[i];
557 1.3 oster if (i == j)
558 1.3 oster writeDataNodes[j].antType[i] = rf_antiData;
559 1.3 oster else
560 1.3 oster writeDataNodes[j].antType[i] = rf_control;
561 1.3 oster }
562 1.3 oster }
563 1.3 oster
564 1.3 oster /* connect read old data nodes to xor nodes */
565 1.3 oster for (i = 0; i < numDataNodes; i++)
566 1.3 oster for (j = 0; j < numParityNodes; j++) {
567 1.3 oster RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
568 1.3 oster readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
569 1.3 oster xorNodes[j].antecedents[i] = &readDataNodes[i];
570 1.3 oster xorNodes[j].antType[i] = rf_trueData;
571 1.3 oster }
572 1.3 oster
573 1.3 oster /* connect read old parity nodes to write new data nodes */
574 1.3 oster for (i = 0; i < numParityNodes; i++) {
575 1.3 oster RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
576 1.3 oster for (j = 0; j < numDataNodes; j++) {
577 1.3 oster readParityNodes[i].succedents[j] = &writeDataNodes[j];
578 1.3 oster writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
579 1.3 oster writeDataNodes[j].antType[numDataNodes + i] = rf_control;
580 1.3 oster }
581 1.3 oster }
582 1.3 oster
583 1.3 oster /* connect read old parity nodes to xor nodes */
584 1.3 oster for (i = 0; i < numParityNodes; i++)
585 1.3 oster for (j = 0; j < numParityNodes; j++) {
586 1.3 oster readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
587 1.3 oster xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
588 1.3 oster xorNodes[j].antType[numDataNodes + i] = rf_trueData;
589 1.3 oster }
590 1.3 oster
591 1.3 oster /* connect xor nodes to write new parity nodes */
592 1.3 oster for (i = 0; i < numParityNodes; i++) {
593 1.3 oster RF_ASSERT(xorNodes[i].numSuccedents == 1);
594 1.3 oster RF_ASSERT(lpuNodes[i].numAntecedents == 1);
595 1.3 oster xorNodes[i].succedents[0] = &lpuNodes[i];
596 1.3 oster lpuNodes[i].antecedents[0] = &xorNodes[i];
597 1.3 oster lpuNodes[i].antType[0] = rf_trueData;
598 1.3 oster }
599 1.3 oster
600 1.3 oster for (i = 0; i < numDataNodes; i++) {
601 1.3 oster if (lu_flag) {
602 1.3 oster /* connect write new data nodes to unlock nodes */
603 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
604 1.3 oster RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
605 1.3 oster writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
606 1.3 oster unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
607 1.3 oster unlockDataNodes[i].antType[0] = rf_control;
608 1.3 oster
609 1.3 oster /* connect unlock nodes to unblock node */
610 1.3 oster RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
611 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
612 1.3 oster unlockDataNodes[i].succedents[0] = unblockNode;
613 1.3 oster unblockNode->antecedents[i] = &unlockDataNodes[i];
614 1.3 oster unblockNode->antType[i] = rf_control;
615 1.3 oster } else {
616 1.3 oster /* connect write new data nodes to unblock node */
617 1.3 oster RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
618 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
619 1.3 oster writeDataNodes[i].succedents[0] = unblockNode;
620 1.3 oster unblockNode->antecedents[i] = &writeDataNodes[i];
621 1.3 oster unblockNode->antType[i] = rf_control;
622 1.3 oster }
623 1.3 oster }
624 1.3 oster
625 1.3 oster /* connect write new parity nodes to unblock node */
626 1.3 oster for (i = 0; i < numParityNodes; i++) {
627 1.3 oster RF_ASSERT(lpuNodes[i].numSuccedents == 1);
628 1.3 oster lpuNodes[i].succedents[0] = unblockNode;
629 1.3 oster unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
630 1.3 oster unblockNode->antType[numDataNodes + i] = rf_control;
631 1.3 oster }
632 1.3 oster
633 1.3 oster /* connect unblock node to terminator */
634 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1);
635 1.3 oster RF_ASSERT(termNode->numAntecedents == 1);
636 1.3 oster RF_ASSERT(termNode->numSuccedents == 0);
637 1.3 oster unblockNode->succedents[0] = termNode;
638 1.3 oster termNode->antecedents[0] = unblockNode;
639 1.3 oster termNode->antType[0] = rf_control;
640 1.1 oster }
641 1.1 oster
642 1.1 oster
643 1.3 oster void
644 1.3 oster rf_CreateParityLoggingSmallWriteDAG(
645 1.3 oster RF_Raid_t * raidPtr,
646 1.3 oster RF_AccessStripeMap_t * asmap,
647 1.3 oster RF_DagHeader_t * dag_h,
648 1.3 oster void *bp,
649 1.3 oster RF_RaidAccessFlags_t flags,
650 1.3 oster RF_AllocListElem_t * allocList,
651 1.3 oster RF_RedFuncs_t * pfuncs,
652 1.3 oster RF_RedFuncs_t * qfuncs)
653 1.1 oster {
654 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
655 1.3 oster rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
656 1.1 oster }
657 1.1 oster
658 1.1 oster
659 1.3 oster void
660 1.3 oster rf_CreateParityLoggingLargeWriteDAG(
661 1.3 oster RF_Raid_t * raidPtr,
662 1.3 oster RF_AccessStripeMap_t * asmap,
663 1.3 oster RF_DagHeader_t * dag_h,
664 1.3 oster void *bp,
665 1.3 oster RF_RaidAccessFlags_t flags,
666 1.3 oster RF_AllocListElem_t * allocList,
667 1.3 oster int nfaults,
668 1.3 oster int (*redFunc) (RF_DagNode_t *))
669 1.1 oster {
670 1.3 oster dag_h->creator = "ParityLoggingSmallWriteDAG";
671 1.3 oster rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
672 1.1 oster }
673 1.3 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
674