rf_dagdegwr.c revision 1.9 1 /* $NetBSD: rf_dagdegwr.c,v 1.9 2001/11/13 07:11:13 lukem Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_dagdegwr.c
31 *
32 * code for creating degraded write DAGs
33 *
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.9 2001/11/13 07:11:13 lukem Exp $");
38
39 #include <dev/raidframe/raidframevar.h>
40
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_memchunk.h"
47 #include "rf_general.h"
48 #include "rf_dagdegwr.h"
49
50
51 /******************************************************************************
52 *
53 * General comments on DAG creation:
54 *
55 * All DAGs in this file use roll-away error recovery. Each DAG has a single
56 * commit node, usually called "Cmt." If an error occurs before the Cmt node
57 * is reached, the execution engine will halt forward execution and work
58 * backward through the graph, executing the undo functions. Assuming that
59 * each node in the graph prior to the Cmt node are undoable and atomic - or -
60 * does not make changes to permanent state, the graph will fail atomically.
61 * If an error occurs after the Cmt node executes, the engine will roll-forward
62 * through the graph, blindly executing nodes until it reaches the end.
63 * If a graph reaches the end, it is assumed to have completed successfully.
64 *
65 * A graph has only 1 Cmt node.
66 *
67 */
68
69
70 /******************************************************************************
71 *
72 * The following wrappers map the standard DAG creation interface to the
73 * DAG creation routines. Additionally, these wrappers enable experimentation
74 * with new DAG structures by providing an extra level of indirection, allowing
75 * the DAG creation routines to be replaced at this single point.
76 */
77
78 static
79 RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
80 {
81 rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
82 flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE);
83 }
84
85 void
86 rf_CreateDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList)
87 RF_Raid_t *raidPtr;
88 RF_AccessStripeMap_t *asmap;
89 RF_DagHeader_t *dag_h;
90 void *bp;
91 RF_RaidAccessFlags_t flags;
92 RF_AllocListElem_t *allocList;
93 {
94
95 RF_ASSERT(asmap->numDataFailed == 1);
96 dag_h->creator = "DegradedWriteDAG";
97
98 /*
99 * if the access writes only a portion of the failed unit, and also
100 * writes some portion of at least one surviving unit, we create two
101 * DAGs, one for the failed component and one for the non-failed
102 * component, and do them sequentially. Note that the fact that we're
103 * accessing only a portion of the failed unit indicates that the
104 * access either starts or ends in the failed unit, and hence we need
105 * create only two dags. This is inefficient in that the same data or
106 * parity can get read and written twice using this structure. I need
107 * to fix this to do the access all at once.
108 */
109 RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 &&
110 asmap->failedPDAs[0]->numSector !=
111 raidPtr->Layout.sectorsPerStripeUnit));
112 rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
113 allocList);
114 }
115
116
117
118 /******************************************************************************
119 *
120 * DAG creation code begins here
121 */
122
123
124
125 /******************************************************************************
126 *
127 * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
128 * write, which is as follows
129 *
130 * / {Wnq} --\
131 * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
132 * \ {Rod} / \ Wnd ---/
133 * \ {Wnd} -/
134 *
135 * commit nodes: Xor, Wnd
136 *
137 * IMPORTANT:
138 * This DAG generator does not work for double-degraded archs since it does not
139 * generate Q
140 *
141 * This dag is essentially identical to the large-write dag, except that the
142 * write to the failed data unit is suppressed.
143 *
144 * IMPORTANT: this dag does not work in the case where the access writes only
145 * a portion of the failed unit, and also writes some portion of at least one
146 * surviving SU. this case is handled in CreateDegradedWriteDAG above.
147 *
148 * The block & unblock nodes are leftovers from a previous version. They
149 * do nothing, but I haven't deleted them because it would be a tremendous
150 * effort to put them back in.
151 *
152 * This dag is used whenever a one of the data units in a write has failed.
153 * If it is the parity unit that failed, the nonredundant write dag (below)
154 * is used.
155 *****************************************************************************/
156
157 void
158 rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
159 allocList, nfaults, redFunc, allowBufferRecycle)
160 RF_Raid_t *raidPtr;
161 RF_AccessStripeMap_t *asmap;
162 RF_DagHeader_t *dag_h;
163 void *bp;
164 RF_RaidAccessFlags_t flags;
165 RF_AllocListElem_t *allocList;
166 int nfaults;
167 int (*redFunc) (RF_DagNode_t *);
168 int allowBufferRecycle;
169 {
170 int nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum,
171 rdnodesFaked;
172 RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
173 RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode;
174 RF_SectorCount_t sectorsPerSU;
175 RF_ReconUnitNum_t which_ru;
176 char *xorTargetBuf = NULL; /* the target buffer for the XOR
177 * operation */
178 char *overlappingPDAs;/* a temporary array of flags */
179 RF_AccessStripeMapHeader_t *new_asm_h[2];
180 RF_PhysDiskAddr_t *pda, *parityPDA;
181 RF_StripeNum_t parityStripeID;
182 RF_PhysDiskAddr_t *failedPDA;
183 RF_RaidLayout_t *layoutPtr;
184
185 layoutPtr = &(raidPtr->Layout);
186 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
187 &which_ru);
188 sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
189 /* failedPDA points to the pda within the asm that targets the failed
190 * disk */
191 failedPDA = asmap->failedPDAs[0];
192
193 if (rf_dagDebug)
194 printf("[Creating degraded-write DAG]\n");
195
196 RF_ASSERT(asmap->numDataFailed == 1);
197 dag_h->creator = "SimpleDegradedWriteDAG";
198
199 /*
200 * Generate two ASMs identifying the surviving data
201 * we need in order to recover the lost data.
202 */
203 /* overlappingPDAs array must be zero'd */
204 RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *));
205 rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
206 &nXorBufs, NULL, overlappingPDAs, allocList);
207
208 /* create all the nodes at once */
209 nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is
210 * generated for the
211 * failed pda */
212
213 nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
214 ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
215 /*
216 * XXX
217 *
218 * There's a bug with a complete stripe overwrite- that means 0 reads
219 * of old data, and the rest of the DAG generation code doesn't like
220 * that. A release is coming, and I don't wanna risk breaking a critical
221 * DAG generator, so here's what I'm gonna do- if there's no read nodes,
222 * I'm gonna fake there being a read node, and I'm gonna swap in a
223 * no-op node in its place (to make all the link-up code happy).
224 * This should be fixed at some point. --jimz
225 */
226 if (nRrdNodes == 0) {
227 nRrdNodes = 1;
228 rdnodesFaked = 1;
229 } else {
230 rdnodesFaked = 0;
231 }
232 /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
233 nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
234 RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t),
235 (RF_DagNode_t *), allocList);
236 i = 0;
237 blockNode = &nodes[i];
238 i += 1;
239 commitNode = &nodes[i];
240 i += 1;
241 unblockNode = &nodes[i];
242 i += 1;
243 termNode = &nodes[i];
244 i += 1;
245 xorNode = &nodes[i];
246 i += 1;
247 wnpNode = &nodes[i];
248 i += 1;
249 wndNodes = &nodes[i];
250 i += nWndNodes;
251 rrdNodes = &nodes[i];
252 i += nRrdNodes;
253 if (nfaults == 2) {
254 wnqNode = &nodes[i];
255 i += 1;
256 } else {
257 wnqNode = NULL;
258 }
259 RF_ASSERT(i == nNodes);
260
261 /* this dag can not commit until all rrd and xor Nodes have completed */
262 dag_h->numCommitNodes = 1;
263 dag_h->numCommits = 0;
264 dag_h->numSuccedents = 1;
265
266 RF_ASSERT(nRrdNodes > 0);
267 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
268 NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
269 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
270 NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
271 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
272 NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
273 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
274 NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
275 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
276 nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList);
277
278 /*
279 * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
280 * the failed buffer, save a pointer to it so we can use it as the target
281 * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
282 * a buffer is the same size as the failed buffer, it must also be at the
283 * same alignment within the SU.
284 */
285 i = 0;
286 if (new_asm_h[0]) {
287 for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo;
288 i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
289 i++, pda = pda->next) {
290 rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
291 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
292 RF_ASSERT(pda);
293 rrdNodes[i].params[0].p = pda;
294 rrdNodes[i].params[1].p = pda->bufPtr;
295 rrdNodes[i].params[2].v = parityStripeID;
296 rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
297 }
298 }
299 /* i now equals the number of stripe units accessed in new_asm_h[0] */
300 if (new_asm_h[1]) {
301 for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
302 j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
303 j++, pda = pda->next) {
304 rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
305 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
306 RF_ASSERT(pda);
307 rrdNodes[i + j].params[0].p = pda;
308 rrdNodes[i + j].params[1].p = pda->bufPtr;
309 rrdNodes[i + j].params[2].v = parityStripeID;
310 rrdNodes[i + j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
311 if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
312 xorTargetBuf = pda->bufPtr;
313 }
314 }
315 if (rdnodesFaked) {
316 /*
317 * This is where we'll init that fake noop read node
318 * (XXX should the wakeup func be different?)
319 */
320 rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
321 NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
322 }
323 /*
324 * Make a PDA for the parity unit. The parity PDA should start at
325 * the same offset into the SU as the failed PDA.
326 */
327 /* Danner comment: I don't think this copy is really necessary. We are
328 * in one of two cases here. (1) The entire failed unit is written.
329 * Then asmap->parityInfo will describe the entire parity. (2) We are
330 * only writing a subset of the failed unit and nothing else. Then the
331 * asmap->parityInfo describes the failed unit and the copy can also
332 * be avoided. */
333
334 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
335 parityPDA->row = asmap->parityInfo->row;
336 parityPDA->col = asmap->parityInfo->col;
337 parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
338 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
339 parityPDA->numSector = failedPDA->numSector;
340
341 if (!xorTargetBuf) {
342 RF_CallocAndAdd(xorTargetBuf, 1,
343 rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
344 }
345 /* init the Wnp node */
346 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
347 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
348 wnpNode->params[0].p = parityPDA;
349 wnpNode->params[1].p = xorTargetBuf;
350 wnpNode->params[2].v = parityStripeID;
351 wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
352
353 /* fill in the Wnq Node */
354 if (nfaults == 2) {
355 {
356 RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
357 (RF_PhysDiskAddr_t *), allocList);
358 parityPDA->row = asmap->qInfo->row;
359 parityPDA->col = asmap->qInfo->col;
360 parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
361 * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
362 parityPDA->numSector = failedPDA->numSector;
363
364 rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
365 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
366 wnqNode->params[0].p = parityPDA;
367 RF_CallocAndAdd(xorNode->results[1], 1,
368 rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
369 wnqNode->params[1].p = xorNode->results[1];
370 wnqNode->params[2].v = parityStripeID;
371 wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
372 }
373 }
374 /* fill in the Wnd nodes */
375 for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) {
376 if (pda == failedPDA) {
377 i--;
378 continue;
379 }
380 rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
381 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
382 RF_ASSERT(pda);
383 wndNodes[i].params[0].p = pda;
384 wndNodes[i].params[1].p = pda->bufPtr;
385 wndNodes[i].params[2].v = parityStripeID;
386 wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
387 }
388
389 /* fill in the results of the xor node */
390 xorNode->results[0] = xorTargetBuf;
391
392 /* fill in the params of the xor node */
393
394 paramNum = 0;
395 if (rdnodesFaked == 0) {
396 for (i = 0; i < nRrdNodes; i++) {
397 /* all the Rrd nodes need to be xored together */
398 xorNode->params[paramNum++] = rrdNodes[i].params[0];
399 xorNode->params[paramNum++] = rrdNodes[i].params[1];
400 }
401 }
402 for (i = 0; i < nWndNodes; i++) {
403 /* any Wnd nodes that overlap the failed access need to be
404 * xored in */
405 if (overlappingPDAs[i]) {
406 RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
407 bcopy((char *) wndNodes[i].params[0].p, (char *) pda, sizeof(RF_PhysDiskAddr_t));
408 rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
409 xorNode->params[paramNum++].p = pda;
410 xorNode->params[paramNum++].p = pda->bufPtr;
411 }
412 }
413 RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
414
415 /*
416 * Install the failed PDA into the xor param list so that the
417 * new data gets xor'd in.
418 */
419 xorNode->params[paramNum++].p = failedPDA;
420 xorNode->params[paramNum++].p = failedPDA->bufPtr;
421
422 /*
423 * The last 2 params to the recovery xor node are always the failed
424 * PDA and the raidPtr. install the failedPDA even though we have just
425 * done so above. This allows us to use the same XOR function for both
426 * degraded reads and degraded writes.
427 */
428 xorNode->params[paramNum++].p = failedPDA;
429 xorNode->params[paramNum++].p = raidPtr;
430 RF_ASSERT(paramNum == 2 * nXorBufs + 2);
431
432 /*
433 * Code to link nodes begins here
434 */
435
436 /* link header to block node */
437 RF_ASSERT(blockNode->numAntecedents == 0);
438 dag_h->succedents[0] = blockNode;
439
440 /* link block node to rd nodes */
441 RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
442 for (i = 0; i < nRrdNodes; i++) {
443 RF_ASSERT(rrdNodes[i].numAntecedents == 1);
444 blockNode->succedents[i] = &rrdNodes[i];
445 rrdNodes[i].antecedents[0] = blockNode;
446 rrdNodes[i].antType[0] = rf_control;
447 }
448
449 /* link read nodes to xor node */
450 RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
451 for (i = 0; i < nRrdNodes; i++) {
452 RF_ASSERT(rrdNodes[i].numSuccedents == 1);
453 rrdNodes[i].succedents[0] = xorNode;
454 xorNode->antecedents[i] = &rrdNodes[i];
455 xorNode->antType[i] = rf_trueData;
456 }
457
458 /* link xor node to commit node */
459 RF_ASSERT(xorNode->numSuccedents == 1);
460 RF_ASSERT(commitNode->numAntecedents == 1);
461 xorNode->succedents[0] = commitNode;
462 commitNode->antecedents[0] = xorNode;
463 commitNode->antType[0] = rf_control;
464
465 /* link commit node to wnd nodes */
466 RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
467 for (i = 0; i < nWndNodes; i++) {
468 RF_ASSERT(wndNodes[i].numAntecedents == 1);
469 commitNode->succedents[i] = &wndNodes[i];
470 wndNodes[i].antecedents[0] = commitNode;
471 wndNodes[i].antType[0] = rf_control;
472 }
473
474 /* link the commit node to wnp, wnq nodes */
475 RF_ASSERT(wnpNode->numAntecedents == 1);
476 commitNode->succedents[nWndNodes] = wnpNode;
477 wnpNode->antecedents[0] = commitNode;
478 wnpNode->antType[0] = rf_control;
479 if (nfaults == 2) {
480 RF_ASSERT(wnqNode->numAntecedents == 1);
481 commitNode->succedents[nWndNodes + 1] = wnqNode;
482 wnqNode->antecedents[0] = commitNode;
483 wnqNode->antType[0] = rf_control;
484 }
485 /* link write new data nodes to unblock node */
486 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
487 for (i = 0; i < nWndNodes; i++) {
488 RF_ASSERT(wndNodes[i].numSuccedents == 1);
489 wndNodes[i].succedents[0] = unblockNode;
490 unblockNode->antecedents[i] = &wndNodes[i];
491 unblockNode->antType[i] = rf_control;
492 }
493
494 /* link write new parity node to unblock node */
495 RF_ASSERT(wnpNode->numSuccedents == 1);
496 wnpNode->succedents[0] = unblockNode;
497 unblockNode->antecedents[nWndNodes] = wnpNode;
498 unblockNode->antType[nWndNodes] = rf_control;
499
500 /* link write new q node to unblock node */
501 if (nfaults == 2) {
502 RF_ASSERT(wnqNode->numSuccedents == 1);
503 wnqNode->succedents[0] = unblockNode;
504 unblockNode->antecedents[nWndNodes + 1] = wnqNode;
505 unblockNode->antType[nWndNodes + 1] = rf_control;
506 }
507 /* link unblock node to term node */
508 RF_ASSERT(unblockNode->numSuccedents == 1);
509 RF_ASSERT(termNode->numAntecedents == 1);
510 RF_ASSERT(termNode->numSuccedents == 0);
511 unblockNode->succedents[0] = termNode;
512 termNode->antecedents[0] = unblockNode;
513 termNode->antType[0] = rf_control;
514 }
515 #define CONS_PDA(if,start,num) \
516 pda_p->row = asmap->if->row; pda_p->col = asmap->if->col; \
517 pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
518 pda_p->numSector = num; \
519 pda_p->next = NULL; \
520 RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
521 #if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0)
522 void
523 rf_WriteGenerateFailedAccessASMs(
524 RF_Raid_t * raidPtr,
525 RF_AccessStripeMap_t * asmap,
526 RF_PhysDiskAddr_t ** pdap,
527 int *nNodep,
528 RF_PhysDiskAddr_t ** pqpdap,
529 int *nPQNodep,
530 RF_AllocListElem_t * allocList)
531 {
532 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
533 int PDAPerDisk, i;
534 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
535 int numDataCol = layoutPtr->numDataCol;
536 int state;
537 unsigned napdas;
538 RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
539 RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
540 RF_PhysDiskAddr_t *pda_p;
541 RF_RaidAddr_t sosAddr;
542
543 /* determine how many pda's we will have to generate per unaccess
544 * stripe. If there is only one failed data unit, it is one; if two,
545 * possibly two, depending wether they overlap. */
546
547 fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
548 fone_end = fone_start + fone->numSector;
549
550 if (asmap->numDataFailed == 1) {
551 PDAPerDisk = 1;
552 state = 1;
553 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
554 pda_p = *pqpdap;
555 /* build p */
556 CONS_PDA(parityInfo, fone_start, fone->numSector);
557 pda_p->type = RF_PDA_TYPE_PARITY;
558 pda_p++;
559 /* build q */
560 CONS_PDA(qInfo, fone_start, fone->numSector);
561 pda_p->type = RF_PDA_TYPE_Q;
562 } else {
563 ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
564 ftwo_end = ftwo_start + ftwo->numSector;
565 if (fone->numSector + ftwo->numSector > secPerSU) {
566 PDAPerDisk = 1;
567 state = 2;
568 RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
569 pda_p = *pqpdap;
570 CONS_PDA(parityInfo, 0, secPerSU);
571 pda_p->type = RF_PDA_TYPE_PARITY;
572 pda_p++;
573 CONS_PDA(qInfo, 0, secPerSU);
574 pda_p->type = RF_PDA_TYPE_Q;
575 } else {
576 PDAPerDisk = 2;
577 state = 3;
578 /* four of them, fone, then ftwo */
579 RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
580 pda_p = *pqpdap;
581 CONS_PDA(parityInfo, fone_start, fone->numSector);
582 pda_p->type = RF_PDA_TYPE_PARITY;
583 pda_p++;
584 CONS_PDA(qInfo, fone_start, fone->numSector);
585 pda_p->type = RF_PDA_TYPE_Q;
586 pda_p++;
587 CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
588 pda_p->type = RF_PDA_TYPE_PARITY;
589 pda_p++;
590 CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
591 pda_p->type = RF_PDA_TYPE_Q;
592 }
593 }
594 /* figure out number of nonaccessed pda */
595 napdas = PDAPerDisk * (numDataCol - 2);
596 *nPQNodep = PDAPerDisk;
597
598 *nNodep = napdas;
599 if (napdas == 0)
600 return; /* short circuit */
601
602 /* allocate up our list of pda's */
603
604 RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
605 *pdap = pda_p;
606
607 /* linkem together */
608 for (i = 0; i < (napdas - 1); i++)
609 pda_p[i].next = pda_p + (i + 1);
610
611 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
612 for (i = 0; i < numDataCol; i++) {
613 if ((pda_p - (*pdap)) == napdas)
614 continue;
615 pda_p->type = RF_PDA_TYPE_DATA;
616 pda_p->raidAddress = sosAddr + (i * secPerSU);
617 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
618 /* skip over dead disks */
619 if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
620 continue;
621 switch (state) {
622 case 1: /* fone */
623 pda_p->numSector = fone->numSector;
624 pda_p->raidAddress += fone_start;
625 pda_p->startSector += fone_start;
626 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
627 break;
628 case 2: /* full stripe */
629 pda_p->numSector = secPerSU;
630 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, secPerSU), (char *), allocList);
631 break;
632 case 3: /* two slabs */
633 pda_p->numSector = fone->numSector;
634 pda_p->raidAddress += fone_start;
635 pda_p->startSector += fone_start;
636 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
637 pda_p++;
638 pda_p->type = RF_PDA_TYPE_DATA;
639 pda_p->raidAddress = sosAddr + (i * secPerSU);
640 (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
641 pda_p->numSector = ftwo->numSector;
642 pda_p->raidAddress += ftwo_start;
643 pda_p->startSector += ftwo_start;
644 RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr, pda_p->numSector), (char *), allocList);
645 break;
646 default:
647 RF_PANIC();
648 }
649 pda_p++;
650 }
651
652 RF_ASSERT(pda_p - *pdap == napdas);
653 return;
654 }
655 #define DISK_NODE_PDA(node) ((node)->params[0].p)
656
657 #define DISK_NODE_PARAMS(_node_,_p_) \
658 (_node_).params[0].p = _p_ ; \
659 (_node_).params[1].p = (_p_)->bufPtr; \
660 (_node_).params[2].v = parityStripeID; \
661 (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
662
663 void
664 rf_DoubleDegSmallWrite(
665 RF_Raid_t * raidPtr,
666 RF_AccessStripeMap_t * asmap,
667 RF_DagHeader_t * dag_h,
668 void *bp,
669 RF_RaidAccessFlags_t flags,
670 RF_AllocListElem_t * allocList,
671 char *redundantReadNodeName,
672 char *redundantWriteNodeName,
673 char *recoveryNodeName,
674 int (*recovFunc) (RF_DagNode_t *))
675 {
676 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
677 RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode,
678 *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode;
679 RF_PhysDiskAddr_t *pda, *pqPDAs;
680 RF_PhysDiskAddr_t *npdas;
681 int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
682 RF_ReconUnitNum_t which_ru;
683 int nPQNodes;
684 RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
685
686 /* simple small write case - First part looks like a reconstruct-read
687 * of the failed data units. Then a write of all data units not
688 * failed. */
689
690
691 /* Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \
692 * / -------PQ----- / \ \ Wud Wp WQ \ | /
693 * --Unblock- | T
694 *
695 * Rrd = read recovery data (potentially none) Wud = write user data
696 * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q
697 * (could be two)
698 *
699 */
700
701 rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList);
702
703 RF_ASSERT(asmap->numDataFailed == 1);
704
705 nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
706 nReadNodes = nRrdNodes + 2 * nPQNodes;
707 nWriteNodes = nWudNodes + 2 * nPQNodes;
708 nNodes = 4 + nReadNodes + nWriteNodes;
709
710 RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
711 blockNode = nodes;
712 unblockNode = blockNode + 1;
713 termNode = unblockNode + 1;
714 recoveryNode = termNode + 1;
715 rrdNodes = recoveryNode + 1;
716 rpNodes = rrdNodes + nRrdNodes;
717 rqNodes = rpNodes + nPQNodes;
718 wudNodes = rqNodes + nPQNodes;
719 wpNodes = wudNodes + nWudNodes;
720 wqNodes = wpNodes + nPQNodes;
721
722 dag_h->creator = "PQ_DDSimpleSmallWrite";
723 dag_h->numSuccedents = 1;
724 dag_h->succedents[0] = blockNode;
725 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
726 termNode->antecedents[0] = unblockNode;
727 termNode->antType[0] = rf_control;
728
729 /* init the block and unblock nodes */
730 /* The block node has all the read nodes as successors */
731 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
732 for (i = 0; i < nReadNodes; i++)
733 blockNode->succedents[i] = rrdNodes + i;
734
735 /* The unblock node has all the writes as successors */
736 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
737 for (i = 0; i < nWriteNodes; i++) {
738 unblockNode->antecedents[i] = wudNodes + i;
739 unblockNode->antType[i] = rf_control;
740 }
741 unblockNode->succedents[0] = termNode;
742
743 #define INIT_READ_NODE(node,name) \
744 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
745 (node)->succedents[0] = recoveryNode; \
746 (node)->antecedents[0] = blockNode; \
747 (node)->antType[0] = rf_control;
748
749 /* build the read nodes */
750 pda = npdas;
751 for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
752 INIT_READ_NODE(rrdNodes + i, "rrd");
753 DISK_NODE_PARAMS(rrdNodes[i], pda);
754 }
755
756 /* read redundancy pdas */
757 pda = pqPDAs;
758 INIT_READ_NODE(rpNodes, "Rp");
759 RF_ASSERT(pda);
760 DISK_NODE_PARAMS(rpNodes[0], pda);
761 pda++;
762 INIT_READ_NODE(rqNodes, redundantReadNodeName);
763 RF_ASSERT(pda);
764 DISK_NODE_PARAMS(rqNodes[0], pda);
765 if (nPQNodes == 2) {
766 pda++;
767 INIT_READ_NODE(rpNodes + 1, "Rp");
768 RF_ASSERT(pda);
769 DISK_NODE_PARAMS(rpNodes[1], pda);
770 pda++;
771 INIT_READ_NODE(rqNodes + 1, redundantReadNodeName);
772 RF_ASSERT(pda);
773 DISK_NODE_PARAMS(rqNodes[1], pda);
774 }
775 /* the recovery node has all reads as precedessors and all writes as
776 * successors. It generates a result for every write P or write Q
777 * node. As parameters, it takes a pda per read and a pda per stripe
778 * of user data written. It also takes as the last params the raidPtr
779 * and asm. For results, it takes PDA for P & Q. */
780
781
782 rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
783 nWriteNodes, /* succesors */
784 nReadNodes, /* preds */
785 nReadNodes + nWudNodes + 3, /* params */
786 2 * nPQNodes, /* results */
787 dag_h, recoveryNodeName, allocList);
788
789
790
791 for (i = 0; i < nReadNodes; i++) {
792 recoveryNode->antecedents[i] = rrdNodes + i;
793 recoveryNode->antType[i] = rf_control;
794 recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i);
795 }
796 for (i = 0; i < nWudNodes; i++) {
797 recoveryNode->succedents[i] = wudNodes + i;
798 }
799 recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0];
800 recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr;
801 recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap;
802
803 for (; i < nWriteNodes; i++)
804 recoveryNode->succedents[i] = wudNodes + i;
805
806 pda = pqPDAs;
807 recoveryNode->results[0] = pda;
808 pda++;
809 recoveryNode->results[1] = pda;
810 if (nPQNodes == 2) {
811 pda++;
812 recoveryNode->results[2] = pda;
813 pda++;
814 recoveryNode->results[3] = pda;
815 }
816 /* fill writes */
817 #define INIT_WRITE_NODE(node,name) \
818 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
819 (node)->succedents[0] = unblockNode; \
820 (node)->antecedents[0] = recoveryNode; \
821 (node)->antType[0] = rf_control;
822
823 pda = asmap->physInfo;
824 for (i = 0; i < nWudNodes; i++) {
825 INIT_WRITE_NODE(wudNodes + i, "Wd");
826 DISK_NODE_PARAMS(wudNodes[i], pda);
827 recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i);
828 pda = pda->next;
829 }
830 /* write redundancy pdas */
831 pda = pqPDAs;
832 INIT_WRITE_NODE(wpNodes, "Wp");
833 RF_ASSERT(pda);
834 DISK_NODE_PARAMS(wpNodes[0], pda);
835 pda++;
836 INIT_WRITE_NODE(wqNodes, "Wq");
837 RF_ASSERT(pda);
838 DISK_NODE_PARAMS(wqNodes[0], pda);
839 if (nPQNodes == 2) {
840 pda++;
841 INIT_WRITE_NODE(wpNodes + 1, "Wp");
842 RF_ASSERT(pda);
843 DISK_NODE_PARAMS(wpNodes[1], pda);
844 pda++;
845 INIT_WRITE_NODE(wqNodes + 1, "Wq");
846 RF_ASSERT(pda);
847 DISK_NODE_PARAMS(wqNodes[1], pda);
848 }
849 }
850 #endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */
851