1 1.37 oster /* $NetBSD: rf_dagdegwr.c,v 1.37 2023/10/15 18:15:19 oster Exp $ */ 2 1.1 oster /* 3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University. 4 1.1 oster * All rights reserved. 5 1.1 oster * 6 1.1 oster * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II 7 1.1 oster * 8 1.1 oster * Permission to use, copy, modify and distribute this software and 9 1.1 oster * its documentation is hereby granted, provided that both the copyright 10 1.1 oster * notice and this permission notice appear in all copies of the 11 1.1 oster * software, derivative works or modified versions, and any portions 12 1.1 oster * thereof, and that both notices appear in supporting documentation. 13 1.1 oster * 14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 1.1 oster * 18 1.1 oster * Carnegie Mellon requests users of this software to return to 19 1.1 oster * 20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU 21 1.1 oster * School of Computer Science 22 1.1 oster * Carnegie Mellon University 23 1.1 oster * Pittsburgh PA 15213-3890 24 1.1 oster * 25 1.1 oster * any improvements or extensions that they make and grant Carnegie the 26 1.1 oster * rights to redistribute these changes. 27 1.1 oster */ 28 1.1 oster 29 1.1 oster /* 30 1.1 oster * rf_dagdegwr.c 31 1.1 oster * 32 1.1 oster * code for creating degraded write DAGs 33 1.1 oster * 34 1.1 oster */ 35 1.9 lukem 36 1.9 lukem #include <sys/cdefs.h> 37 1.37 oster __KERNEL_RCSID(0, "$NetBSD: rf_dagdegwr.c,v 1.37 2023/10/15 18:15:19 oster Exp $"); 38 1.1 oster 39 1.8 oster #include <dev/raidframe/raidframevar.h> 40 1.8 oster 41 1.1 oster #include "rf_raid.h" 42 1.1 oster #include "rf_dag.h" 43 1.1 oster #include "rf_dagutils.h" 44 1.1 oster #include "rf_dagfuncs.h" 45 1.1 oster #include "rf_debugMem.h" 46 1.1 oster #include "rf_general.h" 47 1.1 oster #include "rf_dagdegwr.h" 48 1.19 oster #include "rf_map.h" 49 1.1 oster 50 1.1 oster 51 1.1 oster /****************************************************************************** 52 1.1 oster * 53 1.1 oster * General comments on DAG creation: 54 1.3 oster * 55 1.1 oster * All DAGs in this file use roll-away error recovery. Each DAG has a single 56 1.1 oster * commit node, usually called "Cmt." If an error occurs before the Cmt node 57 1.1 oster * is reached, the execution engine will halt forward execution and work 58 1.1 oster * backward through the graph, executing the undo functions. Assuming that 59 1.1 oster * each node in the graph prior to the Cmt node are undoable and atomic - or - 60 1.1 oster * does not make changes to permanent state, the graph will fail atomically. 61 1.1 oster * If an error occurs after the Cmt node executes, the engine will roll-forward 62 1.1 oster * through the graph, blindly executing nodes until it reaches the end. 63 1.1 oster * If a graph reaches the end, it is assumed to have completed successfully. 64 1.1 oster * 65 1.1 oster * A graph has only 1 Cmt node. 66 1.1 oster * 67 1.1 oster */ 68 1.1 oster 69 1.1 oster 70 1.1 oster /****************************************************************************** 71 1.1 oster * 72 1.1 oster * The following wrappers map the standard DAG creation interface to the 73 1.1 oster * DAG creation routines. Additionally, these wrappers enable experimentation 74 1.1 oster * with new DAG structures by providing an extra level of indirection, allowing 75 1.1 oster * the DAG creation routines to be replaced at this single point. 76 1.1 oster */ 77 1.1 oster 78 1.25 perry static 79 1.3 oster RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG) 80 1.1 oster { 81 1.3 oster rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, 82 1.3 oster flags, allocList, 1, rf_RecoveryXorFunc, RF_TRUE); 83 1.1 oster } 84 1.1 oster 85 1.25 perry void 86 1.14 oster rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 87 1.25 perry RF_DagHeader_t *dag_h, void *bp, 88 1.25 perry RF_RaidAccessFlags_t flags, 89 1.14 oster RF_AllocListElem_t *allocList) 90 1.1 oster { 91 1.3 oster 92 1.3 oster RF_ASSERT(asmap->numDataFailed == 1); 93 1.3 oster dag_h->creator = "DegradedWriteDAG"; 94 1.3 oster 95 1.7 thorpej /* 96 1.7 thorpej * if the access writes only a portion of the failed unit, and also 97 1.3 oster * writes some portion of at least one surviving unit, we create two 98 1.3 oster * DAGs, one for the failed component and one for the non-failed 99 1.3 oster * component, and do them sequentially. Note that the fact that we're 100 1.3 oster * accessing only a portion of the failed unit indicates that the 101 1.3 oster * access either starts or ends in the failed unit, and hence we need 102 1.3 oster * create only two dags. This is inefficient in that the same data or 103 1.3 oster * parity can get read and written twice using this structure. I need 104 1.7 thorpej * to fix this to do the access all at once. 105 1.7 thorpej */ 106 1.7 thorpej RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 && 107 1.7 thorpej asmap->failedPDAs[0]->numSector != 108 1.7 thorpej raidPtr->Layout.sectorsPerStripeUnit)); 109 1.7 thorpej rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, 110 1.7 thorpej allocList); 111 1.1 oster } 112 1.1 oster 113 1.1 oster 114 1.1 oster 115 1.1 oster /****************************************************************************** 116 1.1 oster * 117 1.1 oster * DAG creation code begins here 118 1.1 oster */ 119 1.34 christos #define BUF_ALLOC(num) \ 120 1.34 christos RF_MallocAndAdd(rf_RaidAddressToByte(raidPtr, num), allocList) 121 1.1 oster 122 1.1 oster 123 1.1 oster 124 1.1 oster /****************************************************************************** 125 1.1 oster * 126 1.1 oster * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode 127 1.1 oster * write, which is as follows 128 1.1 oster * 129 1.1 oster * / {Wnq} --\ 130 1.1 oster * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term 131 1.1 oster * \ {Rod} / \ Wnd ---/ 132 1.1 oster * \ {Wnd} -/ 133 1.1 oster * 134 1.1 oster * commit nodes: Xor, Wnd 135 1.1 oster * 136 1.1 oster * IMPORTANT: 137 1.1 oster * This DAG generator does not work for double-degraded archs since it does not 138 1.1 oster * generate Q 139 1.1 oster * 140 1.1 oster * This dag is essentially identical to the large-write dag, except that the 141 1.1 oster * write to the failed data unit is suppressed. 142 1.1 oster * 143 1.1 oster * IMPORTANT: this dag does not work in the case where the access writes only 144 1.1 oster * a portion of the failed unit, and also writes some portion of at least one 145 1.1 oster * surviving SU. this case is handled in CreateDegradedWriteDAG above. 146 1.1 oster * 147 1.1 oster * The block & unblock nodes are leftovers from a previous version. They 148 1.1 oster * do nothing, but I haven't deleted them because it would be a tremendous 149 1.1 oster * effort to put them back in. 150 1.1 oster * 151 1.1 oster * This dag is used whenever a one of the data units in a write has failed. 152 1.1 oster * If it is the parity unit that failed, the nonredundant write dag (below) 153 1.1 oster * is used. 154 1.1 oster *****************************************************************************/ 155 1.1 oster 156 1.25 perry void 157 1.25 perry rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr, 158 1.14 oster RF_AccessStripeMap_t *asmap, 159 1.30 christos RF_DagHeader_t *dag_h, void *bp, 160 1.30 christos RF_RaidAccessFlags_t flags, 161 1.14 oster RF_AllocListElem_t *allocList, 162 1.14 oster int nfaults, 163 1.35 christos void (*redFunc) (RF_DagNode_t *), 164 1.14 oster int allowBufferRecycle) 165 1.1 oster { 166 1.32 martin int nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum, 167 1.3 oster rdnodesFaked; 168 1.32 martin RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *termNode; 169 1.32 martin #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 170 1.32 martin RF_DagNode_t *wnqNode; 171 1.32 martin #endif 172 1.18 oster RF_DagNode_t *wndNodes, *rrdNodes, *xorNode, *commitNode; 173 1.18 oster RF_DagNode_t *tmpNode, *tmpwndNode, *tmprrdNode; 174 1.3 oster RF_SectorCount_t sectorsPerSU; 175 1.3 oster RF_ReconUnitNum_t which_ru; 176 1.3 oster char *xorTargetBuf = NULL; /* the target buffer for the XOR 177 1.3 oster * operation */ 178 1.19 oster char overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */ 179 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2]; 180 1.3 oster RF_PhysDiskAddr_t *pda, *parityPDA; 181 1.3 oster RF_StripeNum_t parityStripeID; 182 1.3 oster RF_PhysDiskAddr_t *failedPDA; 183 1.3 oster RF_RaidLayout_t *layoutPtr; 184 1.3 oster 185 1.3 oster layoutPtr = &(raidPtr->Layout); 186 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, 187 1.3 oster &which_ru); 188 1.3 oster sectorsPerSU = layoutPtr->sectorsPerStripeUnit; 189 1.3 oster /* failedPDA points to the pda within the asm that targets the failed 190 1.3 oster * disk */ 191 1.3 oster failedPDA = asmap->failedPDAs[0]; 192 1.3 oster 193 1.16 oster #if RF_DEBUG_DAG 194 1.3 oster if (rf_dagDebug) 195 1.3 oster printf("[Creating degraded-write DAG]\n"); 196 1.16 oster #endif 197 1.3 oster 198 1.3 oster RF_ASSERT(asmap->numDataFailed == 1); 199 1.3 oster dag_h->creator = "SimpleDegradedWriteDAG"; 200 1.3 oster 201 1.3 oster /* 202 1.3 oster * Generate two ASMs identifying the surviving data 203 1.3 oster * we need in order to recover the lost data. 204 1.3 oster */ 205 1.3 oster /* overlappingPDAs array must be zero'd */ 206 1.19 oster memset(overlappingPDAs, 0, RF_MAXCOL); 207 1.3 oster rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, 208 1.3 oster &nXorBufs, NULL, overlappingPDAs, allocList); 209 1.3 oster 210 1.3 oster /* create all the nodes at once */ 211 1.3 oster nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is 212 1.3 oster * generated for the 213 1.3 oster * failed pda */ 214 1.3 oster 215 1.3 oster nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) + 216 1.3 oster ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0); 217 1.3 oster /* 218 1.3 oster * XXX 219 1.3 oster * 220 1.3 oster * There's a bug with a complete stripe overwrite- that means 0 reads 221 1.3 oster * of old data, and the rest of the DAG generation code doesn't like 222 1.3 oster * that. A release is coming, and I don't wanna risk breaking a critical 223 1.3 oster * DAG generator, so here's what I'm gonna do- if there's no read nodes, 224 1.3 oster * I'm gonna fake there being a read node, and I'm gonna swap in a 225 1.3 oster * no-op node in its place (to make all the link-up code happy). 226 1.3 oster * This should be fixed at some point. --jimz 227 1.3 oster */ 228 1.3 oster if (nRrdNodes == 0) { 229 1.3 oster nRrdNodes = 1; 230 1.3 oster rdnodesFaked = 1; 231 1.3 oster } else { 232 1.3 oster rdnodesFaked = 0; 233 1.3 oster } 234 1.18 oster 235 1.36 oster blockNode = rf_AllocDAGNode(raidPtr); 236 1.18 oster blockNode->list_next = dag_h->nodes; 237 1.18 oster dag_h->nodes = blockNode; 238 1.18 oster 239 1.36 oster commitNode = rf_AllocDAGNode(raidPtr); 240 1.18 oster commitNode->list_next = dag_h->nodes; 241 1.18 oster dag_h->nodes = commitNode; 242 1.18 oster 243 1.36 oster unblockNode = rf_AllocDAGNode(raidPtr); 244 1.18 oster unblockNode->list_next = dag_h->nodes; 245 1.18 oster dag_h->nodes = unblockNode; 246 1.18 oster 247 1.36 oster termNode = rf_AllocDAGNode(raidPtr); 248 1.18 oster termNode->list_next = dag_h->nodes; 249 1.18 oster dag_h->nodes = termNode; 250 1.18 oster 251 1.36 oster xorNode = rf_AllocDAGNode(raidPtr); 252 1.18 oster xorNode->list_next = dag_h->nodes; 253 1.18 oster dag_h->nodes = xorNode; 254 1.18 oster 255 1.36 oster wnpNode = rf_AllocDAGNode(raidPtr); 256 1.18 oster wnpNode->list_next = dag_h->nodes; 257 1.18 oster dag_h->nodes = wnpNode; 258 1.18 oster 259 1.18 oster for (i = 0; i < nWndNodes; i++) { 260 1.36 oster tmpNode = rf_AllocDAGNode(raidPtr); 261 1.18 oster tmpNode->list_next = dag_h->nodes; 262 1.18 oster dag_h->nodes = tmpNode; 263 1.18 oster } 264 1.25 perry wndNodes = dag_h->nodes; 265 1.18 oster 266 1.18 oster for (i = 0; i < nRrdNodes; i++) { 267 1.36 oster tmpNode = rf_AllocDAGNode(raidPtr); 268 1.18 oster tmpNode->list_next = dag_h->nodes; 269 1.18 oster dag_h->nodes = tmpNode; 270 1.18 oster } 271 1.18 oster rrdNodes = dag_h->nodes; 272 1.18 oster 273 1.17 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 274 1.3 oster if (nfaults == 2) { 275 1.36 oster wnqNode = rf_AllocDAGNode(raidPtr); 276 1.18 oster wnqNode->list_next = dag_h->nodes; 277 1.18 oster dag_h->nodes = wnqNode; 278 1.3 oster } else { 279 1.3 oster wnqNode = NULL; 280 1.3 oster } 281 1.17 oster #endif 282 1.3 oster 283 1.3 oster /* this dag can not commit until all rrd and xor Nodes have completed */ 284 1.3 oster dag_h->numCommitNodes = 1; 285 1.3 oster dag_h->numCommits = 0; 286 1.3 oster dag_h->numSuccedents = 1; 287 1.3 oster 288 1.3 oster RF_ASSERT(nRrdNodes > 0); 289 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 290 1.3 oster NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList); 291 1.3 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 292 1.3 oster NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList); 293 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 294 1.3 oster NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList); 295 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 296 1.3 oster NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 297 1.3 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 298 1.3 oster nRrdNodes, 2 * nXorBufs + 2, nfaults, dag_h, "Xrc", allocList); 299 1.3 oster 300 1.3 oster /* 301 1.3 oster * Fill in the Rrd nodes. If any of the rrd buffers are the same size as 302 1.3 oster * the failed buffer, save a pointer to it so we can use it as the target 303 1.3 oster * of the XOR. The pdas in the rrd nodes have been range-restricted, so if 304 1.3 oster * a buffer is the same size as the failed buffer, it must also be at the 305 1.3 oster * same alignment within the SU. 306 1.3 oster */ 307 1.3 oster i = 0; 308 1.18 oster tmprrdNode = rrdNodes; 309 1.3 oster if (new_asm_h[0]) { 310 1.3 oster for (i = 0, pda = new_asm_h[0]->stripeMap->physInfo; 311 1.3 oster i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed; 312 1.3 oster i++, pda = pda->next) { 313 1.18 oster rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 314 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList); 315 1.3 oster RF_ASSERT(pda); 316 1.18 oster tmprrdNode->params[0].p = pda; 317 1.18 oster tmprrdNode->params[1].p = pda->bufPtr; 318 1.18 oster tmprrdNode->params[2].v = parityStripeID; 319 1.18 oster tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 320 1.18 oster tmprrdNode = tmprrdNode->list_next; 321 1.3 oster } 322 1.3 oster } 323 1.3 oster /* i now equals the number of stripe units accessed in new_asm_h[0] */ 324 1.25 perry /* Note that for tmprrdNode, this means a continuation from above, so no need to 325 1.18 oster assign it anything.. */ 326 1.3 oster if (new_asm_h[1]) { 327 1.3 oster for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo; 328 1.3 oster j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed; 329 1.3 oster j++, pda = pda->next) { 330 1.18 oster rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 331 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList); 332 1.3 oster RF_ASSERT(pda); 333 1.18 oster tmprrdNode->params[0].p = pda; 334 1.18 oster tmprrdNode->params[1].p = pda->bufPtr; 335 1.18 oster tmprrdNode->params[2].v = parityStripeID; 336 1.18 oster tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 337 1.3 oster if (allowBufferRecycle && (pda->numSector == failedPDA->numSector)) 338 1.3 oster xorTargetBuf = pda->bufPtr; 339 1.18 oster tmprrdNode = tmprrdNode->list_next; 340 1.3 oster } 341 1.3 oster } 342 1.3 oster if (rdnodesFaked) { 343 1.3 oster /* 344 1.3 oster * This is where we'll init that fake noop read node 345 1.3 oster * (XXX should the wakeup func be different?) 346 1.3 oster */ 347 1.18 oster /* node that rrdNodes will just be a single node... */ 348 1.18 oster rf_InitNode(rrdNodes, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 349 1.3 oster NULL, 1, 1, 0, 0, dag_h, "RrN", allocList); 350 1.3 oster } 351 1.3 oster /* 352 1.3 oster * Make a PDA for the parity unit. The parity PDA should start at 353 1.3 oster * the same offset into the SU as the failed PDA. 354 1.3 oster */ 355 1.3 oster /* Danner comment: I don't think this copy is really necessary. We are 356 1.3 oster * in one of two cases here. (1) The entire failed unit is written. 357 1.3 oster * Then asmap->parityInfo will describe the entire parity. (2) We are 358 1.3 oster * only writing a subset of the failed unit and nothing else. Then the 359 1.3 oster * asmap->parityInfo describes the failed unit and the copy can also 360 1.3 oster * be avoided. */ 361 1.3 oster 362 1.36 oster parityPDA = rf_AllocPhysDiskAddr(raidPtr); 363 1.19 oster parityPDA->next = dag_h->pda_cleanup_list; 364 1.19 oster dag_h->pda_cleanup_list = parityPDA; 365 1.3 oster parityPDA->col = asmap->parityInfo->col; 366 1.3 oster parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU) 367 1.3 oster * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); 368 1.3 oster parityPDA->numSector = failedPDA->numSector; 369 1.3 oster 370 1.3 oster if (!xorTargetBuf) { 371 1.24 oster xorTargetBuf = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); 372 1.3 oster } 373 1.3 oster /* init the Wnp node */ 374 1.3 oster rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 375 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList); 376 1.3 oster wnpNode->params[0].p = parityPDA; 377 1.3 oster wnpNode->params[1].p = xorTargetBuf; 378 1.3 oster wnpNode->params[2].v = parityStripeID; 379 1.15 oster wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 380 1.3 oster 381 1.17 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 382 1.3 oster /* fill in the Wnq Node */ 383 1.3 oster if (nfaults == 2) { 384 1.3 oster { 385 1.37 oster parityPDA = RF_MallocAndAdd(sizeof(*parityPDA), allocList); 386 1.3 oster parityPDA->col = asmap->qInfo->col; 387 1.3 oster parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU) 388 1.3 oster * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); 389 1.3 oster parityPDA->numSector = failedPDA->numSector; 390 1.3 oster 391 1.3 oster rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 392 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList); 393 1.3 oster wnqNode->params[0].p = parityPDA; 394 1.34 christos xorNode->results[1] = BUF_ALLOC(failedPDA->numSector); 395 1.3 oster wnqNode->params[1].p = xorNode->results[1]; 396 1.3 oster wnqNode->params[2].v = parityStripeID; 397 1.15 oster wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 398 1.3 oster } 399 1.3 oster } 400 1.17 oster #endif 401 1.3 oster /* fill in the Wnd nodes */ 402 1.18 oster tmpwndNode = wndNodes; 403 1.3 oster for (pda = asmap->physInfo, i = 0; i < nWndNodes; i++, pda = pda->next) { 404 1.3 oster if (pda == failedPDA) { 405 1.3 oster i--; 406 1.3 oster continue; 407 1.3 oster } 408 1.18 oster rf_InitNode(tmpwndNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, 409 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList); 410 1.3 oster RF_ASSERT(pda); 411 1.18 oster tmpwndNode->params[0].p = pda; 412 1.18 oster tmpwndNode->params[1].p = pda->bufPtr; 413 1.18 oster tmpwndNode->params[2].v = parityStripeID; 414 1.18 oster tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 415 1.18 oster tmpwndNode = tmpwndNode->list_next; 416 1.3 oster } 417 1.3 oster 418 1.3 oster /* fill in the results of the xor node */ 419 1.3 oster xorNode->results[0] = xorTargetBuf; 420 1.3 oster 421 1.3 oster /* fill in the params of the xor node */ 422 1.3 oster 423 1.3 oster paramNum = 0; 424 1.3 oster if (rdnodesFaked == 0) { 425 1.18 oster tmprrdNode = rrdNodes; 426 1.3 oster for (i = 0; i < nRrdNodes; i++) { 427 1.3 oster /* all the Rrd nodes need to be xored together */ 428 1.18 oster xorNode->params[paramNum++] = tmprrdNode->params[0]; 429 1.18 oster xorNode->params[paramNum++] = tmprrdNode->params[1]; 430 1.18 oster tmprrdNode = tmprrdNode->list_next; 431 1.3 oster } 432 1.3 oster } 433 1.18 oster tmpwndNode = wndNodes; 434 1.3 oster for (i = 0; i < nWndNodes; i++) { 435 1.3 oster /* any Wnd nodes that overlap the failed access need to be 436 1.3 oster * xored in */ 437 1.3 oster if (overlappingPDAs[i]) { 438 1.36 oster pda = rf_AllocPhysDiskAddr(raidPtr); 439 1.18 oster memcpy((char *) pda, (char *) tmpwndNode->params[0].p, sizeof(RF_PhysDiskAddr_t)); 440 1.19 oster /* add it into the pda_cleanup_list *after* the copy, TYVM */ 441 1.19 oster pda->next = dag_h->pda_cleanup_list; 442 1.19 oster dag_h->pda_cleanup_list = pda; 443 1.3 oster rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0); 444 1.3 oster xorNode->params[paramNum++].p = pda; 445 1.3 oster xorNode->params[paramNum++].p = pda->bufPtr; 446 1.3 oster } 447 1.18 oster tmpwndNode = tmpwndNode->list_next; 448 1.3 oster } 449 1.3 oster 450 1.3 oster /* 451 1.3 oster * Install the failed PDA into the xor param list so that the 452 1.3 oster * new data gets xor'd in. 453 1.3 oster */ 454 1.3 oster xorNode->params[paramNum++].p = failedPDA; 455 1.3 oster xorNode->params[paramNum++].p = failedPDA->bufPtr; 456 1.3 oster 457 1.3 oster /* 458 1.3 oster * The last 2 params to the recovery xor node are always the failed 459 1.3 oster * PDA and the raidPtr. install the failedPDA even though we have just 460 1.3 oster * done so above. This allows us to use the same XOR function for both 461 1.3 oster * degraded reads and degraded writes. 462 1.3 oster */ 463 1.3 oster xorNode->params[paramNum++].p = failedPDA; 464 1.3 oster xorNode->params[paramNum++].p = raidPtr; 465 1.3 oster RF_ASSERT(paramNum == 2 * nXorBufs + 2); 466 1.3 oster 467 1.3 oster /* 468 1.3 oster * Code to link nodes begins here 469 1.3 oster */ 470 1.3 oster 471 1.3 oster /* link header to block node */ 472 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0); 473 1.3 oster dag_h->succedents[0] = blockNode; 474 1.3 oster 475 1.3 oster /* link block node to rd nodes */ 476 1.3 oster RF_ASSERT(blockNode->numSuccedents == nRrdNodes); 477 1.18 oster tmprrdNode = rrdNodes; 478 1.3 oster for (i = 0; i < nRrdNodes; i++) { 479 1.18 oster RF_ASSERT(tmprrdNode->numAntecedents == 1); 480 1.18 oster blockNode->succedents[i] = tmprrdNode; 481 1.18 oster tmprrdNode->antecedents[0] = blockNode; 482 1.18 oster tmprrdNode->antType[0] = rf_control; 483 1.18 oster tmprrdNode = tmprrdNode->list_next; 484 1.3 oster } 485 1.3 oster 486 1.3 oster /* link read nodes to xor node */ 487 1.3 oster RF_ASSERT(xorNode->numAntecedents == nRrdNodes); 488 1.18 oster tmprrdNode = rrdNodes; 489 1.3 oster for (i = 0; i < nRrdNodes; i++) { 490 1.18 oster RF_ASSERT(tmprrdNode->numSuccedents == 1); 491 1.18 oster tmprrdNode->succedents[0] = xorNode; 492 1.18 oster xorNode->antecedents[i] = tmprrdNode; 493 1.3 oster xorNode->antType[i] = rf_trueData; 494 1.18 oster tmprrdNode = tmprrdNode->list_next; 495 1.3 oster } 496 1.3 oster 497 1.3 oster /* link xor node to commit node */ 498 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1); 499 1.3 oster RF_ASSERT(commitNode->numAntecedents == 1); 500 1.3 oster xorNode->succedents[0] = commitNode; 501 1.3 oster commitNode->antecedents[0] = xorNode; 502 1.3 oster commitNode->antType[0] = rf_control; 503 1.3 oster 504 1.3 oster /* link commit node to wnd nodes */ 505 1.3 oster RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes); 506 1.18 oster tmpwndNode = wndNodes; 507 1.3 oster for (i = 0; i < nWndNodes; i++) { 508 1.18 oster RF_ASSERT(tmpwndNode->numAntecedents == 1); 509 1.18 oster commitNode->succedents[i] = tmpwndNode; 510 1.18 oster tmpwndNode->antecedents[0] = commitNode; 511 1.18 oster tmpwndNode->antType[0] = rf_control; 512 1.22 oster tmpwndNode = tmpwndNode->list_next; 513 1.3 oster } 514 1.3 oster 515 1.3 oster /* link the commit node to wnp, wnq nodes */ 516 1.3 oster RF_ASSERT(wnpNode->numAntecedents == 1); 517 1.3 oster commitNode->succedents[nWndNodes] = wnpNode; 518 1.3 oster wnpNode->antecedents[0] = commitNode; 519 1.3 oster wnpNode->antType[0] = rf_control; 520 1.17 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 521 1.3 oster if (nfaults == 2) { 522 1.3 oster RF_ASSERT(wnqNode->numAntecedents == 1); 523 1.3 oster commitNode->succedents[nWndNodes + 1] = wnqNode; 524 1.3 oster wnqNode->antecedents[0] = commitNode; 525 1.3 oster wnqNode->antType[0] = rf_control; 526 1.3 oster } 527 1.17 oster #endif 528 1.3 oster /* link write new data nodes to unblock node */ 529 1.3 oster RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults)); 530 1.18 oster tmpwndNode = wndNodes; 531 1.3 oster for (i = 0; i < nWndNodes; i++) { 532 1.18 oster RF_ASSERT(tmpwndNode->numSuccedents == 1); 533 1.18 oster tmpwndNode->succedents[0] = unblockNode; 534 1.18 oster unblockNode->antecedents[i] = tmpwndNode; 535 1.3 oster unblockNode->antType[i] = rf_control; 536 1.22 oster tmpwndNode = tmpwndNode->list_next; 537 1.3 oster } 538 1.3 oster 539 1.3 oster /* link write new parity node to unblock node */ 540 1.3 oster RF_ASSERT(wnpNode->numSuccedents == 1); 541 1.3 oster wnpNode->succedents[0] = unblockNode; 542 1.3 oster unblockNode->antecedents[nWndNodes] = wnpNode; 543 1.3 oster unblockNode->antType[nWndNodes] = rf_control; 544 1.3 oster 545 1.17 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) 546 1.3 oster /* link write new q node to unblock node */ 547 1.3 oster if (nfaults == 2) { 548 1.3 oster RF_ASSERT(wnqNode->numSuccedents == 1); 549 1.3 oster wnqNode->succedents[0] = unblockNode; 550 1.3 oster unblockNode->antecedents[nWndNodes + 1] = wnqNode; 551 1.3 oster unblockNode->antType[nWndNodes + 1] = rf_control; 552 1.3 oster } 553 1.17 oster #endif 554 1.3 oster /* link unblock node to term node */ 555 1.3 oster RF_ASSERT(unblockNode->numSuccedents == 1); 556 1.3 oster RF_ASSERT(termNode->numAntecedents == 1); 557 1.3 oster RF_ASSERT(termNode->numSuccedents == 0); 558 1.3 oster unblockNode->succedents[0] = termNode; 559 1.3 oster termNode->antecedents[0] = unblockNode; 560 1.3 oster termNode->antType[0] = rf_control; 561 1.1 oster } 562 1.1 oster #define CONS_PDA(if,start,num) \ 563 1.12 oster pda_p->col = asmap->if->col; \ 564 1.1 oster pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \ 565 1.1 oster pda_p->numSector = num; \ 566 1.1 oster pda_p->next = NULL; \ 567 1.34 christos pda_p->bufPtr = BUF_ALLOC(num) 568 1.37 oster #if (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) 569 1.25 perry void 570 1.3 oster rf_WriteGenerateFailedAccessASMs( 571 1.3 oster RF_Raid_t * raidPtr, 572 1.3 oster RF_AccessStripeMap_t * asmap, 573 1.3 oster RF_PhysDiskAddr_t ** pdap, 574 1.3 oster int *nNodep, 575 1.3 oster RF_PhysDiskAddr_t ** pqpdap, 576 1.3 oster int *nPQNodep, 577 1.3 oster RF_AllocListElem_t * allocList) 578 1.1 oster { 579 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 580 1.3 oster int PDAPerDisk, i; 581 1.3 oster RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 582 1.3 oster int numDataCol = layoutPtr->numDataCol; 583 1.3 oster int state; 584 1.3 oster unsigned napdas; 585 1.33 christos RF_SectorNum_t fone_start, ftwo_start = 0; 586 1.3 oster RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1]; 587 1.3 oster RF_PhysDiskAddr_t *pda_p; 588 1.3 oster RF_RaidAddr_t sosAddr; 589 1.3 oster 590 1.3 oster /* determine how many pda's we will have to generate per unaccess 591 1.3 oster * stripe. If there is only one failed data unit, it is one; if two, 592 1.31 mbalmer * possibly two, depending whether they overlap. */ 593 1.3 oster 594 1.3 oster fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector); 595 1.3 oster 596 1.3 oster if (asmap->numDataFailed == 1) { 597 1.3 oster PDAPerDisk = 1; 598 1.3 oster state = 1; 599 1.34 christos *pqpdap = RF_MallocAndAdd(2 * sizeof(**pqpdap), allocList); 600 1.3 oster pda_p = *pqpdap; 601 1.3 oster /* build p */ 602 1.3 oster CONS_PDA(parityInfo, fone_start, fone->numSector); 603 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 604 1.3 oster pda_p++; 605 1.3 oster /* build q */ 606 1.3 oster CONS_PDA(qInfo, fone_start, fone->numSector); 607 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 608 1.3 oster } else { 609 1.3 oster ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector); 610 1.3 oster if (fone->numSector + ftwo->numSector > secPerSU) { 611 1.3 oster PDAPerDisk = 1; 612 1.3 oster state = 2; 613 1.34 christos *pqpdap = RF_MallocAndAdd(2 * sizeof(**pqpdap), 614 1.34 christos allocList); 615 1.3 oster pda_p = *pqpdap; 616 1.3 oster CONS_PDA(parityInfo, 0, secPerSU); 617 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 618 1.3 oster pda_p++; 619 1.3 oster CONS_PDA(qInfo, 0, secPerSU); 620 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 621 1.3 oster } else { 622 1.3 oster PDAPerDisk = 2; 623 1.3 oster state = 3; 624 1.3 oster /* four of them, fone, then ftwo */ 625 1.34 christos *pqpdap = RF_MallocAndAdd(4 * sizeof(*pqpdap), 626 1.34 christos allocList); 627 1.3 oster pda_p = *pqpdap; 628 1.3 oster CONS_PDA(parityInfo, fone_start, fone->numSector); 629 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 630 1.3 oster pda_p++; 631 1.3 oster CONS_PDA(qInfo, fone_start, fone->numSector); 632 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 633 1.3 oster pda_p++; 634 1.3 oster CONS_PDA(parityInfo, ftwo_start, ftwo->numSector); 635 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 636 1.3 oster pda_p++; 637 1.3 oster CONS_PDA(qInfo, ftwo_start, ftwo->numSector); 638 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 639 1.3 oster } 640 1.3 oster } 641 1.3 oster /* figure out number of nonaccessed pda */ 642 1.3 oster napdas = PDAPerDisk * (numDataCol - 2); 643 1.3 oster *nPQNodep = PDAPerDisk; 644 1.3 oster 645 1.3 oster *nNodep = napdas; 646 1.3 oster if (napdas == 0) 647 1.3 oster return; /* short circuit */ 648 1.3 oster 649 1.3 oster /* allocate up our list of pda's */ 650 1.3 oster 651 1.34 christos pda_p = RF_MallocAndAdd(napdas * sizeof(*pda_p), allocList); 652 1.3 oster *pdap = pda_p; 653 1.3 oster 654 1.3 oster /* linkem together */ 655 1.3 oster for (i = 0; i < (napdas - 1); i++) 656 1.3 oster pda_p[i].next = pda_p + (i + 1); 657 1.3 oster 658 1.3 oster sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 659 1.3 oster for (i = 0; i < numDataCol; i++) { 660 1.3 oster if ((pda_p - (*pdap)) == napdas) 661 1.3 oster continue; 662 1.3 oster pda_p->type = RF_PDA_TYPE_DATA; 663 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU); 664 1.12 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 665 1.3 oster /* skip over dead disks */ 666 1.12 oster if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status)) 667 1.3 oster continue; 668 1.3 oster switch (state) { 669 1.3 oster case 1: /* fone */ 670 1.3 oster pda_p->numSector = fone->numSector; 671 1.3 oster pda_p->raidAddress += fone_start; 672 1.3 oster pda_p->startSector += fone_start; 673 1.34 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 674 1.3 oster break; 675 1.3 oster case 2: /* full stripe */ 676 1.3 oster pda_p->numSector = secPerSU; 677 1.34 christos pda_p->bufPtr = BUF_ALLOC(secPerSU); 678 1.3 oster break; 679 1.3 oster case 3: /* two slabs */ 680 1.3 oster pda_p->numSector = fone->numSector; 681 1.3 oster pda_p->raidAddress += fone_start; 682 1.3 oster pda_p->startSector += fone_start; 683 1.34 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 684 1.3 oster pda_p++; 685 1.3 oster pda_p->type = RF_PDA_TYPE_DATA; 686 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU); 687 1.12 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 688 1.3 oster pda_p->numSector = ftwo->numSector; 689 1.3 oster pda_p->raidAddress += ftwo_start; 690 1.3 oster pda_p->startSector += ftwo_start; 691 1.34 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 692 1.3 oster break; 693 1.3 oster default: 694 1.3 oster RF_PANIC(); 695 1.3 oster } 696 1.3 oster pda_p++; 697 1.1 oster } 698 1.1 oster 699 1.3 oster RF_ASSERT(pda_p - *pdap == napdas); 700 1.3 oster return; 701 1.1 oster } 702 1.1 oster #define DISK_NODE_PDA(node) ((node)->params[0].p) 703 1.1 oster 704 1.1 oster #define DISK_NODE_PARAMS(_node_,_p_) \ 705 1.1 oster (_node_).params[0].p = _p_ ; \ 706 1.1 oster (_node_).params[1].p = (_p_)->bufPtr; \ 707 1.1 oster (_node_).params[2].v = parityStripeID; \ 708 1.15 oster (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru) 709 1.1 oster 710 1.25 perry void 711 1.14 oster rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 712 1.30 christos RF_DagHeader_t *dag_h, void *bp, 713 1.30 christos RF_RaidAccessFlags_t flags, 714 1.14 oster RF_AllocListElem_t *allocList, 715 1.28 oster const char *redundantReadNodeName, 716 1.30 christos const char *redundantWriteNodeName, 717 1.28 oster const char *recoveryNodeName, 718 1.35 christos void (*recovFunc) (RF_DagNode_t *)) 719 1.1 oster { 720 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 721 1.3 oster RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode, 722 1.3 oster *unblockNode, *rpNodes, *rqNodes, *wpNodes, *wqNodes, *termNode; 723 1.3 oster RF_PhysDiskAddr_t *pda, *pqPDAs; 724 1.3 oster RF_PhysDiskAddr_t *npdas; 725 1.3 oster int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i; 726 1.3 oster RF_ReconUnitNum_t which_ru; 727 1.3 oster int nPQNodes; 728 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru); 729 1.3 oster 730 1.3 oster /* simple small write case - First part looks like a reconstruct-read 731 1.3 oster * of the failed data units. Then a write of all data units not 732 1.3 oster * failed. */ 733 1.3 oster 734 1.3 oster 735 1.3 oster /* Hdr | ------Block- / / \ Rrd Rrd ... Rrd Rp Rq \ \ 736 1.3 oster * / -------PQ----- / \ \ Wud Wp WQ \ | / 737 1.3 oster * --Unblock- | T 738 1.25 perry * 739 1.3 oster * Rrd = read recovery data (potentially none) Wud = write user data 740 1.3 oster * (not incl. failed disks) Wp = Write P (could be two) Wq = Write Q 741 1.3 oster * (could be two) 742 1.25 perry * 743 1.3 oster */ 744 1.3 oster 745 1.3 oster rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList); 746 1.3 oster 747 1.3 oster RF_ASSERT(asmap->numDataFailed == 1); 748 1.3 oster 749 1.3 oster nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed); 750 1.3 oster nReadNodes = nRrdNodes + 2 * nPQNodes; 751 1.3 oster nWriteNodes = nWudNodes + 2 * nPQNodes; 752 1.3 oster nNodes = 4 + nReadNodes + nWriteNodes; 753 1.3 oster 754 1.34 christos nodes = RF_MallocAndAdd(nNodes * sizeof(*nodes), allocList); 755 1.3 oster blockNode = nodes; 756 1.3 oster unblockNode = blockNode + 1; 757 1.3 oster termNode = unblockNode + 1; 758 1.3 oster recoveryNode = termNode + 1; 759 1.3 oster rrdNodes = recoveryNode + 1; 760 1.3 oster rpNodes = rrdNodes + nRrdNodes; 761 1.3 oster rqNodes = rpNodes + nPQNodes; 762 1.3 oster wudNodes = rqNodes + nPQNodes; 763 1.3 oster wpNodes = wudNodes + nWudNodes; 764 1.3 oster wqNodes = wpNodes + nPQNodes; 765 1.3 oster 766 1.3 oster dag_h->creator = "PQ_DDSimpleSmallWrite"; 767 1.3 oster dag_h->numSuccedents = 1; 768 1.3 oster dag_h->succedents[0] = blockNode; 769 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 770 1.3 oster termNode->antecedents[0] = unblockNode; 771 1.3 oster termNode->antType[0] = rf_control; 772 1.3 oster 773 1.3 oster /* init the block and unblock nodes */ 774 1.3 oster /* The block node has all the read nodes as successors */ 775 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList); 776 1.3 oster for (i = 0; i < nReadNodes; i++) 777 1.3 oster blockNode->succedents[i] = rrdNodes + i; 778 1.3 oster 779 1.3 oster /* The unblock node has all the writes as successors */ 780 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList); 781 1.3 oster for (i = 0; i < nWriteNodes; i++) { 782 1.3 oster unblockNode->antecedents[i] = wudNodes + i; 783 1.3 oster unblockNode->antType[i] = rf_control; 784 1.3 oster } 785 1.3 oster unblockNode->succedents[0] = termNode; 786 1.1 oster 787 1.1 oster #define INIT_READ_NODE(node,name) \ 788 1.1 oster rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \ 789 1.1 oster (node)->succedents[0] = recoveryNode; \ 790 1.1 oster (node)->antecedents[0] = blockNode; \ 791 1.1 oster (node)->antType[0] = rf_control; 792 1.1 oster 793 1.3 oster /* build the read nodes */ 794 1.3 oster pda = npdas; 795 1.3 oster for (i = 0; i < nRrdNodes; i++, pda = pda->next) { 796 1.3 oster INIT_READ_NODE(rrdNodes + i, "rrd"); 797 1.3 oster DISK_NODE_PARAMS(rrdNodes[i], pda); 798 1.3 oster } 799 1.3 oster 800 1.3 oster /* read redundancy pdas */ 801 1.3 oster pda = pqPDAs; 802 1.3 oster INIT_READ_NODE(rpNodes, "Rp"); 803 1.3 oster RF_ASSERT(pda); 804 1.3 oster DISK_NODE_PARAMS(rpNodes[0], pda); 805 1.3 oster pda++; 806 1.3 oster INIT_READ_NODE(rqNodes, redundantReadNodeName); 807 1.3 oster RF_ASSERT(pda); 808 1.3 oster DISK_NODE_PARAMS(rqNodes[0], pda); 809 1.3 oster if (nPQNodes == 2) { 810 1.3 oster pda++; 811 1.3 oster INIT_READ_NODE(rpNodes + 1, "Rp"); 812 1.3 oster RF_ASSERT(pda); 813 1.3 oster DISK_NODE_PARAMS(rpNodes[1], pda); 814 1.3 oster pda++; 815 1.3 oster INIT_READ_NODE(rqNodes + 1, redundantReadNodeName); 816 1.3 oster RF_ASSERT(pda); 817 1.3 oster DISK_NODE_PARAMS(rqNodes[1], pda); 818 1.3 oster } 819 1.3 oster /* the recovery node has all reads as precedessors and all writes as 820 1.3 oster * successors. It generates a result for every write P or write Q 821 1.3 oster * node. As parameters, it takes a pda per read and a pda per stripe 822 1.3 oster * of user data written. It also takes as the last params the raidPtr 823 1.3 oster * and asm. For results, it takes PDA for P & Q. */ 824 1.3 oster 825 1.3 oster 826 1.3 oster rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL, 827 1.3 oster nWriteNodes, /* succesors */ 828 1.3 oster nReadNodes, /* preds */ 829 1.3 oster nReadNodes + nWudNodes + 3, /* params */ 830 1.3 oster 2 * nPQNodes, /* results */ 831 1.3 oster dag_h, recoveryNodeName, allocList); 832 1.3 oster 833 1.3 oster 834 1.3 oster 835 1.3 oster for (i = 0; i < nReadNodes; i++) { 836 1.3 oster recoveryNode->antecedents[i] = rrdNodes + i; 837 1.3 oster recoveryNode->antType[i] = rf_control; 838 1.3 oster recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes + i); 839 1.3 oster } 840 1.3 oster for (i = 0; i < nWudNodes; i++) { 841 1.3 oster recoveryNode->succedents[i] = wudNodes + i; 842 1.3 oster } 843 1.3 oster recoveryNode->params[nReadNodes + nWudNodes].p = asmap->failedPDAs[0]; 844 1.3 oster recoveryNode->params[nReadNodes + nWudNodes + 1].p = raidPtr; 845 1.3 oster recoveryNode->params[nReadNodes + nWudNodes + 2].p = asmap; 846 1.3 oster 847 1.3 oster for (; i < nWriteNodes; i++) 848 1.3 oster recoveryNode->succedents[i] = wudNodes + i; 849 1.3 oster 850 1.3 oster pda = pqPDAs; 851 1.3 oster recoveryNode->results[0] = pda; 852 1.3 oster pda++; 853 1.3 oster recoveryNode->results[1] = pda; 854 1.3 oster if (nPQNodes == 2) { 855 1.3 oster pda++; 856 1.3 oster recoveryNode->results[2] = pda; 857 1.3 oster pda++; 858 1.3 oster recoveryNode->results[3] = pda; 859 1.3 oster } 860 1.3 oster /* fill writes */ 861 1.1 oster #define INIT_WRITE_NODE(node,name) \ 862 1.1 oster rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \ 863 1.1 oster (node)->succedents[0] = unblockNode; \ 864 1.1 oster (node)->antecedents[0] = recoveryNode; \ 865 1.1 oster (node)->antType[0] = rf_control; 866 1.1 oster 867 1.3 oster pda = asmap->physInfo; 868 1.3 oster for (i = 0; i < nWudNodes; i++) { 869 1.3 oster INIT_WRITE_NODE(wudNodes + i, "Wd"); 870 1.3 oster DISK_NODE_PARAMS(wudNodes[i], pda); 871 1.3 oster recoveryNode->params[nReadNodes + i].p = DISK_NODE_PDA(wudNodes + i); 872 1.3 oster pda = pda->next; 873 1.3 oster } 874 1.3 oster /* write redundancy pdas */ 875 1.3 oster pda = pqPDAs; 876 1.3 oster INIT_WRITE_NODE(wpNodes, "Wp"); 877 1.3 oster RF_ASSERT(pda); 878 1.3 oster DISK_NODE_PARAMS(wpNodes[0], pda); 879 1.3 oster pda++; 880 1.3 oster INIT_WRITE_NODE(wqNodes, "Wq"); 881 1.3 oster RF_ASSERT(pda); 882 1.3 oster DISK_NODE_PARAMS(wqNodes[0], pda); 883 1.3 oster if (nPQNodes == 2) { 884 1.3 oster pda++; 885 1.3 oster INIT_WRITE_NODE(wpNodes + 1, "Wp"); 886 1.3 oster RF_ASSERT(pda); 887 1.3 oster DISK_NODE_PARAMS(wpNodes[1], pda); 888 1.3 oster pda++; 889 1.3 oster INIT_WRITE_NODE(wqNodes + 1, "Wq"); 890 1.3 oster RF_ASSERT(pda); 891 1.3 oster DISK_NODE_PARAMS(wqNodes[1], pda); 892 1.3 oster } 893 1.1 oster } 894 1.6 oster #endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_EVENODD > 0) */ 895