1 1.33 andvar /* $NetBSD: rf_dagdegrd.c,v 1.33 2022/01/24 09:14:37 andvar Exp $ */ 2 1.1 oster /* 3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University. 4 1.1 oster * All rights reserved. 5 1.1 oster * 6 1.1 oster * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II 7 1.1 oster * 8 1.1 oster * Permission to use, copy, modify and distribute this software and 9 1.1 oster * its documentation is hereby granted, provided that both the copyright 10 1.1 oster * notice and this permission notice appear in all copies of the 11 1.1 oster * software, derivative works or modified versions, and any portions 12 1.1 oster * thereof, and that both notices appear in supporting documentation. 13 1.1 oster * 14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 1.1 oster * 18 1.1 oster * Carnegie Mellon requests users of this software to return to 19 1.1 oster * 20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU 21 1.1 oster * School of Computer Science 22 1.1 oster * Carnegie Mellon University 23 1.1 oster * Pittsburgh PA 15213-3890 24 1.1 oster * 25 1.1 oster * any improvements or extensions that they make and grant Carnegie the 26 1.1 oster * rights to redistribute these changes. 27 1.1 oster */ 28 1.1 oster 29 1.1 oster /* 30 1.1 oster * rf_dagdegrd.c 31 1.1 oster * 32 1.1 oster * code for creating degraded read DAGs 33 1.1 oster */ 34 1.10 lukem 35 1.10 lukem #include <sys/cdefs.h> 36 1.33 andvar __KERNEL_RCSID(0, "$NetBSD: rf_dagdegrd.c,v 1.33 2022/01/24 09:14:37 andvar Exp $"); 37 1.1 oster 38 1.9 oster #include <dev/raidframe/raidframevar.h> 39 1.9 oster 40 1.6 oster #include "rf_archs.h" 41 1.1 oster #include "rf_raid.h" 42 1.1 oster #include "rf_dag.h" 43 1.1 oster #include "rf_dagutils.h" 44 1.1 oster #include "rf_dagfuncs.h" 45 1.1 oster #include "rf_debugMem.h" 46 1.1 oster #include "rf_general.h" 47 1.1 oster #include "rf_dagdegrd.h" 48 1.21 oster #include "rf_map.h" 49 1.1 oster 50 1.1 oster 51 1.1 oster /****************************************************************************** 52 1.1 oster * 53 1.1 oster * General comments on DAG creation: 54 1.3 oster * 55 1.1 oster * All DAGs in this file use roll-away error recovery. Each DAG has a single 56 1.1 oster * commit node, usually called "Cmt." If an error occurs before the Cmt node 57 1.1 oster * is reached, the execution engine will halt forward execution and work 58 1.1 oster * backward through the graph, executing the undo functions. Assuming that 59 1.1 oster * each node in the graph prior to the Cmt node are undoable and atomic - or - 60 1.1 oster * does not make changes to permanent state, the graph will fail atomically. 61 1.1 oster * If an error occurs after the Cmt node executes, the engine will roll-forward 62 1.1 oster * through the graph, blindly executing nodes until it reaches the end. 63 1.1 oster * If a graph reaches the end, it is assumed to have completed successfully. 64 1.1 oster * 65 1.1 oster * A graph has only 1 Cmt node. 66 1.1 oster * 67 1.1 oster */ 68 1.1 oster 69 1.1 oster 70 1.1 oster /****************************************************************************** 71 1.1 oster * 72 1.1 oster * The following wrappers map the standard DAG creation interface to the 73 1.1 oster * DAG creation routines. Additionally, these wrappers enable experimentation 74 1.1 oster * with new DAG structures by providing an extra level of indirection, allowing 75 1.1 oster * the DAG creation routines to be replaced at this single point. 76 1.1 oster */ 77 1.1 oster 78 1.23 perry void 79 1.16 oster rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t *raidPtr, 80 1.16 oster RF_AccessStripeMap_t *asmap, 81 1.16 oster RF_DagHeader_t *dag_h, 82 1.16 oster void *bp, 83 1.16 oster RF_RaidAccessFlags_t flags, 84 1.16 oster RF_AllocListElem_t *allocList) 85 1.1 oster { 86 1.3 oster rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 87 1.3 oster &rf_xorRecoveryFuncs); 88 1.1 oster } 89 1.1 oster 90 1.1 oster 91 1.1 oster /****************************************************************************** 92 1.1 oster * 93 1.1 oster * DAG creation code begins here 94 1.1 oster */ 95 1.1 oster 96 1.1 oster 97 1.1 oster /****************************************************************************** 98 1.1 oster * Create a degraded read DAG for RAID level 1 99 1.1 oster * 100 1.1 oster * Hdr -> Nil -> R(p/s)d -> Commit -> Trm 101 1.1 oster * 102 1.1 oster * The "Rd" node reads data from the surviving disk in the mirror pair 103 1.1 oster * Rpd - read of primary copy 104 1.1 oster * Rsd - read of secondary copy 105 1.1 oster * 106 1.1 oster * Parameters: raidPtr - description of the physical array 107 1.1 oster * asmap - logical & physical addresses for this access 108 1.1 oster * bp - buffer ptr (for holding write data) 109 1.3 oster * flags - general flags (e.g. disk locking) 110 1.1 oster * allocList - list of memory allocated in DAG creation 111 1.1 oster *****************************************************************************/ 112 1.1 oster 113 1.23 perry void 114 1.16 oster rf_CreateRaidOneDegradedReadDAG(RF_Raid_t *raidPtr, 115 1.16 oster RF_AccessStripeMap_t *asmap, 116 1.16 oster RF_DagHeader_t *dag_h, 117 1.27 christos void *bp, 118 1.27 christos RF_RaidAccessFlags_t flags, 119 1.16 oster RF_AllocListElem_t *allocList) 120 1.1 oster { 121 1.20 oster RF_DagNode_t *rdNode, *blockNode, *commitNode, *termNode; 122 1.3 oster RF_StripeNum_t parityStripeID; 123 1.3 oster RF_ReconUnitNum_t which_ru; 124 1.3 oster RF_PhysDiskAddr_t *pda; 125 1.20 oster int useMirror; 126 1.3 oster 127 1.3 oster useMirror = 0; 128 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 129 1.3 oster asmap->raidAddress, &which_ru); 130 1.19 oster #if RF_DEBUG_DAG 131 1.3 oster if (rf_dagDebug) { 132 1.3 oster printf("[Creating RAID level 1 degraded read DAG]\n"); 133 1.3 oster } 134 1.19 oster #endif 135 1.3 oster dag_h->creator = "RaidOneDegradedReadDAG"; 136 1.3 oster /* alloc the Wnd nodes and the Wmir node */ 137 1.3 oster if (asmap->numDataFailed == 0) 138 1.3 oster useMirror = RF_FALSE; 139 1.3 oster else 140 1.3 oster useMirror = RF_TRUE; 141 1.3 oster 142 1.3 oster /* total number of nodes = 1 + (block + commit + terminator) */ 143 1.20 oster 144 1.32 oster rdNode = rf_AllocDAGNode(raidPtr); 145 1.20 oster rdNode->list_next = dag_h->nodes; 146 1.20 oster dag_h->nodes = rdNode; 147 1.20 oster 148 1.32 oster blockNode = rf_AllocDAGNode(raidPtr); 149 1.20 oster blockNode->list_next = dag_h->nodes; 150 1.20 oster dag_h->nodes = blockNode; 151 1.20 oster 152 1.32 oster commitNode = rf_AllocDAGNode(raidPtr); 153 1.20 oster commitNode->list_next = dag_h->nodes; 154 1.20 oster dag_h->nodes = commitNode; 155 1.20 oster 156 1.32 oster termNode = rf_AllocDAGNode(raidPtr); 157 1.20 oster termNode->list_next = dag_h->nodes; 158 1.20 oster dag_h->nodes = termNode; 159 1.3 oster 160 1.3 oster /* this dag can not commit until the commit node is reached. errors 161 1.3 oster * prior to the commit point imply the dag has failed and must be 162 1.3 oster * retried */ 163 1.3 oster dag_h->numCommitNodes = 1; 164 1.3 oster dag_h->numCommits = 0; 165 1.3 oster dag_h->numSuccedents = 1; 166 1.3 oster 167 1.3 oster /* initialize the block, commit, and terminator nodes */ 168 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 169 1.3 oster NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); 170 1.3 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 171 1.3 oster NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 172 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 173 1.3 oster NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 174 1.3 oster 175 1.3 oster pda = asmap->physInfo; 176 1.3 oster RF_ASSERT(pda != NULL); 177 1.3 oster /* parityInfo must describe entire parity unit */ 178 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); 179 1.3 oster 180 1.3 oster /* initialize the data node */ 181 1.3 oster if (!useMirror) { 182 1.3 oster /* read primary copy of data */ 183 1.3 oster rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 184 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); 185 1.3 oster rdNode->params[0].p = pda; 186 1.3 oster rdNode->params[1].p = pda->bufPtr; 187 1.3 oster rdNode->params[2].v = parityStripeID; 188 1.18 oster rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 189 1.18 oster which_ru); 190 1.3 oster } else { 191 1.3 oster /* read secondary copy of data */ 192 1.3 oster rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 193 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); 194 1.3 oster rdNode->params[0].p = asmap->parityInfo; 195 1.3 oster rdNode->params[1].p = pda->bufPtr; 196 1.3 oster rdNode->params[2].v = parityStripeID; 197 1.18 oster rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 198 1.18 oster which_ru); 199 1.3 oster } 200 1.3 oster 201 1.3 oster /* connect header to block node */ 202 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1); 203 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0); 204 1.3 oster dag_h->succedents[0] = blockNode; 205 1.3 oster 206 1.3 oster /* connect block node to rdnode */ 207 1.3 oster RF_ASSERT(blockNode->numSuccedents == 1); 208 1.3 oster RF_ASSERT(rdNode->numAntecedents == 1); 209 1.3 oster blockNode->succedents[0] = rdNode; 210 1.3 oster rdNode->antecedents[0] = blockNode; 211 1.3 oster rdNode->antType[0] = rf_control; 212 1.3 oster 213 1.3 oster /* connect rdnode to commit node */ 214 1.3 oster RF_ASSERT(rdNode->numSuccedents == 1); 215 1.3 oster RF_ASSERT(commitNode->numAntecedents == 1); 216 1.3 oster rdNode->succedents[0] = commitNode; 217 1.3 oster commitNode->antecedents[0] = rdNode; 218 1.3 oster commitNode->antType[0] = rf_control; 219 1.3 oster 220 1.3 oster /* connect commit node to terminator */ 221 1.3 oster RF_ASSERT(commitNode->numSuccedents == 1); 222 1.3 oster RF_ASSERT(termNode->numAntecedents == 1); 223 1.3 oster RF_ASSERT(termNode->numSuccedents == 0); 224 1.3 oster commitNode->succedents[0] = termNode; 225 1.3 oster termNode->antecedents[0] = commitNode; 226 1.3 oster termNode->antType[0] = rf_control; 227 1.1 oster } 228 1.1 oster 229 1.1 oster 230 1.1 oster 231 1.1 oster /****************************************************************************** 232 1.1 oster * 233 1.1 oster * creates a DAG to perform a degraded-mode read of data within one stripe. 234 1.1 oster * This DAG is as follows: 235 1.1 oster * 236 1.1 oster * Hdr -> Block -> Rud -> Xor -> Cmt -> T 237 1.1 oster * -> Rrd -> 238 1.1 oster * -> Rp --> 239 1.1 oster * 240 1.1 oster * Each R node is a successor of the L node 241 1.1 oster * One successor arc from each R node goes to C, and the other to X 242 1.1 oster * There is one Rud for each chunk of surviving user data requested by the 243 1.1 oster * user, and one Rrd for each chunk of surviving user data _not_ being read by 244 1.1 oster * the user 245 1.1 oster * R = read, ud = user data, rd = recovery (surviving) data, p = parity 246 1.1 oster * X = XOR, C = Commit, T = terminate 247 1.1 oster * 248 1.1 oster * The block node guarantees a single source node. 249 1.1 oster * 250 1.1 oster * Note: The target buffer for the XOR node is set to the actual user buffer 251 1.1 oster * where the failed data is supposed to end up. This buffer is zero'd by the 252 1.1 oster * code here. Thus, if you create a degraded read dag, use it, and then 253 1.1 oster * re-use, you have to be sure to zero the target buffer prior to the re-use. 254 1.1 oster * 255 1.1 oster * The recfunc argument at the end specifies the name and function used for 256 1.1 oster * the redundancy 257 1.3 oster * recovery function. 258 1.1 oster * 259 1.1 oster *****************************************************************************/ 260 1.1 oster 261 1.23 perry void 262 1.16 oster rf_CreateDegradedReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 263 1.27 christos RF_DagHeader_t *dag_h, void *bp, 264 1.27 christos RF_RaidAccessFlags_t flags, 265 1.16 oster RF_AllocListElem_t *allocList, 266 1.16 oster const RF_RedFuncs_t *recFunc) 267 1.1 oster { 268 1.20 oster RF_DagNode_t *rudNodes, *rrdNodes, *xorNode, *blockNode; 269 1.3 oster RF_DagNode_t *commitNode, *rpNode, *termNode; 270 1.20 oster RF_DagNode_t *tmpNode, *tmprudNode, *tmprrdNode; 271 1.29 martin int nRrdNodes, nRudNodes, nXorBufs, i; 272 1.3 oster int j, paramNum; 273 1.3 oster RF_SectorCount_t sectorsPerSU; 274 1.3 oster RF_ReconUnitNum_t which_ru; 275 1.21 oster char overlappingPDAs[RF_MAXCOL];/* a temporary array of flags */ 276 1.3 oster RF_AccessStripeMapHeader_t *new_asm_h[2]; 277 1.3 oster RF_PhysDiskAddr_t *pda, *parityPDA; 278 1.3 oster RF_StripeNum_t parityStripeID; 279 1.3 oster RF_PhysDiskAddr_t *failedPDA; 280 1.3 oster RF_RaidLayout_t *layoutPtr; 281 1.3 oster char *rpBuf; 282 1.3 oster 283 1.3 oster layoutPtr = &(raidPtr->Layout); 284 1.3 oster /* failedPDA points to the pda within the asm that targets the failed 285 1.3 oster * disk */ 286 1.3 oster failedPDA = asmap->failedPDAs[0]; 287 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, 288 1.3 oster asmap->raidAddress, &which_ru); 289 1.3 oster sectorsPerSU = layoutPtr->sectorsPerStripeUnit; 290 1.3 oster 291 1.19 oster #if RF_DEBUG_DAG 292 1.3 oster if (rf_dagDebug) { 293 1.3 oster printf("[Creating degraded read DAG]\n"); 294 1.3 oster } 295 1.19 oster #endif 296 1.3 oster RF_ASSERT(asmap->numDataFailed == 1); 297 1.3 oster dag_h->creator = "DegradedReadDAG"; 298 1.3 oster 299 1.3 oster /* 300 1.3 oster * generate two ASMs identifying the surviving data we need 301 1.3 oster * in order to recover the lost data 302 1.3 oster */ 303 1.3 oster 304 1.3 oster /* overlappingPDAs array must be zero'd */ 305 1.21 oster memset(overlappingPDAs, 0, RF_MAXCOL); 306 1.3 oster rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, &nXorBufs, 307 1.3 oster &rpBuf, overlappingPDAs, allocList); 308 1.3 oster 309 1.3 oster /* 310 1.3 oster * create all the nodes at once 311 1.3 oster * 312 1.3 oster * -1 because no access is generated for the failed pda 313 1.3 oster */ 314 1.3 oster nRudNodes = asmap->numStripeUnitsAccessed - 1; 315 1.3 oster nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) + 316 1.3 oster ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0); 317 1.20 oster 318 1.32 oster blockNode = rf_AllocDAGNode(raidPtr); 319 1.20 oster blockNode->list_next = dag_h->nodes; 320 1.20 oster dag_h->nodes = blockNode; 321 1.20 oster 322 1.32 oster commitNode = rf_AllocDAGNode(raidPtr); 323 1.20 oster commitNode->list_next = dag_h->nodes; 324 1.20 oster dag_h->nodes = commitNode; 325 1.20 oster 326 1.32 oster xorNode = rf_AllocDAGNode(raidPtr); 327 1.20 oster xorNode->list_next = dag_h->nodes; 328 1.20 oster dag_h->nodes = xorNode; 329 1.20 oster 330 1.32 oster rpNode = rf_AllocDAGNode(raidPtr); 331 1.20 oster rpNode->list_next = dag_h->nodes; 332 1.20 oster dag_h->nodes = rpNode; 333 1.20 oster 334 1.32 oster termNode = rf_AllocDAGNode(raidPtr); 335 1.20 oster termNode->list_next = dag_h->nodes; 336 1.20 oster dag_h->nodes = termNode; 337 1.20 oster 338 1.20 oster for (i = 0; i < nRudNodes; i++) { 339 1.32 oster tmpNode = rf_AllocDAGNode(raidPtr); 340 1.20 oster tmpNode->list_next = dag_h->nodes; 341 1.20 oster dag_h->nodes = tmpNode; 342 1.20 oster } 343 1.20 oster rudNodes = dag_h->nodes; 344 1.20 oster 345 1.20 oster for (i = 0; i < nRrdNodes; i++) { 346 1.32 oster tmpNode = rf_AllocDAGNode(raidPtr); 347 1.20 oster tmpNode->list_next = dag_h->nodes; 348 1.20 oster dag_h->nodes = tmpNode; 349 1.20 oster } 350 1.20 oster rrdNodes = dag_h->nodes; 351 1.3 oster 352 1.3 oster /* initialize nodes */ 353 1.3 oster dag_h->numCommitNodes = 1; 354 1.3 oster dag_h->numCommits = 0; 355 1.3 oster /* this dag can not commit until the commit node is reached errors 356 1.3 oster * prior to the commit point imply the dag has failed */ 357 1.3 oster dag_h->numSuccedents = 1; 358 1.3 oster 359 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 360 1.3 oster NULL, nRudNodes + nRrdNodes + 1, 0, 0, 0, dag_h, "Nil", allocList); 361 1.3 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 362 1.3 oster NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 363 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 364 1.3 oster NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 365 1.3 oster rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple, rf_NullNodeUndoFunc, 366 1.3 oster NULL, 1, nRudNodes + nRrdNodes + 1, 2 * nXorBufs + 2, 1, dag_h, 367 1.3 oster recFunc->SimpleName, allocList); 368 1.3 oster 369 1.3 oster /* fill in the Rud nodes */ 370 1.20 oster tmprudNode = rudNodes; 371 1.3 oster for (pda = asmap->physInfo, i = 0; i < nRudNodes; i++, pda = pda->next) { 372 1.3 oster if (pda == failedPDA) { 373 1.3 oster i--; 374 1.3 oster continue; 375 1.3 oster } 376 1.20 oster rf_InitNode(tmprudNode, rf_wait, RF_FALSE, rf_DiskReadFunc, 377 1.3 oster rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, 378 1.3 oster "Rud", allocList); 379 1.3 oster RF_ASSERT(pda); 380 1.20 oster tmprudNode->params[0].p = pda; 381 1.20 oster tmprudNode->params[1].p = pda->bufPtr; 382 1.20 oster tmprudNode->params[2].v = parityStripeID; 383 1.20 oster tmprudNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 384 1.20 oster tmprudNode = tmprudNode->list_next; 385 1.3 oster } 386 1.3 oster 387 1.3 oster /* fill in the Rrd nodes */ 388 1.3 oster i = 0; 389 1.20 oster tmprrdNode = rrdNodes; 390 1.3 oster if (new_asm_h[0]) { 391 1.3 oster for (pda = new_asm_h[0]->stripeMap->physInfo; 392 1.3 oster i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed; 393 1.3 oster i++, pda = pda->next) { 394 1.20 oster rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, 395 1.3 oster rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, 396 1.3 oster dag_h, "Rrd", allocList); 397 1.3 oster RF_ASSERT(pda); 398 1.20 oster tmprrdNode->params[0].p = pda; 399 1.20 oster tmprrdNode->params[1].p = pda->bufPtr; 400 1.20 oster tmprrdNode->params[2].v = parityStripeID; 401 1.20 oster tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 402 1.20 oster tmprrdNode = tmprrdNode->list_next; 403 1.3 oster } 404 1.3 oster } 405 1.3 oster if (new_asm_h[1]) { 406 1.20 oster /* tmprrdNode = rrdNodes; */ /* don't set this here -- old code was using i+j, which means 407 1.20 oster we need to just continue using tmprrdNode for the next 'j' elements. */ 408 1.3 oster for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo; 409 1.3 oster j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed; 410 1.3 oster j++, pda = pda->next) { 411 1.20 oster rf_InitNode(tmprrdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, 412 1.3 oster rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, 413 1.3 oster dag_h, "Rrd", allocList); 414 1.3 oster RF_ASSERT(pda); 415 1.20 oster tmprrdNode->params[0].p = pda; 416 1.20 oster tmprrdNode->params[1].p = pda->bufPtr; 417 1.20 oster tmprrdNode->params[2].v = parityStripeID; 418 1.20 oster tmprrdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 419 1.20 oster tmprrdNode = tmprrdNode->list_next; 420 1.3 oster } 421 1.3 oster } 422 1.3 oster /* make a PDA for the parity unit */ 423 1.32 oster parityPDA = rf_AllocPhysDiskAddr(raidPtr); 424 1.21 oster parityPDA->next = dag_h->pda_cleanup_list; 425 1.21 oster dag_h->pda_cleanup_list = parityPDA; 426 1.3 oster parityPDA->col = asmap->parityInfo->col; 427 1.3 oster parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU) 428 1.3 oster * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); 429 1.3 oster parityPDA->numSector = failedPDA->numSector; 430 1.3 oster 431 1.3 oster /* initialize the Rp node */ 432 1.3 oster rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 433 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rp ", allocList); 434 1.3 oster rpNode->params[0].p = parityPDA; 435 1.3 oster rpNode->params[1].p = rpBuf; 436 1.3 oster rpNode->params[2].v = parityStripeID; 437 1.18 oster rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 438 1.3 oster 439 1.3 oster /* 440 1.3 oster * the last and nastiest step is to assign all 441 1.3 oster * the parameters of the Xor node 442 1.3 oster */ 443 1.3 oster paramNum = 0; 444 1.20 oster tmprrdNode = rrdNodes; 445 1.3 oster for (i = 0; i < nRrdNodes; i++) { 446 1.3 oster /* all the Rrd nodes need to be xored together */ 447 1.20 oster xorNode->params[paramNum++] = tmprrdNode->params[0]; 448 1.20 oster xorNode->params[paramNum++] = tmprrdNode->params[1]; 449 1.20 oster tmprrdNode = tmprrdNode->list_next; 450 1.3 oster } 451 1.20 oster tmprudNode = rudNodes; 452 1.3 oster for (i = 0; i < nRudNodes; i++) { 453 1.3 oster /* any Rud nodes that overlap the failed access need to be 454 1.3 oster * xored in */ 455 1.3 oster if (overlappingPDAs[i]) { 456 1.32 oster pda = rf_AllocPhysDiskAddr(raidPtr); 457 1.20 oster memcpy((char *) pda, (char *) tmprudNode->params[0].p, sizeof(RF_PhysDiskAddr_t)); 458 1.21 oster /* add it into the pda_cleanup_list *after* the copy, TYVM */ 459 1.21 oster pda->next = dag_h->pda_cleanup_list; 460 1.21 oster dag_h->pda_cleanup_list = pda; 461 1.3 oster rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0); 462 1.3 oster xorNode->params[paramNum++].p = pda; 463 1.3 oster xorNode->params[paramNum++].p = pda->bufPtr; 464 1.3 oster } 465 1.20 oster tmprudNode = tmprudNode->list_next; 466 1.3 oster } 467 1.3 oster 468 1.3 oster /* install parity pda as last set of params to be xor'd */ 469 1.3 oster xorNode->params[paramNum++].p = parityPDA; 470 1.3 oster xorNode->params[paramNum++].p = rpBuf; 471 1.3 oster 472 1.3 oster /* 473 1.3 oster * the last 2 params to the recovery xor node are 474 1.3 oster * the failed PDA and the raidPtr 475 1.3 oster */ 476 1.3 oster xorNode->params[paramNum++].p = failedPDA; 477 1.3 oster xorNode->params[paramNum++].p = raidPtr; 478 1.3 oster RF_ASSERT(paramNum == 2 * nXorBufs + 2); 479 1.3 oster 480 1.3 oster /* 481 1.3 oster * The xor node uses results[0] as the target buffer. 482 1.3 oster * Set pointer and zero the buffer. In the kernel, this 483 1.3 oster * may be a user buffer in which case we have to remap it. 484 1.3 oster */ 485 1.3 oster xorNode->results[0] = failedPDA->bufPtr; 486 1.17 oster memset(failedPDA->bufPtr, 0, rf_RaidAddressToByte(raidPtr, 487 1.3 oster failedPDA->numSector)); 488 1.3 oster 489 1.3 oster /* connect nodes to form graph */ 490 1.3 oster /* connect the header to the block node */ 491 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1); 492 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0); 493 1.3 oster dag_h->succedents[0] = blockNode; 494 1.3 oster 495 1.3 oster /* connect the block node to the read nodes */ 496 1.3 oster RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes)); 497 1.3 oster RF_ASSERT(rpNode->numAntecedents == 1); 498 1.3 oster blockNode->succedents[0] = rpNode; 499 1.3 oster rpNode->antecedents[0] = blockNode; 500 1.3 oster rpNode->antType[0] = rf_control; 501 1.20 oster tmprrdNode = rrdNodes; 502 1.3 oster for (i = 0; i < nRrdNodes; i++) { 503 1.20 oster RF_ASSERT(tmprrdNode->numSuccedents == 1); 504 1.20 oster blockNode->succedents[1 + i] = tmprrdNode; 505 1.20 oster tmprrdNode->antecedents[0] = blockNode; 506 1.20 oster tmprrdNode->antType[0] = rf_control; 507 1.20 oster tmprrdNode = tmprrdNode->list_next; 508 1.3 oster } 509 1.20 oster tmprudNode = rudNodes; 510 1.3 oster for (i = 0; i < nRudNodes; i++) { 511 1.20 oster RF_ASSERT(tmprudNode->numSuccedents == 1); 512 1.20 oster blockNode->succedents[1 + nRrdNodes + i] = tmprudNode; 513 1.20 oster tmprudNode->antecedents[0] = blockNode; 514 1.20 oster tmprudNode->antType[0] = rf_control; 515 1.20 oster tmprudNode = tmprudNode->list_next; 516 1.3 oster } 517 1.3 oster 518 1.3 oster /* connect the read nodes to the xor node */ 519 1.3 oster RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes)); 520 1.3 oster RF_ASSERT(rpNode->numSuccedents == 1); 521 1.3 oster rpNode->succedents[0] = xorNode; 522 1.3 oster xorNode->antecedents[0] = rpNode; 523 1.3 oster xorNode->antType[0] = rf_trueData; 524 1.20 oster tmprrdNode = rrdNodes; 525 1.3 oster for (i = 0; i < nRrdNodes; i++) { 526 1.22 oster RF_ASSERT(tmprrdNode->numSuccedents == 1); 527 1.20 oster tmprrdNode->succedents[0] = xorNode; 528 1.20 oster xorNode->antecedents[1 + i] = tmprrdNode; 529 1.3 oster xorNode->antType[1 + i] = rf_trueData; 530 1.20 oster tmprrdNode = tmprrdNode->list_next; 531 1.3 oster } 532 1.20 oster tmprudNode = rudNodes; 533 1.3 oster for (i = 0; i < nRudNodes; i++) { 534 1.20 oster RF_ASSERT(tmprudNode->numSuccedents == 1); 535 1.20 oster tmprudNode->succedents[0] = xorNode; 536 1.20 oster xorNode->antecedents[1 + nRrdNodes + i] = tmprudNode; 537 1.3 oster xorNode->antType[1 + nRrdNodes + i] = rf_trueData; 538 1.20 oster tmprudNode = tmprudNode->list_next; 539 1.3 oster } 540 1.3 oster 541 1.3 oster /* connect the xor node to the commit node */ 542 1.3 oster RF_ASSERT(xorNode->numSuccedents == 1); 543 1.3 oster RF_ASSERT(commitNode->numAntecedents == 1); 544 1.3 oster xorNode->succedents[0] = commitNode; 545 1.3 oster commitNode->antecedents[0] = xorNode; 546 1.3 oster commitNode->antType[0] = rf_control; 547 1.3 oster 548 1.3 oster /* connect the termNode to the commit node */ 549 1.3 oster RF_ASSERT(commitNode->numSuccedents == 1); 550 1.3 oster RF_ASSERT(termNode->numAntecedents == 1); 551 1.3 oster RF_ASSERT(termNode->numSuccedents == 0); 552 1.3 oster commitNode->succedents[0] = termNode; 553 1.3 oster termNode->antType[0] = rf_control; 554 1.3 oster termNode->antecedents[0] = commitNode; 555 1.1 oster } 556 1.1 oster 557 1.6 oster #if (RF_INCLUDE_CHAINDECLUSTER > 0) 558 1.1 oster /****************************************************************************** 559 1.1 oster * Create a degraded read DAG for Chained Declustering 560 1.1 oster * 561 1.1 oster * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm 562 1.1 oster * 563 1.1 oster * The "Rd" node reads data from the surviving disk in the mirror pair 564 1.1 oster * Rpd - read of primary copy 565 1.1 oster * Rsd - read of secondary copy 566 1.1 oster * 567 1.1 oster * Parameters: raidPtr - description of the physical array 568 1.1 oster * asmap - logical & physical addresses for this access 569 1.1 oster * bp - buffer ptr (for holding write data) 570 1.3 oster * flags - general flags (e.g. disk locking) 571 1.1 oster * allocList - list of memory allocated in DAG creation 572 1.1 oster *****************************************************************************/ 573 1.1 oster 574 1.23 perry void 575 1.16 oster rf_CreateRaidCDegradedReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 576 1.27 christos RF_DagHeader_t *dag_h, void *bp, 577 1.27 christos RF_RaidAccessFlags_t flags, 578 1.16 oster RF_AllocListElem_t *allocList) 579 1.1 oster { 580 1.3 oster RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode; 581 1.3 oster RF_StripeNum_t parityStripeID; 582 1.3 oster int useMirror, i, shiftable; 583 1.3 oster RF_ReconUnitNum_t which_ru; 584 1.3 oster RF_PhysDiskAddr_t *pda; 585 1.3 oster 586 1.3 oster if ((asmap->numDataFailed + asmap->numParityFailed) == 0) { 587 1.3 oster shiftable = RF_TRUE; 588 1.3 oster } else { 589 1.3 oster shiftable = RF_FALSE; 590 1.3 oster } 591 1.3 oster useMirror = 0; 592 1.3 oster parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), 593 1.3 oster asmap->raidAddress, &which_ru); 594 1.3 oster 595 1.19 oster #if RF_DEBUG_DAG 596 1.3 oster if (rf_dagDebug) { 597 1.3 oster printf("[Creating RAID C degraded read DAG]\n"); 598 1.3 oster } 599 1.19 oster #endif 600 1.3 oster dag_h->creator = "RaidCDegradedReadDAG"; 601 1.3 oster /* alloc the Wnd nodes and the Wmir node */ 602 1.3 oster if (asmap->numDataFailed == 0) 603 1.3 oster useMirror = RF_FALSE; 604 1.3 oster else 605 1.3 oster useMirror = RF_TRUE; 606 1.3 oster 607 1.3 oster /* total number of nodes = 1 + (block + commit + terminator) */ 608 1.30 christos nodes = RF_MallocAndAdd(4 * sizeof(*nodes), allocList); 609 1.3 oster i = 0; 610 1.3 oster rdNode = &nodes[i]; 611 1.3 oster i++; 612 1.3 oster blockNode = &nodes[i]; 613 1.3 oster i++; 614 1.3 oster commitNode = &nodes[i]; 615 1.3 oster i++; 616 1.3 oster termNode = &nodes[i]; 617 1.3 oster i++; 618 1.3 oster 619 1.3 oster /* 620 1.3 oster * This dag can not commit until the commit node is reached. 621 1.3 oster * Errors prior to the commit point imply the dag has failed 622 1.3 oster * and must be retried. 623 1.3 oster */ 624 1.3 oster dag_h->numCommitNodes = 1; 625 1.3 oster dag_h->numCommits = 0; 626 1.3 oster dag_h->numSuccedents = 1; 627 1.3 oster 628 1.3 oster /* initialize the block, commit, and terminator nodes */ 629 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 630 1.3 oster NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); 631 1.3 oster rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, 632 1.3 oster NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); 633 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, 634 1.3 oster NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); 635 1.3 oster 636 1.3 oster pda = asmap->physInfo; 637 1.3 oster RF_ASSERT(pda != NULL); 638 1.3 oster /* parityInfo must describe entire parity unit */ 639 1.3 oster RF_ASSERT(asmap->parityInfo->next == NULL); 640 1.3 oster 641 1.3 oster /* initialize the data node */ 642 1.3 oster if (!useMirror) { 643 1.3 oster rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 644 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); 645 1.3 oster if (shiftable && rf_compute_workload_shift(raidPtr, pda)) { 646 1.3 oster /* shift this read to the next disk in line */ 647 1.3 oster rdNode->params[0].p = asmap->parityInfo; 648 1.3 oster rdNode->params[1].p = pda->bufPtr; 649 1.3 oster rdNode->params[2].v = parityStripeID; 650 1.18 oster rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 651 1.3 oster } else { 652 1.3 oster /* read primary copy */ 653 1.3 oster rdNode->params[0].p = pda; 654 1.3 oster rdNode->params[1].p = pda->bufPtr; 655 1.3 oster rdNode->params[2].v = parityStripeID; 656 1.18 oster rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 657 1.3 oster } 658 1.3 oster } else { 659 1.3 oster /* read secondary copy of data */ 660 1.3 oster rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, 661 1.3 oster rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); 662 1.3 oster rdNode->params[0].p = asmap->parityInfo; 663 1.3 oster rdNode->params[1].p = pda->bufPtr; 664 1.3 oster rdNode->params[2].v = parityStripeID; 665 1.18 oster rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru); 666 1.3 oster } 667 1.3 oster 668 1.3 oster /* connect header to block node */ 669 1.3 oster RF_ASSERT(dag_h->numSuccedents == 1); 670 1.3 oster RF_ASSERT(blockNode->numAntecedents == 0); 671 1.3 oster dag_h->succedents[0] = blockNode; 672 1.3 oster 673 1.3 oster /* connect block node to rdnode */ 674 1.3 oster RF_ASSERT(blockNode->numSuccedents == 1); 675 1.3 oster RF_ASSERT(rdNode->numAntecedents == 1); 676 1.3 oster blockNode->succedents[0] = rdNode; 677 1.3 oster rdNode->antecedents[0] = blockNode; 678 1.3 oster rdNode->antType[0] = rf_control; 679 1.3 oster 680 1.3 oster /* connect rdnode to commit node */ 681 1.3 oster RF_ASSERT(rdNode->numSuccedents == 1); 682 1.3 oster RF_ASSERT(commitNode->numAntecedents == 1); 683 1.3 oster rdNode->succedents[0] = commitNode; 684 1.3 oster commitNode->antecedents[0] = rdNode; 685 1.3 oster commitNode->antType[0] = rf_control; 686 1.3 oster 687 1.3 oster /* connect commit node to terminator */ 688 1.3 oster RF_ASSERT(commitNode->numSuccedents == 1); 689 1.3 oster RF_ASSERT(termNode->numAntecedents == 1); 690 1.3 oster RF_ASSERT(termNode->numSuccedents == 0); 691 1.3 oster commitNode->succedents[0] = termNode; 692 1.3 oster termNode->antecedents[0] = commitNode; 693 1.3 oster termNode->antType[0] = rf_control; 694 1.1 oster } 695 1.8 mrg #endif /* (RF_INCLUDE_CHAINDECLUSTER > 0) */ 696 1.6 oster 697 1.7 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) 698 1.1 oster /* 699 1.1 oster * XXX move this elsewhere? 700 1.1 oster */ 701 1.23 perry void 702 1.16 oster rf_DD_GenerateFailedAccessASMs(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 703 1.16 oster RF_PhysDiskAddr_t **pdap, int *nNodep, 704 1.16 oster RF_PhysDiskAddr_t **pqpdap, int *nPQNodep, 705 1.16 oster RF_AllocListElem_t *allocList) 706 1.1 oster { 707 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 708 1.3 oster int PDAPerDisk, i; 709 1.3 oster RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; 710 1.3 oster int numDataCol = layoutPtr->numDataCol; 711 1.3 oster int state; 712 1.3 oster RF_SectorNum_t suoff, suend; 713 1.3 oster unsigned firstDataCol, napdas, count; 714 1.3 oster RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0; 715 1.3 oster RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1]; 716 1.3 oster RF_PhysDiskAddr_t *pda_p; 717 1.3 oster RF_PhysDiskAddr_t *phys_p; 718 1.3 oster RF_RaidAddr_t sosAddr; 719 1.3 oster 720 1.3 oster /* determine how many pda's we will have to generate per unaccess 721 1.3 oster * stripe. If there is only one failed data unit, it is one; if two, 722 1.28 mbalmer * possibly two, depending whether they overlap. */ 723 1.1 oster 724 1.3 oster fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector); 725 1.3 oster fone_end = fone_start + fone->numSector; 726 1.1 oster 727 1.30 christos #define BUF_ALLOC(num) \ 728 1.30 christos RF_MallocAndAdd(rf_RaidAddressToByte(raidPtr, num), allocList) 729 1.1 oster #define CONS_PDA(if,start,num) \ 730 1.14 oster pda_p->col = asmap->if->col; \ 731 1.1 oster pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \ 732 1.1 oster pda_p->numSector = num; \ 733 1.1 oster pda_p->next = NULL; \ 734 1.30 christos pda_p->bufPtr = BUF_ALLOC(num) 735 1.1 oster 736 1.3 oster if (asmap->numDataFailed == 1) { 737 1.3 oster PDAPerDisk = 1; 738 1.3 oster state = 1; 739 1.30 christos *pqpdap = RF_MallocAndAdd(2 * sizeof(**pqpdap), allocList); 740 1.3 oster pda_p = *pqpdap; 741 1.3 oster /* build p */ 742 1.3 oster CONS_PDA(parityInfo, fone_start, fone->numSector); 743 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 744 1.1 oster pda_p++; 745 1.3 oster /* build q */ 746 1.3 oster CONS_PDA(qInfo, fone_start, fone->numSector); 747 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 748 1.3 oster } else { 749 1.3 oster ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector); 750 1.3 oster ftwo_end = ftwo_start + ftwo->numSector; 751 1.3 oster if (fone->numSector + ftwo->numSector > secPerSU) { 752 1.3 oster PDAPerDisk = 1; 753 1.3 oster state = 2; 754 1.30 christos *pqpdap = RF_MallocAndAdd(2 * sizeof(**pqpdap), allocList); 755 1.3 oster pda_p = *pqpdap; 756 1.3 oster CONS_PDA(parityInfo, 0, secPerSU); 757 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 758 1.3 oster pda_p++; 759 1.3 oster CONS_PDA(qInfo, 0, secPerSU); 760 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 761 1.3 oster } else { 762 1.3 oster PDAPerDisk = 2; 763 1.3 oster state = 3; 764 1.3 oster /* four of them, fone, then ftwo */ 765 1.30 christos *pqpdap = RF_MallocAndAdd(4 * sizeof(**pqpdap), allocList); 766 1.3 oster pda_p = *pqpdap; 767 1.3 oster CONS_PDA(parityInfo, fone_start, fone->numSector); 768 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 769 1.3 oster pda_p++; 770 1.3 oster CONS_PDA(qInfo, fone_start, fone->numSector); 771 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 772 1.3 oster pda_p++; 773 1.3 oster CONS_PDA(parityInfo, ftwo_start, ftwo->numSector); 774 1.3 oster pda_p->type = RF_PDA_TYPE_PARITY; 775 1.3 oster pda_p++; 776 1.3 oster CONS_PDA(qInfo, ftwo_start, ftwo->numSector); 777 1.3 oster pda_p->type = RF_PDA_TYPE_Q; 778 1.1 oster } 779 1.3 oster } 780 1.3 oster /* figure out number of nonaccessed pda */ 781 1.3 oster napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed - (ftwo == NULL ? 1 : 0)); 782 1.3 oster *nPQNodep = PDAPerDisk; 783 1.3 oster 784 1.3 oster /* sweep over the over accessed pda's, figuring out the number of 785 1.3 oster * additional pda's to generate. Of course, skip the failed ones */ 786 1.3 oster 787 1.3 oster count = 0; 788 1.3 oster for (pda_p = asmap->physInfo; pda_p; pda_p = pda_p->next) { 789 1.3 oster if ((pda_p == fone) || (pda_p == ftwo)) 790 1.3 oster continue; 791 1.3 oster suoff = rf_StripeUnitOffset(layoutPtr, pda_p->startSector); 792 1.3 oster suend = suoff + pda_p->numSector; 793 1.3 oster switch (state) { 794 1.3 oster case 1: /* one failed PDA to overlap */ 795 1.3 oster /* if a PDA doesn't contain the failed unit, it can 796 1.3 oster * only miss the start or end, not both */ 797 1.3 oster if ((suoff > fone_start) || (suend < fone_end)) 798 1.3 oster count++; 799 1.3 oster break; 800 1.3 oster case 2: /* whole stripe */ 801 1.33 andvar if (suoff) /* leak at beginning */ 802 1.3 oster count++; 803 1.3 oster if (suend < numDataCol) /* leak at end */ 804 1.3 oster count++; 805 1.3 oster break; 806 1.3 oster case 3: /* two disjoint units */ 807 1.3 oster if ((suoff > fone_start) || (suend < fone_end)) 808 1.3 oster count++; 809 1.3 oster if ((suoff > ftwo_start) || (suend < ftwo_end)) 810 1.3 oster count++; 811 1.3 oster break; 812 1.3 oster default: 813 1.3 oster RF_PANIC(); 814 1.1 oster } 815 1.3 oster } 816 1.3 oster 817 1.3 oster napdas += count; 818 1.3 oster *nNodep = napdas; 819 1.3 oster if (napdas == 0) 820 1.3 oster return; /* short circuit */ 821 1.3 oster 822 1.3 oster /* allocate up our list of pda's */ 823 1.3 oster 824 1.30 christos pda_p = RF_MallocAndAdd(napdas * sizeof(*pdap), allocList); 825 1.3 oster *pdap = pda_p; 826 1.3 oster 827 1.3 oster /* linkem together */ 828 1.3 oster for (i = 0; i < (napdas - 1); i++) 829 1.3 oster pda_p[i].next = pda_p + (i + 1); 830 1.3 oster 831 1.3 oster /* march through the one's up to the first accessed disk */ 832 1.3 oster firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), asmap->physInfo->raidAddress) % numDataCol; 833 1.3 oster sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); 834 1.3 oster for (i = 0; i < firstDataCol; i++) { 835 1.3 oster if ((pda_p - (*pdap)) == napdas) 836 1.3 oster continue; 837 1.3 oster pda_p->type = RF_PDA_TYPE_DATA; 838 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU); 839 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 840 1.3 oster /* skip over dead disks */ 841 1.14 oster if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status)) 842 1.3 oster continue; 843 1.3 oster switch (state) { 844 1.3 oster case 1: /* fone */ 845 1.3 oster pda_p->numSector = fone->numSector; 846 1.3 oster pda_p->raidAddress += fone_start; 847 1.3 oster pda_p->startSector += fone_start; 848 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 849 1.3 oster break; 850 1.3 oster case 2: /* full stripe */ 851 1.3 oster pda_p->numSector = secPerSU; 852 1.30 christos pda_p->bufPtr = BUF_ALLOC(secPerSU); 853 1.3 oster break; 854 1.3 oster case 3: /* two slabs */ 855 1.3 oster pda_p->numSector = fone->numSector; 856 1.3 oster pda_p->raidAddress += fone_start; 857 1.3 oster pda_p->startSector += fone_start; 858 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 859 1.3 oster pda_p++; 860 1.3 oster pda_p->type = RF_PDA_TYPE_DATA; 861 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU); 862 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 863 1.3 oster pda_p->numSector = ftwo->numSector; 864 1.3 oster pda_p->raidAddress += ftwo_start; 865 1.3 oster pda_p->startSector += ftwo_start; 866 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 867 1.3 oster break; 868 1.3 oster default: 869 1.3 oster RF_PANIC(); 870 1.1 oster } 871 1.3 oster pda_p++; 872 1.3 oster } 873 1.3 oster 874 1.3 oster /* march through the touched stripe units */ 875 1.3 oster for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++) { 876 1.3 oster if ((phys_p == asmap->failedPDAs[0]) || (phys_p == asmap->failedPDAs[1])) 877 1.3 oster continue; 878 1.3 oster suoff = rf_StripeUnitOffset(layoutPtr, phys_p->startSector); 879 1.3 oster suend = suoff + phys_p->numSector; 880 1.3 oster switch (state) { 881 1.3 oster case 1: /* single buffer */ 882 1.3 oster if (suoff > fone_start) { 883 1.3 oster RF_ASSERT(suend >= fone_end); 884 1.3 oster /* The data read starts after the mapped 885 1.33 andvar * access, snip off the beginning */ 886 1.3 oster pda_p->numSector = suoff - fone_start; 887 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU) + fone_start; 888 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 889 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 890 1.3 oster pda_p++; 891 1.3 oster } 892 1.3 oster if (suend < fone_end) { 893 1.3 oster RF_ASSERT(suoff <= fone_start); 894 1.3 oster /* The data read stops before the end of the 895 1.3 oster * failed access, extend */ 896 1.3 oster pda_p->numSector = fone_end - suend; 897 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 898 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 899 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 900 1.3 oster pda_p++; 901 1.3 oster } 902 1.3 oster break; 903 1.3 oster case 2: /* whole stripe unit */ 904 1.3 oster RF_ASSERT((suoff == 0) || (suend == secPerSU)); 905 1.3 oster if (suend < secPerSU) { /* short read, snip from end 906 1.3 oster * on */ 907 1.3 oster pda_p->numSector = secPerSU - suend; 908 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 909 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 910 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 911 1.3 oster pda_p++; 912 1.3 oster } else 913 1.3 oster if (suoff > 0) { /* short at front */ 914 1.3 oster pda_p->numSector = suoff; 915 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU); 916 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 917 1.30 christos pda_p->bufPtr = 918 1.30 christos BUF_ALLOC(pda_p->numSector); 919 1.3 oster pda_p++; 920 1.3 oster } 921 1.3 oster break; 922 1.3 oster case 3: /* two nonoverlapping failures */ 923 1.3 oster if ((suoff > fone_start) || (suend < fone_end)) { 924 1.3 oster if (suoff > fone_start) { 925 1.3 oster RF_ASSERT(suend >= fone_end); 926 1.3 oster /* The data read starts after the 927 1.3 oster * mapped access, snip off the 928 1.33 andvar * beginning */ 929 1.3 oster pda_p->numSector = suoff - fone_start; 930 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU) + fone_start; 931 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 932 1.30 christos pda_p->bufPtr = 933 1.30 christos BUF_ALLOC(pda_p->numSector); 934 1.3 oster pda_p++; 935 1.3 oster } 936 1.3 oster if (suend < fone_end) { 937 1.3 oster RF_ASSERT(suoff <= fone_start); 938 1.3 oster /* The data read stops before the end 939 1.3 oster * of the failed access, extend */ 940 1.3 oster pda_p->numSector = fone_end - suend; 941 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 942 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 943 1.30 christos pda_p->bufPtr = 944 1.30 christos BUF_ALLOC(pda_p->numSector); 945 1.3 oster pda_p++; 946 1.3 oster } 947 1.3 oster } 948 1.3 oster if ((suoff > ftwo_start) || (suend < ftwo_end)) { 949 1.3 oster if (suoff > ftwo_start) { 950 1.3 oster RF_ASSERT(suend >= ftwo_end); 951 1.3 oster /* The data read starts after the 952 1.3 oster * mapped access, snip off the 953 1.33 andvar * beginning */ 954 1.3 oster pda_p->numSector = suoff - ftwo_start; 955 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU) + ftwo_start; 956 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 957 1.30 christos pda_p->bufPtr = 958 1.30 christos BUF_ALLOC(pda_p->numSector); 959 1.3 oster pda_p++; 960 1.3 oster } 961 1.3 oster if (suend < ftwo_end) { 962 1.3 oster RF_ASSERT(suoff <= ftwo_start); 963 1.3 oster /* The data read stops before the end 964 1.3 oster * of the failed access, extend */ 965 1.3 oster pda_p->numSector = ftwo_end - suend; 966 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU) + suend; /* off by one? */ 967 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 968 1.30 christos pda_p->bufPtr = 969 1.30 christos BUF_ALLOC(pda_p->numSector); 970 1.3 oster pda_p++; 971 1.3 oster } 972 1.3 oster } 973 1.3 oster break; 974 1.3 oster default: 975 1.3 oster RF_PANIC(); 976 1.1 oster } 977 1.1 oster } 978 1.1 oster 979 1.3 oster /* after the last accessed disk */ 980 1.3 oster for (; i < numDataCol; i++) { 981 1.3 oster if ((pda_p - (*pdap)) == napdas) 982 1.3 oster continue; 983 1.3 oster pda_p->type = RF_PDA_TYPE_DATA; 984 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU); 985 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 986 1.3 oster /* skip over dead disks */ 987 1.14 oster if (RF_DEAD_DISK(raidPtr->Disks[pda_p->col].status)) 988 1.3 oster continue; 989 1.3 oster switch (state) { 990 1.3 oster case 1: /* fone */ 991 1.3 oster pda_p->numSector = fone->numSector; 992 1.3 oster pda_p->raidAddress += fone_start; 993 1.3 oster pda_p->startSector += fone_start; 994 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 995 1.3 oster break; 996 1.3 oster case 2: /* full stripe */ 997 1.3 oster pda_p->numSector = secPerSU; 998 1.30 christos pda_p->bufPtr = BUF_ALLOC(secPerSU); 999 1.3 oster break; 1000 1.3 oster case 3: /* two slabs */ 1001 1.3 oster pda_p->numSector = fone->numSector; 1002 1.3 oster pda_p->raidAddress += fone_start; 1003 1.3 oster pda_p->startSector += fone_start; 1004 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 1005 1.3 oster pda_p++; 1006 1.3 oster pda_p->type = RF_PDA_TYPE_DATA; 1007 1.3 oster pda_p->raidAddress = sosAddr + (i * secPerSU); 1008 1.14 oster (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress, &(pda_p->col), &(pda_p->startSector), 0); 1009 1.3 oster pda_p->numSector = ftwo->numSector; 1010 1.3 oster pda_p->raidAddress += ftwo_start; 1011 1.3 oster pda_p->startSector += ftwo_start; 1012 1.30 christos pda_p->bufPtr = BUF_ALLOC(pda_p->numSector); 1013 1.3 oster break; 1014 1.3 oster default: 1015 1.3 oster RF_PANIC(); 1016 1.3 oster } 1017 1.3 oster pda_p++; 1018 1.3 oster } 1019 1.3 oster 1020 1.3 oster RF_ASSERT(pda_p - *pdap == napdas); 1021 1.3 oster return; 1022 1.1 oster } 1023 1.1 oster #define INIT_DISK_NODE(node,name) \ 1024 1.1 oster rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ 1025 1.1 oster (node)->succedents[0] = unblockNode; \ 1026 1.1 oster (node)->succedents[1] = recoveryNode; \ 1027 1.1 oster (node)->antecedents[0] = blockNode; \ 1028 1.1 oster (node)->antType[0] = rf_control 1029 1.1 oster 1030 1.1 oster #define DISK_NODE_PARAMS(_node_,_p_) \ 1031 1.1 oster (_node_).params[0].p = _p_ ; \ 1032 1.1 oster (_node_).params[1].p = (_p_)->bufPtr; \ 1033 1.1 oster (_node_).params[2].v = parityStripeID; \ 1034 1.18 oster (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru) 1035 1.1 oster 1036 1.23 perry void 1037 1.16 oster rf_DoubleDegRead(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, 1038 1.27 christos RF_DagHeader_t *dag_h, void *bp, 1039 1.27 christos RF_RaidAccessFlags_t flags, 1040 1.16 oster RF_AllocListElem_t *allocList, 1041 1.26 christos const char *redundantReadNodeName, 1042 1.26 christos const char *recoveryNodeName, 1043 1.31 christos void (*recovFunc) (RF_DagNode_t *)) 1044 1.1 oster { 1045 1.3 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); 1046 1.3 oster RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode, 1047 1.3 oster *unblockNode, *rpNodes, *rqNodes, *termNode; 1048 1.3 oster RF_PhysDiskAddr_t *pda, *pqPDAs; 1049 1.3 oster RF_PhysDiskAddr_t *npdas; 1050 1.3 oster int nNodes, nRrdNodes, nRudNodes, i; 1051 1.3 oster RF_ReconUnitNum_t which_ru; 1052 1.3 oster int nReadNodes, nPQNodes; 1053 1.3 oster RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0]; 1054 1.3 oster RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1]; 1055 1.3 oster RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru); 1056 1.3 oster 1057 1.19 oster #if RF_DEBUG_DAG 1058 1.3 oster if (rf_dagDebug) 1059 1.3 oster printf("[Creating Double Degraded Read DAG]\n"); 1060 1.19 oster #endif 1061 1.3 oster rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes, allocList); 1062 1.3 oster 1063 1.3 oster nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed); 1064 1.3 oster nReadNodes = nRrdNodes + nRudNodes + 2 * nPQNodes; 1065 1.3 oster nNodes = 4 /* block, unblock, recovery, term */ + nReadNodes; 1066 1.3 oster 1067 1.30 christos nodes = RF_MallocAndAdd(nNodes * sizeof(*nodes), allocList); 1068 1.3 oster i = 0; 1069 1.3 oster blockNode = &nodes[i]; 1070 1.3 oster i += 1; 1071 1.3 oster unblockNode = &nodes[i]; 1072 1.3 oster i += 1; 1073 1.3 oster recoveryNode = &nodes[i]; 1074 1.3 oster i += 1; 1075 1.3 oster termNode = &nodes[i]; 1076 1.3 oster i += 1; 1077 1.3 oster rudNodes = &nodes[i]; 1078 1.3 oster i += nRudNodes; 1079 1.3 oster rrdNodes = &nodes[i]; 1080 1.3 oster i += nRrdNodes; 1081 1.3 oster rpNodes = &nodes[i]; 1082 1.3 oster i += nPQNodes; 1083 1.3 oster rqNodes = &nodes[i]; 1084 1.3 oster i += nPQNodes; 1085 1.3 oster RF_ASSERT(i == nNodes); 1086 1.3 oster 1087 1.3 oster dag_h->numSuccedents = 1; 1088 1.3 oster dag_h->succedents[0] = blockNode; 1089 1.3 oster dag_h->creator = "DoubleDegRead"; 1090 1.3 oster dag_h->numCommits = 0; 1091 1.3 oster dag_h->numCommitNodes = 1; /* unblock */ 1092 1.3 oster 1093 1.3 oster rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList); 1094 1.3 oster termNode->antecedents[0] = unblockNode; 1095 1.3 oster termNode->antType[0] = rf_control; 1096 1.3 oster termNode->antecedents[1] = recoveryNode; 1097 1.3 oster termNode->antType[1] = rf_control; 1098 1.3 oster 1099 1.3 oster /* init the block and unblock nodes */ 1100 1.3 oster /* The block node has all nodes except itself, unblock and recovery as 1101 1.3 oster * successors. Similarly for predecessors of the unblock. */ 1102 1.3 oster rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList); 1103 1.3 oster rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h, "Nil", allocList); 1104 1.3 oster 1105 1.3 oster for (i = 0; i < nReadNodes; i++) { 1106 1.3 oster blockNode->succedents[i] = rudNodes + i; 1107 1.3 oster unblockNode->antecedents[i] = rudNodes + i; 1108 1.3 oster unblockNode->antType[i] = rf_control; 1109 1.3 oster } 1110 1.3 oster unblockNode->succedents[0] = termNode; 1111 1.3 oster 1112 1.3 oster /* The recovery node has all the reads as predecessors, and the term 1113 1.3 oster * node as successors. It gets a pda as a param from each of the read 1114 1.3 oster * nodes plus the raidPtr. For each failed unit is has a result pda. */ 1115 1.3 oster rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL, 1116 1.3 oster 1, /* succesors */ 1117 1.3 oster nReadNodes, /* preds */ 1118 1.3 oster nReadNodes + 2, /* params */ 1119 1.3 oster asmap->numDataFailed, /* results */ 1120 1.3 oster dag_h, recoveryNodeName, allocList); 1121 1.3 oster 1122 1.3 oster recoveryNode->succedents[0] = termNode; 1123 1.3 oster for (i = 0; i < nReadNodes; i++) { 1124 1.3 oster recoveryNode->antecedents[i] = rudNodes + i; 1125 1.3 oster recoveryNode->antType[i] = rf_trueData; 1126 1.3 oster } 1127 1.3 oster 1128 1.3 oster /* build the read nodes, then come back and fill in recovery params 1129 1.3 oster * and results */ 1130 1.3 oster pda = asmap->physInfo; 1131 1.3 oster for (i = 0; i < nRudNodes; pda = pda->next) { 1132 1.3 oster if ((pda == failedPDA) || (pda == failedPDAtwo)) 1133 1.3 oster continue; 1134 1.3 oster INIT_DISK_NODE(rudNodes + i, "Rud"); 1135 1.3 oster RF_ASSERT(pda); 1136 1.3 oster DISK_NODE_PARAMS(rudNodes[i], pda); 1137 1.3 oster i++; 1138 1.3 oster } 1139 1.3 oster 1140 1.3 oster pda = npdas; 1141 1.3 oster for (i = 0; i < nRrdNodes; i++, pda = pda->next) { 1142 1.3 oster INIT_DISK_NODE(rrdNodes + i, "Rrd"); 1143 1.3 oster RF_ASSERT(pda); 1144 1.3 oster DISK_NODE_PARAMS(rrdNodes[i], pda); 1145 1.3 oster } 1146 1.3 oster 1147 1.3 oster /* redundancy pdas */ 1148 1.3 oster pda = pqPDAs; 1149 1.3 oster INIT_DISK_NODE(rpNodes, "Rp"); 1150 1.3 oster RF_ASSERT(pda); 1151 1.3 oster DISK_NODE_PARAMS(rpNodes[0], pda); 1152 1.3 oster pda++; 1153 1.3 oster INIT_DISK_NODE(rqNodes, redundantReadNodeName); 1154 1.3 oster RF_ASSERT(pda); 1155 1.3 oster DISK_NODE_PARAMS(rqNodes[0], pda); 1156 1.3 oster if (nPQNodes == 2) { 1157 1.3 oster pda++; 1158 1.3 oster INIT_DISK_NODE(rpNodes + 1, "Rp"); 1159 1.3 oster RF_ASSERT(pda); 1160 1.3 oster DISK_NODE_PARAMS(rpNodes[1], pda); 1161 1.3 oster pda++; 1162 1.3 oster INIT_DISK_NODE(rqNodes + 1, redundantReadNodeName); 1163 1.3 oster RF_ASSERT(pda); 1164 1.3 oster DISK_NODE_PARAMS(rqNodes[1], pda); 1165 1.3 oster } 1166 1.3 oster /* fill in recovery node params */ 1167 1.3 oster for (i = 0; i < nReadNodes; i++) 1168 1.3 oster recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */ 1169 1.3 oster recoveryNode->params[i++].p = (void *) raidPtr; 1170 1.3 oster recoveryNode->params[i++].p = (void *) asmap; 1171 1.3 oster recoveryNode->results[0] = failedPDA; 1172 1.3 oster if (asmap->numDataFailed == 2) 1173 1.3 oster recoveryNode->results[1] = failedPDAtwo; 1174 1.1 oster 1175 1.3 oster /* zero fill the target data buffers? */ 1176 1.1 oster } 1177 1.6 oster 1178 1.7 oster #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */ 1179