1 1.35 thorpej /* $NetBSD: rf_dagfuncs.c,v 1.35 2021/08/07 16:19:15 thorpej Exp $ */ 2 1.1 oster /* 3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University. 4 1.1 oster * All rights reserved. 5 1.1 oster * 6 1.1 oster * Author: Mark Holland, William V. Courtright II 7 1.1 oster * 8 1.1 oster * Permission to use, copy, modify and distribute this software and 9 1.1 oster * its documentation is hereby granted, provided that both the copyright 10 1.1 oster * notice and this permission notice appear in all copies of the 11 1.1 oster * software, derivative works or modified versions, and any portions 12 1.1 oster * thereof, and that both notices appear in supporting documentation. 13 1.1 oster * 14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 1.1 oster * 18 1.1 oster * Carnegie Mellon requests users of this software to return to 19 1.1 oster * 20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU 21 1.1 oster * School of Computer Science 22 1.1 oster * Carnegie Mellon University 23 1.1 oster * Pittsburgh PA 15213-3890 24 1.1 oster * 25 1.1 oster * any improvements or extensions that they make and grant Carnegie the 26 1.1 oster * rights to redistribute these changes. 27 1.1 oster */ 28 1.1 oster 29 1.1 oster /* 30 1.1 oster * dagfuncs.c -- DAG node execution routines 31 1.1 oster * 32 1.1 oster * Rules: 33 1.1 oster * 1. Every DAG execution function must eventually cause node->status to 34 1.1 oster * get set to "good" or "bad", and "FinishNode" to be called. In the 35 1.1 oster * case of nodes that complete immediately (xor, NullNodeFunc, etc), 36 1.1 oster * the node execution function can do these two things directly. In 37 1.1 oster * the case of nodes that have to wait for some event (a disk read to 38 1.1 oster * complete, a lock to be released, etc) to occur before they can 39 1.1 oster * complete, this is typically achieved by having whatever module 40 1.1 oster * is doing the operation call GenericWakeupFunc upon completion. 41 1.1 oster * 2. DAG execution functions should check the status in the DAG header 42 1.1 oster * and NOP out their operations if the status is not "enable". However, 43 1.1 oster * execution functions that release resources must be sure to release 44 1.1 oster * them even when they NOP out the function that would use them. 45 1.1 oster * Functions that acquire resources should go ahead and acquire them 46 1.1 oster * even when they NOP, so that a downstream release node will not have 47 1.1 oster * to check to find out whether or not the acquire was suppressed. 48 1.1 oster */ 49 1.8 lukem 50 1.8 lukem #include <sys/cdefs.h> 51 1.35 thorpej __KERNEL_RCSID(0, "$NetBSD: rf_dagfuncs.c,v 1.35 2021/08/07 16:19:15 thorpej Exp $"); 52 1.1 oster 53 1.7 mrg #include <sys/param.h> 54 1.1 oster #include <sys/ioctl.h> 55 1.1 oster 56 1.1 oster #include "rf_archs.h" 57 1.1 oster #include "rf_raid.h" 58 1.1 oster #include "rf_dag.h" 59 1.1 oster #include "rf_layout.h" 60 1.1 oster #include "rf_etimer.h" 61 1.1 oster #include "rf_acctrace.h" 62 1.1 oster #include "rf_diskqueue.h" 63 1.1 oster #include "rf_dagfuncs.h" 64 1.1 oster #include "rf_general.h" 65 1.1 oster #include "rf_engine.h" 66 1.1 oster #include "rf_dagutils.h" 67 1.1 oster 68 1.1 oster #include "rf_kintf.h" 69 1.1 oster 70 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0 71 1.1 oster #include "rf_paritylog.h" 72 1.3 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 73 1.1 oster 74 1.31 christos void (*rf_DiskReadFunc) (RF_DagNode_t *); 75 1.31 christos void (*rf_DiskWriteFunc) (RF_DagNode_t *); 76 1.31 christos void (*rf_DiskReadUndoFunc) (RF_DagNode_t *); 77 1.31 christos void (*rf_DiskWriteUndoFunc) (RF_DagNode_t *); 78 1.31 christos void (*rf_RegularXorUndoFunc) (RF_DagNode_t *); 79 1.31 christos void (*rf_SimpleXorUndoFunc) (RF_DagNode_t *); 80 1.31 christos void (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *); 81 1.1 oster 82 1.14 oster /***************************************************************************** 83 1.1 oster * main (only) configuration routine for this module 84 1.14 oster ****************************************************************************/ 85 1.23 perry int 86 1.28 christos rf_ConfigureDAGFuncs(RF_ShutdownList_t **listp) 87 1.3 oster { 88 1.23 perry RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || 89 1.14 oster ((sizeof(long) == 4) && RF_LONGSHIFT == 2)); 90 1.3 oster rf_DiskReadFunc = rf_DiskReadFuncForThreads; 91 1.3 oster rf_DiskReadUndoFunc = rf_DiskUndoFunc; 92 1.3 oster rf_DiskWriteFunc = rf_DiskWriteFuncForThreads; 93 1.3 oster rf_DiskWriteUndoFunc = rf_DiskUndoFunc; 94 1.3 oster rf_RegularXorUndoFunc = rf_NullNodeUndoFunc; 95 1.3 oster rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc; 96 1.3 oster rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc; 97 1.3 oster return (0); 98 1.1 oster } 99 1.1 oster 100 1.1 oster 101 1.1 oster 102 1.14 oster /***************************************************************************** 103 1.1 oster * the execution function associated with a terminate node 104 1.14 oster ****************************************************************************/ 105 1.31 christos void 106 1.15 oster rf_TerminateFunc(RF_DagNode_t *node) 107 1.1 oster { 108 1.3 oster RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes); 109 1.3 oster node->status = rf_good; 110 1.31 christos rf_FinishNode(node, RF_THREAD_CONTEXT); 111 1.1 oster } 112 1.1 oster 113 1.31 christos void 114 1.28 christos rf_TerminateUndoFunc(RF_DagNode_t *node) 115 1.1 oster { 116 1.1 oster } 117 1.1 oster 118 1.1 oster 119 1.15 oster /***************************************************************************** 120 1.1 oster * execution functions associated with a mirror node 121 1.1 oster * 122 1.1 oster * parameters: 123 1.1 oster * 124 1.34 andvar * 0 - physical disk address of data 125 1.1 oster * 1 - buffer for holding read data 126 1.1 oster * 2 - parity stripe ID 127 1.1 oster * 3 - flags 128 1.1 oster * 4 - physical disk address of mirror (parity) 129 1.1 oster * 130 1.15 oster ****************************************************************************/ 131 1.1 oster 132 1.31 christos void 133 1.15 oster rf_DiskReadMirrorIdleFunc(RF_DagNode_t *node) 134 1.1 oster { 135 1.3 oster /* select the mirror copy with the shortest queue and fill in node 136 1.3 oster * parameters with physical disk address */ 137 1.1 oster 138 1.3 oster rf_SelectMirrorDiskIdle(node); 139 1.31 christos rf_DiskReadFunc(node); 140 1.1 oster } 141 1.1 oster 142 1.11 oster #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0) || (RF_DEBUG_VALIDATE_DAG > 0) 143 1.31 christos void 144 1.15 oster rf_DiskReadMirrorPartitionFunc(RF_DagNode_t *node) 145 1.1 oster { 146 1.3 oster /* select the mirror copy with the shortest queue and fill in node 147 1.3 oster * parameters with physical disk address */ 148 1.1 oster 149 1.3 oster rf_SelectMirrorDiskPartition(node); 150 1.31 christos rf_DiskReadFunc(node); 151 1.1 oster } 152 1.11 oster #endif 153 1.1 oster 154 1.31 christos void 155 1.28 christos rf_DiskReadMirrorUndoFunc(RF_DagNode_t *node) 156 1.1 oster { 157 1.1 oster } 158 1.1 oster 159 1.1 oster 160 1.1 oster 161 1.1 oster #if RF_INCLUDE_PARITYLOGGING > 0 162 1.14 oster /***************************************************************************** 163 1.1 oster * the execution function associated with a parity log update node 164 1.14 oster ****************************************************************************/ 165 1.31 christos void 166 1.15 oster rf_ParityLogUpdateFunc(RF_DagNode_t *node) 167 1.3 oster { 168 1.3 oster RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 169 1.29 christos void *bf = (void *) node->params[1].p; 170 1.3 oster RF_ParityLogData_t *logData; 171 1.19 oster #if RF_ACC_TRACE > 0 172 1.3 oster RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 173 1.3 oster RF_Etimer_t timer; 174 1.19 oster #endif 175 1.3 oster 176 1.3 oster if (node->dagHdr->status == rf_enable) { 177 1.19 oster #if RF_ACC_TRACE > 0 178 1.3 oster RF_ETIMER_START(timer); 179 1.19 oster #endif 180 1.24 christos logData = rf_CreateParityLogData(RF_UPDATE, pda, bf, 181 1.3 oster (RF_Raid_t *) (node->dagHdr->raidPtr), 182 1.31 christos node->wakeFunc, node, 183 1.3 oster node->dagHdr->tracerec, timer); 184 1.3 oster if (logData) 185 1.3 oster rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); 186 1.3 oster else { 187 1.19 oster #if RF_ACC_TRACE > 0 188 1.3 oster RF_ETIMER_STOP(timer); 189 1.3 oster RF_ETIMER_EVAL(timer); 190 1.3 oster tracerec->plog_us += RF_ETIMER_VAL_US(timer); 191 1.19 oster #endif 192 1.3 oster (node->wakeFunc) (node, ENOMEM); 193 1.3 oster } 194 1.1 oster } 195 1.1 oster } 196 1.1 oster 197 1.1 oster 198 1.15 oster /***************************************************************************** 199 1.1 oster * the execution function associated with a parity log overwrite node 200 1.15 oster ****************************************************************************/ 201 1.31 christos void 202 1.15 oster rf_ParityLogOverwriteFunc(RF_DagNode_t *node) 203 1.3 oster { 204 1.3 oster RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 205 1.29 christos void *bf = (void *) node->params[1].p; 206 1.3 oster RF_ParityLogData_t *logData; 207 1.19 oster #if RF_ACC_TRACE > 0 208 1.3 oster RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 209 1.3 oster RF_Etimer_t timer; 210 1.19 oster #endif 211 1.3 oster 212 1.3 oster if (node->dagHdr->status == rf_enable) { 213 1.19 oster #if RF_ACC_TRACE > 0 214 1.3 oster RF_ETIMER_START(timer); 215 1.19 oster #endif 216 1.24 christos logData = rf_CreateParityLogData(RF_OVERWRITE, pda, bf, 217 1.14 oster (RF_Raid_t *) (node->dagHdr->raidPtr), 218 1.31 christos node->wakeFunc, node, node->dagHdr->tracerec, timer); 219 1.3 oster if (logData) 220 1.3 oster rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); 221 1.3 oster else { 222 1.19 oster #if RF_ACC_TRACE > 0 223 1.3 oster RF_ETIMER_STOP(timer); 224 1.3 oster RF_ETIMER_EVAL(timer); 225 1.3 oster tracerec->plog_us += RF_ETIMER_VAL_US(timer); 226 1.19 oster #endif 227 1.3 oster (node->wakeFunc) (node, ENOMEM); 228 1.3 oster } 229 1.1 oster } 230 1.1 oster } 231 1.1 oster 232 1.31 christos void 233 1.28 christos rf_ParityLogUpdateUndoFunc(RF_DagNode_t *node) 234 1.1 oster { 235 1.1 oster } 236 1.1 oster 237 1.31 christos void 238 1.28 christos rf_ParityLogOverwriteUndoFunc(RF_DagNode_t *node) 239 1.1 oster { 240 1.1 oster } 241 1.10 oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 242 1.10 oster 243 1.14 oster /***************************************************************************** 244 1.1 oster * the execution function associated with a NOP node 245 1.14 oster ****************************************************************************/ 246 1.31 christos void 247 1.15 oster rf_NullNodeFunc(RF_DagNode_t *node) 248 1.1 oster { 249 1.3 oster node->status = rf_good; 250 1.31 christos rf_FinishNode(node, RF_THREAD_CONTEXT); 251 1.1 oster } 252 1.1 oster 253 1.31 christos void 254 1.15 oster rf_NullNodeUndoFunc(RF_DagNode_t *node) 255 1.1 oster { 256 1.3 oster node->status = rf_undone; 257 1.31 christos rf_FinishNode(node, RF_THREAD_CONTEXT); 258 1.1 oster } 259 1.1 oster 260 1.1 oster 261 1.14 oster /***************************************************************************** 262 1.1 oster * the execution function associated with a disk-read node 263 1.14 oster ****************************************************************************/ 264 1.31 christos void 265 1.15 oster rf_DiskReadFuncForThreads(RF_DagNode_t *node) 266 1.3 oster { 267 1.3 oster RF_DiskQueueData_t *req; 268 1.3 oster RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 269 1.29 christos void *bf = (void *) node->params[1].p; 270 1.3 oster RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v; 271 1.3 oster unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); 272 1.3 oster unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); 273 1.3 oster RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP; 274 1.13 oster RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; 275 1.1 oster 276 1.3 oster req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, 277 1.31 christos bf, parityStripeID, which_ru, node->wakeFunc, node, 278 1.19 oster #if RF_ACC_TRACE > 0 279 1.19 oster node->dagHdr->tracerec, 280 1.19 oster #else 281 1.19 oster NULL, 282 1.19 oster #endif 283 1.33 oster (void *) (node->dagHdr->raidPtr), 0, node->dagHdr->bp); 284 1.33 oster 285 1.33 oster node->dagFuncData = (void *) req; 286 1.33 oster rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority); 287 1.1 oster } 288 1.1 oster 289 1.1 oster 290 1.14 oster /***************************************************************************** 291 1.1 oster * the execution function associated with a disk-write node 292 1.14 oster ****************************************************************************/ 293 1.31 christos void 294 1.15 oster rf_DiskWriteFuncForThreads(RF_DagNode_t *node) 295 1.3 oster { 296 1.3 oster RF_DiskQueueData_t *req; 297 1.3 oster RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 298 1.29 christos void *bf = (void *) node->params[1].p; 299 1.3 oster RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v; 300 1.3 oster unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); 301 1.3 oster unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); 302 1.3 oster RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP; 303 1.13 oster RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; 304 1.1 oster 305 1.3 oster /* normal processing (rollaway or forward recovery) begins here */ 306 1.3 oster req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, 307 1.31 christos bf, parityStripeID, which_ru, node->wakeFunc, node, 308 1.19 oster #if RF_ACC_TRACE > 0 309 1.3 oster node->dagHdr->tracerec, 310 1.19 oster #else 311 1.19 oster NULL, 312 1.19 oster #endif 313 1.3 oster (void *) (node->dagHdr->raidPtr), 314 1.33 oster 0, node->dagHdr->bp); 315 1.3 oster 316 1.33 oster node->dagFuncData = (void *) req; 317 1.33 oster rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority); 318 1.1 oster } 319 1.14 oster /***************************************************************************** 320 1.1 oster * the undo function for disk nodes 321 1.1 oster * Note: this is not a proper undo of a write node, only locks are released. 322 1.1 oster * old data is not restored to disk! 323 1.14 oster ****************************************************************************/ 324 1.31 christos void 325 1.15 oster rf_DiskUndoFunc(RF_DagNode_t *node) 326 1.3 oster { 327 1.3 oster RF_DiskQueueData_t *req; 328 1.3 oster RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; 329 1.13 oster RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; 330 1.3 oster 331 1.3 oster req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP, 332 1.31 christos 0L, 0, NULL, 0L, 0, node->wakeFunc, node, 333 1.19 oster #if RF_ACC_TRACE > 0 334 1.19 oster node->dagHdr->tracerec, 335 1.19 oster #else 336 1.19 oster NULL, 337 1.19 oster #endif 338 1.3 oster (void *) (node->dagHdr->raidPtr), 339 1.33 oster 0, NULL); 340 1.33 oster 341 1.33 oster node->dagFuncData = (void *) req; 342 1.33 oster rf_DiskIOEnqueue(&(dqs[pda->col]), req, RF_IO_NORMAL_PRIORITY); 343 1.1 oster } 344 1.3 oster 345 1.14 oster /***************************************************************************** 346 1.14 oster * Callback routine for DiskRead and DiskWrite nodes. When the disk 347 1.14 oster * op completes, the routine is called to set the node status and 348 1.14 oster * inform the execution engine that the node has fired. 349 1.14 oster ****************************************************************************/ 350 1.31 christos void 351 1.31 christos rf_GenericWakeupFunc(void *v, int status) 352 1.3 oster { 353 1.31 christos RF_DagNode_t *node = v; 354 1.15 oster 355 1.3 oster switch (node->status) { 356 1.3 oster case rf_fired: 357 1.3 oster if (status) 358 1.3 oster node->status = rf_bad; 359 1.3 oster else 360 1.3 oster node->status = rf_good; 361 1.3 oster break; 362 1.3 oster case rf_recover: 363 1.3 oster /* probably should never reach this case */ 364 1.3 oster if (status) 365 1.3 oster node->status = rf_panic; 366 1.3 oster else 367 1.3 oster node->status = rf_undone; 368 1.3 oster break; 369 1.3 oster default: 370 1.4 oster printf("rf_GenericWakeupFunc:"); 371 1.4 oster printf("node->status is %d,", node->status); 372 1.4 oster printf("status is %d \n", status); 373 1.3 oster RF_PANIC(); 374 1.3 oster break; 375 1.3 oster } 376 1.3 oster if (node->dagFuncData) 377 1.3 oster rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData); 378 1.31 christos rf_FinishNode(node, RF_INTR_CONTEXT); 379 1.1 oster } 380 1.1 oster 381 1.1 oster 382 1.14 oster /***************************************************************************** 383 1.14 oster * there are three distinct types of xor nodes: 384 1.14 oster 385 1.14 oster * A "regular xor" is used in the fault-free case where the access 386 1.14 oster * spans a complete stripe unit. It assumes that the result buffer is 387 1.14 oster * one full stripe unit in size, and uses the stripe-unit-offset 388 1.14 oster * values that it computes from the PDAs to determine where within the 389 1.14 oster * stripe unit to XOR each argument buffer. 390 1.14 oster * 391 1.14 oster * A "simple xor" is used in the fault-free case where the access 392 1.14 oster * touches only a portion of one (or two, in some cases) stripe 393 1.14 oster * unit(s). It assumes that all the argument buffers are of the same 394 1.14 oster * size and have the same stripe unit offset. 395 1.14 oster * 396 1.14 oster * A "recovery xor" is used in the degraded-mode case. It's similar 397 1.14 oster * to the regular xor function except that it takes the failed PDA as 398 1.14 oster * an additional parameter, and uses it to determine what portions of 399 1.14 oster * the argument buffers need to be xor'd into the result buffer, and 400 1.14 oster * where in the result buffer they should go. 401 1.14 oster ****************************************************************************/ 402 1.1 oster 403 1.1 oster /* xor the params together and store the result in the result field. 404 1.14 oster * assume the result field points to a buffer that is the size of one 405 1.14 oster * SU, and use the pda params to determine where within the buffer to 406 1.14 oster * XOR the input buffers. */ 407 1.31 christos void 408 1.15 oster rf_RegularXorFunc(RF_DagNode_t *node) 409 1.3 oster { 410 1.3 oster RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 411 1.19 oster #if RF_ACC_TRACE > 0 412 1.3 oster RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 413 1.3 oster RF_Etimer_t timer; 414 1.19 oster #endif 415 1.3 oster int i, retcode; 416 1.1 oster 417 1.3 oster retcode = 0; 418 1.3 oster if (node->dagHdr->status == rf_enable) { 419 1.3 oster /* don't do the XOR if the input is the same as the output */ 420 1.19 oster #if RF_ACC_TRACE > 0 421 1.3 oster RF_ETIMER_START(timer); 422 1.19 oster #endif 423 1.3 oster for (i = 0; i < node->numParams - 1; i += 2) 424 1.3 oster if (node->params[i + 1].p != node->results[0]) { 425 1.3 oster retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p, 426 1.17 oster (char *) node->params[i + 1].p, (char *) node->results[0]); 427 1.3 oster } 428 1.19 oster #if RF_ACC_TRACE > 0 429 1.3 oster RF_ETIMER_STOP(timer); 430 1.3 oster RF_ETIMER_EVAL(timer); 431 1.3 oster tracerec->xor_us += RF_ETIMER_VAL_US(timer); 432 1.19 oster #endif 433 1.3 oster } 434 1.31 christos rf_GenericWakeupFunc(node, retcode); /* call wake func 435 1.31 christos * explicitly since no 436 1.31 christos * I/O in this node */ 437 1.1 oster } 438 1.1 oster /* xor the inputs into the result buffer, ignoring placement issues */ 439 1.31 christos void 440 1.15 oster rf_SimpleXorFunc(RF_DagNode_t *node) 441 1.3 oster { 442 1.3 oster RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 443 1.3 oster int i, retcode = 0; 444 1.19 oster #if RF_ACC_TRACE > 0 445 1.3 oster RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 446 1.3 oster RF_Etimer_t timer; 447 1.19 oster #endif 448 1.1 oster 449 1.3 oster if (node->dagHdr->status == rf_enable) { 450 1.19 oster #if RF_ACC_TRACE > 0 451 1.3 oster RF_ETIMER_START(timer); 452 1.19 oster #endif 453 1.3 oster /* don't do the XOR if the input is the same as the output */ 454 1.3 oster for (i = 0; i < node->numParams - 1; i += 2) 455 1.3 oster if (node->params[i + 1].p != node->results[0]) { 456 1.3 oster retcode = rf_bxor((char *) node->params[i + 1].p, (char *) node->results[0], 457 1.17 oster rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[i].p)->numSector)); 458 1.3 oster } 459 1.19 oster #if RF_ACC_TRACE > 0 460 1.3 oster RF_ETIMER_STOP(timer); 461 1.3 oster RF_ETIMER_EVAL(timer); 462 1.3 oster tracerec->xor_us += RF_ETIMER_VAL_US(timer); 463 1.19 oster #endif 464 1.3 oster } 465 1.31 christos rf_GenericWakeupFunc(node, retcode); /* call wake func 466 1.31 christos * explicitly since no 467 1.31 christos * I/O in this node */ 468 1.1 oster } 469 1.14 oster /* this xor is used by the degraded-mode dag functions to recover lost 470 1.14 oster * data. the second-to-last parameter is the PDA for the failed 471 1.14 oster * portion of the access. the code here looks at this PDA and assumes 472 1.14 oster * that the xor target buffer is equal in size to the number of 473 1.14 oster * sectors in the failed PDA. It then uses the other PDAs in the 474 1.14 oster * parameter list to determine where within the target buffer the 475 1.14 oster * corresponding data should be xored. */ 476 1.31 christos void 477 1.15 oster rf_RecoveryXorFunc(RF_DagNode_t *node) 478 1.3 oster { 479 1.3 oster RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p; 480 1.3 oster RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout; 481 1.3 oster RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p; 482 1.3 oster int i, retcode = 0; 483 1.3 oster RF_PhysDiskAddr_t *pda; 484 1.3 oster int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); 485 1.3 oster char *srcbuf, *destbuf; 486 1.19 oster #if RF_ACC_TRACE > 0 487 1.3 oster RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; 488 1.3 oster RF_Etimer_t timer; 489 1.19 oster #endif 490 1.1 oster 491 1.3 oster if (node->dagHdr->status == rf_enable) { 492 1.19 oster #if RF_ACC_TRACE > 0 493 1.3 oster RF_ETIMER_START(timer); 494 1.19 oster #endif 495 1.3 oster for (i = 0; i < node->numParams - 2; i += 2) 496 1.3 oster if (node->params[i + 1].p != node->results[0]) { 497 1.3 oster pda = (RF_PhysDiskAddr_t *) node->params[i].p; 498 1.3 oster srcbuf = (char *) node->params[i + 1].p; 499 1.3 oster suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); 500 1.3 oster destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset); 501 1.17 oster retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector)); 502 1.3 oster } 503 1.19 oster #if RF_ACC_TRACE > 0 504 1.3 oster RF_ETIMER_STOP(timer); 505 1.3 oster RF_ETIMER_EVAL(timer); 506 1.3 oster tracerec->xor_us += RF_ETIMER_VAL_US(timer); 507 1.19 oster #endif 508 1.3 oster } 509 1.31 christos rf_GenericWakeupFunc(node, retcode); 510 1.1 oster } 511 1.14 oster /***************************************************************************** 512 1.14 oster * The next three functions are utilities used by the above 513 1.14 oster * xor-execution functions. 514 1.14 oster ****************************************************************************/ 515 1.1 oster 516 1.1 oster 517 1.1 oster /* 518 1.14 oster * this is just a glorified buffer xor. targbuf points to a buffer 519 1.14 oster * that is one full stripe unit in size. srcbuf points to a buffer 520 1.14 oster * that may be less than 1 SU, but never more. When the access 521 1.14 oster * described by pda is one SU in size (which by implication means it's 522 1.14 oster * SU-aligned), all that happens is (targbuf) <- (srcbuf ^ targbuf). 523 1.14 oster * When the access is less than one SU in size the XOR occurs on only 524 1.14 oster * the portion of targbuf identified in the pda. */ 525 1.1 oster 526 1.23 perry int 527 1.15 oster rf_XorIntoBuffer(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, 528 1.17 oster char *srcbuf, char *targbuf) 529 1.3 oster { 530 1.3 oster char *targptr; 531 1.3 oster int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 532 1.3 oster int SUOffset = pda->startSector % sectPerSU; 533 1.3 oster int length, retcode = 0; 534 1.3 oster 535 1.3 oster RF_ASSERT(pda->numSector <= sectPerSU); 536 1.3 oster 537 1.3 oster targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset); 538 1.3 oster length = rf_RaidAddressToByte(raidPtr, pda->numSector); 539 1.17 oster retcode = rf_bxor(srcbuf, targptr, length); 540 1.3 oster return (retcode); 541 1.1 oster } 542 1.14 oster /* it really should be the case that the buffer pointers (returned by 543 1.14 oster * malloc) are aligned to the natural word size of the machine, so 544 1.14 oster * this is the only case we optimize for. The length should always be 545 1.14 oster * a multiple of the sector size, so there should be no problem with 546 1.14 oster * leftover bytes at the end. */ 547 1.23 perry int 548 1.17 oster rf_bxor(char *src, char *dest, int len) 549 1.3 oster { 550 1.3 oster unsigned mask = sizeof(long) - 1, retcode = 0; 551 1.3 oster 552 1.23 perry if (!(((unsigned long) src) & mask) && 553 1.14 oster !(((unsigned long) dest) & mask) && !(len & mask)) { 554 1.23 perry retcode = rf_longword_bxor((unsigned long *) src, 555 1.23 perry (unsigned long *) dest, 556 1.17 oster len >> RF_LONGSHIFT); 557 1.3 oster } else { 558 1.3 oster RF_ASSERT(0); 559 1.3 oster } 560 1.3 oster return (retcode); 561 1.1 oster } 562 1.1 oster 563 1.14 oster /* When XORing in kernel mode, we need to map each user page to kernel 564 1.14 oster * space before we can access it. We don't want to assume anything 565 1.14 oster * about which input buffers are in kernel/user space, nor about their 566 1.14 oster * alignment, so in each loop we compute the maximum number of bytes 567 1.14 oster * that we can xor without crossing any page boundaries, and do only 568 1.23 perry * this many bytes before the next remap. 569 1.23 perry * 570 1.23 perry * len - is in longwords 571 1.15 oster */ 572 1.23 perry int 573 1.17 oster rf_longword_bxor(unsigned long *src, unsigned long *dest, int len) 574 1.3 oster { 575 1.6 augustss unsigned long *end = src + len; 576 1.6 augustss unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */ 577 1.14 oster unsigned long *pg_src, *pg_dest; /* per-page source/dest pointers */ 578 1.3 oster int longs_this_time;/* # longwords to xor in the current iteration */ 579 1.3 oster 580 1.16 oster pg_src = src; 581 1.16 oster pg_dest = dest; 582 1.3 oster if (!pg_src || !pg_dest) 583 1.3 oster return (EFAULT); 584 1.3 oster 585 1.3 oster while (len >= 4) { 586 1.3 oster longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT); /* note len in longwords */ 587 1.3 oster src += longs_this_time; 588 1.3 oster dest += longs_this_time; 589 1.3 oster len -= longs_this_time; 590 1.3 oster while (longs_this_time >= 4) { 591 1.3 oster d0 = pg_dest[0]; 592 1.3 oster d1 = pg_dest[1]; 593 1.3 oster d2 = pg_dest[2]; 594 1.3 oster d3 = pg_dest[3]; 595 1.3 oster s0 = pg_src[0]; 596 1.3 oster s1 = pg_src[1]; 597 1.3 oster s2 = pg_src[2]; 598 1.3 oster s3 = pg_src[3]; 599 1.3 oster pg_dest[0] = d0 ^ s0; 600 1.3 oster pg_dest[1] = d1 ^ s1; 601 1.3 oster pg_dest[2] = d2 ^ s2; 602 1.3 oster pg_dest[3] = d3 ^ s3; 603 1.3 oster pg_src += 4; 604 1.3 oster pg_dest += 4; 605 1.3 oster longs_this_time -= 4; 606 1.3 oster } 607 1.3 oster while (longs_this_time > 0) { /* cannot cross any page 608 1.3 oster * boundaries here */ 609 1.3 oster *pg_dest++ ^= *pg_src++; 610 1.3 oster longs_this_time--; 611 1.3 oster } 612 1.3 oster 613 1.3 oster /* either we're done, or we've reached a page boundary on one 614 1.3 oster * (or possibly both) of the pointers */ 615 1.3 oster if (len) { 616 1.3 oster if (RF_PAGE_ALIGNED(src)) 617 1.16 oster pg_src = src; 618 1.3 oster if (RF_PAGE_ALIGNED(dest)) 619 1.16 oster pg_dest = dest; 620 1.3 oster if (!pg_src || !pg_dest) 621 1.3 oster return (EFAULT); 622 1.3 oster } 623 1.3 oster } 624 1.3 oster while (src < end) { 625 1.3 oster *pg_dest++ ^= *pg_src++; 626 1.3 oster src++; 627 1.3 oster dest++; 628 1.3 oster len--; 629 1.3 oster if (RF_PAGE_ALIGNED(src)) 630 1.16 oster pg_src = src; 631 1.3 oster if (RF_PAGE_ALIGNED(dest)) 632 1.16 oster pg_dest = dest; 633 1.3 oster } 634 1.3 oster RF_ASSERT(len == 0); 635 1.3 oster return (0); 636 1.1 oster } 637 1.1 oster 638 1.9 oster #if 0 639 1.1 oster /* 640 1.1 oster dst = a ^ b ^ c; 641 1.1 oster a may equal dst 642 1.1 oster see comment above longword_bxor 643 1.15 oster len is length in longwords 644 1.1 oster */ 645 1.23 perry int 646 1.15 oster rf_longword_bxor3(unsigned long *dst, unsigned long *a, unsigned long *b, 647 1.15 oster unsigned long *c, int len, void *bp) 648 1.3 oster { 649 1.3 oster unsigned long a0, a1, a2, a3, b0, b1, b2, b3; 650 1.6 augustss unsigned long *pg_a, *pg_b, *pg_c, *pg_dst; /* per-page source/dest 651 1.3 oster * pointers */ 652 1.3 oster int longs_this_time;/* # longs to xor in the current iteration */ 653 1.3 oster char dst_is_a = 0; 654 1.3 oster 655 1.16 oster pg_a = a; 656 1.16 oster pg_b = b; 657 1.16 oster pg_c = c; 658 1.3 oster if (a == dst) { 659 1.3 oster pg_dst = pg_a; 660 1.3 oster dst_is_a = 1; 661 1.3 oster } else { 662 1.16 oster pg_dst = dst; 663 1.3 oster } 664 1.3 oster 665 1.3 oster /* align dest to cache line. Can't cross a pg boundary on dst here. */ 666 1.3 oster while ((((unsigned long) pg_dst) & 0x1f)) { 667 1.3 oster *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; 668 1.3 oster dst++; 669 1.3 oster a++; 670 1.3 oster b++; 671 1.3 oster c++; 672 1.3 oster if (RF_PAGE_ALIGNED(a)) { 673 1.16 oster pg_a = a; 674 1.3 oster if (!pg_a) 675 1.3 oster return (EFAULT); 676 1.3 oster } 677 1.3 oster if (RF_PAGE_ALIGNED(b)) { 678 1.16 oster pg_b = a; 679 1.3 oster if (!pg_b) 680 1.3 oster return (EFAULT); 681 1.3 oster } 682 1.3 oster if (RF_PAGE_ALIGNED(c)) { 683 1.16 oster pg_c = a; 684 1.3 oster if (!pg_c) 685 1.3 oster return (EFAULT); 686 1.3 oster } 687 1.3 oster len--; 688 1.3 oster } 689 1.3 oster 690 1.3 oster while (len > 4) { 691 1.3 oster longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT); 692 1.3 oster a += longs_this_time; 693 1.3 oster b += longs_this_time; 694 1.3 oster c += longs_this_time; 695 1.3 oster dst += longs_this_time; 696 1.3 oster len -= longs_this_time; 697 1.3 oster while (longs_this_time >= 4) { 698 1.3 oster a0 = pg_a[0]; 699 1.3 oster longs_this_time -= 4; 700 1.3 oster 701 1.3 oster a1 = pg_a[1]; 702 1.3 oster a2 = pg_a[2]; 703 1.3 oster 704 1.3 oster a3 = pg_a[3]; 705 1.3 oster pg_a += 4; 706 1.3 oster 707 1.3 oster b0 = pg_b[0]; 708 1.3 oster b1 = pg_b[1]; 709 1.3 oster 710 1.3 oster b2 = pg_b[2]; 711 1.3 oster b3 = pg_b[3]; 712 1.3 oster /* start dual issue */ 713 1.3 oster a0 ^= b0; 714 1.3 oster b0 = pg_c[0]; 715 1.3 oster 716 1.3 oster pg_b += 4; 717 1.3 oster a1 ^= b1; 718 1.3 oster 719 1.3 oster a2 ^= b2; 720 1.3 oster a3 ^= b3; 721 1.3 oster 722 1.3 oster b1 = pg_c[1]; 723 1.3 oster a0 ^= b0; 724 1.3 oster 725 1.3 oster b2 = pg_c[2]; 726 1.3 oster a1 ^= b1; 727 1.3 oster 728 1.3 oster b3 = pg_c[3]; 729 1.3 oster a2 ^= b2; 730 1.3 oster 731 1.3 oster pg_dst[0] = a0; 732 1.3 oster a3 ^= b3; 733 1.3 oster pg_dst[1] = a1; 734 1.3 oster pg_c += 4; 735 1.3 oster pg_dst[2] = a2; 736 1.3 oster pg_dst[3] = a3; 737 1.3 oster pg_dst += 4; 738 1.3 oster } 739 1.3 oster while (longs_this_time > 0) { /* cannot cross any page 740 1.3 oster * boundaries here */ 741 1.3 oster *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; 742 1.3 oster longs_this_time--; 743 1.3 oster } 744 1.3 oster 745 1.3 oster if (len) { 746 1.3 oster if (RF_PAGE_ALIGNED(a)) { 747 1.16 oster pg_a = a; 748 1.3 oster if (!pg_a) 749 1.3 oster return (EFAULT); 750 1.3 oster if (dst_is_a) 751 1.3 oster pg_dst = pg_a; 752 1.3 oster } 753 1.3 oster if (RF_PAGE_ALIGNED(b)) { 754 1.16 oster pg_b = b; 755 1.3 oster if (!pg_b) 756 1.3 oster return (EFAULT); 757 1.3 oster } 758 1.3 oster if (RF_PAGE_ALIGNED(c)) { 759 1.16 oster pg_c = c; 760 1.3 oster if (!pg_c) 761 1.3 oster return (EFAULT); 762 1.3 oster } 763 1.3 oster if (!dst_is_a) 764 1.3 oster if (RF_PAGE_ALIGNED(dst)) { 765 1.16 oster pg_dst = dst; 766 1.3 oster if (!pg_dst) 767 1.3 oster return (EFAULT); 768 1.3 oster } 769 1.3 oster } 770 1.3 oster } 771 1.3 oster while (len) { 772 1.3 oster *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; 773 1.3 oster dst++; 774 1.3 oster a++; 775 1.3 oster b++; 776 1.3 oster c++; 777 1.3 oster if (RF_PAGE_ALIGNED(a)) { 778 1.16 oster pg_a = a; 779 1.3 oster if (!pg_a) 780 1.3 oster return (EFAULT); 781 1.3 oster if (dst_is_a) 782 1.3 oster pg_dst = pg_a; 783 1.3 oster } 784 1.3 oster if (RF_PAGE_ALIGNED(b)) { 785 1.16 oster pg_b = b; 786 1.3 oster if (!pg_b) 787 1.3 oster return (EFAULT); 788 1.3 oster } 789 1.3 oster if (RF_PAGE_ALIGNED(c)) { 790 1.16 oster pg_c = c; 791 1.3 oster if (!pg_c) 792 1.3 oster return (EFAULT); 793 1.3 oster } 794 1.3 oster if (!dst_is_a) 795 1.3 oster if (RF_PAGE_ALIGNED(dst)) { 796 1.16 oster pg_dst = dst; 797 1.3 oster if (!pg_dst) 798 1.3 oster return (EFAULT); 799 1.3 oster } 800 1.3 oster len--; 801 1.3 oster } 802 1.3 oster return (0); 803 1.3 oster } 804 1.3 oster 805 1.23 perry int 806 1.23 perry rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b, 807 1.15 oster unsigned char *c, unsigned long len, void *bp) 808 1.1 oster { 809 1.3 oster RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0); 810 1.1 oster 811 1.3 oster return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a, 812 1.3 oster (unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp)); 813 1.1 oster } 814 1.9 oster #endif 815