Home | History | Annotate | Line # | Download | only in raidframe
rf_dagfuncs.c revision 1.10
      1 /*	$NetBSD: rf_dagfuncs.c,v 1.10 2002/09/21 00:52:49 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland, William V. Courtright II
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * dagfuncs.c -- DAG node execution routines
     31  *
     32  * Rules:
     33  * 1. Every DAG execution function must eventually cause node->status to
     34  *    get set to "good" or "bad", and "FinishNode" to be called. In the
     35  *    case of nodes that complete immediately (xor, NullNodeFunc, etc),
     36  *    the node execution function can do these two things directly. In
     37  *    the case of nodes that have to wait for some event (a disk read to
     38  *    complete, a lock to be released, etc) to occur before they can
     39  *    complete, this is typically achieved by having whatever module
     40  *    is doing the operation call GenericWakeupFunc upon completion.
     41  * 2. DAG execution functions should check the status in the DAG header
     42  *    and NOP out their operations if the status is not "enable". However,
     43  *    execution functions that release resources must be sure to release
     44  *    them even when they NOP out the function that would use them.
     45  *    Functions that acquire resources should go ahead and acquire them
     46  *    even when they NOP, so that a downstream release node will not have
     47  *    to check to find out whether or not the acquire was suppressed.
     48  */
     49 
     50 #include <sys/cdefs.h>
     51 __KERNEL_RCSID(0, "$NetBSD: rf_dagfuncs.c,v 1.10 2002/09/21 00:52:49 oster Exp $");
     52 
     53 #include <sys/param.h>
     54 #include <sys/ioctl.h>
     55 
     56 #include "rf_archs.h"
     57 #include "rf_raid.h"
     58 #include "rf_dag.h"
     59 #include "rf_layout.h"
     60 #include "rf_etimer.h"
     61 #include "rf_acctrace.h"
     62 #include "rf_diskqueue.h"
     63 #include "rf_dagfuncs.h"
     64 #include "rf_general.h"
     65 #include "rf_engine.h"
     66 #include "rf_dagutils.h"
     67 
     68 #include "rf_kintf.h"
     69 
     70 #if RF_INCLUDE_PARITYLOGGING > 0
     71 #include "rf_paritylog.h"
     72 #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
     73 
     74 int     (*rf_DiskReadFunc) (RF_DagNode_t *);
     75 int     (*rf_DiskWriteFunc) (RF_DagNode_t *);
     76 int     (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
     77 int     (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
     78 int     (*rf_DiskUnlockFunc) (RF_DagNode_t *);
     79 int     (*rf_DiskUnlockUndoFunc) (RF_DagNode_t *);
     80 int     (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
     81 int     (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
     82 int     (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
     83 
     84 /*****************************************************************************************
     85  * main (only) configuration routine for this module
     86  ****************************************************************************************/
     87 int
     88 rf_ConfigureDAGFuncs(listp)
     89 	RF_ShutdownList_t **listp;
     90 {
     91 	RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) || ((sizeof(long) == 4) && RF_LONGSHIFT == 2));
     92 	rf_DiskReadFunc = rf_DiskReadFuncForThreads;
     93 	rf_DiskReadUndoFunc = rf_DiskUndoFunc;
     94 	rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
     95 	rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
     96 	rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
     97 	rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
     98 	rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
     99 	rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
    100 	rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
    101 	return (0);
    102 }
    103 
    104 
    105 
    106 /*****************************************************************************************
    107  * the execution function associated with a terminate node
    108  ****************************************************************************************/
    109 int
    110 rf_TerminateFunc(node)
    111 	RF_DagNode_t *node;
    112 {
    113 	RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
    114 	node->status = rf_good;
    115 	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
    116 }
    117 
    118 int
    119 rf_TerminateUndoFunc(node)
    120 	RF_DagNode_t *node;
    121 {
    122 	return (0);
    123 }
    124 
    125 
    126 /*****************************************************************************************
    127  * execution functions associated with a mirror node
    128  *
    129  * parameters:
    130  *
    131  * 0 - physical disk addres of data
    132  * 1 - buffer for holding read data
    133  * 2 - parity stripe ID
    134  * 3 - flags
    135  * 4 - physical disk address of mirror (parity)
    136  *
    137  ****************************************************************************************/
    138 
    139 int
    140 rf_DiskReadMirrorIdleFunc(node)
    141 	RF_DagNode_t *node;
    142 {
    143 	/* select the mirror copy with the shortest queue and fill in node
    144 	 * parameters with physical disk address */
    145 
    146 	rf_SelectMirrorDiskIdle(node);
    147 	return (rf_DiskReadFunc(node));
    148 }
    149 
    150 int
    151 rf_DiskReadMirrorPartitionFunc(node)
    152 	RF_DagNode_t *node;
    153 {
    154 	/* select the mirror copy with the shortest queue and fill in node
    155 	 * parameters with physical disk address */
    156 
    157 	rf_SelectMirrorDiskPartition(node);
    158 	return (rf_DiskReadFunc(node));
    159 }
    160 
    161 int
    162 rf_DiskReadMirrorUndoFunc(node)
    163 	RF_DagNode_t *node;
    164 {
    165 	return (0);
    166 }
    167 
    168 
    169 
    170 #if RF_INCLUDE_PARITYLOGGING > 0
    171 /*****************************************************************************************
    172  * the execution function associated with a parity log update node
    173  ****************************************************************************************/
    174 int
    175 rf_ParityLogUpdateFunc(node)
    176 	RF_DagNode_t *node;
    177 {
    178 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    179 	caddr_t buf = (caddr_t) node->params[1].p;
    180 	RF_ParityLogData_t *logData;
    181 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    182 	RF_Etimer_t timer;
    183 
    184 	if (node->dagHdr->status == rf_enable) {
    185 		RF_ETIMER_START(timer);
    186 		logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
    187 		    (RF_Raid_t *) (node->dagHdr->raidPtr),
    188 		    node->wakeFunc, (void *) node,
    189 		    node->dagHdr->tracerec, timer);
    190 		if (logData)
    191 			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
    192 		else {
    193 			RF_ETIMER_STOP(timer);
    194 			RF_ETIMER_EVAL(timer);
    195 			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
    196 			(node->wakeFunc) (node, ENOMEM);
    197 		}
    198 	}
    199 	return (0);
    200 }
    201 
    202 
    203 /*****************************************************************************************
    204  * the execution function associated with a parity log overwrite node
    205  ****************************************************************************************/
    206 int
    207 rf_ParityLogOverwriteFunc(node)
    208 	RF_DagNode_t *node;
    209 {
    210 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    211 	caddr_t buf = (caddr_t) node->params[1].p;
    212 	RF_ParityLogData_t *logData;
    213 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    214 	RF_Etimer_t timer;
    215 
    216 	if (node->dagHdr->status == rf_enable) {
    217 		RF_ETIMER_START(timer);
    218 		logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
    219 		    node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
    220 		if (logData)
    221 			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
    222 		else {
    223 			RF_ETIMER_STOP(timer);
    224 			RF_ETIMER_EVAL(timer);
    225 			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
    226 			(node->wakeFunc) (node, ENOMEM);
    227 		}
    228 	}
    229 	return (0);
    230 }
    231 
    232 int
    233 rf_ParityLogUpdateUndoFunc(node)
    234 	RF_DagNode_t *node;
    235 {
    236 	return (0);
    237 }
    238 
    239 int
    240 rf_ParityLogOverwriteUndoFunc(node)
    241 	RF_DagNode_t *node;
    242 {
    243 	return (0);
    244 }
    245 #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
    246 
    247 /*****************************************************************************************
    248  * the execution function associated with a NOP node
    249  ****************************************************************************************/
    250 int
    251 rf_NullNodeFunc(node)
    252 	RF_DagNode_t *node;
    253 {
    254 	node->status = rf_good;
    255 	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
    256 }
    257 
    258 int
    259 rf_NullNodeUndoFunc(node)
    260 	RF_DagNode_t *node;
    261 {
    262 	node->status = rf_undone;
    263 	return (rf_FinishNode(node, RF_THREAD_CONTEXT));
    264 }
    265 
    266 
    267 /*****************************************************************************************
    268  * the execution function associated with a disk-read node
    269  ****************************************************************************************/
    270 int
    271 rf_DiskReadFuncForThreads(node)
    272 	RF_DagNode_t *node;
    273 {
    274 	RF_DiskQueueData_t *req;
    275 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    276 	caddr_t buf = (caddr_t) node->params[1].p;
    277 	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
    278 	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
    279 	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    280 	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    281 	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
    282 	RF_DiskQueueDataFlags_t flags = 0;
    283 	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
    284 	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    285 	void   *b_proc = NULL;
    286 
    287 	if (node->dagHdr->bp)
    288 		b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
    289 
    290 	RF_ASSERT(!(lock && unlock));
    291 	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
    292 	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
    293 
    294 	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
    295 	    buf, parityStripeID, which_ru,
    296 	    (int (*) (void *, int)) node->wakeFunc,
    297 	    node, NULL, node->dagHdr->tracerec,
    298 	    (void *) (node->dagHdr->raidPtr), flags, b_proc);
    299 	if (!req) {
    300 		(node->wakeFunc) (node, ENOMEM);
    301 	} else {
    302 		node->dagFuncData = (void *) req;
    303 		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
    304 	}
    305 	return (0);
    306 }
    307 
    308 
    309 /*****************************************************************************************
    310  * the execution function associated with a disk-write node
    311  ****************************************************************************************/
    312 int
    313 rf_DiskWriteFuncForThreads(node)
    314 	RF_DagNode_t *node;
    315 {
    316 	RF_DiskQueueData_t *req;
    317 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    318 	caddr_t buf = (caddr_t) node->params[1].p;
    319 	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
    320 	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
    321 	unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    322 	unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    323 	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
    324 	RF_DiskQueueDataFlags_t flags = 0;
    325 	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
    326 	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    327 	void   *b_proc = NULL;
    328 
    329 	if (node->dagHdr->bp)
    330 		b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
    331 
    332 	/* normal processing (rollaway or forward recovery) begins here */
    333 	RF_ASSERT(!(lock && unlock));
    334 	flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
    335 	flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
    336 	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
    337 	    buf, parityStripeID, which_ru,
    338 	    (int (*) (void *, int)) node->wakeFunc,
    339 	    (void *) node, NULL,
    340 	    node->dagHdr->tracerec,
    341 	    (void *) (node->dagHdr->raidPtr),
    342 	    flags, b_proc);
    343 
    344 	if (!req) {
    345 		(node->wakeFunc) (node, ENOMEM);
    346 	} else {
    347 		node->dagFuncData = (void *) req;
    348 		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, priority);
    349 	}
    350 
    351 	return (0);
    352 }
    353 /*****************************************************************************************
    354  * the undo function for disk nodes
    355  * Note:  this is not a proper undo of a write node, only locks are released.
    356  *        old data is not restored to disk!
    357  ****************************************************************************************/
    358 int
    359 rf_DiskUndoFunc(node)
    360 	RF_DagNode_t *node;
    361 {
    362 	RF_DiskQueueData_t *req;
    363 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    364 	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    365 
    366 	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
    367 	    0L, 0, NULL, 0L, 0,
    368 	    (int (*) (void *, int)) node->wakeFunc,
    369 	    (void *) node,
    370 	    NULL, node->dagHdr->tracerec,
    371 	    (void *) (node->dagHdr->raidPtr),
    372 	    RF_UNLOCK_DISK_QUEUE, NULL);
    373 	if (!req)
    374 		(node->wakeFunc) (node, ENOMEM);
    375 	else {
    376 		node->dagFuncData = (void *) req;
    377 		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
    378 	}
    379 
    380 	return (0);
    381 }
    382 /*****************************************************************************************
    383  * the execution function associated with an "unlock disk queue" node
    384  ****************************************************************************************/
    385 int
    386 rf_DiskUnlockFuncForThreads(node)
    387 	RF_DagNode_t *node;
    388 {
    389 	RF_DiskQueueData_t *req;
    390 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    391 	RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    392 
    393 	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
    394 	    0L, 0, NULL, 0L, 0,
    395 	    (int (*) (void *, int)) node->wakeFunc,
    396 	    (void *) node,
    397 	    NULL, node->dagHdr->tracerec,
    398 	    (void *) (node->dagHdr->raidPtr),
    399 	    RF_UNLOCK_DISK_QUEUE, NULL);
    400 	if (!req)
    401 		(node->wakeFunc) (node, ENOMEM);
    402 	else {
    403 		node->dagFuncData = (void *) req;
    404 		rf_DiskIOEnqueue(&(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY);
    405 	}
    406 
    407 	return (0);
    408 }
    409 /*****************************************************************************************
    410  * Callback routine for DiskRead and DiskWrite nodes.  When the disk op completes,
    411  * the routine is called to set the node status and inform the execution engine that
    412  * the node has fired.
    413  ****************************************************************************************/
    414 int
    415 rf_GenericWakeupFunc(node, status)
    416 	RF_DagNode_t *node;
    417 	int     status;
    418 {
    419 	switch (node->status) {
    420 	case rf_bwd1:
    421 		node->status = rf_bwd2;
    422 		if (node->dagFuncData)
    423 			rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
    424 		return (rf_DiskWriteFuncForThreads(node));
    425 		break;
    426 	case rf_fired:
    427 		if (status)
    428 			node->status = rf_bad;
    429 		else
    430 			node->status = rf_good;
    431 		break;
    432 	case rf_recover:
    433 		/* probably should never reach this case */
    434 		if (status)
    435 			node->status = rf_panic;
    436 		else
    437 			node->status = rf_undone;
    438 		break;
    439 	default:
    440 		printf("rf_GenericWakeupFunc:");
    441 		printf("node->status is %d,", node->status);
    442 		printf("status is %d \n", status);
    443 		RF_PANIC();
    444 		break;
    445 	}
    446 	if (node->dagFuncData)
    447 		rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
    448 	return (rf_FinishNode(node, RF_INTR_CONTEXT));
    449 }
    450 
    451 
    452 /*****************************************************************************************
    453  * there are three distinct types of xor nodes
    454  * A "regular xor" is used in the fault-free case where the access spans a complete
    455  * stripe unit.  It assumes that the result buffer is one full stripe unit in size,
    456  * and uses the stripe-unit-offset values that it computes from the PDAs to determine
    457  * where within the stripe unit to XOR each argument buffer.
    458  *
    459  * A "simple xor" is used in the fault-free case where the access touches only a portion
    460  * of one (or two, in some cases) stripe unit(s).  It assumes that all the argument
    461  * buffers are of the same size and have the same stripe unit offset.
    462  *
    463  * A "recovery xor" is used in the degraded-mode case.  It's similar to the regular
    464  * xor function except that it takes the failed PDA as an additional parameter, and
    465  * uses it to determine what portions of the argument buffers need to be xor'd into
    466  * the result buffer, and where in the result buffer they should go.
    467  ****************************************************************************************/
    468 
    469 /* xor the params together and store the result in the result field.
    470  * assume the result field points to a buffer that is the size of one SU,
    471  * and use the pda params to determine where within the buffer to XOR
    472  * the input buffers.
    473  */
    474 int
    475 rf_RegularXorFunc(node)
    476 	RF_DagNode_t *node;
    477 {
    478 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    479 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    480 	RF_Etimer_t timer;
    481 	int     i, retcode;
    482 
    483 	retcode = 0;
    484 	if (node->dagHdr->status == rf_enable) {
    485 		/* don't do the XOR if the input is the same as the output */
    486 		RF_ETIMER_START(timer);
    487 		for (i = 0; i < node->numParams - 1; i += 2)
    488 			if (node->params[i + 1].p != node->results[0]) {
    489 				retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
    490 				    (char *) node->params[i + 1].p, (char *) node->results[0], node->dagHdr->bp);
    491 			}
    492 		RF_ETIMER_STOP(timer);
    493 		RF_ETIMER_EVAL(timer);
    494 		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    495 	}
    496 	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
    497 							 * explicitly since no
    498 							 * I/O in this node */
    499 }
    500 /* xor the inputs into the result buffer, ignoring placement issues */
    501 int
    502 rf_SimpleXorFunc(node)
    503 	RF_DagNode_t *node;
    504 {
    505 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    506 	int     i, retcode = 0;
    507 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    508 	RF_Etimer_t timer;
    509 
    510 	if (node->dagHdr->status == rf_enable) {
    511 		RF_ETIMER_START(timer);
    512 		/* don't do the XOR if the input is the same as the output */
    513 		for (i = 0; i < node->numParams - 1; i += 2)
    514 			if (node->params[i + 1].p != node->results[0]) {
    515 				retcode = rf_bxor((char *) node->params[i + 1].p, (char *) node->results[0],
    516 				    rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[i].p)->numSector),
    517 				    (struct buf *) node->dagHdr->bp);
    518 			}
    519 		RF_ETIMER_STOP(timer);
    520 		RF_ETIMER_EVAL(timer);
    521 		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    522 	}
    523 	return (rf_GenericWakeupFunc(node, retcode));	/* call wake func
    524 							 * explicitly since no
    525 							 * I/O in this node */
    526 }
    527 /* this xor is used by the degraded-mode dag functions to recover lost data.
    528  * the second-to-last parameter is the PDA for the failed portion of the access.
    529  * the code here looks at this PDA and assumes that the xor target buffer is
    530  * equal in size to the number of sectors in the failed PDA.  It then uses
    531  * the other PDAs in the parameter list to determine where within the target
    532  * buffer the corresponding data should be xored.
    533  */
    534 int
    535 rf_RecoveryXorFunc(node)
    536 	RF_DagNode_t *node;
    537 {
    538 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    539 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    540 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    541 	int     i, retcode = 0;
    542 	RF_PhysDiskAddr_t *pda;
    543 	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    544 	char   *srcbuf, *destbuf;
    545 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    546 	RF_Etimer_t timer;
    547 
    548 	if (node->dagHdr->status == rf_enable) {
    549 		RF_ETIMER_START(timer);
    550 		for (i = 0; i < node->numParams - 2; i += 2)
    551 			if (node->params[i + 1].p != node->results[0]) {
    552 				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    553 				srcbuf = (char *) node->params[i + 1].p;
    554 				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    555 				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    556 				retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
    557 			}
    558 		RF_ETIMER_STOP(timer);
    559 		RF_ETIMER_EVAL(timer);
    560 		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    561 	}
    562 	return (rf_GenericWakeupFunc(node, retcode));
    563 }
    564 /*****************************************************************************************
    565  * The next three functions are utilities used by the above xor-execution functions.
    566  ****************************************************************************************/
    567 
    568 
    569 /*
    570  * this is just a glorified buffer xor.  targbuf points to a buffer that is one full stripe unit
    571  * in size.  srcbuf points to a buffer that may be less than 1 SU, but never more.  When the
    572  * access described by pda is one SU in size (which by implication means it's SU-aligned),
    573  * all that happens is (targbuf) <- (srcbuf ^ targbuf).  When the access is less than one
    574  * SU in size the XOR occurs on only the portion of targbuf identified in the pda.
    575  */
    576 
    577 int
    578 rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
    579 	RF_Raid_t *raidPtr;
    580 	RF_PhysDiskAddr_t *pda;
    581 	char   *srcbuf;
    582 	char   *targbuf;
    583 	void   *bp;
    584 {
    585 	char   *targptr;
    586 	int     sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    587 	int     SUOffset = pda->startSector % sectPerSU;
    588 	int     length, retcode = 0;
    589 
    590 	RF_ASSERT(pda->numSector <= sectPerSU);
    591 
    592 	targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
    593 	length = rf_RaidAddressToByte(raidPtr, pda->numSector);
    594 	retcode = rf_bxor(srcbuf, targptr, length, bp);
    595 	return (retcode);
    596 }
    597 /* it really should be the case that the buffer pointers (returned by malloc)
    598  * are aligned to the natural word size of the machine, so this is the only
    599  * case we optimize for.  The length should always be a multiple of the sector
    600  * size, so there should be no problem with leftover bytes at the end.
    601  */
    602 int
    603 rf_bxor(src, dest, len, bp)
    604 	char   *src;
    605 	char   *dest;
    606 	int     len;
    607 	void   *bp;
    608 {
    609 	unsigned mask = sizeof(long) - 1, retcode = 0;
    610 
    611 	if (!(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len & mask)) {
    612 		retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len >> RF_LONGSHIFT, bp);
    613 	} else {
    614 		RF_ASSERT(0);
    615 	}
    616 	return (retcode);
    617 }
    618 /* map a user buffer into kernel space, if necessary */
    619 #define REMAP_VA(_bp,x,y) (y) = (x)
    620 
    621 /* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
    622  * We don't want to assume anything about which input buffers are in kernel/user
    623  * space, nor about their alignment, so in each loop we compute the maximum number
    624  * of bytes that we can xor without crossing any page boundaries, and do only this many
    625  * bytes before the next remap.
    626  */
    627 int
    628 rf_longword_bxor(src, dest, len, bp)
    629 	unsigned long *src;
    630 	unsigned long *dest;
    631 	int     len;		/* longwords */
    632 	void   *bp;
    633 {
    634 	unsigned long *end = src + len;
    635 	unsigned long d0, d1, d2, d3, s0, s1, s2, s3;	/* temps */
    636 	unsigned long *pg_src, *pg_dest;	/* per-page source/dest
    637 							 * pointers */
    638 	int     longs_this_time;/* # longwords to xor in the current iteration */
    639 
    640 	REMAP_VA(bp, src, pg_src);
    641 	REMAP_VA(bp, dest, pg_dest);
    642 	if (!pg_src || !pg_dest)
    643 		return (EFAULT);
    644 
    645 	while (len >= 4) {
    646 		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT);	/* note len in longwords */
    647 		src += longs_this_time;
    648 		dest += longs_this_time;
    649 		len -= longs_this_time;
    650 		while (longs_this_time >= 4) {
    651 			d0 = pg_dest[0];
    652 			d1 = pg_dest[1];
    653 			d2 = pg_dest[2];
    654 			d3 = pg_dest[3];
    655 			s0 = pg_src[0];
    656 			s1 = pg_src[1];
    657 			s2 = pg_src[2];
    658 			s3 = pg_src[3];
    659 			pg_dest[0] = d0 ^ s0;
    660 			pg_dest[1] = d1 ^ s1;
    661 			pg_dest[2] = d2 ^ s2;
    662 			pg_dest[3] = d3 ^ s3;
    663 			pg_src += 4;
    664 			pg_dest += 4;
    665 			longs_this_time -= 4;
    666 		}
    667 		while (longs_this_time > 0) {	/* cannot cross any page
    668 						 * boundaries here */
    669 			*pg_dest++ ^= *pg_src++;
    670 			longs_this_time--;
    671 		}
    672 
    673 		/* either we're done, or we've reached a page boundary on one
    674 		 * (or possibly both) of the pointers */
    675 		if (len) {
    676 			if (RF_PAGE_ALIGNED(src))
    677 				REMAP_VA(bp, src, pg_src);
    678 			if (RF_PAGE_ALIGNED(dest))
    679 				REMAP_VA(bp, dest, pg_dest);
    680 			if (!pg_src || !pg_dest)
    681 				return (EFAULT);
    682 		}
    683 	}
    684 	while (src < end) {
    685 		*pg_dest++ ^= *pg_src++;
    686 		src++;
    687 		dest++;
    688 		len--;
    689 		if (RF_PAGE_ALIGNED(src))
    690 			REMAP_VA(bp, src, pg_src);
    691 		if (RF_PAGE_ALIGNED(dest))
    692 			REMAP_VA(bp, dest, pg_dest);
    693 	}
    694 	RF_ASSERT(len == 0);
    695 	return (0);
    696 }
    697 
    698 #if 0
    699 /*
    700    dst = a ^ b ^ c;
    701    a may equal dst
    702    see comment above longword_bxor
    703 */
    704 int
    705 rf_longword_bxor3(dst, a, b, c, len, bp)
    706 	unsigned long *dst;
    707 	unsigned long *a;
    708 	unsigned long *b;
    709 	unsigned long *c;
    710 	int     len;		/* length in longwords */
    711 	void   *bp;
    712 {
    713 	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
    714 	unsigned long *pg_a, *pg_b, *pg_c, *pg_dst;	/* per-page source/dest
    715 								 * pointers */
    716 	int     longs_this_time;/* # longs to xor in the current iteration */
    717 	char    dst_is_a = 0;
    718 
    719 	REMAP_VA(bp, a, pg_a);
    720 	REMAP_VA(bp, b, pg_b);
    721 	REMAP_VA(bp, c, pg_c);
    722 	if (a == dst) {
    723 		pg_dst = pg_a;
    724 		dst_is_a = 1;
    725 	} else {
    726 		REMAP_VA(bp, dst, pg_dst);
    727 	}
    728 
    729 	/* align dest to cache line.  Can't cross a pg boundary on dst here. */
    730 	while ((((unsigned long) pg_dst) & 0x1f)) {
    731 		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    732 		dst++;
    733 		a++;
    734 		b++;
    735 		c++;
    736 		if (RF_PAGE_ALIGNED(a)) {
    737 			REMAP_VA(bp, a, pg_a);
    738 			if (!pg_a)
    739 				return (EFAULT);
    740 		}
    741 		if (RF_PAGE_ALIGNED(b)) {
    742 			REMAP_VA(bp, a, pg_b);
    743 			if (!pg_b)
    744 				return (EFAULT);
    745 		}
    746 		if (RF_PAGE_ALIGNED(c)) {
    747 			REMAP_VA(bp, a, pg_c);
    748 			if (!pg_c)
    749 				return (EFAULT);
    750 		}
    751 		len--;
    752 	}
    753 
    754 	while (len > 4) {
    755 		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
    756 		a += longs_this_time;
    757 		b += longs_this_time;
    758 		c += longs_this_time;
    759 		dst += longs_this_time;
    760 		len -= longs_this_time;
    761 		while (longs_this_time >= 4) {
    762 			a0 = pg_a[0];
    763 			longs_this_time -= 4;
    764 
    765 			a1 = pg_a[1];
    766 			a2 = pg_a[2];
    767 
    768 			a3 = pg_a[3];
    769 			pg_a += 4;
    770 
    771 			b0 = pg_b[0];
    772 			b1 = pg_b[1];
    773 
    774 			b2 = pg_b[2];
    775 			b3 = pg_b[3];
    776 			/* start dual issue */
    777 			a0 ^= b0;
    778 			b0 = pg_c[0];
    779 
    780 			pg_b += 4;
    781 			a1 ^= b1;
    782 
    783 			a2 ^= b2;
    784 			a3 ^= b3;
    785 
    786 			b1 = pg_c[1];
    787 			a0 ^= b0;
    788 
    789 			b2 = pg_c[2];
    790 			a1 ^= b1;
    791 
    792 			b3 = pg_c[3];
    793 			a2 ^= b2;
    794 
    795 			pg_dst[0] = a0;
    796 			a3 ^= b3;
    797 			pg_dst[1] = a1;
    798 			pg_c += 4;
    799 			pg_dst[2] = a2;
    800 			pg_dst[3] = a3;
    801 			pg_dst += 4;
    802 		}
    803 		while (longs_this_time > 0) {	/* cannot cross any page
    804 						 * boundaries here */
    805 			*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    806 			longs_this_time--;
    807 		}
    808 
    809 		if (len) {
    810 			if (RF_PAGE_ALIGNED(a)) {
    811 				REMAP_VA(bp, a, pg_a);
    812 				if (!pg_a)
    813 					return (EFAULT);
    814 				if (dst_is_a)
    815 					pg_dst = pg_a;
    816 			}
    817 			if (RF_PAGE_ALIGNED(b)) {
    818 				REMAP_VA(bp, b, pg_b);
    819 				if (!pg_b)
    820 					return (EFAULT);
    821 			}
    822 			if (RF_PAGE_ALIGNED(c)) {
    823 				REMAP_VA(bp, c, pg_c);
    824 				if (!pg_c)
    825 					return (EFAULT);
    826 			}
    827 			if (!dst_is_a)
    828 				if (RF_PAGE_ALIGNED(dst)) {
    829 					REMAP_VA(bp, dst, pg_dst);
    830 					if (!pg_dst)
    831 						return (EFAULT);
    832 				}
    833 		}
    834 	}
    835 	while (len) {
    836 		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    837 		dst++;
    838 		a++;
    839 		b++;
    840 		c++;
    841 		if (RF_PAGE_ALIGNED(a)) {
    842 			REMAP_VA(bp, a, pg_a);
    843 			if (!pg_a)
    844 				return (EFAULT);
    845 			if (dst_is_a)
    846 				pg_dst = pg_a;
    847 		}
    848 		if (RF_PAGE_ALIGNED(b)) {
    849 			REMAP_VA(bp, b, pg_b);
    850 			if (!pg_b)
    851 				return (EFAULT);
    852 		}
    853 		if (RF_PAGE_ALIGNED(c)) {
    854 			REMAP_VA(bp, c, pg_c);
    855 			if (!pg_c)
    856 				return (EFAULT);
    857 		}
    858 		if (!dst_is_a)
    859 			if (RF_PAGE_ALIGNED(dst)) {
    860 				REMAP_VA(bp, dst, pg_dst);
    861 				if (!pg_dst)
    862 					return (EFAULT);
    863 			}
    864 		len--;
    865 	}
    866 	return (0);
    867 }
    868 
    869 int
    870 rf_bxor3(dst, a, b, c, len, bp)
    871 	unsigned char *dst;
    872 	unsigned char *a;
    873 	unsigned char *b;
    874 	unsigned char *c;
    875 	unsigned long len;
    876 	void   *bp;
    877 {
    878 	RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0);
    879 
    880 	return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a,
    881 		(unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp));
    882 }
    883 #endif
    884