Home | History | Annotate | Line # | Download | only in raidframe
rf_dagfuncs.c revision 1.32.6.1
      1  1.32.6.1   thorpej /*	$NetBSD: rf_dagfuncs.c,v 1.32.6.1 2021/08/01 22:42:31 thorpej Exp $	*/
      2       1.1     oster /*
      3       1.1     oster  * Copyright (c) 1995 Carnegie-Mellon University.
      4       1.1     oster  * All rights reserved.
      5       1.1     oster  *
      6       1.1     oster  * Author: Mark Holland, William V. Courtright II
      7       1.1     oster  *
      8       1.1     oster  * Permission to use, copy, modify and distribute this software and
      9       1.1     oster  * its documentation is hereby granted, provided that both the copyright
     10       1.1     oster  * notice and this permission notice appear in all copies of the
     11       1.1     oster  * software, derivative works or modified versions, and any portions
     12       1.1     oster  * thereof, and that both notices appear in supporting documentation.
     13       1.1     oster  *
     14       1.1     oster  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15       1.1     oster  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16       1.1     oster  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17       1.1     oster  *
     18       1.1     oster  * Carnegie Mellon requests users of this software to return to
     19       1.1     oster  *
     20       1.1     oster  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21       1.1     oster  *  School of Computer Science
     22       1.1     oster  *  Carnegie Mellon University
     23       1.1     oster  *  Pittsburgh PA 15213-3890
     24       1.1     oster  *
     25       1.1     oster  * any improvements or extensions that they make and grant Carnegie the
     26       1.1     oster  * rights to redistribute these changes.
     27       1.1     oster  */
     28       1.1     oster 
     29       1.1     oster /*
     30       1.1     oster  * dagfuncs.c -- DAG node execution routines
     31       1.1     oster  *
     32       1.1     oster  * Rules:
     33       1.1     oster  * 1. Every DAG execution function must eventually cause node->status to
     34       1.1     oster  *    get set to "good" or "bad", and "FinishNode" to be called. In the
     35       1.1     oster  *    case of nodes that complete immediately (xor, NullNodeFunc, etc),
     36       1.1     oster  *    the node execution function can do these two things directly. In
     37       1.1     oster  *    the case of nodes that have to wait for some event (a disk read to
     38       1.1     oster  *    complete, a lock to be released, etc) to occur before they can
     39       1.1     oster  *    complete, this is typically achieved by having whatever module
     40       1.1     oster  *    is doing the operation call GenericWakeupFunc upon completion.
     41       1.1     oster  * 2. DAG execution functions should check the status in the DAG header
     42       1.1     oster  *    and NOP out their operations if the status is not "enable". However,
     43       1.1     oster  *    execution functions that release resources must be sure to release
     44       1.1     oster  *    them even when they NOP out the function that would use them.
     45       1.1     oster  *    Functions that acquire resources should go ahead and acquire them
     46       1.1     oster  *    even when they NOP, so that a downstream release node will not have
     47       1.1     oster  *    to check to find out whether or not the acquire was suppressed.
     48       1.1     oster  */
     49       1.8     lukem 
     50       1.8     lukem #include <sys/cdefs.h>
     51  1.32.6.1   thorpej __KERNEL_RCSID(0, "$NetBSD: rf_dagfuncs.c,v 1.32.6.1 2021/08/01 22:42:31 thorpej Exp $");
     52       1.1     oster 
     53       1.7       mrg #include <sys/param.h>
     54       1.1     oster #include <sys/ioctl.h>
     55       1.1     oster 
     56       1.1     oster #include "rf_archs.h"
     57       1.1     oster #include "rf_raid.h"
     58       1.1     oster #include "rf_dag.h"
     59       1.1     oster #include "rf_layout.h"
     60       1.1     oster #include "rf_etimer.h"
     61       1.1     oster #include "rf_acctrace.h"
     62       1.1     oster #include "rf_diskqueue.h"
     63       1.1     oster #include "rf_dagfuncs.h"
     64       1.1     oster #include "rf_general.h"
     65       1.1     oster #include "rf_engine.h"
     66       1.1     oster #include "rf_dagutils.h"
     67       1.1     oster 
     68       1.1     oster #include "rf_kintf.h"
     69       1.1     oster 
     70       1.1     oster #if RF_INCLUDE_PARITYLOGGING > 0
     71       1.1     oster #include "rf_paritylog.h"
     72       1.3     oster #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
     73       1.1     oster 
     74      1.31  christos void     (*rf_DiskReadFunc) (RF_DagNode_t *);
     75      1.31  christos void     (*rf_DiskWriteFunc) (RF_DagNode_t *);
     76      1.31  christos void     (*rf_DiskReadUndoFunc) (RF_DagNode_t *);
     77      1.31  christos void     (*rf_DiskWriteUndoFunc) (RF_DagNode_t *);
     78      1.31  christos void     (*rf_RegularXorUndoFunc) (RF_DagNode_t *);
     79      1.31  christos void     (*rf_SimpleXorUndoFunc) (RF_DagNode_t *);
     80      1.31  christos void     (*rf_RecoveryXorUndoFunc) (RF_DagNode_t *);
     81       1.1     oster 
     82      1.14     oster /*****************************************************************************
     83       1.1     oster  * main (only) configuration routine for this module
     84      1.14     oster  ****************************************************************************/
     85      1.23     perry int
     86      1.28  christos rf_ConfigureDAGFuncs(RF_ShutdownList_t **listp)
     87       1.3     oster {
     88      1.23     perry 	RF_ASSERT(((sizeof(long) == 8) && RF_LONGSHIFT == 3) ||
     89      1.14     oster 		  ((sizeof(long) == 4) && RF_LONGSHIFT == 2));
     90       1.3     oster 	rf_DiskReadFunc = rf_DiskReadFuncForThreads;
     91       1.3     oster 	rf_DiskReadUndoFunc = rf_DiskUndoFunc;
     92       1.3     oster 	rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
     93       1.3     oster 	rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
     94       1.3     oster 	rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
     95       1.3     oster 	rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
     96       1.3     oster 	rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
     97       1.3     oster 	return (0);
     98       1.1     oster }
     99       1.1     oster 
    100       1.1     oster 
    101       1.1     oster 
    102      1.14     oster /*****************************************************************************
    103       1.1     oster  * the execution function associated with a terminate node
    104      1.14     oster  ****************************************************************************/
    105      1.31  christos void
    106      1.15     oster rf_TerminateFunc(RF_DagNode_t *node)
    107       1.1     oster {
    108       1.3     oster 	RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
    109       1.3     oster 	node->status = rf_good;
    110      1.31  christos 	rf_FinishNode(node, RF_THREAD_CONTEXT);
    111       1.1     oster }
    112       1.1     oster 
    113      1.31  christos void
    114      1.28  christos rf_TerminateUndoFunc(RF_DagNode_t *node)
    115       1.1     oster {
    116       1.1     oster }
    117       1.1     oster 
    118       1.1     oster 
    119      1.15     oster /*****************************************************************************
    120       1.1     oster  * execution functions associated with a mirror node
    121       1.1     oster  *
    122       1.1     oster  * parameters:
    123       1.1     oster  *
    124       1.1     oster  * 0 - physical disk addres of data
    125       1.1     oster  * 1 - buffer for holding read data
    126       1.1     oster  * 2 - parity stripe ID
    127       1.1     oster  * 3 - flags
    128       1.1     oster  * 4 - physical disk address of mirror (parity)
    129       1.1     oster  *
    130      1.15     oster  ****************************************************************************/
    131       1.1     oster 
    132      1.31  christos void
    133      1.15     oster rf_DiskReadMirrorIdleFunc(RF_DagNode_t *node)
    134       1.1     oster {
    135       1.3     oster 	/* select the mirror copy with the shortest queue and fill in node
    136       1.3     oster 	 * parameters with physical disk address */
    137       1.1     oster 
    138       1.3     oster 	rf_SelectMirrorDiskIdle(node);
    139      1.31  christos 	rf_DiskReadFunc(node);
    140       1.1     oster }
    141       1.1     oster 
    142      1.11     oster #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0) || (RF_DEBUG_VALIDATE_DAG > 0)
    143      1.31  christos void
    144      1.15     oster rf_DiskReadMirrorPartitionFunc(RF_DagNode_t *node)
    145       1.1     oster {
    146       1.3     oster 	/* select the mirror copy with the shortest queue and fill in node
    147       1.3     oster 	 * parameters with physical disk address */
    148       1.1     oster 
    149       1.3     oster 	rf_SelectMirrorDiskPartition(node);
    150      1.31  christos 	rf_DiskReadFunc(node);
    151       1.1     oster }
    152      1.11     oster #endif
    153       1.1     oster 
    154      1.31  christos void
    155      1.28  christos rf_DiskReadMirrorUndoFunc(RF_DagNode_t *node)
    156       1.1     oster {
    157       1.1     oster }
    158       1.1     oster 
    159       1.1     oster 
    160       1.1     oster 
    161       1.1     oster #if RF_INCLUDE_PARITYLOGGING > 0
    162      1.14     oster /*****************************************************************************
    163       1.1     oster  * the execution function associated with a parity log update node
    164      1.14     oster  ****************************************************************************/
    165      1.31  christos void
    166      1.15     oster rf_ParityLogUpdateFunc(RF_DagNode_t *node)
    167       1.3     oster {
    168       1.3     oster 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    169      1.29  christos 	void *bf = (void *) node->params[1].p;
    170       1.3     oster 	RF_ParityLogData_t *logData;
    171      1.19     oster #if RF_ACC_TRACE > 0
    172       1.3     oster 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    173       1.3     oster 	RF_Etimer_t timer;
    174      1.19     oster #endif
    175       1.3     oster 
    176       1.3     oster 	if (node->dagHdr->status == rf_enable) {
    177      1.19     oster #if RF_ACC_TRACE > 0
    178       1.3     oster 		RF_ETIMER_START(timer);
    179      1.19     oster #endif
    180      1.24  christos 		logData = rf_CreateParityLogData(RF_UPDATE, pda, bf,
    181       1.3     oster 		    (RF_Raid_t *) (node->dagHdr->raidPtr),
    182      1.31  christos 		    node->wakeFunc, node,
    183       1.3     oster 		    node->dagHdr->tracerec, timer);
    184       1.3     oster 		if (logData)
    185       1.3     oster 			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
    186       1.3     oster 		else {
    187      1.19     oster #if RF_ACC_TRACE > 0
    188       1.3     oster 			RF_ETIMER_STOP(timer);
    189       1.3     oster 			RF_ETIMER_EVAL(timer);
    190       1.3     oster 			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
    191      1.19     oster #endif
    192       1.3     oster 			(node->wakeFunc) (node, ENOMEM);
    193       1.3     oster 		}
    194       1.1     oster 	}
    195       1.1     oster }
    196       1.1     oster 
    197       1.1     oster 
    198      1.15     oster /*****************************************************************************
    199       1.1     oster  * the execution function associated with a parity log overwrite node
    200      1.15     oster  ****************************************************************************/
    201      1.31  christos void
    202      1.15     oster rf_ParityLogOverwriteFunc(RF_DagNode_t *node)
    203       1.3     oster {
    204       1.3     oster 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    205      1.29  christos 	void *bf = (void *) node->params[1].p;
    206       1.3     oster 	RF_ParityLogData_t *logData;
    207      1.19     oster #if RF_ACC_TRACE > 0
    208       1.3     oster 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    209       1.3     oster 	RF_Etimer_t timer;
    210      1.19     oster #endif
    211       1.3     oster 
    212       1.3     oster 	if (node->dagHdr->status == rf_enable) {
    213      1.19     oster #if RF_ACC_TRACE > 0
    214       1.3     oster 		RF_ETIMER_START(timer);
    215      1.19     oster #endif
    216      1.24  christos 		logData = rf_CreateParityLogData(RF_OVERWRITE, pda, bf,
    217      1.14     oster (RF_Raid_t *) (node->dagHdr->raidPtr),
    218      1.31  christos 		    node->wakeFunc, node, node->dagHdr->tracerec, timer);
    219       1.3     oster 		if (logData)
    220       1.3     oster 			rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
    221       1.3     oster 		else {
    222      1.19     oster #if RF_ACC_TRACE > 0
    223       1.3     oster 			RF_ETIMER_STOP(timer);
    224       1.3     oster 			RF_ETIMER_EVAL(timer);
    225       1.3     oster 			tracerec->plog_us += RF_ETIMER_VAL_US(timer);
    226      1.19     oster #endif
    227       1.3     oster 			(node->wakeFunc) (node, ENOMEM);
    228       1.3     oster 		}
    229       1.1     oster 	}
    230       1.1     oster }
    231       1.1     oster 
    232      1.31  christos void
    233      1.28  christos rf_ParityLogUpdateUndoFunc(RF_DagNode_t *node)
    234       1.1     oster {
    235       1.1     oster }
    236       1.1     oster 
    237      1.31  christos void
    238      1.28  christos rf_ParityLogOverwriteUndoFunc(RF_DagNode_t *node)
    239       1.1     oster {
    240       1.1     oster }
    241      1.10     oster #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
    242      1.10     oster 
    243      1.14     oster /*****************************************************************************
    244       1.1     oster  * the execution function associated with a NOP node
    245      1.14     oster  ****************************************************************************/
    246      1.31  christos void
    247      1.15     oster rf_NullNodeFunc(RF_DagNode_t *node)
    248       1.1     oster {
    249       1.3     oster 	node->status = rf_good;
    250      1.31  christos 	rf_FinishNode(node, RF_THREAD_CONTEXT);
    251       1.1     oster }
    252       1.1     oster 
    253      1.31  christos void
    254      1.15     oster rf_NullNodeUndoFunc(RF_DagNode_t *node)
    255       1.1     oster {
    256       1.3     oster 	node->status = rf_undone;
    257      1.31  christos 	rf_FinishNode(node, RF_THREAD_CONTEXT);
    258       1.1     oster }
    259       1.1     oster 
    260       1.1     oster 
    261      1.14     oster /*****************************************************************************
    262       1.1     oster  * the execution function associated with a disk-read node
    263      1.14     oster  ****************************************************************************/
    264      1.31  christos void
    265      1.15     oster rf_DiskReadFuncForThreads(RF_DagNode_t *node)
    266       1.3     oster {
    267       1.3     oster 	RF_DiskQueueData_t *req;
    268       1.3     oster 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    269      1.29  christos 	void *bf = (void *) node->params[1].p;
    270       1.3     oster 	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
    271       1.3     oster 	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
    272       1.3     oster 	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
    273       1.3     oster 	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
    274      1.13     oster 	RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    275       1.1     oster 
    276       1.3     oster 	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
    277      1.31  christos 	    bf, parityStripeID, which_ru, node->wakeFunc, node,
    278      1.19     oster #if RF_ACC_TRACE > 0
    279      1.19     oster 	     node->dagHdr->tracerec,
    280      1.19     oster #else
    281      1.19     oster              NULL,
    282      1.19     oster #endif
    283  1.32.6.1   thorpej 	    (void *) (node->dagHdr->raidPtr), 0, node->dagHdr->bp);
    284  1.32.6.1   thorpej 
    285  1.32.6.1   thorpej 	node->dagFuncData = (void *) req;
    286  1.32.6.1   thorpej 	rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority);
    287       1.1     oster }
    288       1.1     oster 
    289       1.1     oster 
    290      1.14     oster /*****************************************************************************
    291       1.1     oster  * the execution function associated with a disk-write node
    292      1.14     oster  ****************************************************************************/
    293      1.31  christos void
    294      1.15     oster rf_DiskWriteFuncForThreads(RF_DagNode_t *node)
    295       1.3     oster {
    296       1.3     oster 	RF_DiskQueueData_t *req;
    297       1.3     oster 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    298      1.29  christos 	void *bf = (void *) node->params[1].p;
    299       1.3     oster 	RF_StripeNum_t parityStripeID = (RF_StripeNum_t) node->params[2].v;
    300       1.3     oster 	unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
    301       1.3     oster 	unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
    302       1.3     oster 	RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
    303      1.13     oster 	RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    304       1.1     oster 
    305       1.3     oster 	/* normal processing (rollaway or forward recovery) begins here */
    306       1.3     oster 	req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
    307      1.31  christos 	    bf, parityStripeID, which_ru, node->wakeFunc, node,
    308      1.19     oster #if RF_ACC_TRACE > 0
    309       1.3     oster 	    node->dagHdr->tracerec,
    310      1.19     oster #else
    311      1.19     oster 	    NULL,
    312      1.19     oster #endif
    313       1.3     oster 	    (void *) (node->dagHdr->raidPtr),
    314  1.32.6.1   thorpej 	    0, node->dagHdr->bp);
    315       1.3     oster 
    316  1.32.6.1   thorpej 	node->dagFuncData = (void *) req;
    317  1.32.6.1   thorpej 	rf_DiskIOEnqueue(&(dqs[pda->col]), req, priority);
    318       1.1     oster }
    319      1.14     oster /*****************************************************************************
    320       1.1     oster  * the undo function for disk nodes
    321       1.1     oster  * Note:  this is not a proper undo of a write node, only locks are released.
    322       1.1     oster  *        old data is not restored to disk!
    323      1.14     oster  ****************************************************************************/
    324      1.31  christos void
    325      1.15     oster rf_DiskUndoFunc(RF_DagNode_t *node)
    326       1.3     oster {
    327       1.3     oster 	RF_DiskQueueData_t *req;
    328       1.3     oster 	RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    329      1.13     oster 	RF_DiskQueue_t *dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    330       1.3     oster 
    331       1.3     oster 	req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
    332      1.31  christos 	    0L, 0, NULL, 0L, 0, node->wakeFunc, node,
    333      1.19     oster #if RF_ACC_TRACE > 0
    334      1.19     oster 	     node->dagHdr->tracerec,
    335      1.19     oster #else
    336      1.19     oster 	     NULL,
    337      1.19     oster #endif
    338       1.3     oster 	    (void *) (node->dagHdr->raidPtr),
    339  1.32.6.1   thorpej 	    0, NULL);
    340  1.32.6.1   thorpej 
    341  1.32.6.1   thorpej 	node->dagFuncData = (void *) req;
    342  1.32.6.1   thorpej 	rf_DiskIOEnqueue(&(dqs[pda->col]), req, RF_IO_NORMAL_PRIORITY);
    343       1.1     oster }
    344       1.3     oster 
    345      1.14     oster /*****************************************************************************
    346      1.14     oster  * Callback routine for DiskRead and DiskWrite nodes.  When the disk
    347      1.14     oster  * op completes, the routine is called to set the node status and
    348      1.14     oster  * inform the execution engine that the node has fired.
    349      1.14     oster  ****************************************************************************/
    350      1.31  christos void
    351      1.31  christos rf_GenericWakeupFunc(void *v, int status)
    352       1.3     oster {
    353      1.31  christos 	RF_DagNode_t *node = v;
    354      1.15     oster 
    355       1.3     oster 	switch (node->status) {
    356       1.3     oster 	case rf_fired:
    357       1.3     oster 		if (status)
    358       1.3     oster 			node->status = rf_bad;
    359       1.3     oster 		else
    360       1.3     oster 			node->status = rf_good;
    361       1.3     oster 		break;
    362       1.3     oster 	case rf_recover:
    363       1.3     oster 		/* probably should never reach this case */
    364       1.3     oster 		if (status)
    365       1.3     oster 			node->status = rf_panic;
    366       1.3     oster 		else
    367       1.3     oster 			node->status = rf_undone;
    368       1.3     oster 		break;
    369       1.3     oster 	default:
    370       1.4     oster 		printf("rf_GenericWakeupFunc:");
    371       1.4     oster 		printf("node->status is %d,", node->status);
    372       1.4     oster 		printf("status is %d \n", status);
    373       1.3     oster 		RF_PANIC();
    374       1.3     oster 		break;
    375       1.3     oster 	}
    376       1.3     oster 	if (node->dagFuncData)
    377       1.3     oster 		rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
    378      1.31  christos 	rf_FinishNode(node, RF_INTR_CONTEXT);
    379       1.1     oster }
    380       1.1     oster 
    381       1.1     oster 
    382      1.14     oster /*****************************************************************************
    383      1.14     oster  * there are three distinct types of xor nodes:
    384      1.14     oster 
    385      1.14     oster  * A "regular xor" is used in the fault-free case where the access
    386      1.14     oster  * spans a complete stripe unit.  It assumes that the result buffer is
    387      1.14     oster  * one full stripe unit in size, and uses the stripe-unit-offset
    388      1.14     oster  * values that it computes from the PDAs to determine where within the
    389      1.14     oster  * stripe unit to XOR each argument buffer.
    390      1.14     oster  *
    391      1.14     oster  * A "simple xor" is used in the fault-free case where the access
    392      1.14     oster  * touches only a portion of one (or two, in some cases) stripe
    393      1.14     oster  * unit(s).  It assumes that all the argument buffers are of the same
    394      1.14     oster  * size and have the same stripe unit offset.
    395      1.14     oster  *
    396      1.14     oster  * A "recovery xor" is used in the degraded-mode case.  It's similar
    397      1.14     oster  * to the regular xor function except that it takes the failed PDA as
    398      1.14     oster  * an additional parameter, and uses it to determine what portions of
    399      1.14     oster  * the argument buffers need to be xor'd into the result buffer, and
    400      1.14     oster  * where in the result buffer they should go.
    401      1.14     oster  ****************************************************************************/
    402       1.1     oster 
    403       1.1     oster /* xor the params together and store the result in the result field.
    404      1.14     oster  * assume the result field points to a buffer that is the size of one
    405      1.14     oster  * SU, and use the pda params to determine where within the buffer to
    406      1.14     oster  * XOR the input buffers.  */
    407      1.31  christos void
    408      1.15     oster rf_RegularXorFunc(RF_DagNode_t *node)
    409       1.3     oster {
    410       1.3     oster 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    411      1.19     oster #if RF_ACC_TRACE > 0
    412       1.3     oster 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    413       1.3     oster 	RF_Etimer_t timer;
    414      1.19     oster #endif
    415       1.3     oster 	int     i, retcode;
    416       1.1     oster 
    417       1.3     oster 	retcode = 0;
    418       1.3     oster 	if (node->dagHdr->status == rf_enable) {
    419       1.3     oster 		/* don't do the XOR if the input is the same as the output */
    420      1.19     oster #if RF_ACC_TRACE > 0
    421       1.3     oster 		RF_ETIMER_START(timer);
    422      1.19     oster #endif
    423       1.3     oster 		for (i = 0; i < node->numParams - 1; i += 2)
    424       1.3     oster 			if (node->params[i + 1].p != node->results[0]) {
    425       1.3     oster 				retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
    426      1.17     oster 							   (char *) node->params[i + 1].p, (char *) node->results[0]);
    427       1.3     oster 			}
    428      1.19     oster #if RF_ACC_TRACE > 0
    429       1.3     oster 		RF_ETIMER_STOP(timer);
    430       1.3     oster 		RF_ETIMER_EVAL(timer);
    431       1.3     oster 		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    432      1.19     oster #endif
    433       1.3     oster 	}
    434      1.31  christos 	rf_GenericWakeupFunc(node, retcode);	/* call wake func
    435      1.31  christos 						 * explicitly since no
    436      1.31  christos 						 * I/O in this node */
    437       1.1     oster }
    438       1.1     oster /* xor the inputs into the result buffer, ignoring placement issues */
    439      1.31  christos void
    440      1.15     oster rf_SimpleXorFunc(RF_DagNode_t *node)
    441       1.3     oster {
    442       1.3     oster 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    443       1.3     oster 	int     i, retcode = 0;
    444      1.19     oster #if RF_ACC_TRACE > 0
    445       1.3     oster 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    446       1.3     oster 	RF_Etimer_t timer;
    447      1.19     oster #endif
    448       1.1     oster 
    449       1.3     oster 	if (node->dagHdr->status == rf_enable) {
    450      1.19     oster #if RF_ACC_TRACE > 0
    451       1.3     oster 		RF_ETIMER_START(timer);
    452      1.19     oster #endif
    453       1.3     oster 		/* don't do the XOR if the input is the same as the output */
    454       1.3     oster 		for (i = 0; i < node->numParams - 1; i += 2)
    455       1.3     oster 			if (node->params[i + 1].p != node->results[0]) {
    456       1.3     oster 				retcode = rf_bxor((char *) node->params[i + 1].p, (char *) node->results[0],
    457      1.17     oster 				    rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[i].p)->numSector));
    458       1.3     oster 			}
    459      1.19     oster #if RF_ACC_TRACE > 0
    460       1.3     oster 		RF_ETIMER_STOP(timer);
    461       1.3     oster 		RF_ETIMER_EVAL(timer);
    462       1.3     oster 		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    463      1.19     oster #endif
    464       1.3     oster 	}
    465      1.31  christos 	rf_GenericWakeupFunc(node, retcode);	/* call wake func
    466      1.31  christos 						 * explicitly since no
    467      1.31  christos 						 * I/O in this node */
    468       1.1     oster }
    469      1.14     oster /* this xor is used by the degraded-mode dag functions to recover lost
    470      1.14     oster  * data.  the second-to-last parameter is the PDA for the failed
    471      1.14     oster  * portion of the access.  the code here looks at this PDA and assumes
    472      1.14     oster  * that the xor target buffer is equal in size to the number of
    473      1.14     oster  * sectors in the failed PDA.  It then uses the other PDAs in the
    474      1.14     oster  * parameter list to determine where within the target buffer the
    475      1.14     oster  * corresponding data should be xored.  */
    476      1.31  christos void
    477      1.15     oster rf_RecoveryXorFunc(RF_DagNode_t *node)
    478       1.3     oster {
    479       1.3     oster 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    480       1.3     oster 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    481       1.3     oster 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    482       1.3     oster 	int     i, retcode = 0;
    483       1.3     oster 	RF_PhysDiskAddr_t *pda;
    484       1.3     oster 	int     suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    485       1.3     oster 	char   *srcbuf, *destbuf;
    486      1.19     oster #if RF_ACC_TRACE > 0
    487       1.3     oster 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    488       1.3     oster 	RF_Etimer_t timer;
    489      1.19     oster #endif
    490       1.1     oster 
    491       1.3     oster 	if (node->dagHdr->status == rf_enable) {
    492      1.19     oster #if RF_ACC_TRACE > 0
    493       1.3     oster 		RF_ETIMER_START(timer);
    494      1.19     oster #endif
    495       1.3     oster 		for (i = 0; i < node->numParams - 2; i += 2)
    496       1.3     oster 			if (node->params[i + 1].p != node->results[0]) {
    497       1.3     oster 				pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    498       1.3     oster 				srcbuf = (char *) node->params[i + 1].p;
    499       1.3     oster 				suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    500       1.3     oster 				destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    501      1.17     oster 				retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector));
    502       1.3     oster 			}
    503      1.19     oster #if RF_ACC_TRACE > 0
    504       1.3     oster 		RF_ETIMER_STOP(timer);
    505       1.3     oster 		RF_ETIMER_EVAL(timer);
    506       1.3     oster 		tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    507      1.19     oster #endif
    508       1.3     oster 	}
    509      1.31  christos 	rf_GenericWakeupFunc(node, retcode);
    510       1.1     oster }
    511      1.14     oster /*****************************************************************************
    512      1.14     oster  * The next three functions are utilities used by the above
    513      1.14     oster  * xor-execution functions.
    514      1.14     oster  ****************************************************************************/
    515       1.1     oster 
    516       1.1     oster 
    517       1.1     oster /*
    518      1.14     oster  * this is just a glorified buffer xor.  targbuf points to a buffer
    519      1.14     oster  * that is one full stripe unit in size.  srcbuf points to a buffer
    520      1.14     oster  * that may be less than 1 SU, but never more.  When the access
    521      1.14     oster  * described by pda is one SU in size (which by implication means it's
    522      1.14     oster  * SU-aligned), all that happens is (targbuf) <- (srcbuf ^ targbuf).
    523      1.14     oster  * When the access is less than one SU in size the XOR occurs on only
    524      1.14     oster  * the portion of targbuf identified in the pda.  */
    525       1.1     oster 
    526      1.23     perry int
    527      1.15     oster rf_XorIntoBuffer(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda,
    528      1.17     oster 		 char *srcbuf, char *targbuf)
    529       1.3     oster {
    530       1.3     oster 	char   *targptr;
    531       1.3     oster 	int     sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    532       1.3     oster 	int     SUOffset = pda->startSector % sectPerSU;
    533       1.3     oster 	int     length, retcode = 0;
    534       1.3     oster 
    535       1.3     oster 	RF_ASSERT(pda->numSector <= sectPerSU);
    536       1.3     oster 
    537       1.3     oster 	targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
    538       1.3     oster 	length = rf_RaidAddressToByte(raidPtr, pda->numSector);
    539      1.17     oster 	retcode = rf_bxor(srcbuf, targptr, length);
    540       1.3     oster 	return (retcode);
    541       1.1     oster }
    542      1.14     oster /* it really should be the case that the buffer pointers (returned by
    543      1.14     oster  * malloc) are aligned to the natural word size of the machine, so
    544      1.14     oster  * this is the only case we optimize for.  The length should always be
    545      1.14     oster  * a multiple of the sector size, so there should be no problem with
    546      1.14     oster  * leftover bytes at the end.  */
    547      1.23     perry int
    548      1.17     oster rf_bxor(char *src, char *dest, int len)
    549       1.3     oster {
    550       1.3     oster 	unsigned mask = sizeof(long) - 1, retcode = 0;
    551       1.3     oster 
    552      1.23     perry 	if (!(((unsigned long) src) & mask) &&
    553      1.14     oster 	    !(((unsigned long) dest) & mask) && !(len & mask)) {
    554      1.23     perry 		retcode = rf_longword_bxor((unsigned long *) src,
    555      1.23     perry 					   (unsigned long *) dest,
    556      1.17     oster 					   len >> RF_LONGSHIFT);
    557       1.3     oster 	} else {
    558       1.3     oster 		RF_ASSERT(0);
    559       1.3     oster 	}
    560       1.3     oster 	return (retcode);
    561       1.1     oster }
    562       1.1     oster 
    563      1.14     oster /* When XORing in kernel mode, we need to map each user page to kernel
    564      1.14     oster  * space before we can access it.  We don't want to assume anything
    565      1.14     oster  * about which input buffers are in kernel/user space, nor about their
    566      1.14     oster  * alignment, so in each loop we compute the maximum number of bytes
    567      1.14     oster  * that we can xor without crossing any page boundaries, and do only
    568      1.23     perry  * this many bytes before the next remap.
    569      1.23     perry  *
    570      1.23     perry  * len - is in longwords
    571      1.15     oster  */
    572      1.23     perry int
    573      1.17     oster rf_longword_bxor(unsigned long *src, unsigned long *dest, int len)
    574       1.3     oster {
    575       1.6  augustss 	unsigned long *end = src + len;
    576       1.6  augustss 	unsigned long d0, d1, d2, d3, s0, s1, s2, s3;	/* temps */
    577      1.14     oster 	unsigned long *pg_src, *pg_dest;   /* per-page source/dest pointers */
    578       1.3     oster 	int     longs_this_time;/* # longwords to xor in the current iteration */
    579       1.3     oster 
    580      1.16     oster 	pg_src = src;
    581      1.16     oster 	pg_dest = dest;
    582       1.3     oster 	if (!pg_src || !pg_dest)
    583       1.3     oster 		return (EFAULT);
    584       1.3     oster 
    585       1.3     oster 	while (len >= 4) {
    586       1.3     oster 		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT);	/* note len in longwords */
    587       1.3     oster 		src += longs_this_time;
    588       1.3     oster 		dest += longs_this_time;
    589       1.3     oster 		len -= longs_this_time;
    590       1.3     oster 		while (longs_this_time >= 4) {
    591       1.3     oster 			d0 = pg_dest[0];
    592       1.3     oster 			d1 = pg_dest[1];
    593       1.3     oster 			d2 = pg_dest[2];
    594       1.3     oster 			d3 = pg_dest[3];
    595       1.3     oster 			s0 = pg_src[0];
    596       1.3     oster 			s1 = pg_src[1];
    597       1.3     oster 			s2 = pg_src[2];
    598       1.3     oster 			s3 = pg_src[3];
    599       1.3     oster 			pg_dest[0] = d0 ^ s0;
    600       1.3     oster 			pg_dest[1] = d1 ^ s1;
    601       1.3     oster 			pg_dest[2] = d2 ^ s2;
    602       1.3     oster 			pg_dest[3] = d3 ^ s3;
    603       1.3     oster 			pg_src += 4;
    604       1.3     oster 			pg_dest += 4;
    605       1.3     oster 			longs_this_time -= 4;
    606       1.3     oster 		}
    607       1.3     oster 		while (longs_this_time > 0) {	/* cannot cross any page
    608       1.3     oster 						 * boundaries here */
    609       1.3     oster 			*pg_dest++ ^= *pg_src++;
    610       1.3     oster 			longs_this_time--;
    611       1.3     oster 		}
    612       1.3     oster 
    613       1.3     oster 		/* either we're done, or we've reached a page boundary on one
    614       1.3     oster 		 * (or possibly both) of the pointers */
    615       1.3     oster 		if (len) {
    616       1.3     oster 			if (RF_PAGE_ALIGNED(src))
    617      1.16     oster 				pg_src = src;
    618       1.3     oster 			if (RF_PAGE_ALIGNED(dest))
    619      1.16     oster 				pg_dest = dest;
    620       1.3     oster 			if (!pg_src || !pg_dest)
    621       1.3     oster 				return (EFAULT);
    622       1.3     oster 		}
    623       1.3     oster 	}
    624       1.3     oster 	while (src < end) {
    625       1.3     oster 		*pg_dest++ ^= *pg_src++;
    626       1.3     oster 		src++;
    627       1.3     oster 		dest++;
    628       1.3     oster 		len--;
    629       1.3     oster 		if (RF_PAGE_ALIGNED(src))
    630      1.16     oster 			pg_src = src;
    631       1.3     oster 		if (RF_PAGE_ALIGNED(dest))
    632      1.16     oster 			pg_dest = dest;
    633       1.3     oster 	}
    634       1.3     oster 	RF_ASSERT(len == 0);
    635       1.3     oster 	return (0);
    636       1.1     oster }
    637       1.1     oster 
    638       1.9     oster #if 0
    639       1.1     oster /*
    640       1.1     oster    dst = a ^ b ^ c;
    641       1.1     oster    a may equal dst
    642       1.1     oster    see comment above longword_bxor
    643      1.15     oster    len is length in longwords
    644       1.1     oster */
    645      1.23     perry int
    646      1.15     oster rf_longword_bxor3(unsigned long *dst, unsigned long *a, unsigned long *b,
    647      1.15     oster 		  unsigned long *c, int len, void *bp)
    648       1.3     oster {
    649       1.3     oster 	unsigned long a0, a1, a2, a3, b0, b1, b2, b3;
    650       1.6  augustss 	unsigned long *pg_a, *pg_b, *pg_c, *pg_dst;	/* per-page source/dest
    651       1.3     oster 								 * pointers */
    652       1.3     oster 	int     longs_this_time;/* # longs to xor in the current iteration */
    653       1.3     oster 	char    dst_is_a = 0;
    654       1.3     oster 
    655      1.16     oster 	pg_a = a;
    656      1.16     oster 	pg_b = b;
    657      1.16     oster 	pg_c = c;
    658       1.3     oster 	if (a == dst) {
    659       1.3     oster 		pg_dst = pg_a;
    660       1.3     oster 		dst_is_a = 1;
    661       1.3     oster 	} else {
    662      1.16     oster 		pg_dst = dst;
    663       1.3     oster 	}
    664       1.3     oster 
    665       1.3     oster 	/* align dest to cache line.  Can't cross a pg boundary on dst here. */
    666       1.3     oster 	while ((((unsigned long) pg_dst) & 0x1f)) {
    667       1.3     oster 		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    668       1.3     oster 		dst++;
    669       1.3     oster 		a++;
    670       1.3     oster 		b++;
    671       1.3     oster 		c++;
    672       1.3     oster 		if (RF_PAGE_ALIGNED(a)) {
    673      1.16     oster 			pg_a = a;
    674       1.3     oster 			if (!pg_a)
    675       1.3     oster 				return (EFAULT);
    676       1.3     oster 		}
    677       1.3     oster 		if (RF_PAGE_ALIGNED(b)) {
    678      1.16     oster 			pg_b = a;
    679       1.3     oster 			if (!pg_b)
    680       1.3     oster 				return (EFAULT);
    681       1.3     oster 		}
    682       1.3     oster 		if (RF_PAGE_ALIGNED(c)) {
    683      1.16     oster 			pg_c = a;
    684       1.3     oster 			if (!pg_c)
    685       1.3     oster 				return (EFAULT);
    686       1.3     oster 		}
    687       1.3     oster 		len--;
    688       1.3     oster 	}
    689       1.3     oster 
    690       1.3     oster 	while (len > 4) {
    691       1.3     oster 		longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
    692       1.3     oster 		a += longs_this_time;
    693       1.3     oster 		b += longs_this_time;
    694       1.3     oster 		c += longs_this_time;
    695       1.3     oster 		dst += longs_this_time;
    696       1.3     oster 		len -= longs_this_time;
    697       1.3     oster 		while (longs_this_time >= 4) {
    698       1.3     oster 			a0 = pg_a[0];
    699       1.3     oster 			longs_this_time -= 4;
    700       1.3     oster 
    701       1.3     oster 			a1 = pg_a[1];
    702       1.3     oster 			a2 = pg_a[2];
    703       1.3     oster 
    704       1.3     oster 			a3 = pg_a[3];
    705       1.3     oster 			pg_a += 4;
    706       1.3     oster 
    707       1.3     oster 			b0 = pg_b[0];
    708       1.3     oster 			b1 = pg_b[1];
    709       1.3     oster 
    710       1.3     oster 			b2 = pg_b[2];
    711       1.3     oster 			b3 = pg_b[3];
    712       1.3     oster 			/* start dual issue */
    713       1.3     oster 			a0 ^= b0;
    714       1.3     oster 			b0 = pg_c[0];
    715       1.3     oster 
    716       1.3     oster 			pg_b += 4;
    717       1.3     oster 			a1 ^= b1;
    718       1.3     oster 
    719       1.3     oster 			a2 ^= b2;
    720       1.3     oster 			a3 ^= b3;
    721       1.3     oster 
    722       1.3     oster 			b1 = pg_c[1];
    723       1.3     oster 			a0 ^= b0;
    724       1.3     oster 
    725       1.3     oster 			b2 = pg_c[2];
    726       1.3     oster 			a1 ^= b1;
    727       1.3     oster 
    728       1.3     oster 			b3 = pg_c[3];
    729       1.3     oster 			a2 ^= b2;
    730       1.3     oster 
    731       1.3     oster 			pg_dst[0] = a0;
    732       1.3     oster 			a3 ^= b3;
    733       1.3     oster 			pg_dst[1] = a1;
    734       1.3     oster 			pg_c += 4;
    735       1.3     oster 			pg_dst[2] = a2;
    736       1.3     oster 			pg_dst[3] = a3;
    737       1.3     oster 			pg_dst += 4;
    738       1.3     oster 		}
    739       1.3     oster 		while (longs_this_time > 0) {	/* cannot cross any page
    740       1.3     oster 						 * boundaries here */
    741       1.3     oster 			*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    742       1.3     oster 			longs_this_time--;
    743       1.3     oster 		}
    744       1.3     oster 
    745       1.3     oster 		if (len) {
    746       1.3     oster 			if (RF_PAGE_ALIGNED(a)) {
    747      1.16     oster 				pg_a = a;
    748       1.3     oster 				if (!pg_a)
    749       1.3     oster 					return (EFAULT);
    750       1.3     oster 				if (dst_is_a)
    751       1.3     oster 					pg_dst = pg_a;
    752       1.3     oster 			}
    753       1.3     oster 			if (RF_PAGE_ALIGNED(b)) {
    754      1.16     oster 				pg_b = b;
    755       1.3     oster 				if (!pg_b)
    756       1.3     oster 					return (EFAULT);
    757       1.3     oster 			}
    758       1.3     oster 			if (RF_PAGE_ALIGNED(c)) {
    759      1.16     oster 				pg_c = c;
    760       1.3     oster 				if (!pg_c)
    761       1.3     oster 					return (EFAULT);
    762       1.3     oster 			}
    763       1.3     oster 			if (!dst_is_a)
    764       1.3     oster 				if (RF_PAGE_ALIGNED(dst)) {
    765      1.16     oster 					pg_dst = dst;
    766       1.3     oster 					if (!pg_dst)
    767       1.3     oster 						return (EFAULT);
    768       1.3     oster 				}
    769       1.3     oster 		}
    770       1.3     oster 	}
    771       1.3     oster 	while (len) {
    772       1.3     oster 		*pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    773       1.3     oster 		dst++;
    774       1.3     oster 		a++;
    775       1.3     oster 		b++;
    776       1.3     oster 		c++;
    777       1.3     oster 		if (RF_PAGE_ALIGNED(a)) {
    778      1.16     oster 			pg_a = a;
    779       1.3     oster 			if (!pg_a)
    780       1.3     oster 				return (EFAULT);
    781       1.3     oster 			if (dst_is_a)
    782       1.3     oster 				pg_dst = pg_a;
    783       1.3     oster 		}
    784       1.3     oster 		if (RF_PAGE_ALIGNED(b)) {
    785      1.16     oster 			pg_b = b;
    786       1.3     oster 			if (!pg_b)
    787       1.3     oster 				return (EFAULT);
    788       1.3     oster 		}
    789       1.3     oster 		if (RF_PAGE_ALIGNED(c)) {
    790      1.16     oster 			pg_c = c;
    791       1.3     oster 			if (!pg_c)
    792       1.3     oster 				return (EFAULT);
    793       1.3     oster 		}
    794       1.3     oster 		if (!dst_is_a)
    795       1.3     oster 			if (RF_PAGE_ALIGNED(dst)) {
    796      1.16     oster 				pg_dst = dst;
    797       1.3     oster 				if (!pg_dst)
    798       1.3     oster 					return (EFAULT);
    799       1.3     oster 			}
    800       1.3     oster 		len--;
    801       1.3     oster 	}
    802       1.3     oster 	return (0);
    803       1.3     oster }
    804       1.3     oster 
    805      1.23     perry int
    806      1.23     perry rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b,
    807      1.15     oster 	 unsigned char *c, unsigned long len, void *bp)
    808       1.1     oster {
    809       1.3     oster 	RF_ASSERT(((RF_UL(dst) | RF_UL(a) | RF_UL(b) | RF_UL(c) | len) & 0x7) == 0);
    810       1.1     oster 
    811       1.3     oster 	return (rf_longword_bxor3((unsigned long *) dst, (unsigned long *) a,
    812       1.3     oster 		(unsigned long *) b, (unsigned long *) c, len >> RF_LONGSHIFT, bp));
    813       1.1     oster }
    814       1.9     oster #endif
    815