Home | History | Annotate | Line # | Download | only in raidframe
rf_dagfuncs.c revision 1.2
      1 /*	$NetBSD: rf_dagfuncs.c,v 1.2 1999/01/26 02:33:53 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland, William V. Courtright II
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * dagfuncs.c -- DAG node execution routines
     31  *
     32  * Rules:
     33  * 1. Every DAG execution function must eventually cause node->status to
     34  *    get set to "good" or "bad", and "FinishNode" to be called. In the
     35  *    case of nodes that complete immediately (xor, NullNodeFunc, etc),
     36  *    the node execution function can do these two things directly. In
     37  *    the case of nodes that have to wait for some event (a disk read to
     38  *    complete, a lock to be released, etc) to occur before they can
     39  *    complete, this is typically achieved by having whatever module
     40  *    is doing the operation call GenericWakeupFunc upon completion.
     41  * 2. DAG execution functions should check the status in the DAG header
     42  *    and NOP out their operations if the status is not "enable". However,
     43  *    execution functions that release resources must be sure to release
     44  *    them even when they NOP out the function that would use them.
     45  *    Functions that acquire resources should go ahead and acquire them
     46  *    even when they NOP, so that a downstream release node will not have
     47  *    to check to find out whether or not the acquire was suppressed.
     48  */
     49 
     50 #include <sys/ioctl.h>
     51 #include <sys/param.h>
     52 
     53 #include "rf_archs.h"
     54 #include "rf_raid.h"
     55 #include "rf_dag.h"
     56 #include "rf_layout.h"
     57 #include "rf_etimer.h"
     58 #include "rf_acctrace.h"
     59 #include "rf_diskqueue.h"
     60 #include "rf_dagfuncs.h"
     61 #include "rf_general.h"
     62 #include "rf_engine.h"
     63 #include "rf_dagutils.h"
     64 
     65 #include "rf_kintf.h"
     66 
     67 #if RF_INCLUDE_PARITYLOGGING > 0
     68 #include "rf_paritylog.h"
     69 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
     70 
     71 int (*rf_DiskReadFunc)(RF_DagNode_t *);
     72 int (*rf_DiskWriteFunc)(RF_DagNode_t *);
     73 int (*rf_DiskReadUndoFunc)(RF_DagNode_t *);
     74 int (*rf_DiskWriteUndoFunc)(RF_DagNode_t *);
     75 int (*rf_DiskUnlockFunc)(RF_DagNode_t *);
     76 int (*rf_DiskUnlockUndoFunc)(RF_DagNode_t *);
     77 int (*rf_RegularXorUndoFunc)(RF_DagNode_t *);
     78 int (*rf_SimpleXorUndoFunc)(RF_DagNode_t *);
     79 int (*rf_RecoveryXorUndoFunc)(RF_DagNode_t *);
     80 
     81 /*****************************************************************************************
     82  * main (only) configuration routine for this module
     83  ****************************************************************************************/
     84 int rf_ConfigureDAGFuncs(listp)
     85   RF_ShutdownList_t  **listp;
     86 {
     87   RF_ASSERT( ((sizeof(long)==8) && RF_LONGSHIFT==3) || ((sizeof(long)==4)  && RF_LONGSHIFT==2) );
     88   rf_DiskReadFunc  = rf_DiskReadFuncForThreads;
     89   rf_DiskReadUndoFunc = rf_DiskUndoFunc;
     90   rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
     91   rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
     92   rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
     93   rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
     94   rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
     95   rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
     96   rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
     97   return(0);
     98 }
     99 
    100 
    101 
    102 /*****************************************************************************************
    103  * the execution function associated with a terminate node
    104  ****************************************************************************************/
    105 int rf_TerminateFunc(node)
    106   RF_DagNode_t  *node;
    107 {
    108   RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
    109   node->status = rf_good;
    110   return(rf_FinishNode(node, RF_THREAD_CONTEXT));
    111 }
    112 
    113 int rf_TerminateUndoFunc(node)
    114   RF_DagNode_t  *node;
    115 {
    116   return(0);
    117 }
    118 
    119 
    120 /*****************************************************************************************
    121  * execution functions associated with a mirror node
    122  *
    123  * parameters:
    124  *
    125  * 0 - physical disk addres of data
    126  * 1 - buffer for holding read data
    127  * 2 - parity stripe ID
    128  * 3 - flags
    129  * 4 - physical disk address of mirror (parity)
    130  *
    131  ****************************************************************************************/
    132 
    133 int rf_DiskReadMirrorIdleFunc(node)
    134   RF_DagNode_t  *node;
    135 {
    136   /* select the mirror copy with the shortest queue and fill in node parameters
    137      with physical disk address */
    138 
    139   rf_SelectMirrorDiskIdle(node);
    140   return(rf_DiskReadFunc(node));
    141 }
    142 
    143 int rf_DiskReadMirrorPartitionFunc(node)
    144   RF_DagNode_t  *node;
    145 {
    146   /* select the mirror copy with the shortest queue and fill in node parameters
    147      with physical disk address */
    148 
    149   rf_SelectMirrorDiskPartition(node);
    150   return(rf_DiskReadFunc(node));
    151 }
    152 
    153 int rf_DiskReadMirrorUndoFunc(node)
    154   RF_DagNode_t  *node;
    155 {
    156   return(0);
    157 }
    158 
    159 
    160 
    161 #if RF_INCLUDE_PARITYLOGGING > 0
    162 /*****************************************************************************************
    163  * the execution function associated with a parity log update node
    164  ****************************************************************************************/
    165 int rf_ParityLogUpdateFunc(node)
    166   RF_DagNode_t  *node;
    167 {
    168   RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    169   caddr_t buf = (caddr_t) node->params[1].p;
    170   RF_ParityLogData_t *logData;
    171   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    172   RF_Etimer_t timer;
    173 
    174   if (node->dagHdr->status == rf_enable)
    175     {
    176       RF_ETIMER_START(timer);
    177       logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
    178 				       (RF_Raid_t *) (node->dagHdr->raidPtr),
    179 				       node->wakeFunc, (void *) node,
    180 				       node->dagHdr->tracerec, timer);
    181       if (logData)
    182 	rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
    183       else
    184 	{
    185 	  RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer);
    186 	  (node->wakeFunc)(node, ENOMEM);
    187 	}
    188     }
    189     return(0);
    190 }
    191 
    192 
    193 /*****************************************************************************************
    194  * the execution function associated with a parity log overwrite node
    195  ****************************************************************************************/
    196 int rf_ParityLogOverwriteFunc(node)
    197   RF_DagNode_t  *node;
    198 {
    199   RF_PhysDiskAddr_t  *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
    200   caddr_t buf = (caddr_t) node->params[1].p;
    201   RF_ParityLogData_t *logData;
    202   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    203   RF_Etimer_t timer;
    204 
    205   if (node->dagHdr->status == rf_enable)
    206     {
    207       RF_ETIMER_START(timer);
    208       logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
    209 				    node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
    210       if (logData)
    211 	rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
    212       else
    213 	{
    214 	  RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer);
    215 	  (node->wakeFunc)(node, ENOMEM);
    216 	}
    217     }
    218     return(0);
    219 }
    220 
    221 #else /* RF_INCLUDE_PARITYLOGGING > 0 */
    222 
    223 int rf_ParityLogUpdateFunc(node)
    224   RF_DagNode_t  *node;
    225 {
    226   return(0);
    227 }
    228 int rf_ParityLogOverwriteFunc(node)
    229   RF_DagNode_t  *node;
    230 {
    231   return(0);
    232 }
    233 
    234 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
    235 
    236 int rf_ParityLogUpdateUndoFunc(node)
    237   RF_DagNode_t  *node;
    238 {
    239   return(0);
    240 }
    241 
    242 int rf_ParityLogOverwriteUndoFunc(node)
    243   RF_DagNode_t  *node;
    244 {
    245   return(0);
    246 }
    247 
    248 /*****************************************************************************************
    249  * the execution function associated with a NOP node
    250  ****************************************************************************************/
    251 int rf_NullNodeFunc(node)
    252   RF_DagNode_t  *node;
    253 {
    254   node->status = rf_good;
    255   return(rf_FinishNode(node, RF_THREAD_CONTEXT));
    256 }
    257 
    258 int rf_NullNodeUndoFunc(node)
    259   RF_DagNode_t  *node;
    260 {
    261   node->status = rf_undone;
    262   return(rf_FinishNode(node, RF_THREAD_CONTEXT));
    263 }
    264 
    265 
    266 /*****************************************************************************************
    267  * the execution function associated with a disk-read node
    268  ****************************************************************************************/
    269 int rf_DiskReadFuncForThreads(node)
    270   RF_DagNode_t  *node;
    271 {
    272   RF_DiskQueueData_t *req;
    273   RF_PhysDiskAddr_t  *pda       = (RF_PhysDiskAddr_t *)node->params[0].p;
    274   caddr_t        buf            = (caddr_t)node->params[1].p;
    275   RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v;
    276   unsigned       priority       = RF_EXTRACT_PRIORITY(node->params[3].v);
    277   unsigned       lock           = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    278   unsigned       unlock         = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    279   unsigned       which_ru       = RF_EXTRACT_RU(node->params[3].v);
    280   RF_DiskQueueDataFlags_t flags = 0;
    281   RF_IoType_t    iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
    282   RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    283   void *b_proc = NULL;
    284 #if RF_BACKWARD > 0
    285   caddr_t        undoBuf;
    286 #endif
    287 
    288   if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
    289 
    290   RF_ASSERT( !(lock && unlock) );
    291   flags |= (lock)   ? RF_LOCK_DISK_QUEUE   : 0;
    292   flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
    293 #if RF_BACKWARD > 0
    294   /* allocate and zero the undo buffer.
    295    * this is equivalent to copying the original buffer's contents to the undo buffer
    296    * prior to performing the disk read.
    297    * XXX hardcoded 512 bytes per sector!
    298    */
    299   if (node->dagHdr->allocList == NULL)
    300     rf_MakeAllocList(node->dagHdr->allocList);
    301   RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
    302 #endif /* RF_BACKWARD > 0 */
    303   req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
    304 			       buf, parityStripeID, which_ru,
    305 			       (int (*)(void *,int)) node->wakeFunc,
    306 			       node, NULL, node->dagHdr->tracerec,
    307 			    (void *)(node->dagHdr->raidPtr), flags, b_proc);
    308   if (!req) {
    309     (node->wakeFunc)(node, ENOMEM);
    310   } else {
    311     node->dagFuncData = (void *) req;
    312     rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
    313   }
    314   return(0);
    315 }
    316 
    317 
    318 /*****************************************************************************************
    319  * the execution function associated with a disk-write node
    320  ****************************************************************************************/
    321 int rf_DiskWriteFuncForThreads(node)
    322   RF_DagNode_t  *node;
    323 {
    324   RF_DiskQueueData_t *req;
    325   RF_PhysDiskAddr_t  *pda       = (RF_PhysDiskAddr_t *)node->params[0].p;
    326   caddr_t        buf            = (caddr_t)node->params[1].p;
    327   RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v;
    328   unsigned       priority       = RF_EXTRACT_PRIORITY(node->params[3].v);
    329   unsigned       lock           = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    330   unsigned       unlock         = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    331   unsigned       which_ru       = RF_EXTRACT_RU(node->params[3].v);
    332   RF_DiskQueueDataFlags_t flags = 0;
    333   RF_IoType_t    iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
    334   RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    335   void *b_proc = NULL;
    336 #if RF_BACKWARD > 0
    337   caddr_t undoBuf;
    338 #endif
    339 
    340   if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
    341 
    342 #if RF_BACKWARD > 0
    343   /* This area is used only for backward error recovery experiments
    344    * First, schedule allocate a buffer and schedule a pre-read of the disk
    345    * After the pre-read, proceed with the normal disk write
    346    */
    347   if (node->status == rf_bwd2) {
    348     /* just finished undo logging, now perform real function */
    349     node->status = rf_fired;
    350     RF_ASSERT( !(lock && unlock) );
    351     flags |= (lock)   ? RF_LOCK_DISK_QUEUE   : 0;
    352     flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
    353     req = rf_CreateDiskQueueData(iotype,
    354 			      pda->startSector, pda->numSector, buf, parityStripeID, which_ru,
    355 			      node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
    356 			      (void *) (node->dagHdr->raidPtr), flags, b_proc);
    357 
    358     if (!req) {
    359       (node->wakeFunc)(node, ENOMEM);
    360     } else {
    361       node->dagFuncData = (void *) req;
    362       rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
    363     }
    364   }
    365 
    366   else {
    367     /* node status should be rf_fired */
    368     /* schedule a disk pre-read */
    369     node->status = rf_bwd1;
    370     RF_ASSERT( !(lock && unlock) );
    371     flags |= (lock)   ? RF_LOCK_DISK_QUEUE   : 0;
    372     flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
    373     if (node->dagHdr->allocList == NULL)
    374       rf_MakeAllocList(node->dagHdr->allocList);
    375     RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
    376     req = rf_CreateDiskQueueData(RF_IO_TYPE_READ,
    377 			      pda->startSector, pda->numSector, undoBuf, parityStripeID, which_ru,
    378 			      node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
    379 			      (void *) (node->dagHdr->raidPtr), flags, b_proc);
    380 
    381     if (!req) {
    382       (node->wakeFunc)(node, ENOMEM);
    383     } else {
    384       node->dagFuncData = (void *) req;
    385       rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
    386     }
    387   }
    388   return(0);
    389 #endif /* RF_BACKWARD > 0 */
    390 
    391   /* normal processing (rollaway or forward recovery) begins here */
    392   RF_ASSERT( !(lock && unlock) );
    393   flags |= (lock)   ? RF_LOCK_DISK_QUEUE   : 0;
    394   flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
    395   req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
    396 			       buf, parityStripeID, which_ru,
    397 			       (int (*)(void *,int)) node->wakeFunc,
    398 			       (void *) node, NULL,
    399 			       node->dagHdr->tracerec,
    400 			       (void *) (node->dagHdr->raidPtr),
    401 			       flags, b_proc);
    402 
    403   if (!req) {
    404     (node->wakeFunc)(node, ENOMEM);
    405   } else {
    406     node->dagFuncData = (void *) req;
    407     rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
    408   }
    409 
    410   return(0);
    411 }
    412 
    413 /*****************************************************************************************
    414  * the undo function for disk nodes
    415  * Note:  this is not a proper undo of a write node, only locks are released.
    416  *        old data is not restored to disk!
    417  ****************************************************************************************/
    418 int rf_DiskUndoFunc(node)
    419   RF_DagNode_t  *node;
    420 {
    421   RF_DiskQueueData_t *req;
    422   RF_PhysDiskAddr_t  *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    423   RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    424 
    425   req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
    426 			       0L, 0, NULL, 0L, 0,
    427 			       (int (*)(void *,int)) node->wakeFunc,
    428 			       (void *) node,
    429 			       NULL, node->dagHdr->tracerec,
    430 			       (void *) (node->dagHdr->raidPtr),
    431 			       RF_UNLOCK_DISK_QUEUE, NULL);
    432   if (!req)
    433     (node->wakeFunc)(node, ENOMEM);
    434   else {
    435     node->dagFuncData = (void *) req;
    436     rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY );
    437   }
    438 
    439   return(0);
    440 }
    441 
    442 /*****************************************************************************************
    443  * the execution function associated with an "unlock disk queue" node
    444  ****************************************************************************************/
    445 int rf_DiskUnlockFuncForThreads(node)
    446   RF_DagNode_t  *node;
    447 {
    448   RF_DiskQueueData_t *req;
    449   RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    450   RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
    451 
    452   req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
    453 			       0L, 0, NULL, 0L, 0,
    454 			       (int (*)(void *,int)) node->wakeFunc,
    455 			       (void *) node,
    456 			       NULL, node->dagHdr->tracerec,
    457 			       (void *) (node->dagHdr->raidPtr),
    458 			       RF_UNLOCK_DISK_QUEUE, NULL);
    459   if (!req)
    460     (node->wakeFunc)(node, ENOMEM);
    461   else {
    462     node->dagFuncData = (void *) req;
    463     rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY );
    464   }
    465 
    466   return(0);
    467 }
    468 
    469 /*****************************************************************************************
    470  * Callback routine for DiskRead and DiskWrite nodes.  When the disk op completes,
    471  * the routine is called to set the node status and inform the execution engine that
    472  * the node has fired.
    473  ****************************************************************************************/
    474 int rf_GenericWakeupFunc(node, status)
    475   RF_DagNode_t  *node;
    476   int            status;
    477 {
    478   switch (node->status) {
    479   case rf_bwd1 :
    480     node->status = rf_bwd2;
    481     if (node->dagFuncData)
    482       rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
    483     return(rf_DiskWriteFuncForThreads(node));
    484     break;
    485   case rf_fired :
    486     if (status) node->status = rf_bad;
    487     else node->status = rf_good;
    488     break;
    489   case rf_recover :
    490     /* probably should never reach this case */
    491     if (status) node->status = rf_panic;
    492     else node->status = rf_undone;
    493     break;
    494   default :
    495     RF_PANIC();
    496     break;
    497   }
    498   if (node->dagFuncData)
    499     rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
    500   return(rf_FinishNode(node, RF_INTR_CONTEXT));
    501 }
    502 
    503 
    504 /*****************************************************************************************
    505  * there are three distinct types of xor nodes
    506  * A "regular xor" is used in the fault-free case where the access spans a complete
    507  * stripe unit.  It assumes that the result buffer is one full stripe unit in size,
    508  * and uses the stripe-unit-offset values that it computes from the PDAs to determine
    509  * where within the stripe unit to XOR each argument buffer.
    510  *
    511  * A "simple xor" is used in the fault-free case where the access touches only a portion
    512  * of one (or two, in some cases) stripe unit(s).  It assumes that all the argument
    513  * buffers are of the same size and have the same stripe unit offset.
    514  *
    515  * A "recovery xor" is used in the degraded-mode case.  It's similar to the regular
    516  * xor function except that it takes the failed PDA as an additional parameter, and
    517  * uses it to determine what portions of the argument buffers need to be xor'd into
    518  * the result buffer, and where in the result buffer they should go.
    519  ****************************************************************************************/
    520 
    521 /* xor the params together and store the result in the result field.
    522  * assume the result field points to a buffer that is the size of one SU,
    523  * and use the pda params to determine where within the buffer to XOR
    524  * the input buffers.
    525  */
    526 int rf_RegularXorFunc(node)
    527   RF_DagNode_t  *node;
    528 {
    529   RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
    530   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    531   RF_Etimer_t timer;
    532   int i, retcode;
    533 #if RF_BACKWARD > 0
    534   RF_PhysDiskAddr_t *pda;
    535   caddr_t undoBuf;
    536 #endif
    537 
    538   retcode = 0;
    539   if (node->dagHdr->status == rf_enable) {
    540     /* don't do the XOR if the input is the same as the output */
    541     RF_ETIMER_START(timer);
    542     for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) {
    543 #if RF_BACKWARD > 0
    544       /* This section mimics undo logging for backward error recovery experiments b
    545        * allocating and initializing a buffer
    546        * XXX 512 byte sector size is hard coded!
    547        */
    548       pda = node->params[i].p;
    549       if (node->dagHdr->allocList == NULL)
    550 	rf_MakeAllocList(node->dagHdr->allocList);
    551       RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
    552 #endif /* RF_BACKWARD > 0 */
    553       retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
    554 			      (char *)node->params[i+1].p, (char *) node->results[0], node->dagHdr->bp);
    555     }
    556     RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    557   }
    558   return(rf_GenericWakeupFunc(node, retcode));     /* call wake func explicitly since no I/O in this node */
    559 }
    560 
    561 /* xor the inputs into the result buffer, ignoring placement issues */
    562 int rf_SimpleXorFunc(node)
    563   RF_DagNode_t  *node;
    564 {
    565   RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
    566   int i, retcode = 0;
    567   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    568   RF_Etimer_t timer;
    569 #if RF_BACKWARD > 0
    570   RF_PhysDiskAddr_t *pda;
    571   caddr_t undoBuf;
    572 #endif
    573 
    574   if (node->dagHdr->status == rf_enable) {
    575     RF_ETIMER_START(timer);
    576     /* don't do the XOR if the input is the same as the output */
    577     for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) {
    578 #if RF_BACKWARD > 0
    579       /* This section mimics undo logging for backward error recovery experiments b
    580        * allocating and initializing a buffer
    581        * XXX 512 byte sector size is hard coded!
    582        */
    583       pda = node->params[i].p;
    584       if (node->dagHdr->allocList == NULL)
    585 	rf_MakeAllocList(node->dagHdr->allocList);
    586       RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
    587 #endif /* RF_BACKWARD > 0 */
    588       retcode = rf_bxor((char *)node->params[i+1].p, (char *) node->results[0],
    589 		     rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[i].p)->numSector),
    590 		     (struct buf *) node->dagHdr->bp);
    591     }
    592     RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    593   }
    594 
    595   return(rf_GenericWakeupFunc(node, retcode));     /* call wake func explicitly since no I/O in this node */
    596 }
    597 
    598 /* this xor is used by the degraded-mode dag functions to recover lost data.
    599  * the second-to-last parameter is the PDA for the failed portion of the access.
    600  * the code here looks at this PDA and assumes that the xor target buffer is
    601  * equal in size to the number of sectors in the failed PDA.  It then uses
    602  * the other PDAs in the parameter list to determine where within the target
    603  * buffer the corresponding data should be xored.
    604  */
    605 int rf_RecoveryXorFunc(node)
    606   RF_DagNode_t  *node;
    607 {
    608   RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
    609   RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
    610   RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *)node->params[node->numParams-2].p;
    611   int i, retcode = 0;
    612   RF_PhysDiskAddr_t *pda;
    613   int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
    614   char *srcbuf, *destbuf;
    615   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    616   RF_Etimer_t timer;
    617 #if RF_BACKWARD > 0
    618   caddr_t undoBuf;
    619 #endif
    620 
    621   if (node->dagHdr->status == rf_enable) {
    622     RF_ETIMER_START(timer);
    623     for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) {
    624       pda = (RF_PhysDiskAddr_t *)node->params[i].p;
    625 #if RF_BACKWARD > 0
    626       /* This section mimics undo logging for backward error recovery experiments b
    627        * allocating and initializing a buffer
    628        * XXX 512 byte sector size is hard coded!
    629        */
    630       if (node->dagHdr->allocList == NULL)
    631 	rf_MakeAllocList(node->dagHdr->allocList);
    632       RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
    633 #endif /* RF_BACKWARD > 0 */
    634       srcbuf = (char *)node->params[i+1].p;
    635       suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    636       destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
    637       retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
    638     }
    639     RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
    640   }
    641   return (rf_GenericWakeupFunc(node, retcode));
    642 }
    643 
    644 /*****************************************************************************************
    645  * The next three functions are utilities used by the above xor-execution functions.
    646  ****************************************************************************************/
    647 
    648 
    649 /*
    650  * this is just a glorified buffer xor.  targbuf points to a buffer that is one full stripe unit
    651  * in size.  srcbuf points to a buffer that may be less than 1 SU, but never more.  When the
    652  * access described by pda is one SU in size (which by implication means it's SU-aligned),
    653  * all that happens is (targbuf) <- (srcbuf ^ targbuf).  When the access is less than one
    654  * SU in size the XOR occurs on only the portion of targbuf identified in the pda.
    655  */
    656 
    657 int rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
    658   RF_Raid_t          *raidPtr;
    659   RF_PhysDiskAddr_t  *pda;
    660   char               *srcbuf;
    661   char               *targbuf;
    662   void               *bp;
    663 {
    664   char *targptr;
    665   int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    666   int SUOffset = pda->startSector % sectPerSU;
    667   int length, retcode = 0;
    668 
    669   RF_ASSERT(pda->numSector <= sectPerSU);
    670 
    671   targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
    672   length  = rf_RaidAddressToByte(raidPtr, pda->numSector);
    673   retcode = rf_bxor(srcbuf, targptr, length, bp);
    674   return(retcode);
    675 }
    676 
    677 /* it really should be the case that the buffer pointers (returned by malloc)
    678  * are aligned to the natural word size of the machine, so this is the only
    679  * case we optimize for.  The length should always be a multiple of the sector
    680  * size, so there should be no problem with leftover bytes at the end.
    681  */
    682 int rf_bxor(src, dest, len, bp)
    683   char  *src;
    684   char  *dest;
    685   int    len;
    686   void  *bp;
    687 {
    688   unsigned mask = sizeof(long) -1, retcode = 0;
    689 
    690   if ( !(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len&mask) ) {
    691     retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len>>RF_LONGSHIFT, bp);
    692   } else {
    693     RF_ASSERT(0);
    694   }
    695   return(retcode);
    696 }
    697 
    698 /* map a user buffer into kernel space, if necessary */
    699 #define REMAP_VA(_bp,x,y) (y) = (x)
    700 
    701 /* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
    702  * We don't want to assume anything about which input buffers are in kernel/user
    703  * space, nor about their alignment, so in each loop we compute the maximum number
    704  * of bytes that we can xor without crossing any page boundaries, and do only this many
    705  * bytes before the next remap.
    706  */
    707 int rf_longword_bxor(src, dest, len, bp)
    708   register unsigned long  *src;
    709   register unsigned long  *dest;
    710   int                      len; /* longwords */
    711   void                    *bp;
    712 {
    713   register unsigned long *end = src+len;
    714   register unsigned long d0, d1, d2, d3, s0, s1, s2, s3;   /* temps */
    715   register unsigned long *pg_src, *pg_dest;                /* per-page source/dest pointers */
    716   int longs_this_time;                                     /* # longwords to xor in the current iteration */
    717 
    718   REMAP_VA(bp, src, pg_src);
    719   REMAP_VA(bp, dest, pg_dest);
    720   if (!pg_src || !pg_dest) return(EFAULT);
    721 
    722   while (len >= 4 ) {
    723     longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT);  /* note len in longwords */
    724     src += longs_this_time; dest+= longs_this_time; len -= longs_this_time;
    725     while (longs_this_time >= 4) {
    726       d0 = pg_dest[0];
    727       d1 = pg_dest[1];
    728       d2 = pg_dest[2];
    729       d3 = pg_dest[3];
    730       s0 = pg_src[0];
    731       s1 = pg_src[1];
    732       s2 = pg_src[2];
    733       s3 = pg_src[3];
    734       pg_dest[0] = d0 ^ s0;
    735       pg_dest[1] = d1 ^ s1;
    736       pg_dest[2] = d2 ^ s2;
    737       pg_dest[3] = d3 ^ s3;
    738       pg_src += 4;
    739       pg_dest += 4;
    740       longs_this_time -= 4;
    741     }
    742     while (longs_this_time > 0) {   /* cannot cross any page boundaries here */
    743       *pg_dest++ ^= *pg_src++;
    744       longs_this_time--;
    745     }
    746 
    747     /* either we're done, or we've reached a page boundary on one (or possibly both) of the pointers */
    748     if (len) {
    749       if (RF_PAGE_ALIGNED(src))  REMAP_VA(bp, src, pg_src);
    750       if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest);
    751       if (!pg_src || !pg_dest) return(EFAULT);
    752     }
    753   }
    754   while (src < end) {
    755     *pg_dest++ ^=  *pg_src++;
    756     src++; dest++; len--;
    757     if (RF_PAGE_ALIGNED(src)) REMAP_VA(bp, src, pg_src);
    758     if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest);
    759   }
    760   RF_ASSERT(len == 0);
    761   return(0);
    762 }
    763 
    764 
    765 /*
    766    dst = a ^ b ^ c;
    767    a may equal dst
    768    see comment above longword_bxor
    769 */
    770 int rf_longword_bxor3(dst,a,b,c,len, bp)
    771   register unsigned long  *dst;
    772   register unsigned long  *a;
    773   register unsigned long  *b;
    774   register unsigned long  *c;
    775   int                      len; /* length in longwords */
    776   void                    *bp;
    777 {
    778   unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
    779   register unsigned long *pg_a, *pg_b, *pg_c, *pg_dst;    /* per-page source/dest pointers */
    780   int longs_this_time;                                     /* # longs to xor in the current iteration */
    781   char dst_is_a = 0;
    782 
    783   REMAP_VA(bp, a, pg_a);
    784   REMAP_VA(bp, b, pg_b);
    785   REMAP_VA(bp, c, pg_c);
    786   if (a == dst) {pg_dst = pg_a; dst_is_a = 1;} else { REMAP_VA(bp, dst, pg_dst); }
    787 
    788   /* align dest to cache line.  Can't cross a pg boundary on dst here. */
    789   while ((((unsigned long) pg_dst) & 0x1f)) {
    790     *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    791     dst++; a++; b++; c++;
    792     if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT);}
    793     if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, a, pg_b); if (!pg_b) return(EFAULT);}
    794     if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, a, pg_c); if (!pg_c) return(EFAULT);}
    795     len--;
    796   }
    797 
    798   while (len > 4 ) {
    799     longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
    800     a+= longs_this_time; b+= longs_this_time; c+= longs_this_time; dst+=longs_this_time; len-=longs_this_time;
    801     while (longs_this_time >= 4) {
    802       a0 = pg_a[0]; longs_this_time -= 4;
    803 
    804       a1 = pg_a[1];
    805       a2 = pg_a[2];
    806 
    807       a3 = pg_a[3];  pg_a += 4;
    808 
    809       b0 = pg_b[0];
    810       b1 = pg_b[1];
    811 
    812       b2 = pg_b[2];
    813       b3 = pg_b[3];
    814       /* start dual issue */
    815       a0 ^= b0; b0 =  pg_c[0];
    816 
    817       pg_b += 4;  a1 ^= b1;
    818 
    819       a2 ^= b2; a3 ^= b3;
    820 
    821       b1 =  pg_c[1]; a0 ^= b0;
    822 
    823       b2 =  pg_c[2]; a1 ^= b1;
    824 
    825       b3 =  pg_c[3]; a2 ^= b2;
    826 
    827       pg_dst[0] = a0; a3 ^= b3;
    828       pg_dst[1] = a1; pg_c += 4;
    829       pg_dst[2] = a2;
    830       pg_dst[3] = a3; pg_dst += 4;
    831     }
    832     while (longs_this_time > 0) {   /* cannot cross any page boundaries here */
    833       *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    834       longs_this_time--;
    835     }
    836 
    837     if (len) {
    838       if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;}
    839       if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);}
    840       if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);}
    841       if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);}
    842     }
    843   }
    844   while (len) {
    845     *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
    846     dst++; a++; b++; c++;
    847     if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;}
    848     if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);}
    849     if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);}
    850     if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);}
    851     len--;
    852   }
    853   return(0);
    854 }
    855 
    856 int rf_bxor3(dst,a,b,c,len, bp)
    857   register unsigned char  *dst;
    858   register unsigned char  *a;
    859   register unsigned char  *b;
    860   register unsigned char  *c;
    861   unsigned long            len;
    862   void                    *bp;
    863 {
    864 	RF_ASSERT(((RF_UL(dst)|RF_UL(a)|RF_UL(b)|RF_UL(c)|len) & 0x7) == 0);
    865 
    866 	return(rf_longword_bxor3((unsigned long *)dst, (unsigned long *)a,
    867 		(unsigned long *)b, (unsigned long *)c, len>>RF_LONGSHIFT, bp));
    868 }
    869