Home | History | Annotate | Line # | Download | only in raidframe
rf_dagdegrd.c revision 1.1
      1 /*	$NetBSD: rf_dagdegrd.c,v 1.1 1998/11/13 04:20:27 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_dagdegrd.c
     31  *
     32  * code for creating degraded read DAGs
     33  *
     34  * :
     35  * Log: rf_dagdegrd.c,v
     36  * Revision 1.20  1996/11/05 21:10:40  jimz
     37  * failed pda generalization
     38  *
     39  * Revision 1.19  1996/08/19  23:30:36  jimz
     40  * fix chained declustered accesses in degraded mode when mirror copy is failed
     41  * (workload shifting not allowed when there are no duplicate copies extant)
     42  *
     43  * Revision 1.18  1996/07/31  16:29:01  jimz
     44  * asm/asmap re-fix (EO merge)
     45  *
     46  * Revision 1.17  1996/07/31  15:34:34  jimz
     47  * evenodd changes; bugfixes for double-degraded archs, generalize
     48  * some formerly PQ-only functions
     49  *
     50  * Revision 1.16  1996/07/28  20:31:39  jimz
     51  * i386netbsd port
     52  * true/false fixup
     53  *
     54  * Revision 1.15  1996/07/27  23:36:08  jimz
     55  * Solaris port of simulator
     56  *
     57  * Revision 1.14  1996/07/22  19:52:16  jimz
     58  * switched node params to RF_DagParam_t, a union of
     59  * a 64-bit int and a void *, for better portability
     60  * attempted hpux port, but failed partway through for
     61  * lack of a single C compiler capable of compiling all
     62  * source files
     63  *
     64  * Revision 1.13  1996/06/09  02:36:46  jimz
     65  * lots of little crufty cleanup- fixup whitespace
     66  * issues, comment #ifdefs, improve typing in some
     67  * places (esp size-related)
     68  *
     69  * Revision 1.12  1996/06/07  22:26:27  jimz
     70  * type-ify which_ru (RF_ReconUnitNum_t)
     71  *
     72  * Revision 1.11  1996/06/07  21:33:04  jimz
     73  * begin using consistent types for sector numbers,
     74  * stripe numbers, row+col numbers, recon unit numbers
     75  *
     76  * Revision 1.10  1996/05/31  22:26:54  jimz
     77  * fix a lot of mapping problems, memory allocation problems
     78  * found some weird lock issues, fixed 'em
     79  * more code cleanup
     80  *
     81  * Revision 1.9  1996/05/30  11:29:41  jimz
     82  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
     83  * about when stripes should be locked (I made it consistent: no parity, no lock)
     84  * There was a lot of extra serialization of I/Os which I've removed- a lot of
     85  * it was to calculate values for the cache code, which is no longer with us.
     86  * More types, function, macro cleanup. Added code to properly quiesce the array
     87  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
     88  * before. Fixed memory allocation, freeing bugs.
     89  *
     90  * Revision 1.8  1996/05/27  18:56:37  jimz
     91  * more code cleanup
     92  * better typing
     93  * compiles in all 3 environments
     94  *
     95  * Revision 1.7  1996/05/24  22:17:04  jimz
     96  * continue code + namespace cleanup
     97  * typed a bunch of flags
     98  *
     99  * Revision 1.6  1996/05/24  04:28:55  jimz
    100  * release cleanup ckpt
    101  *
    102  * Revision 1.5  1996/05/23  21:46:35  jimz
    103  * checkpoint in code cleanup (release prep)
    104  * lots of types, function names have been fixed
    105  *
    106  * Revision 1.4  1996/05/23  00:33:23  jimz
    107  * code cleanup: move all debug decls to rf_options.c, all extern
    108  * debug decls to rf_options.h, all debug vars preceded by rf_
    109  *
    110  * Revision 1.3  1996/05/18  19:51:34  jimz
    111  * major code cleanup- fix syntax, make some types consistent,
    112  * add prototypes, clean out dead code, et cetera
    113  *
    114  * Revision 1.2  1996/05/08  21:01:24  jimz
    115  * fixed up enum type names that were conflicting with other
    116  * enums and function names (ie, "panic")
    117  * future naming trends will be towards RF_ and rf_ for
    118  * everything raidframe-related
    119  *
    120  * Revision 1.1  1996/05/03  19:22:23  wvcii
    121  * Initial revision
    122  *
    123  */
    124 
    125 #include "rf_types.h"
    126 #include "rf_raid.h"
    127 #include "rf_dag.h"
    128 #include "rf_dagutils.h"
    129 #include "rf_dagfuncs.h"
    130 #include "rf_threadid.h"
    131 #include "rf_debugMem.h"
    132 #include "rf_memchunk.h"
    133 #include "rf_general.h"
    134 #include "rf_dagdegrd.h"
    135 #include "rf_sys.h"
    136 
    137 
    138 /******************************************************************************
    139  *
    140  * General comments on DAG creation:
    141  *
    142  * All DAGs in this file use roll-away error recovery.  Each DAG has a single
    143  * commit node, usually called "Cmt."  If an error occurs before the Cmt node
    144  * is reached, the execution engine will halt forward execution and work
    145  * backward through the graph, executing the undo functions.  Assuming that
    146  * each node in the graph prior to the Cmt node are undoable and atomic - or -
    147  * does not make changes to permanent state, the graph will fail atomically.
    148  * If an error occurs after the Cmt node executes, the engine will roll-forward
    149  * through the graph, blindly executing nodes until it reaches the end.
    150  * If a graph reaches the end, it is assumed to have completed successfully.
    151  *
    152  * A graph has only 1 Cmt node.
    153  *
    154  */
    155 
    156 
    157 /******************************************************************************
    158  *
    159  * The following wrappers map the standard DAG creation interface to the
    160  * DAG creation routines.  Additionally, these wrappers enable experimentation
    161  * with new DAG structures by providing an extra level of indirection, allowing
    162  * the DAG creation routines to be replaced at this single point.
    163  */
    164 
    165 void rf_CreateRaidFiveDegradedReadDAG(
    166   RF_Raid_t             *raidPtr,
    167   RF_AccessStripeMap_t  *asmap,
    168   RF_DagHeader_t        *dag_h,
    169   void                  *bp,
    170   RF_RaidAccessFlags_t   flags,
    171   RF_AllocListElem_t    *allocList)
    172 {
    173   rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
    174     &rf_xorRecoveryFuncs);
    175 }
    176 
    177 
    178 /******************************************************************************
    179  *
    180  * DAG creation code begins here
    181  */
    182 
    183 
    184 /******************************************************************************
    185  * Create a degraded read DAG for RAID level 1
    186  *
    187  * Hdr -> Nil -> R(p/s)d -> Commit -> Trm
    188  *
    189  * The "Rd" node reads data from the surviving disk in the mirror pair
    190  *   Rpd - read of primary copy
    191  *   Rsd - read of secondary copy
    192  *
    193  * Parameters:  raidPtr   - description of the physical array
    194  *              asmap     - logical & physical addresses for this access
    195  *              bp        - buffer ptr (for holding write data)
    196  *              flags     - general flags (e.g. disk locking)
    197  *              allocList - list of memory allocated in DAG creation
    198  *****************************************************************************/
    199 
    200 void rf_CreateRaidOneDegradedReadDAG(
    201   RF_Raid_t             *raidPtr,
    202   RF_AccessStripeMap_t  *asmap,
    203   RF_DagHeader_t        *dag_h,
    204   void                  *bp,
    205   RF_RaidAccessFlags_t   flags,
    206   RF_AllocListElem_t    *allocList)
    207 {
    208   RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
    209   RF_StripeNum_t parityStripeID;
    210   RF_ReconUnitNum_t which_ru;
    211   RF_PhysDiskAddr_t *pda;
    212   int useMirror, i;
    213 
    214   useMirror = 0;
    215   parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
    216     asmap->raidAddress, &which_ru);
    217   if (rf_dagDebug) {
    218     printf("[Creating RAID level 1 degraded read DAG]\n");
    219   }
    220   dag_h->creator = "RaidOneDegradedReadDAG";
    221   /* alloc the Wnd nodes and the Wmir node */
    222   if (asmap->numDataFailed == 0)
    223     useMirror = RF_FALSE;
    224   else
    225     useMirror = RF_TRUE;
    226 
    227   /* total number of nodes = 1 + (block + commit + terminator) */
    228   RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
    229   i = 0;
    230   rdNode      = &nodes[i]; i++;
    231   blockNode   = &nodes[i]; i++;
    232   commitNode = &nodes[i]; i++;
    233   termNode    = &nodes[i]; i++;
    234 
    235   /* this dag can not commit until the commit node is reached.   errors prior
    236    * to the commit point imply the dag has failed and must be retried
    237    */
    238   dag_h->numCommitNodes = 1;
    239   dag_h->numCommits = 0;
    240   dag_h->numSuccedents = 1;
    241 
    242   /* initialize the block, commit, and terminator nodes */
    243   rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    244     NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
    245   rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    246     NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
    247   rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
    248     NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    249 
    250   pda = asmap->physInfo;
    251   RF_ASSERT(pda != NULL);
    252   /* parityInfo must describe entire parity unit */
    253   RF_ASSERT(asmap->parityInfo->next == NULL);
    254 
    255   /* initialize the data node */
    256   if (!useMirror) {
    257     /* read primary copy of data */
    258     rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
    259       rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList);
    260     rdNode->params[0].p = pda;
    261     rdNode->params[1].p = pda->bufPtr;
    262     rdNode->params[2].v = parityStripeID;
    263     rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    264   }
    265   else {
    266     /* read secondary copy of data */
    267     rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
    268       rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList);
    269     rdNode->params[0].p = asmap->parityInfo;
    270     rdNode->params[1].p = pda->bufPtr;
    271     rdNode->params[2].v = parityStripeID;
    272     rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    273   }
    274 
    275   /* connect header to block node */
    276   RF_ASSERT(dag_h->numSuccedents == 1);
    277   RF_ASSERT(blockNode->numAntecedents == 0);
    278   dag_h->succedents[0] = blockNode;
    279 
    280   /* connect block node to rdnode */
    281   RF_ASSERT(blockNode->numSuccedents == 1);
    282   RF_ASSERT(rdNode->numAntecedents == 1);
    283   blockNode->succedents[0] = rdNode;
    284   rdNode->antecedents[0] = blockNode;
    285   rdNode->antType[0] = rf_control;
    286 
    287   /* connect rdnode to commit node */
    288   RF_ASSERT(rdNode->numSuccedents == 1);
    289   RF_ASSERT(commitNode->numAntecedents == 1);
    290   rdNode->succedents[0] = commitNode;
    291   commitNode->antecedents[0] = rdNode;
    292   commitNode->antType[0] = rf_control;
    293 
    294   /* connect commit node to terminator */
    295   RF_ASSERT(commitNode->numSuccedents == 1);
    296   RF_ASSERT(termNode->numAntecedents == 1);
    297   RF_ASSERT(termNode->numSuccedents == 0);
    298   commitNode->succedents[0] = termNode;
    299   termNode->antecedents[0] = commitNode;
    300   termNode->antType[0] = rf_control;
    301 }
    302 
    303 
    304 
    305 /******************************************************************************
    306  *
    307  * creates a DAG to perform a degraded-mode read of data within one stripe.
    308  * This DAG is as follows:
    309  *
    310  * Hdr -> Block -> Rud -> Xor -> Cmt -> T
    311  *              -> Rrd ->
    312  *              -> Rp -->
    313  *
    314  * Each R node is a successor of the L node
    315  * One successor arc from each R node goes to C, and the other to X
    316  * There is one Rud for each chunk of surviving user data requested by the
    317  * user, and one Rrd for each chunk of surviving user data _not_ being read by
    318  * the user
    319  * R = read, ud = user data, rd = recovery (surviving) data, p = parity
    320  * X = XOR, C = Commit, T = terminate
    321  *
    322  * The block node guarantees a single source node.
    323  *
    324  * Note:  The target buffer for the XOR node is set to the actual user buffer
    325  * where the failed data is supposed to end up.  This buffer is zero'd by the
    326  * code here.  Thus, if you create a degraded read dag, use it, and then
    327  * re-use, you have to be sure to zero the target buffer prior to the re-use.
    328  *
    329  * The recfunc argument at the end specifies the name and function used for
    330  * the redundancy
    331  * recovery function.
    332  *
    333  *****************************************************************************/
    334 
    335 void rf_CreateDegradedReadDAG(
    336   RF_Raid_t             *raidPtr,
    337   RF_AccessStripeMap_t  *asmap,
    338   RF_DagHeader_t        *dag_h,
    339   void                  *bp,
    340   RF_RaidAccessFlags_t   flags,
    341   RF_AllocListElem_t    *allocList,
    342   RF_RedFuncs_t         *recFunc)
    343 {
    344   RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *xorNode, *blockNode;
    345   RF_DagNode_t *commitNode, *rpNode, *termNode;
    346   int nNodes, nRrdNodes, nRudNodes, nXorBufs, i;
    347   int j, paramNum;
    348   RF_SectorCount_t sectorsPerSU;
    349   RF_ReconUnitNum_t which_ru;
    350   char *overlappingPDAs; /* a temporary array of flags */
    351   RF_AccessStripeMapHeader_t *new_asm_h[2];
    352   RF_PhysDiskAddr_t *pda, *parityPDA;
    353   RF_StripeNum_t parityStripeID;
    354   RF_PhysDiskAddr_t *failedPDA;
    355   RF_RaidLayout_t *layoutPtr;
    356   char *rpBuf;
    357 
    358   layoutPtr = &(raidPtr->Layout);
    359   /* failedPDA points to the pda within the asm that targets the failed disk */
    360   failedPDA = asmap->failedPDAs[0];
    361   parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
    362     asmap->raidAddress, &which_ru);
    363   sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
    364 
    365   if (rf_dagDebug) {
    366     printf("[Creating degraded read DAG]\n");
    367   }
    368 
    369   RF_ASSERT( asmap->numDataFailed == 1 );
    370   dag_h->creator = "DegradedReadDAG";
    371 
    372   /*
    373    * generate two ASMs identifying the surviving data we need
    374    * in order to recover the lost data
    375    */
    376 
    377   /* overlappingPDAs array must be zero'd */
    378   RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *));
    379   rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, &nXorBufs,
    380     &rpBuf, overlappingPDAs, allocList);
    381 
    382   /*
    383    * create all the nodes at once
    384    *
    385    * -1 because no access is generated for the failed pda
    386    */
    387   nRudNodes = asmap->numStripeUnitsAccessed-1;
    388   nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
    389               ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
    390   nNodes = 5 + nRudNodes + nRrdNodes; /* lock, unlock, xor, Rp, Rud, Rrd */
    391   RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
    392     allocList);
    393   i = 0;
    394   blockNode   = &nodes[i]; i++;
    395   commitNode  = &nodes[i]; i++;
    396   xorNode     = &nodes[i]; i++;
    397   rpNode      = &nodes[i]; i++;
    398   termNode    = &nodes[i]; i++;
    399   rudNodes    = &nodes[i]; i += nRudNodes;
    400   rrdNodes    = &nodes[i]; i += nRrdNodes;
    401   RF_ASSERT(i == nNodes);
    402 
    403   /* initialize nodes */
    404   dag_h->numCommitNodes = 1;
    405   dag_h->numCommits = 0;
    406   /* this dag can not commit until the commit node is reached
    407    * errors prior to the commit point imply the dag has failed
    408    */
    409   dag_h->numSuccedents = 1;
    410 
    411   rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    412     NULL, nRudNodes+nRrdNodes+1, 0, 0, 0, dag_h, "Nil", allocList);
    413   rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    414     NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
    415   rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
    416     NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    417   rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple, rf_NullNodeUndoFunc,
    418     NULL, 1, nRudNodes+nRrdNodes+1, 2*nXorBufs+2, 1, dag_h,
    419 	recFunc->SimpleName, allocList);
    420 
    421   /* fill in the Rud nodes */
    422   for (pda=asmap->physInfo, i=0; i<nRudNodes; i++, pda=pda->next) {
    423     if (pda == failedPDA) {i--; continue;}
    424     rf_InitNode(&rudNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
    425       rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
    426       "Rud", allocList);
    427     RF_ASSERT(pda);
    428     rudNodes[i].params[0].p = pda;
    429     rudNodes[i].params[1].p = pda->bufPtr;
    430     rudNodes[i].params[2].v = parityStripeID;
    431     rudNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    432   }
    433 
    434   /* fill in the Rrd nodes */
    435   i = 0;
    436   if (new_asm_h[0]) {
    437     for (pda=new_asm_h[0]->stripeMap->physInfo;
    438          i<new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
    439          i++, pda=pda->next)
    440     {
    441       rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
    442         rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
    443         dag_h, "Rrd", allocList);
    444       RF_ASSERT(pda);
    445       rrdNodes[i].params[0].p = pda;
    446       rrdNodes[i].params[1].p = pda->bufPtr;
    447       rrdNodes[i].params[2].v = parityStripeID;
    448       rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    449     }
    450   }
    451   if (new_asm_h[1]) {
    452     for (j=0,pda=new_asm_h[1]->stripeMap->physInfo;
    453          j<new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
    454          j++, pda=pda->next)
    455     {
    456       rf_InitNode(&rrdNodes[i+j], rf_wait, RF_FALSE, rf_DiskReadFunc,
    457         rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
    458         dag_h, "Rrd", allocList);
    459       RF_ASSERT(pda);
    460       rrdNodes[i+j].params[0].p = pda;
    461       rrdNodes[i+j].params[1].p = pda->bufPtr;
    462       rrdNodes[i+j].params[2].v = parityStripeID;
    463       rrdNodes[i+j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    464     }
    465   }
    466 
    467   /* make a PDA for the parity unit */
    468   RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
    469   parityPDA->row = asmap->parityInfo->row;
    470   parityPDA->col = asmap->parityInfo->col;
    471   parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
    472     * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
    473   parityPDA->numSector = failedPDA->numSector;
    474 
    475   /* initialize the Rp node */
    476   rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
    477     rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rp ", allocList);
    478   rpNode->params[0].p = parityPDA;
    479   rpNode->params[1].p = rpBuf;
    480   rpNode->params[2].v = parityStripeID;
    481   rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    482 
    483   /*
    484    * the last and nastiest step is to assign all
    485    * the parameters of the Xor node
    486    */
    487   paramNum=0;
    488   for (i=0; i<nRrdNodes; i++) {
    489     /* all the Rrd nodes need to be xored together */
    490     xorNode->params[paramNum++] = rrdNodes[i].params[0];
    491     xorNode->params[paramNum++] = rrdNodes[i].params[1];
    492   }
    493   for (i=0; i<nRudNodes; i++) {
    494     /* any Rud nodes that overlap the failed access need to be xored in */
    495     if (overlappingPDAs[i]) {
    496       RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
    497       bcopy((char *)rudNodes[i].params[0].p, (char *)pda, sizeof(RF_PhysDiskAddr_t));
    498       rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
    499       xorNode->params[paramNum++].p = pda;
    500       xorNode->params[paramNum++].p = pda->bufPtr;
    501     }
    502   }
    503   RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
    504 
    505   /* install parity pda as last set of params to be xor'd */
    506   xorNode->params[paramNum++].p = parityPDA;
    507   xorNode->params[paramNum++].p = rpBuf;
    508 
    509   /*
    510    * the last 2 params to the recovery xor node are
    511    * the failed PDA and the raidPtr
    512    */
    513   xorNode->params[paramNum++].p = failedPDA;
    514   xorNode->params[paramNum++].p = raidPtr;
    515   RF_ASSERT( paramNum == 2*nXorBufs+2 );
    516 
    517   /*
    518    * The xor node uses results[0] as the target buffer.
    519    * Set pointer and zero the buffer. In the kernel, this
    520    * may be a user buffer in which case we have to remap it.
    521    */
    522   xorNode->results[0] = failedPDA->bufPtr;
    523   RF_BZERO(bp, failedPDA->bufPtr, rf_RaidAddressToByte(raidPtr,
    524     failedPDA->numSector));
    525 
    526   /* connect nodes to form graph */
    527   /* connect the header to the block node */
    528   RF_ASSERT(dag_h->numSuccedents == 1);
    529   RF_ASSERT(blockNode->numAntecedents == 0);
    530   dag_h->succedents[0] = blockNode;
    531 
    532   /* connect the block node to the read nodes */
    533   RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes));
    534   RF_ASSERT(rpNode->numAntecedents == 1);
    535   blockNode->succedents[0] = rpNode;
    536   rpNode->antecedents[0] = blockNode;
    537   rpNode->antType[0] = rf_control;
    538   for (i = 0; i < nRrdNodes; i++) {
    539     RF_ASSERT(rrdNodes[i].numSuccedents == 1);
    540     blockNode->succedents[1 + i] = &rrdNodes[i];
    541     rrdNodes[i].antecedents[0] = blockNode;
    542     rrdNodes[i].antType[0] = rf_control;
    543   }
    544   for (i = 0; i < nRudNodes; i++) {
    545     RF_ASSERT(rudNodes[i].numSuccedents == 1);
    546     blockNode->succedents[1 + nRrdNodes + i] = &rudNodes[i];
    547     rudNodes[i].antecedents[0] = blockNode;
    548     rudNodes[i].antType[0] = rf_control;
    549   }
    550 
    551   /* connect the read nodes to the xor node */
    552   RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes));
    553   RF_ASSERT(rpNode->numSuccedents == 1);
    554   rpNode->succedents[0] = xorNode;
    555   xorNode->antecedents[0] = rpNode;
    556   xorNode->antType[0] = rf_trueData;
    557   for (i = 0; i < nRrdNodes; i++) {
    558     RF_ASSERT(rrdNodes[i].numSuccedents == 1);
    559     rrdNodes[i].succedents[0] = xorNode;
    560     xorNode->antecedents[1 + i] = &rrdNodes[i];
    561     xorNode->antType[1 + i] = rf_trueData;
    562   }
    563   for (i = 0; i < nRudNodes; i++) {
    564     RF_ASSERT(rudNodes[i].numSuccedents == 1);
    565     rudNodes[i].succedents[0] = xorNode;
    566     xorNode->antecedents[1 + nRrdNodes + i] = &rudNodes[i];
    567     xorNode->antType[1 + nRrdNodes + i] = rf_trueData;
    568   }
    569 
    570   /* connect the xor node to the commit node */
    571   RF_ASSERT(xorNode->numSuccedents == 1);
    572   RF_ASSERT(commitNode->numAntecedents == 1);
    573   xorNode->succedents[0] = commitNode;
    574   commitNode->antecedents[0] = xorNode;
    575   commitNode->antType[0] = rf_control;
    576 
    577   /* connect the termNode to the commit node */
    578   RF_ASSERT(commitNode->numSuccedents == 1);
    579   RF_ASSERT(termNode->numAntecedents == 1);
    580   RF_ASSERT(termNode->numSuccedents == 0);
    581   commitNode->succedents[0] = termNode;
    582   termNode->antType[0] = rf_control;
    583   termNode->antecedents[0]  = commitNode;
    584 }
    585 
    586 
    587 /******************************************************************************
    588  * Create a degraded read DAG for Chained Declustering
    589  *
    590  * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm
    591  *
    592  * The "Rd" node reads data from the surviving disk in the mirror pair
    593  *   Rpd - read of primary copy
    594  *   Rsd - read of secondary copy
    595  *
    596  * Parameters:  raidPtr   - description of the physical array
    597  *              asmap     - logical & physical addresses for this access
    598  *              bp        - buffer ptr (for holding write data)
    599  *              flags     - general flags (e.g. disk locking)
    600  *              allocList - list of memory allocated in DAG creation
    601  *****************************************************************************/
    602 
    603 void rf_CreateRaidCDegradedReadDAG(
    604   RF_Raid_t             *raidPtr,
    605   RF_AccessStripeMap_t  *asmap,
    606   RF_DagHeader_t        *dag_h,
    607   void                  *bp,
    608   RF_RaidAccessFlags_t   flags,
    609   RF_AllocListElem_t    *allocList)
    610 {
    611   RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
    612   RF_StripeNum_t parityStripeID;
    613   int useMirror, i, shiftable;
    614   RF_ReconUnitNum_t which_ru;
    615   RF_PhysDiskAddr_t *pda;
    616 
    617   if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
    618     shiftable = RF_TRUE;
    619   }
    620   else {
    621     shiftable = RF_FALSE;
    622   }
    623   useMirror = 0;
    624   parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
    625     asmap->raidAddress, &which_ru);
    626 
    627   if (rf_dagDebug) {
    628     printf("[Creating RAID C degraded read DAG]\n");
    629   }
    630   dag_h->creator = "RaidCDegradedReadDAG";
    631   /* alloc the Wnd nodes and the Wmir node */
    632   if (asmap->numDataFailed == 0)
    633     useMirror = RF_FALSE;
    634   else
    635     useMirror = RF_TRUE;
    636 
    637   /* total number of nodes = 1 + (block + commit + terminator) */
    638   RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
    639   i = 0;
    640   rdNode      = &nodes[i]; i++;
    641   blockNode   = &nodes[i]; i++;
    642   commitNode = &nodes[i]; i++;
    643   termNode    = &nodes[i]; i++;
    644 
    645   /*
    646    * This dag can not commit until the commit node is reached.
    647    * Errors prior to the commit point imply the dag has failed
    648    * and must be retried.
    649    */
    650   dag_h->numCommitNodes = 1;
    651   dag_h->numCommits = 0;
    652   dag_h->numSuccedents = 1;
    653 
    654   /* initialize the block, commit, and terminator nodes */
    655   rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    656     NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
    657   rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    658     NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
    659   rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
    660     NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    661 
    662   pda = asmap->physInfo;
    663   RF_ASSERT(pda != NULL);
    664   /* parityInfo must describe entire parity unit */
    665   RF_ASSERT(asmap->parityInfo->next == NULL);
    666 
    667   /* initialize the data node */
    668   if (!useMirror) {
    669     rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
    670       rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList);
    671     if (shiftable && rf_compute_workload_shift(raidPtr, pda)) {
    672      /* shift this read to the next disk in line */
    673   	 rdNode->params[0].p = asmap->parityInfo;
    674    	 rdNode->params[1].p = pda->bufPtr;
    675 	 rdNode->params[2].v = parityStripeID;
    676 	 rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    677 	}
    678     else {
    679       /* read primary copy */
    680       rdNode->params[0].p = pda;
    681       rdNode->params[1].p = pda->bufPtr;
    682       rdNode->params[2].v = parityStripeID;
    683       rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    684     }
    685   }
    686   else {
    687     /* read secondary copy of data */
    688     rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
    689       rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList);
    690     rdNode->params[0].p = asmap->parityInfo;
    691     rdNode->params[1].p = pda->bufPtr;
    692     rdNode->params[2].v = parityStripeID;
    693     rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
    694   }
    695 
    696   /* connect header to block node */
    697   RF_ASSERT(dag_h->numSuccedents == 1);
    698   RF_ASSERT(blockNode->numAntecedents == 0);
    699   dag_h->succedents[0] = blockNode;
    700 
    701   /* connect block node to rdnode */
    702   RF_ASSERT(blockNode->numSuccedents == 1);
    703   RF_ASSERT(rdNode->numAntecedents == 1);
    704   blockNode->succedents[0] = rdNode;
    705   rdNode->antecedents[0] = blockNode;
    706   rdNode->antType[0] = rf_control;
    707 
    708   /* connect rdnode to commit node */
    709   RF_ASSERT(rdNode->numSuccedents == 1);
    710   RF_ASSERT(commitNode->numAntecedents == 1);
    711   rdNode->succedents[0] = commitNode;
    712   commitNode->antecedents[0] = rdNode;
    713   commitNode->antType[0] = rf_control;
    714 
    715   /* connect commit node to terminator */
    716   RF_ASSERT(commitNode->numSuccedents == 1);
    717   RF_ASSERT(termNode->numAntecedents == 1);
    718   RF_ASSERT(termNode->numSuccedents == 0);
    719   commitNode->succedents[0] = termNode;
    720   termNode->antecedents[0] = commitNode;
    721   termNode->antType[0] = rf_control;
    722 }
    723 
    724 /*
    725  * XXX move this elsewhere?
    726  */
    727 void rf_DD_GenerateFailedAccessASMs(
    728   RF_Raid_t              *raidPtr,
    729   RF_AccessStripeMap_t   *asmap,
    730   RF_PhysDiskAddr_t     **pdap,
    731   int                    *nNodep,
    732   RF_PhysDiskAddr_t     **pqpdap,
    733   int                    *nPQNodep,
    734   RF_AllocListElem_t     *allocList)
    735 {
    736   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    737   int PDAPerDisk,i;
    738   RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
    739   int numDataCol = layoutPtr->numDataCol;
    740   int state;
    741   RF_SectorNum_t suoff, suend;
    742   unsigned firstDataCol, napdas, count;
    743   RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0;
    744   RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
    745   RF_PhysDiskAddr_t *pda_p;
    746   RF_PhysDiskAddr_t *phys_p;
    747   RF_RaidAddr_t sosAddr;
    748 
    749   /* determine how many pda's we will have to generate per unaccess stripe.
    750      If there is only one failed data unit, it is one; if two, possibly two,
    751      depending wether they overlap. */
    752 
    753   fone_start = rf_StripeUnitOffset(layoutPtr,fone->startSector);
    754   fone_end = fone_start + fone->numSector;
    755 
    756 #define CONS_PDA(if,start,num) \
    757   pda_p->row = asmap->if->row;    pda_p->col = asmap->if->col; \
    758   pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
    759   pda_p->numSector = num; \
    760   pda_p->next = NULL; \
    761   RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
    762 
    763   if (asmap->numDataFailed==1)
    764     {
    765       PDAPerDisk = 1;
    766       state = 1;
    767       RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
    768       pda_p = *pqpdap;
    769       /* build p */
    770       CONS_PDA(parityInfo,fone_start,fone->numSector);
    771       pda_p->type = RF_PDA_TYPE_PARITY;
    772       pda_p++;
    773       /* build q */
    774       CONS_PDA(qInfo,fone_start,fone->numSector);
    775       pda_p->type = RF_PDA_TYPE_Q;
    776     }
    777   else
    778     {
    779       ftwo_start = rf_StripeUnitOffset(layoutPtr,ftwo->startSector);
    780       ftwo_end = ftwo_start + ftwo->numSector;
    781       if (fone->numSector + ftwo->numSector > secPerSU)
    782 	{
    783 	  PDAPerDisk = 1;
    784 	  state = 2;
    785 	  RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
    786 	  pda_p = *pqpdap;
    787 	  CONS_PDA(parityInfo,0,secPerSU);
    788 	  pda_p->type = RF_PDA_TYPE_PARITY;
    789 	  pda_p++;
    790 	  CONS_PDA(qInfo,0,secPerSU);
    791 	  pda_p->type = RF_PDA_TYPE_Q;
    792 	}
    793       else
    794 	{
    795 	  PDAPerDisk = 2;
    796 	  state = 3;
    797 	  /* four of them, fone, then ftwo */
    798 	  RF_MallocAndAdd(*pqpdap,4*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
    799 	  pda_p = *pqpdap;
    800 	  CONS_PDA(parityInfo,fone_start,fone->numSector);
    801 	  pda_p->type = RF_PDA_TYPE_PARITY;
    802 	  pda_p++;
    803 	  CONS_PDA(qInfo,fone_start,fone->numSector);
    804 	  pda_p->type = RF_PDA_TYPE_Q;
    805 	  pda_p++;
    806 	  CONS_PDA(parityInfo,ftwo_start,ftwo->numSector);
    807 	  pda_p->type = RF_PDA_TYPE_PARITY;
    808 	  pda_p++;
    809 	  CONS_PDA(qInfo,ftwo_start,ftwo->numSector);
    810 	  pda_p->type = RF_PDA_TYPE_Q;
    811 	}
    812     }
    813   /* figure out number of nonaccessed pda */
    814   napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed - (ftwo==NULL ? 1 : 0));
    815   *nPQNodep = PDAPerDisk;
    816 
    817   /* sweep over the over accessed pda's, figuring out the number of
    818      additional pda's to generate. Of course, skip the failed ones */
    819 
    820   count = 0;
    821   for ( pda_p=asmap->physInfo; pda_p; pda_p= pda_p->next)
    822     {
    823       if ((pda_p == fone) || (pda_p == ftwo))
    824 	continue;
    825       suoff = rf_StripeUnitOffset(layoutPtr,pda_p->startSector);
    826       suend = suoff + pda_p->numSector;
    827       switch (state)
    828 	{
    829 	case 1: /* one failed PDA to overlap */
    830 	  /* if a PDA doesn't contain the failed unit, it can
    831 	     only miss the start or end, not both */
    832 	  if ((suoff > fone_start) || (suend <fone_end))
    833 	    count++;
    834 	  break;
    835 	case 2: /* whole stripe */
    836 	  if (suoff) /* leak at begining */
    837 	    count++;
    838 	  if (suend < numDataCol) /* leak at end */
    839 	    count++;
    840 	  break;
    841 	case 3: /* two disjoint units */
    842 	  if ((suoff > fone_start) || (suend <fone_end))
    843 	    count++;
    844 	  if ((suoff > ftwo_start) || (suend <ftwo_end))
    845 	    count++;
    846 	  break;
    847 	default:
    848 	  RF_PANIC();
    849 	}
    850     }
    851 
    852   napdas += count;
    853   *nNodep = napdas;
    854   if (napdas == 0) return; /* short circuit */
    855 
    856   /* allocate up our list of pda's */
    857 
    858   RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
    859   *pdap = pda_p;
    860 
    861   /* linkem together */
    862   for (i=0; i < (napdas-1); i++)
    863     pda_p[i].next = pda_p+(i+1);
    864 
    865   /* march through the one's up to the first accessed disk */
    866   firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),asmap->physInfo->raidAddress) % numDataCol;
    867   sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    868   for (i=0; i < firstDataCol; i++)
    869     {
    870       if ((pda_p - (*pdap)) == napdas)
    871 	continue;
    872       pda_p->type = RF_PDA_TYPE_DATA;
    873       pda_p->raidAddress = sosAddr + (i * secPerSU);
    874       (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    875       /* skip over dead disks */
    876       if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
    877 	continue;
    878       switch (state)
    879 	{
    880 	case 1: /* fone */
    881 	  pda_p->numSector = fone->numSector;
    882 	  pda_p->raidAddress += fone_start;
    883 	  pda_p->startSector += fone_start;
    884 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    885 	  break;
    886 	case 2: /* full stripe */
    887 	  pda_p->numSector = secPerSU;
    888 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList);
    889 	  break;
    890 	case 3: /* two slabs */
    891 	  pda_p->numSector = fone->numSector;
    892 	  pda_p->raidAddress += fone_start;
    893 	  pda_p->startSector += fone_start;
    894 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    895 	  pda_p++;
    896           pda_p->type = RF_PDA_TYPE_DATA;
    897           pda_p->raidAddress = sosAddr + (i * secPerSU);
    898           (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    899 	  pda_p->numSector = ftwo->numSector;
    900 	  pda_p->raidAddress += ftwo_start;
    901 	  pda_p->startSector += ftwo_start;
    902 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    903 	  break;
    904 	default:
    905 	  RF_PANIC();
    906 	}
    907       pda_p++;
    908     }
    909 
    910   /* march through the touched stripe units */
    911   for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++)
    912     {
    913       if ((phys_p == asmap->failedPDAs[0]) || (phys_p == asmap->failedPDAs[1]))
    914 	continue;
    915       suoff = rf_StripeUnitOffset(layoutPtr,phys_p->startSector);
    916       suend = suoff + phys_p->numSector;
    917       switch(state)
    918 	{
    919 	case 1: /* single buffer */
    920 	  if (suoff > fone_start)
    921 	    {
    922 	      RF_ASSERT( suend >= fone_end );
    923 	      /* The data read starts after the mapped access,
    924 		 snip off the begining */
    925 	      pda_p->numSector = suoff - fone_start;
    926 	      pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start;
    927 	      (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    928 	      RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    929 	      pda_p++;
    930 	    }
    931 	  if (suend < fone_end)
    932 	    {
    933 	      RF_ASSERT ( suoff <= fone_start);
    934 	      /* The data read stops before the end of the failed access, extend */
    935 	      pda_p->numSector = fone_end - suend;
    936 	      pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
    937 	      (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    938 	      RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    939 	      pda_p++;
    940 	    }
    941 	  break;
    942 	case 2: /* whole stripe unit */
    943 	  RF_ASSERT( (suoff == 0) || (suend == secPerSU));
    944 	  if (suend < secPerSU)
    945 	    { /* short read, snip from end on */
    946 	      pda_p->numSector = secPerSU - suend;
    947 	      pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
    948 	      (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    949 	      RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    950 	      pda_p++;
    951 	    }
    952 	  else
    953 	    if (suoff > 0)
    954 	      { /* short at front */
    955 		pda_p->numSector = suoff;
    956 		pda_p->raidAddress = sosAddr + (i*secPerSU);
    957 		(raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    958 		RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    959 		pda_p++;
    960 	      }
    961 	  break;
    962 	case 3: /* two nonoverlapping failures */
    963 	  if ((suoff > fone_start) || (suend <fone_end))
    964 	    {
    965 	      if (suoff > fone_start)
    966 		{
    967 		  RF_ASSERT( suend >= fone_end );
    968 		  /* The data read starts after the mapped access,
    969 		     snip off the begining */
    970 		  pda_p->numSector = suoff - fone_start;
    971 		  pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start;
    972 		  (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    973 		  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    974 		  pda_p++;
    975 		}
    976 	      if (suend < fone_end)
    977 		{
    978 		  RF_ASSERT ( suoff <= fone_start);
    979 		  /* The data read stops before the end of the failed access, extend */
    980 		  pda_p->numSector = fone_end - suend;
    981 		  pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
    982 		  (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    983 		  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    984 		  pda_p++;
    985 		}
    986 	    }
    987 	  if ((suoff > ftwo_start) || (suend <ftwo_end))
    988 	    {
    989 	      if (suoff > ftwo_start)
    990 		{
    991 		  RF_ASSERT( suend >= ftwo_end );
    992 		  /* The data read starts after the mapped access,
    993 		     snip off the begining */
    994 		  pda_p->numSector = suoff - ftwo_start;
    995 		  pda_p->raidAddress = sosAddr + (i*secPerSU) + ftwo_start;
    996 		  (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
    997 		  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
    998 		  pda_p++;
    999 		}
   1000 	      if (suend < ftwo_end)
   1001 		{
   1002 		  RF_ASSERT ( suoff <= ftwo_start);
   1003 		  /* The data read stops before the end of the failed access, extend */
   1004 		  pda_p->numSector = ftwo_end - suend;
   1005 		  pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
   1006 		  (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
   1007 		  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
   1008 		  pda_p++;
   1009 		}
   1010 	    }
   1011 	  break;
   1012 	default:
   1013 	  RF_PANIC();
   1014         }
   1015     }
   1016 
   1017   /* after the last accessed disk */
   1018   for (; i < numDataCol; i++ )
   1019     {
   1020       if ((pda_p - (*pdap)) == napdas)
   1021 	continue;
   1022       pda_p->type = RF_PDA_TYPE_DATA;
   1023       pda_p->raidAddress = sosAddr + (i * secPerSU);
   1024       (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
   1025       /* skip over dead disks */
   1026       if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
   1027 	continue;
   1028       switch (state)
   1029 	{
   1030 	case 1: /* fone */
   1031 	  pda_p->numSector = fone->numSector;
   1032 	  pda_p->raidAddress += fone_start;
   1033 	  pda_p->startSector += fone_start;
   1034 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
   1035 	  break;
   1036 	case 2: /* full stripe */
   1037 	  pda_p->numSector = secPerSU;
   1038 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList);
   1039 	  break;
   1040 	case 3: /* two slabs */
   1041 	  pda_p->numSector = fone->numSector;
   1042 	  pda_p->raidAddress += fone_start;
   1043 	  pda_p->startSector += fone_start;
   1044 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
   1045 	  pda_p++;
   1046           pda_p->type = RF_PDA_TYPE_DATA;
   1047           pda_p->raidAddress = sosAddr + (i * secPerSU);
   1048           (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
   1049 	  pda_p->numSector = ftwo->numSector;
   1050 	  pda_p->raidAddress += ftwo_start;
   1051 	  pda_p->startSector += ftwo_start;
   1052 	  RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
   1053 	  break;
   1054 	default:
   1055 	  RF_PANIC();
   1056 	}
   1057       pda_p++;
   1058     }
   1059 
   1060   RF_ASSERT  (pda_p - *pdap == napdas);
   1061   return;
   1062 }
   1063 
   1064 #define INIT_DISK_NODE(node,name) \
   1065 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
   1066 (node)->succedents[0] = unblockNode; \
   1067 (node)->succedents[1] = recoveryNode; \
   1068 (node)->antecedents[0] = blockNode; \
   1069 (node)->antType[0] = rf_control
   1070 
   1071 #define DISK_NODE_PARAMS(_node_,_p_) \
   1072   (_node_).params[0].p = _p_ ; \
   1073   (_node_).params[1].p = (_p_)->bufPtr; \
   1074   (_node_).params[2].v = parityStripeID; \
   1075   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
   1076 
   1077 void rf_DoubleDegRead(
   1078   RF_Raid_t              *raidPtr,
   1079   RF_AccessStripeMap_t   *asmap,
   1080   RF_DagHeader_t         *dag_h,
   1081   void                   *bp,
   1082   RF_RaidAccessFlags_t    flags,
   1083   RF_AllocListElem_t     *allocList,
   1084   char                   *redundantReadNodeName,
   1085   char                   *recoveryNodeName,
   1086   int                   (*recovFunc)(RF_DagNode_t *))
   1087 {
   1088   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
   1089   RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode, *unblockNode, *rpNodes, *rqNodes, *termNode;
   1090   RF_PhysDiskAddr_t *pda, *pqPDAs;
   1091   RF_PhysDiskAddr_t *npdas;
   1092   int nNodes, nRrdNodes, nRudNodes, i;
   1093   RF_ReconUnitNum_t which_ru;
   1094   int nReadNodes, nPQNodes;
   1095   RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
   1096   RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1];
   1097   RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
   1098 
   1099   if (rf_dagDebug) printf("[Creating Double Degraded Read DAG]\n");
   1100   rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes,allocList);
   1101 
   1102   nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
   1103   nReadNodes = nRrdNodes + nRudNodes + 2*nPQNodes;
   1104   nNodes = 4 /* block, unblock, recovery, term */ + nReadNodes;
   1105 
   1106   RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
   1107   i = 0;
   1108   blockNode    = &nodes[i]; i += 1;
   1109   unblockNode  = &nodes[i]; i += 1;
   1110   recoveryNode = &nodes[i]; i += 1;
   1111   termNode     = &nodes[i]; i += 1;
   1112   rudNodes     = &nodes[i]; i += nRudNodes;
   1113   rrdNodes     = &nodes[i]; i += nRrdNodes;
   1114   rpNodes      = &nodes[i]; i += nPQNodes;
   1115   rqNodes      = &nodes[i]; i += nPQNodes;
   1116   RF_ASSERT(i == nNodes);
   1117 
   1118   dag_h->numSuccedents = 1;
   1119   dag_h->succedents[0] = blockNode;
   1120   dag_h->creator = "DoubleDegRead";
   1121   dag_h->numCommits = 0;
   1122   dag_h->numCommitNodes = 1; /*unblock */
   1123 
   1124   rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList);
   1125   termNode->antecedents[0]  = unblockNode;
   1126   termNode->antType[0] = rf_control;
   1127   termNode->antecedents[1]  = recoveryNode;
   1128   termNode->antType[1] = rf_control;
   1129 
   1130   /* init the block and unblock nodes */
   1131   /* The block node has all nodes except itself, unblock and recovery as successors. Similarly for
   1132      predecessors of the unblock. */
   1133   rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
   1134   rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h, "Nil", allocList);
   1135 
   1136   for (i=0; i < nReadNodes; i++)
   1137     {
   1138       blockNode->succedents[i] = rudNodes+i;
   1139       unblockNode->antecedents[i] = rudNodes+i;
   1140       unblockNode->antType[i] = rf_control;
   1141     }
   1142   unblockNode->succedents[0] = termNode;
   1143 
   1144   /* The recovery node has all the reads as predecessors, and the term node as successors. It gets a pda as a param
   1145      from each of the read nodes plus the raidPtr.
   1146      For each failed unit is has a result pda. */
   1147   rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
   1148 	   1, /* succesors */
   1149 	   nReadNodes, /* preds */
   1150 	   nReadNodes+2, /* params */
   1151 	   asmap->numDataFailed, /* results */
   1152 	   dag_h, recoveryNodeName, allocList);
   1153 
   1154   recoveryNode->succedents[0] = termNode;
   1155   for (i=0; i < nReadNodes; i++) {
   1156     recoveryNode->antecedents[i] = rudNodes+i;
   1157     recoveryNode->antType[i] = rf_trueData;
   1158   }
   1159 
   1160   /* build the read nodes, then come back and fill in recovery params and results */
   1161   pda = asmap->physInfo;
   1162   for (i=0; i < nRudNodes; pda = pda->next)
   1163     {
   1164       if ((pda == failedPDA) || (pda == failedPDAtwo))
   1165 	continue;
   1166       INIT_DISK_NODE(rudNodes+i,"Rud");
   1167       RF_ASSERT(pda);
   1168       DISK_NODE_PARAMS(rudNodes[i],pda);
   1169       i++;
   1170     }
   1171 
   1172   pda = npdas;
   1173   for (i=0; i < nRrdNodes; i++, pda = pda->next)
   1174     {
   1175       INIT_DISK_NODE(rrdNodes+i,"Rrd");
   1176       RF_ASSERT(pda);
   1177       DISK_NODE_PARAMS(rrdNodes[i],pda);
   1178     }
   1179 
   1180   /* redundancy pdas */
   1181   pda = pqPDAs;
   1182   INIT_DISK_NODE(rpNodes,"Rp");
   1183   RF_ASSERT(pda);
   1184   DISK_NODE_PARAMS(rpNodes[0],pda);
   1185   pda++;
   1186   INIT_DISK_NODE(rqNodes,redundantReadNodeName );
   1187   RF_ASSERT(pda);
   1188   DISK_NODE_PARAMS(rqNodes[0],pda);
   1189   if (nPQNodes==2)
   1190     {
   1191       pda++;
   1192       INIT_DISK_NODE(rpNodes+1,"Rp");
   1193       RF_ASSERT(pda);
   1194       DISK_NODE_PARAMS(rpNodes[1],pda);
   1195       pda++;
   1196       INIT_DISK_NODE( rqNodes+1,redundantReadNodeName );
   1197       RF_ASSERT(pda);
   1198       DISK_NODE_PARAMS(rqNodes[1],pda);
   1199     }
   1200 
   1201   /* fill in recovery node params */
   1202   for (i=0; i < nReadNodes; i++)
   1203     recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */
   1204   recoveryNode->params[i++].p = (void *) raidPtr;
   1205   recoveryNode->params[i++].p = (void *) asmap;
   1206   recoveryNode->results[0] = failedPDA;
   1207   if (asmap->numDataFailed ==2 )
   1208     recoveryNode->results[1] = failedPDAtwo;
   1209 
   1210   /* zero fill the target data buffers? */
   1211 }
   1212