Home | History | Annotate | Line # | Download | only in raidframe
rf_pqdegdags.c revision 1.1
      1  1.1  oster /*	$NetBSD: rf_pqdegdags.c,v 1.1 1998/11/13 04:20:32 oster Exp $	*/
      2  1.1  oster /*
      3  1.1  oster  * Copyright (c) 1995 Carnegie-Mellon University.
      4  1.1  oster  * All rights reserved.
      5  1.1  oster  *
      6  1.1  oster  * Author: Daniel Stodolsky
      7  1.1  oster  *
      8  1.1  oster  * Permission to use, copy, modify and distribute this software and
      9  1.1  oster  * its documentation is hereby granted, provided that both the copyright
     10  1.1  oster  * notice and this permission notice appear in all copies of the
     11  1.1  oster  * software, derivative works or modified versions, and any portions
     12  1.1  oster  * thereof, and that both notices appear in supporting documentation.
     13  1.1  oster  *
     14  1.1  oster  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  1.1  oster  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  1.1  oster  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  1.1  oster  *
     18  1.1  oster  * Carnegie Mellon requests users of this software to return to
     19  1.1  oster  *
     20  1.1  oster  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  1.1  oster  *  School of Computer Science
     22  1.1  oster  *  Carnegie Mellon University
     23  1.1  oster  *  Pittsburgh PA 15213-3890
     24  1.1  oster  *
     25  1.1  oster  * any improvements or extensions that they make and grant Carnegie the
     26  1.1  oster  * rights to redistribute these changes.
     27  1.1  oster  */
     28  1.1  oster 
     29  1.1  oster /*
     30  1.1  oster  * rf_pqdegdags.c
     31  1.1  oster  * Degraded mode dags for double fault cases.
     32  1.1  oster */
     33  1.1  oster 
     34  1.1  oster /*
     35  1.1  oster  * :
     36  1.1  oster  * Log: rf_pqdegdags.c,v
     37  1.1  oster  * Revision 1.31  1996/11/05 21:10:40  jimz
     38  1.1  oster  * failed pda generalization
     39  1.1  oster  *
     40  1.1  oster  * Revision 1.30  1996/07/31  16:30:05  jimz
     41  1.1  oster  * asm/asmap fix
     42  1.1  oster  *
     43  1.1  oster  * Revision 1.29  1996/07/31  15:35:15  jimz
     44  1.1  oster  * evenodd changes; bugfixes for double-degraded archs, generalize
     45  1.1  oster  * some formerly PQ-only functions
     46  1.1  oster  *
     47  1.1  oster  * Revision 1.28  1996/07/28  20:31:39  jimz
     48  1.1  oster  * i386netbsd port
     49  1.1  oster  * true/false fixup
     50  1.1  oster  *
     51  1.1  oster  * Revision 1.27  1996/07/27  23:36:08  jimz
     52  1.1  oster  * Solaris port of simulator
     53  1.1  oster  *
     54  1.1  oster  * Revision 1.26  1996/07/22  19:52:16  jimz
     55  1.1  oster  * switched node params to RF_DagParam_t, a union of
     56  1.1  oster  * a 64-bit int and a void *, for better portability
     57  1.1  oster  * attempted hpux port, but failed partway through for
     58  1.1  oster  * lack of a single C compiler capable of compiling all
     59  1.1  oster  * source files
     60  1.1  oster  *
     61  1.1  oster  * Revision 1.25  1996/06/09  02:36:46  jimz
     62  1.1  oster  * lots of little crufty cleanup- fixup whitespace
     63  1.1  oster  * issues, comment #ifdefs, improve typing in some
     64  1.1  oster  * places (esp size-related)
     65  1.1  oster  *
     66  1.1  oster  * Revision 1.24  1996/06/07  22:26:27  jimz
     67  1.1  oster  * type-ify which_ru (RF_ReconUnitNum_t)
     68  1.1  oster  *
     69  1.1  oster  * Revision 1.23  1996/06/07  21:33:04  jimz
     70  1.1  oster  * begin using consistent types for sector numbers,
     71  1.1  oster  * stripe numbers, row+col numbers, recon unit numbers
     72  1.1  oster  *
     73  1.1  oster  * Revision 1.22  1996/06/02  17:31:48  jimz
     74  1.1  oster  * Moved a lot of global stuff into array structure, where it belongs.
     75  1.1  oster  * Fixed up paritylogging, pss modules in this manner. Some general
     76  1.1  oster  * code cleanup. Removed lots of dead code, some dead files.
     77  1.1  oster  *
     78  1.1  oster  * Revision 1.21  1996/05/31  22:26:54  jimz
     79  1.1  oster  * fix a lot of mapping problems, memory allocation problems
     80  1.1  oster  * found some weird lock issues, fixed 'em
     81  1.1  oster  * more code cleanup
     82  1.1  oster  *
     83  1.1  oster  * Revision 1.20  1996/05/30  12:59:18  jimz
     84  1.1  oster  * make etimer happier, more portable
     85  1.1  oster  *
     86  1.1  oster  * Revision 1.19  1996/05/30  11:29:41  jimz
     87  1.1  oster  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
     88  1.1  oster  * about when stripes should be locked (I made it consistent: no parity, no lock)
     89  1.1  oster  * There was a lot of extra serialization of I/Os which I've removed- a lot of
     90  1.1  oster  * it was to calculate values for the cache code, which is no longer with us.
     91  1.1  oster  * More types, function, macro cleanup. Added code to properly quiesce the array
     92  1.1  oster  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
     93  1.1  oster  * before. Fixed memory allocation, freeing bugs.
     94  1.1  oster  *
     95  1.1  oster  * Revision 1.18  1996/05/27  18:56:37  jimz
     96  1.1  oster  * more code cleanup
     97  1.1  oster  * better typing
     98  1.1  oster  * compiles in all 3 environments
     99  1.1  oster  *
    100  1.1  oster  * Revision 1.17  1996/05/24  22:17:04  jimz
    101  1.1  oster  * continue code + namespace cleanup
    102  1.1  oster  * typed a bunch of flags
    103  1.1  oster  *
    104  1.1  oster  * Revision 1.16  1996/05/24  04:28:55  jimz
    105  1.1  oster  * release cleanup ckpt
    106  1.1  oster  *
    107  1.1  oster  * Revision 1.15  1996/05/23  21:46:35  jimz
    108  1.1  oster  * checkpoint in code cleanup (release prep)
    109  1.1  oster  * lots of types, function names have been fixed
    110  1.1  oster  *
    111  1.1  oster  * Revision 1.14  1996/05/23  00:33:23  jimz
    112  1.1  oster  * code cleanup: move all debug decls to rf_options.c, all extern
    113  1.1  oster  * debug decls to rf_options.h, all debug vars preceded by rf_
    114  1.1  oster  *
    115  1.1  oster  * Revision 1.13  1996/05/18  19:51:34  jimz
    116  1.1  oster  * major code cleanup- fix syntax, make some types consistent,
    117  1.1  oster  * add prototypes, clean out dead code, et cetera
    118  1.1  oster  *
    119  1.1  oster  * Revision 1.12  1996/05/08  21:01:24  jimz
    120  1.1  oster  * fixed up enum type names that were conflicting with other
    121  1.1  oster  * enums and function names (ie, "panic")
    122  1.1  oster  * future naming trends will be towards RF_ and rf_ for
    123  1.1  oster  * everything raidframe-related
    124  1.1  oster  *
    125  1.1  oster  * Revision 1.11  1996/05/03  19:47:50  wvcii
    126  1.1  oster  * removed include of rf_redstripe.h
    127  1.1  oster  *
    128  1.1  oster  * Revision 1.10  1995/12/12  18:10:06  jimz
    129  1.1  oster  * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
    130  1.1  oster  * fix 80-column brain damage in comments
    131  1.1  oster  *
    132  1.1  oster  * Revision 1.9  1995/11/30  16:17:57  wvcii
    133  1.1  oster  * added copyright info
    134  1.1  oster  *
    135  1.1  oster  * Revision 1.8  1995/11/07  15:33:25  wvcii
    136  1.1  oster  * dag creation routines now generate term node
    137  1.1  oster  * added asserts
    138  1.1  oster  * encoded commit point nodes, antecedence types into dags
    139  1.1  oster  * didn't add commit barrier - the code is a mess and needs to
    140  1.1  oster  * be cleand up first
    141  1.1  oster  *
    142  1.1  oster  */
    143  1.1  oster 
    144  1.1  oster #include "rf_archs.h"
    145  1.1  oster 
    146  1.1  oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
    147  1.1  oster 
    148  1.1  oster #include "rf_types.h"
    149  1.1  oster #include "rf_raid.h"
    150  1.1  oster #include "rf_dag.h"
    151  1.1  oster #include "rf_dagfuncs.h"
    152  1.1  oster #include "rf_dagutils.h"
    153  1.1  oster #include "rf_etimer.h"
    154  1.1  oster #include "rf_acctrace.h"
    155  1.1  oster #include "rf_general.h"
    156  1.1  oster #include "rf_pqdegdags.h"
    157  1.1  oster #include "rf_pq.h"
    158  1.1  oster #include "rf_sys.h"
    159  1.1  oster 
    160  1.1  oster static void applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda,
    161  1.1  oster 	RF_PhysDiskAddr_t *qpda, void *bp);
    162  1.1  oster 
    163  1.1  oster /*
    164  1.1  oster    Two data drives have failed, and we are doing a read that covers one of them.
    165  1.1  oster    We may also be reading some of the surviving drives.
    166  1.1  oster 
    167  1.1  oster 
    168  1.1  oster  *****************************************************************************************
    169  1.1  oster  *
    170  1.1  oster  * creates a DAG to perform a degraded-mode read of data within one stripe.
    171  1.1  oster  * This DAG is as follows:
    172  1.1  oster  *
    173  1.1  oster  *                                      Hdr
    174  1.1  oster  *                                       |
    175  1.1  oster  *                                     Block
    176  1.1  oster  *                       /         /           \         \     \   \
    177  1.1  oster  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
    178  1.1  oster  *                      | \       | \         | \       | \    | \ | \
    179  1.1  oster  *
    180  1.1  oster  *                                 |                 |
    181  1.1  oster  *                              Unblock              X
    182  1.1  oster  *                                  \               /
    183  1.1  oster  *                                   ------ T ------
    184  1.1  oster  *
    185  1.1  oster  * Each R node is a successor of the L node
    186  1.1  oster  * One successor arc from each R node goes to U, and the other to X
    187  1.1  oster  * There is one Rud for each chunk of surviving user data requested by the user,
    188  1.1  oster  * and one Rrd for each chunk of surviving user data _not_ being read by the user
    189  1.1  oster  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
    190  1.1  oster  * X = pq recovery node, T = terminate
    191  1.1  oster  *
    192  1.1  oster  * The block & unblock nodes are leftovers from a previous version.  They
    193  1.1  oster  * do nothing, but I haven't deleted them because it would be a tremendous
    194  1.1  oster  * effort to put them back in.
    195  1.1  oster  *
    196  1.1  oster  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
    197  1.1  oster  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
    198  1.1  oster  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
    199  1.1  oster  * zero the target buffer prior to the re-use.
    200  1.1  oster  *
    201  1.1  oster  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
    202  1.1  oster  * needs and what's not.
    203  1.1  oster  ****************************************************************************************/
    204  1.1  oster /*   init a disk node with 2 successors and one predecessor */
    205  1.1  oster #define INIT_DISK_NODE(node,name) \
    206  1.1  oster rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
    207  1.1  oster (node)->succedents[0] = unblockNode; \
    208  1.1  oster (node)->succedents[1] = recoveryNode; \
    209  1.1  oster (node)->antecedents[0] = blockNode; \
    210  1.1  oster (node)->antType[0] = rf_control
    211  1.1  oster 
    212  1.1  oster #define DISK_NODE_PARAMS(_node_,_p_) \
    213  1.1  oster   (_node_).params[0].p = _p_ ; \
    214  1.1  oster   (_node_).params[1].p = (_p_)->bufPtr; \
    215  1.1  oster   (_node_).params[2].v = parityStripeID; \
    216  1.1  oster   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
    217  1.1  oster 
    218  1.1  oster #define DISK_NODE_PDA(node)  ((node)->params[0].p)
    219  1.1  oster 
    220  1.1  oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
    221  1.1  oster {
    222  1.1  oster   rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
    223  1.1  oster     "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
    224  1.1  oster }
    225  1.1  oster 
    226  1.1  oster static void applyPDA(raidPtr,pda,ppda,qpda, bp)
    227  1.1  oster   RF_Raid_t          *raidPtr;
    228  1.1  oster   RF_PhysDiskAddr_t  *pda;
    229  1.1  oster   RF_PhysDiskAddr_t  *ppda;
    230  1.1  oster   RF_PhysDiskAddr_t  *qpda;
    231  1.1  oster   void               *bp;
    232  1.1  oster {
    233  1.1  oster   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    234  1.1  oster   RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
    235  1.1  oster   RF_SectorCount_t s0len = ppda->numSector, len;
    236  1.1  oster   RF_SectorNum_t suoffset;
    237  1.1  oster   unsigned coeff;
    238  1.1  oster   char *pbuf = ppda->bufPtr;
    239  1.1  oster   char *qbuf = qpda->bufPtr;
    240  1.1  oster   char *buf;
    241  1.1  oster   int delta;
    242  1.1  oster 
    243  1.1  oster   suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    244  1.1  oster   len = pda->numSector;
    245  1.1  oster   /* see if pda intersects a recovery pda */
    246  1.1  oster   if ((suoffset < s0off+s0len) && ( suoffset+len > s0off))
    247  1.1  oster     {
    248  1.1  oster       buf = pda->bufPtr;
    249  1.1  oster       coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
    250  1.1  oster       coeff = (coeff % raidPtr->Layout.numDataCol);
    251  1.1  oster 
    252  1.1  oster       if (suoffset < s0off)
    253  1.1  oster 	{
    254  1.1  oster 	  delta = s0off - suoffset;
    255  1.1  oster 	  buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
    256  1.1  oster 	  suoffset = s0off;
    257  1.1  oster 	  len -= delta;
    258  1.1  oster 	}
    259  1.1  oster       if (suoffset > s0off)
    260  1.1  oster 	{
    261  1.1  oster 	  delta = suoffset - s0off;
    262  1.1  oster 	  pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
    263  1.1  oster 	  qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
    264  1.1  oster 	}
    265  1.1  oster       if ((suoffset + len) > (s0len + s0off))
    266  1.1  oster 	len = s0len + s0off - suoffset;
    267  1.1  oster 
    268  1.1  oster       /* src, dest, len */
    269  1.1  oster       rf_bxor(buf,pbuf,rf_RaidAddressToByte(raidPtr,len), bp);
    270  1.1  oster 
    271  1.1  oster       /* dest, src, len, coeff */
    272  1.1  oster       rf_IncQ((unsigned long *)qbuf,(unsigned long *)buf,rf_RaidAddressToByte(raidPtr,len),coeff);
    273  1.1  oster     }
    274  1.1  oster }
    275  1.1  oster /*
    276  1.1  oster    Recover data in the case of a double failure. There can be two
    277  1.1  oster    result buffers, one for each chunk of data trying to be recovered.
    278  1.1  oster    The params are pda's that have not been range restricted or otherwise
    279  1.1  oster    politely massaged - this should be done here. The last params are the
    280  1.1  oster    pdas of P and Q, followed by the raidPtr. The list can look like
    281  1.1  oster 
    282  1.1  oster    pda, pda, ... , p pda, q pda, raidptr, asm
    283  1.1  oster 
    284  1.1  oster    or
    285  1.1  oster 
    286  1.1  oster    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
    287  1.1  oster 
    288  1.1  oster    depending on wether two chunks of recovery data were required.
    289  1.1  oster 
    290  1.1  oster    The second condition only arises if there are two failed buffers
    291  1.1  oster    whose lengths do not add up a stripe unit.
    292  1.1  oster */
    293  1.1  oster 
    294  1.1  oster 
    295  1.1  oster int rf_PQDoubleRecoveryFunc(node)
    296  1.1  oster   RF_DagNode_t  *node;
    297  1.1  oster {
    298  1.1  oster   int np = node->numParams;
    299  1.1  oster   RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
    300  1.1  oster   RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
    301  1.1  oster   RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
    302  1.1  oster   int d, i;
    303  1.1  oster   unsigned coeff;
    304  1.1  oster   RF_RaidAddr_t sosAddr, suoffset;
    305  1.1  oster   RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
    306  1.1  oster   int two = 0;
    307  1.1  oster   RF_PhysDiskAddr_t *ppda,*ppda2,*qpda,*qpda2,*pda,npda;
    308  1.1  oster   char *buf;
    309  1.1  oster   int numDataCol = layoutPtr->numDataCol;
    310  1.1  oster   RF_Etimer_t timer;
    311  1.1  oster   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    312  1.1  oster 
    313  1.1  oster   RF_ETIMER_START(timer);
    314  1.1  oster 
    315  1.1  oster   if (asmap->failedPDAs[1] &&
    316  1.1  oster       (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU))
    317  1.1  oster     {
    318  1.1  oster       RF_ASSERT(0);
    319  1.1  oster       ppda  = node->params[np-6].p;
    320  1.1  oster       ppda2 = node->params[np-5].p;
    321  1.1  oster       qpda  = node->params[np-4].p;
    322  1.1  oster       qpda2 = node->params[np-3].p;
    323  1.1  oster       d = (np-6);
    324  1.1  oster       two = 1;
    325  1.1  oster     }
    326  1.1  oster   else
    327  1.1  oster     {
    328  1.1  oster       ppda = node->params[np-4].p;
    329  1.1  oster       qpda = node->params[np-3].p;
    330  1.1  oster       d = (np-4);
    331  1.1  oster     }
    332  1.1  oster 
    333  1.1  oster   for (i=0; i < d; i++)
    334  1.1  oster     {
    335  1.1  oster       pda = node->params[i].p;
    336  1.1  oster       buf = pda->bufPtr;
    337  1.1  oster       suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    338  1.1  oster       len = pda->numSector;
    339  1.1  oster       coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
    340  1.1  oster       /* compute the data unit offset within the column */
    341  1.1  oster       coeff = (coeff % raidPtr->Layout.numDataCol);
    342  1.1  oster       /* see if pda intersects a recovery pda */
    343  1.1  oster       applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
    344  1.1  oster       if (two)
    345  1.1  oster 	applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
    346  1.1  oster     }
    347  1.1  oster 
    348  1.1  oster   /* ok, we got the parity back to the point where we can recover.
    349  1.1  oster      We now need to determine the coeff of the columns that need to be
    350  1.1  oster      recovered. We can also only need to recover a single stripe unit.
    351  1.1  oster      */
    352  1.1  oster 
    353  1.1  oster   if (asmap->failedPDAs[1] == NULL)
    354  1.1  oster     { /* only a single stripe unit to recover. */
    355  1.1  oster       pda = asmap->failedPDAs[0];
    356  1.1  oster       sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    357  1.1  oster       /* need to determine the column of the other failed disk */
    358  1.1  oster       coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
    359  1.1  oster       /* compute the data unit offset within the column */
    360  1.1  oster       coeff = (coeff % raidPtr->Layout.numDataCol);
    361  1.1  oster       for (i=0; i < numDataCol; i++)
    362  1.1  oster 	{
    363  1.1  oster 	  npda.raidAddress = sosAddr + (i * secPerSU);
    364  1.1  oster 	  (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    365  1.1  oster 	  /* skip over dead disks */
    366  1.1  oster 	  if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    367  1.1  oster 	    if (i != coeff) break;
    368  1.1  oster 	}
    369  1.1  oster       RF_ASSERT (i < numDataCol);
    370  1.1  oster       RF_ASSERT (two==0);
    371  1.1  oster       /* recover the data. Since we need only want to recover one column, we overwrite the
    372  1.1  oster 	 parity with the other one. */
    373  1.1  oster       if (coeff < i) /* recovering 'a' */
    374  1.1  oster 	rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)pda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
    375  1.1  oster       else /* recovering 'b' */
    376  1.1  oster 	rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)pda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
    377  1.1  oster     }
    378  1.1  oster   else
    379  1.1  oster     RF_PANIC();
    380  1.1  oster 
    381  1.1  oster   RF_ETIMER_STOP(timer);
    382  1.1  oster   RF_ETIMER_EVAL(timer);
    383  1.1  oster   if (tracerec)
    384  1.1  oster     tracerec->q_us += RF_ETIMER_VAL_US(timer);
    385  1.1  oster   rf_GenericWakeupFunc(node,0);
    386  1.1  oster   return(0);
    387  1.1  oster }
    388  1.1  oster 
    389  1.1  oster int rf_PQWriteDoubleRecoveryFunc(node)
    390  1.1  oster   RF_DagNode_t  *node;
    391  1.1  oster {
    392  1.1  oster   /* The situation:
    393  1.1  oster 
    394  1.1  oster          We are doing a write that hits only one
    395  1.1  oster 	 failed data unit.
    396  1.1  oster 	 The other failed data unit is not being overwritten, so
    397  1.1  oster 	 we need to generate it.
    398  1.1  oster 
    399  1.1  oster 	 For the moment, we assume all the nonfailed data being
    400  1.1  oster 	 written is in the shadow of the failed data unit.
    401  1.1  oster 	 (i.e,, either a single data unit write or the entire
    402  1.1  oster 	 failed stripe unit is being overwritten. )
    403  1.1  oster 
    404  1.1  oster 	 Recovery strategy:
    405  1.1  oster 	     apply the recovery data to the parity and q.
    406  1.1  oster 	     Use P & Q to recover the second failed data unit in P.
    407  1.1  oster 	     Zero fill Q, then apply the recovered data to p.
    408  1.1  oster 	     Then apply the data being written to the failed drive.
    409  1.1  oster 	     Then walk through the surviving drives, applying new data
    410  1.1  oster 	     when it exists, othewise the recovery data. Quite a mess.
    411  1.1  oster 
    412  1.1  oster 
    413  1.1  oster 	The params
    414  1.1  oster 
    415  1.1  oster 	read pda0, read pda1, ... read pda (numDataCol-3),
    416  1.1  oster 	write pda0, ... , write pda (numStripeUnitAccess - numDataFailed),
    417  1.1  oster 	failed pda, raidPtr, asmap
    418  1.1  oster    */
    419  1.1  oster 
    420  1.1  oster   int np = node->numParams;
    421  1.1  oster   RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
    422  1.1  oster   RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
    423  1.1  oster   RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
    424  1.1  oster   int i;
    425  1.1  oster   RF_RaidAddr_t sosAddr;
    426  1.1  oster   unsigned coeff;
    427  1.1  oster   RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
    428  1.1  oster   RF_PhysDiskAddr_t *ppda,*qpda,*pda,npda;
    429  1.1  oster   int numDataCol = layoutPtr->numDataCol;
    430  1.1  oster   RF_Etimer_t timer;
    431  1.1  oster   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    432  1.1  oster 
    433  1.1  oster   RF_ASSERT(node->numResults == 2);
    434  1.1  oster   RF_ASSERT(asmap->failedPDAs[1] == NULL);
    435  1.1  oster   RF_ETIMER_START(timer);
    436  1.1  oster   ppda = node->results[0];
    437  1.1  oster   qpda = node->results[1];
    438  1.1  oster   /* apply the recovery data */
    439  1.1  oster   for (i=0; i < numDataCol-2; i++)
    440  1.1  oster     applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
    441  1.1  oster 
    442  1.1  oster   /* determine the other failed data unit */
    443  1.1  oster   pda = asmap->failedPDAs[0];
    444  1.1  oster   sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    445  1.1  oster   /* need to determine the column of the other failed disk */
    446  1.1  oster   coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
    447  1.1  oster   /* compute the data unit offset within the column */
    448  1.1  oster   coeff = (coeff % raidPtr->Layout.numDataCol);
    449  1.1  oster   for (i=0; i < numDataCol; i++)
    450  1.1  oster     {
    451  1.1  oster       npda.raidAddress = sosAddr + (i * secPerSU);
    452  1.1  oster       (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    453  1.1  oster       /* skip over dead disks */
    454  1.1  oster       if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    455  1.1  oster 	if (i != coeff) break;
    456  1.1  oster     }
    457  1.1  oster   RF_ASSERT (i < numDataCol);
    458  1.1  oster   /* recover the data. The column we want to recover we write over the parity.
    459  1.1  oster      The column we don't care about we dump in q. */
    460  1.1  oster   if (coeff < i) /* recovering 'a' */
    461  1.1  oster     rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
    462  1.1  oster   else /* recovering 'b' */
    463  1.1  oster     rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
    464  1.1  oster 
    465  1.1  oster   /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
    466  1.1  oster   bzero(qpda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector));
    467  1.1  oster   rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),i);
    468  1.1  oster 
    469  1.1  oster   /* now apply all the write data to the buffer */
    470  1.1  oster   /* single stripe unit write case: the failed data is only thing we are writing. */
    471  1.1  oster   RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
    472  1.1  oster   /* dest, src, len, coeff */
    473  1.1  oster   rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)asmap->failedPDAs[0]->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),coeff);
    474  1.1  oster   rf_bxor(asmap->failedPDAs[0]->bufPtr,ppda->bufPtr,rf_RaidAddressToByte(raidPtr,ppda->numSector),node->dagHdr->bp);
    475  1.1  oster 
    476  1.1  oster   /* now apply all the recovery data */
    477  1.1  oster   for (i=0; i < numDataCol-2; i++)
    478  1.1  oster     applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
    479  1.1  oster 
    480  1.1  oster   RF_ETIMER_STOP(timer);
    481  1.1  oster   RF_ETIMER_EVAL(timer);
    482  1.1  oster   if (tracerec)
    483  1.1  oster     tracerec->q_us += RF_ETIMER_VAL_US(timer);
    484  1.1  oster 
    485  1.1  oster   rf_GenericWakeupFunc(node,0);
    486  1.1  oster   return(0);
    487  1.1  oster }
    488  1.1  oster 
    489  1.1  oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
    490  1.1  oster {
    491  1.1  oster   RF_PANIC();
    492  1.1  oster }
    493  1.1  oster 
    494  1.1  oster /*
    495  1.1  oster    Two lost data unit write case.
    496  1.1  oster 
    497  1.1  oster    There are really two cases here:
    498  1.1  oster 
    499  1.1  oster    (1) The write completely covers the two lost data units.
    500  1.1  oster        In that case, a reconstruct write that doesn't write the
    501  1.1  oster        failed data units will do the correct thing. So in this case,
    502  1.1  oster        the dag looks like
    503  1.1  oster 
    504  1.1  oster             full stripe read of surviving data units (not being overwriten)
    505  1.1  oster 	    write new data (ignoring failed units)   compute P&Q
    506  1.1  oster 	                                             write P&Q
    507  1.1  oster 
    508  1.1  oster 
    509  1.1  oster    (2) The write does not completely cover both failed data units
    510  1.1  oster        (but touches at least one of them). Then we need to do the
    511  1.1  oster        equivalent of a reconstruct read to recover the missing data
    512  1.1  oster        unit from the other stripe.
    513  1.1  oster 
    514  1.1  oster        For any data we are writing that is not in the "shadow"
    515  1.1  oster        of the failed units, we need to do a four cycle update.
    516  1.1  oster        PANIC on this case. for now
    517  1.1  oster 
    518  1.1  oster */
    519  1.1  oster 
    520  1.1  oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
    521  1.1  oster {
    522  1.1  oster   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    523  1.1  oster   RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
    524  1.1  oster   int sum;
    525  1.1  oster   int nf = asmap->numDataFailed;
    526  1.1  oster 
    527  1.1  oster   sum = asmap->failedPDAs[0]->numSector;
    528  1.1  oster   if (nf == 2)
    529  1.1  oster     sum += asmap->failedPDAs[1]->numSector;
    530  1.1  oster 
    531  1.1  oster   if ((nf == 2) && ( sum == (2*sectorsPerSU)))
    532  1.1  oster     {
    533  1.1  oster       /* large write case */
    534  1.1  oster       rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    535  1.1  oster       return;
    536  1.1  oster     }
    537  1.1  oster 
    538  1.1  oster 
    539  1.1  oster   if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU))
    540  1.1  oster     {
    541  1.1  oster       /* small write case, no user data not in shadow */
    542  1.1  oster       rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    543  1.1  oster       return;
    544  1.1  oster     }
    545  1.1  oster   RF_PANIC();
    546  1.1  oster }
    547  1.1  oster 
    548  1.1  oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
    549  1.1  oster {
    550  1.1  oster   rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
    551  1.1  oster }
    552  1.1  oster 
    553  1.1  oster #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
    554