Home | History | Annotate | Line # | Download | only in raidframe
rf_pqdegdags.c revision 1.2
      1 /*	$NetBSD: rf_pqdegdags.c,v 1.2 1999/01/26 02:34:00 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_pqdegdags.c
     31  * Degraded mode dags for double fault cases.
     32 */
     33 
     34 
     35 #include "rf_archs.h"
     36 
     37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     38 
     39 #include "rf_types.h"
     40 #include "rf_raid.h"
     41 #include "rf_dag.h"
     42 #include "rf_dagfuncs.h"
     43 #include "rf_dagutils.h"
     44 #include "rf_etimer.h"
     45 #include "rf_acctrace.h"
     46 #include "rf_general.h"
     47 #include "rf_pqdegdags.h"
     48 #include "rf_pq.h"
     49 #include "rf_sys.h"
     50 
     51 static void applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda,
     52 	RF_PhysDiskAddr_t *qpda, void *bp);
     53 
     54 /*
     55    Two data drives have failed, and we are doing a read that covers one of them.
     56    We may also be reading some of the surviving drives.
     57 
     58 
     59  *****************************************************************************************
     60  *
     61  * creates a DAG to perform a degraded-mode read of data within one stripe.
     62  * This DAG is as follows:
     63  *
     64  *                                      Hdr
     65  *                                       |
     66  *                                     Block
     67  *                       /         /           \         \     \   \
     68  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
     69  *                      | \       | \         | \       | \    | \ | \
     70  *
     71  *                                 |                 |
     72  *                              Unblock              X
     73  *                                  \               /
     74  *                                   ------ T ------
     75  *
     76  * Each R node is a successor of the L node
     77  * One successor arc from each R node goes to U, and the other to X
     78  * There is one Rud for each chunk of surviving user data requested by the user,
     79  * and one Rrd for each chunk of surviving user data _not_ being read by the user
     80  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
     81  * X = pq recovery node, T = terminate
     82  *
     83  * The block & unblock nodes are leftovers from a previous version.  They
     84  * do nothing, but I haven't deleted them because it would be a tremendous
     85  * effort to put them back in.
     86  *
     87  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
     88  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
     89  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
     90  * zero the target buffer prior to the re-use.
     91  *
     92  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
     93  * needs and what's not.
     94  ****************************************************************************************/
     95 /*   init a disk node with 2 successors and one predecessor */
     96 #define INIT_DISK_NODE(node,name) \
     97 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
     98 (node)->succedents[0] = unblockNode; \
     99 (node)->succedents[1] = recoveryNode; \
    100 (node)->antecedents[0] = blockNode; \
    101 (node)->antType[0] = rf_control
    102 
    103 #define DISK_NODE_PARAMS(_node_,_p_) \
    104   (_node_).params[0].p = _p_ ; \
    105   (_node_).params[1].p = (_p_)->bufPtr; \
    106   (_node_).params[2].v = parityStripeID; \
    107   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
    108 
    109 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
    110 
    111 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
    112 {
    113   rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
    114     "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
    115 }
    116 
    117 static void applyPDA(raidPtr,pda,ppda,qpda, bp)
    118   RF_Raid_t          *raidPtr;
    119   RF_PhysDiskAddr_t  *pda;
    120   RF_PhysDiskAddr_t  *ppda;
    121   RF_PhysDiskAddr_t  *qpda;
    122   void               *bp;
    123 {
    124   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    125   RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
    126   RF_SectorCount_t s0len = ppda->numSector, len;
    127   RF_SectorNum_t suoffset;
    128   unsigned coeff;
    129   char *pbuf = ppda->bufPtr;
    130   char *qbuf = qpda->bufPtr;
    131   char *buf;
    132   int delta;
    133 
    134   suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    135   len = pda->numSector;
    136   /* see if pda intersects a recovery pda */
    137   if ((suoffset < s0off+s0len) && ( suoffset+len > s0off))
    138     {
    139       buf = pda->bufPtr;
    140       coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
    141       coeff = (coeff % raidPtr->Layout.numDataCol);
    142 
    143       if (suoffset < s0off)
    144 	{
    145 	  delta = s0off - suoffset;
    146 	  buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
    147 	  suoffset = s0off;
    148 	  len -= delta;
    149 	}
    150       if (suoffset > s0off)
    151 	{
    152 	  delta = suoffset - s0off;
    153 	  pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
    154 	  qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
    155 	}
    156       if ((suoffset + len) > (s0len + s0off))
    157 	len = s0len + s0off - suoffset;
    158 
    159       /* src, dest, len */
    160       rf_bxor(buf,pbuf,rf_RaidAddressToByte(raidPtr,len), bp);
    161 
    162       /* dest, src, len, coeff */
    163       rf_IncQ((unsigned long *)qbuf,(unsigned long *)buf,rf_RaidAddressToByte(raidPtr,len),coeff);
    164     }
    165 }
    166 /*
    167    Recover data in the case of a double failure. There can be two
    168    result buffers, one for each chunk of data trying to be recovered.
    169    The params are pda's that have not been range restricted or otherwise
    170    politely massaged - this should be done here. The last params are the
    171    pdas of P and Q, followed by the raidPtr. The list can look like
    172 
    173    pda, pda, ... , p pda, q pda, raidptr, asm
    174 
    175    or
    176 
    177    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
    178 
    179    depending on wether two chunks of recovery data were required.
    180 
    181    The second condition only arises if there are two failed buffers
    182    whose lengths do not add up a stripe unit.
    183 */
    184 
    185 
    186 int rf_PQDoubleRecoveryFunc(node)
    187   RF_DagNode_t  *node;
    188 {
    189   int np = node->numParams;
    190   RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
    191   RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
    192   RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
    193   int d, i;
    194   unsigned coeff;
    195   RF_RaidAddr_t sosAddr, suoffset;
    196   RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
    197   int two = 0;
    198   RF_PhysDiskAddr_t *ppda,*ppda2,*qpda,*qpda2,*pda,npda;
    199   char *buf;
    200   int numDataCol = layoutPtr->numDataCol;
    201   RF_Etimer_t timer;
    202   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    203 
    204   RF_ETIMER_START(timer);
    205 
    206   if (asmap->failedPDAs[1] &&
    207       (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU))
    208     {
    209       RF_ASSERT(0);
    210       ppda  = node->params[np-6].p;
    211       ppda2 = node->params[np-5].p;
    212       qpda  = node->params[np-4].p;
    213       qpda2 = node->params[np-3].p;
    214       d = (np-6);
    215       two = 1;
    216     }
    217   else
    218     {
    219       ppda = node->params[np-4].p;
    220       qpda = node->params[np-3].p;
    221       d = (np-4);
    222     }
    223 
    224   for (i=0; i < d; i++)
    225     {
    226       pda = node->params[i].p;
    227       buf = pda->bufPtr;
    228       suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    229       len = pda->numSector;
    230       coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
    231       /* compute the data unit offset within the column */
    232       coeff = (coeff % raidPtr->Layout.numDataCol);
    233       /* see if pda intersects a recovery pda */
    234       applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
    235       if (two)
    236 	applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
    237     }
    238 
    239   /* ok, we got the parity back to the point where we can recover.
    240      We now need to determine the coeff of the columns that need to be
    241      recovered. We can also only need to recover a single stripe unit.
    242      */
    243 
    244   if (asmap->failedPDAs[1] == NULL)
    245     { /* only a single stripe unit to recover. */
    246       pda = asmap->failedPDAs[0];
    247       sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    248       /* need to determine the column of the other failed disk */
    249       coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
    250       /* compute the data unit offset within the column */
    251       coeff = (coeff % raidPtr->Layout.numDataCol);
    252       for (i=0; i < numDataCol; i++)
    253 	{
    254 	  npda.raidAddress = sosAddr + (i * secPerSU);
    255 	  (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    256 	  /* skip over dead disks */
    257 	  if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    258 	    if (i != coeff) break;
    259 	}
    260       RF_ASSERT (i < numDataCol);
    261       RF_ASSERT (two==0);
    262       /* recover the data. Since we need only want to recover one column, we overwrite the
    263 	 parity with the other one. */
    264       if (coeff < i) /* recovering 'a' */
    265 	rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)pda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
    266       else /* recovering 'b' */
    267 	rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)pda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
    268     }
    269   else
    270     RF_PANIC();
    271 
    272   RF_ETIMER_STOP(timer);
    273   RF_ETIMER_EVAL(timer);
    274   if (tracerec)
    275     tracerec->q_us += RF_ETIMER_VAL_US(timer);
    276   rf_GenericWakeupFunc(node,0);
    277   return(0);
    278 }
    279 
    280 int rf_PQWriteDoubleRecoveryFunc(node)
    281   RF_DagNode_t  *node;
    282 {
    283   /* The situation:
    284 
    285          We are doing a write that hits only one
    286 	 failed data unit.
    287 	 The other failed data unit is not being overwritten, so
    288 	 we need to generate it.
    289 
    290 	 For the moment, we assume all the nonfailed data being
    291 	 written is in the shadow of the failed data unit.
    292 	 (i.e,, either a single data unit write or the entire
    293 	 failed stripe unit is being overwritten. )
    294 
    295 	 Recovery strategy:
    296 	     apply the recovery data to the parity and q.
    297 	     Use P & Q to recover the second failed data unit in P.
    298 	     Zero fill Q, then apply the recovered data to p.
    299 	     Then apply the data being written to the failed drive.
    300 	     Then walk through the surviving drives, applying new data
    301 	     when it exists, othewise the recovery data. Quite a mess.
    302 
    303 
    304 	The params
    305 
    306 	read pda0, read pda1, ... read pda (numDataCol-3),
    307 	write pda0, ... , write pda (numStripeUnitAccess - numDataFailed),
    308 	failed pda, raidPtr, asmap
    309    */
    310 
    311   int np = node->numParams;
    312   RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
    313   RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
    314   RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
    315   int i;
    316   RF_RaidAddr_t sosAddr;
    317   unsigned coeff;
    318   RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
    319   RF_PhysDiskAddr_t *ppda,*qpda,*pda,npda;
    320   int numDataCol = layoutPtr->numDataCol;
    321   RF_Etimer_t timer;
    322   RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    323 
    324   RF_ASSERT(node->numResults == 2);
    325   RF_ASSERT(asmap->failedPDAs[1] == NULL);
    326   RF_ETIMER_START(timer);
    327   ppda = node->results[0];
    328   qpda = node->results[1];
    329   /* apply the recovery data */
    330   for (i=0; i < numDataCol-2; i++)
    331     applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
    332 
    333   /* determine the other failed data unit */
    334   pda = asmap->failedPDAs[0];
    335   sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    336   /* need to determine the column of the other failed disk */
    337   coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
    338   /* compute the data unit offset within the column */
    339   coeff = (coeff % raidPtr->Layout.numDataCol);
    340   for (i=0; i < numDataCol; i++)
    341     {
    342       npda.raidAddress = sosAddr + (i * secPerSU);
    343       (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    344       /* skip over dead disks */
    345       if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    346 	if (i != coeff) break;
    347     }
    348   RF_ASSERT (i < numDataCol);
    349   /* recover the data. The column we want to recover we write over the parity.
    350      The column we don't care about we dump in q. */
    351   if (coeff < i) /* recovering 'a' */
    352     rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
    353   else /* recovering 'b' */
    354     rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
    355 
    356   /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
    357   bzero(qpda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector));
    358   rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),i);
    359 
    360   /* now apply all the write data to the buffer */
    361   /* single stripe unit write case: the failed data is only thing we are writing. */
    362   RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
    363   /* dest, src, len, coeff */
    364   rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)asmap->failedPDAs[0]->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),coeff);
    365   rf_bxor(asmap->failedPDAs[0]->bufPtr,ppda->bufPtr,rf_RaidAddressToByte(raidPtr,ppda->numSector),node->dagHdr->bp);
    366 
    367   /* now apply all the recovery data */
    368   for (i=0; i < numDataCol-2; i++)
    369     applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
    370 
    371   RF_ETIMER_STOP(timer);
    372   RF_ETIMER_EVAL(timer);
    373   if (tracerec)
    374     tracerec->q_us += RF_ETIMER_VAL_US(timer);
    375 
    376   rf_GenericWakeupFunc(node,0);
    377   return(0);
    378 }
    379 
    380 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
    381 {
    382   RF_PANIC();
    383 }
    384 
    385 /*
    386    Two lost data unit write case.
    387 
    388    There are really two cases here:
    389 
    390    (1) The write completely covers the two lost data units.
    391        In that case, a reconstruct write that doesn't write the
    392        failed data units will do the correct thing. So in this case,
    393        the dag looks like
    394 
    395             full stripe read of surviving data units (not being overwriten)
    396 	    write new data (ignoring failed units)   compute P&Q
    397 	                                             write P&Q
    398 
    399 
    400    (2) The write does not completely cover both failed data units
    401        (but touches at least one of them). Then we need to do the
    402        equivalent of a reconstruct read to recover the missing data
    403        unit from the other stripe.
    404 
    405        For any data we are writing that is not in the "shadow"
    406        of the failed units, we need to do a four cycle update.
    407        PANIC on this case. for now
    408 
    409 */
    410 
    411 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
    412 {
    413   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    414   RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
    415   int sum;
    416   int nf = asmap->numDataFailed;
    417 
    418   sum = asmap->failedPDAs[0]->numSector;
    419   if (nf == 2)
    420     sum += asmap->failedPDAs[1]->numSector;
    421 
    422   if ((nf == 2) && ( sum == (2*sectorsPerSU)))
    423     {
    424       /* large write case */
    425       rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    426       return;
    427     }
    428 
    429 
    430   if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU))
    431     {
    432       /* small write case, no user data not in shadow */
    433       rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    434       return;
    435     }
    436   RF_PANIC();
    437 }
    438 
    439 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
    440 {
    441   rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
    442 }
    443 
    444 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
    445