Home | History | Annotate | Line # | Download | only in raidframe
rf_pqdegdags.c revision 1.4
      1 /*	$NetBSD: rf_pqdegdags.c,v 1.4 1999/08/13 03:41:57 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_pqdegdags.c
     31  * Degraded mode dags for double fault cases.
     32 */
     33 
     34 
     35 #include "rf_archs.h"
     36 
     37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     38 
     39 #include "rf_types.h"
     40 #include "rf_raid.h"
     41 #include "rf_dag.h"
     42 #include "rf_dagfuncs.h"
     43 #include "rf_dagutils.h"
     44 #include "rf_etimer.h"
     45 #include "rf_acctrace.h"
     46 #include "rf_general.h"
     47 #include "rf_pqdegdags.h"
     48 #include "rf_pq.h"
     49 
     50 static void
     51 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
     52     RF_PhysDiskAddr_t * qpda, void *bp);
     53 
     54 /*
     55    Two data drives have failed, and we are doing a read that covers one of them.
     56    We may also be reading some of the surviving drives.
     57 
     58 
     59  *****************************************************************************************
     60  *
     61  * creates a DAG to perform a degraded-mode read of data within one stripe.
     62  * This DAG is as follows:
     63  *
     64  *                                      Hdr
     65  *                                       |
     66  *                                     Block
     67  *                       /         /           \         \     \   \
     68  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
     69  *                      | \       | \         | \       | \    | \ | \
     70  *
     71  *                                 |                 |
     72  *                              Unblock              X
     73  *                                  \               /
     74  *                                   ------ T ------
     75  *
     76  * Each R node is a successor of the L node
     77  * One successor arc from each R node goes to U, and the other to X
     78  * There is one Rud for each chunk of surviving user data requested by the user,
     79  * and one Rrd for each chunk of surviving user data _not_ being read by the user
     80  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
     81  * X = pq recovery node, T = terminate
     82  *
     83  * The block & unblock nodes are leftovers from a previous version.  They
     84  * do nothing, but I haven't deleted them because it would be a tremendous
     85  * effort to put them back in.
     86  *
     87  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
     88  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
     89  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
     90  * zero the target buffer prior to the re-use.
     91  *
     92  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
     93  * needs and what's not.
     94  ****************************************************************************************/
     95 /*   init a disk node with 2 successors and one predecessor */
     96 #define INIT_DISK_NODE(node,name) \
     97 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
     98 (node)->succedents[0] = unblockNode; \
     99 (node)->succedents[1] = recoveryNode; \
    100 (node)->antecedents[0] = blockNode; \
    101 (node)->antType[0] = rf_control
    102 
    103 #define DISK_NODE_PARAMS(_node_,_p_) \
    104   (_node_).params[0].p = _p_ ; \
    105   (_node_).params[1].p = (_p_)->bufPtr; \
    106   (_node_).params[2].v = parityStripeID; \
    107   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
    108 
    109 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
    110 
    111 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
    112 {
    113 	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
    114 	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
    115 }
    116 
    117 static void
    118 applyPDA(raidPtr, pda, ppda, qpda, bp)
    119 	RF_Raid_t *raidPtr;
    120 	RF_PhysDiskAddr_t *pda;
    121 	RF_PhysDiskAddr_t *ppda;
    122 	RF_PhysDiskAddr_t *qpda;
    123 	void   *bp;
    124 {
    125 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    126 	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
    127 	RF_SectorCount_t s0len = ppda->numSector, len;
    128 	RF_SectorNum_t suoffset;
    129 	unsigned coeff;
    130 	char   *pbuf = ppda->bufPtr;
    131 	char   *qbuf = qpda->bufPtr;
    132 	char   *buf;
    133 	int     delta;
    134 
    135 	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    136 	len = pda->numSector;
    137 	/* see if pda intersects a recovery pda */
    138 	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
    139 		buf = pda->bufPtr;
    140 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    141 		coeff = (coeff % raidPtr->Layout.numDataCol);
    142 
    143 		if (suoffset < s0off) {
    144 			delta = s0off - suoffset;
    145 			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    146 			suoffset = s0off;
    147 			len -= delta;
    148 		}
    149 		if (suoffset > s0off) {
    150 			delta = suoffset - s0off;
    151 			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    152 			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    153 		}
    154 		if ((suoffset + len) > (s0len + s0off))
    155 			len = s0len + s0off - suoffset;
    156 
    157 		/* src, dest, len */
    158 		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
    159 
    160 		/* dest, src, len, coeff */
    161 		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
    162 	}
    163 }
    164 /*
    165    Recover data in the case of a double failure. There can be two
    166    result buffers, one for each chunk of data trying to be recovered.
    167    The params are pda's that have not been range restricted or otherwise
    168    politely massaged - this should be done here. The last params are the
    169    pdas of P and Q, followed by the raidPtr. The list can look like
    170 
    171    pda, pda, ... , p pda, q pda, raidptr, asm
    172 
    173    or
    174 
    175    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
    176 
    177    depending on wether two chunks of recovery data were required.
    178 
    179    The second condition only arises if there are two failed buffers
    180    whose lengths do not add up a stripe unit.
    181 */
    182 
    183 
    184 int
    185 rf_PQDoubleRecoveryFunc(node)
    186 	RF_DagNode_t *node;
    187 {
    188 	int     np = node->numParams;
    189 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    190 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    191 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    192 	int     d, i;
    193 	unsigned coeff;
    194 	RF_RaidAddr_t sosAddr, suoffset;
    195 	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
    196 	int     two = 0;
    197 	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
    198 	char   *buf;
    199 	int     numDataCol = layoutPtr->numDataCol;
    200 	RF_Etimer_t timer;
    201 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    202 
    203 	RF_ETIMER_START(timer);
    204 
    205 	if (asmap->failedPDAs[1] &&
    206 	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
    207 		RF_ASSERT(0);
    208 		ppda = node->params[np - 6].p;
    209 		ppda2 = node->params[np - 5].p;
    210 		qpda = node->params[np - 4].p;
    211 		qpda2 = node->params[np - 3].p;
    212 		d = (np - 6);
    213 		two = 1;
    214 	} else {
    215 		ppda = node->params[np - 4].p;
    216 		qpda = node->params[np - 3].p;
    217 		d = (np - 4);
    218 	}
    219 
    220 	for (i = 0; i < d; i++) {
    221 		pda = node->params[i].p;
    222 		buf = pda->bufPtr;
    223 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    224 		len = pda->numSector;
    225 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    226 		/* compute the data unit offset within the column */
    227 		coeff = (coeff % raidPtr->Layout.numDataCol);
    228 		/* see if pda intersects a recovery pda */
    229 		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    230 		if (two)
    231 			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    232 	}
    233 
    234 	/* ok, we got the parity back to the point where we can recover. We
    235 	 * now need to determine the coeff of the columns that need to be
    236 	 * recovered. We can also only need to recover a single stripe unit. */
    237 
    238 	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
    239 						 * to recover. */
    240 		pda = asmap->failedPDAs[0];
    241 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    242 		/* need to determine the column of the other failed disk */
    243 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    244 		/* compute the data unit offset within the column */
    245 		coeff = (coeff % raidPtr->Layout.numDataCol);
    246 		for (i = 0; i < numDataCol; i++) {
    247 			npda.raidAddress = sosAddr + (i * secPerSU);
    248 			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    249 			/* skip over dead disks */
    250 			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    251 				if (i != coeff)
    252 					break;
    253 		}
    254 		RF_ASSERT(i < numDataCol);
    255 		RF_ASSERT(two == 0);
    256 		/* recover the data. Since we need only want to recover one
    257 		 * column, we overwrite the parity with the other one. */
    258 		if (coeff < i)	/* recovering 'a' */
    259 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    260 		else		/* recovering 'b' */
    261 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    262 	} else
    263 		RF_PANIC();
    264 
    265 	RF_ETIMER_STOP(timer);
    266 	RF_ETIMER_EVAL(timer);
    267 	if (tracerec)
    268 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    269 	rf_GenericWakeupFunc(node, 0);
    270 	return (0);
    271 }
    272 
    273 int
    274 rf_PQWriteDoubleRecoveryFunc(node)
    275 	RF_DagNode_t *node;
    276 {
    277 	/* The situation:
    278 	 *
    279 	 * We are doing a write that hits only one failed data unit. The other
    280 	 * failed data unit is not being overwritten, so we need to generate
    281 	 * it.
    282 	 *
    283 	 * For the moment, we assume all the nonfailed data being written is in
    284 	 * the shadow of the failed data unit. (i.e,, either a single data
    285 	 * unit write or the entire failed stripe unit is being overwritten. )
    286 	 *
    287 	 * Recovery strategy: apply the recovery data to the parity and q. Use P
    288 	 * & Q to recover the second failed data unit in P. Zero fill Q, then
    289 	 * apply the recovered data to p. Then apply the data being written to
    290 	 * the failed drive. Then walk through the surviving drives, applying
    291 	 * new data when it exists, othewise the recovery data. Quite a mess.
    292 	 *
    293 	 *
    294 	 * The params
    295 	 *
    296 	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
    297 	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
    298 	 * raidPtr, asmap */
    299 
    300 	int     np = node->numParams;
    301 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    302 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    303 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    304 	int     i;
    305 	RF_RaidAddr_t sosAddr;
    306 	unsigned coeff;
    307 	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
    308 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
    309 	int     numDataCol = layoutPtr->numDataCol;
    310 	RF_Etimer_t timer;
    311 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    312 
    313 	RF_ASSERT(node->numResults == 2);
    314 	RF_ASSERT(asmap->failedPDAs[1] == NULL);
    315 	RF_ETIMER_START(timer);
    316 	ppda = node->results[0];
    317 	qpda = node->results[1];
    318 	/* apply the recovery data */
    319 	for (i = 0; i < numDataCol - 2; i++)
    320 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    321 
    322 	/* determine the other failed data unit */
    323 	pda = asmap->failedPDAs[0];
    324 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    325 	/* need to determine the column of the other failed disk */
    326 	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    327 	/* compute the data unit offset within the column */
    328 	coeff = (coeff % raidPtr->Layout.numDataCol);
    329 	for (i = 0; i < numDataCol; i++) {
    330 		npda.raidAddress = sosAddr + (i * secPerSU);
    331 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    332 		/* skip over dead disks */
    333 		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    334 			if (i != coeff)
    335 				break;
    336 	}
    337 	RF_ASSERT(i < numDataCol);
    338 	/* recover the data. The column we want to recover we write over the
    339 	 * parity. The column we don't care about we dump in q. */
    340 	if (coeff < i)		/* recovering 'a' */
    341 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    342 	else			/* recovering 'b' */
    343 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    344 
    345 	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
    346 	bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
    347 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
    348 
    349 	/* now apply all the write data to the buffer */
    350 	/* single stripe unit write case: the failed data is only thing we are
    351 	 * writing. */
    352 	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
    353 	/* dest, src, len, coeff */
    354 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
    355 	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
    356 
    357 	/* now apply all the recovery data */
    358 	for (i = 0; i < numDataCol - 2; i++)
    359 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    360 
    361 	RF_ETIMER_STOP(timer);
    362 	RF_ETIMER_EVAL(timer);
    363 	if (tracerec)
    364 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    365 
    366 	rf_GenericWakeupFunc(node, 0);
    367 	return (0);
    368 }
    369 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
    370 {
    371 	RF_PANIC();
    372 }
    373 /*
    374    Two lost data unit write case.
    375 
    376    There are really two cases here:
    377 
    378    (1) The write completely covers the two lost data units.
    379        In that case, a reconstruct write that doesn't write the
    380        failed data units will do the correct thing. So in this case,
    381        the dag looks like
    382 
    383             full stripe read of surviving data units (not being overwriten)
    384 	    write new data (ignoring failed units)   compute P&Q
    385 	                                             write P&Q
    386 
    387 
    388    (2) The write does not completely cover both failed data units
    389        (but touches at least one of them). Then we need to do the
    390        equivalent of a reconstruct read to recover the missing data
    391        unit from the other stripe.
    392 
    393        For any data we are writing that is not in the "shadow"
    394        of the failed units, we need to do a four cycle update.
    395        PANIC on this case. for now
    396 
    397 */
    398 
    399 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
    400 {
    401 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    402 	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
    403 	int     sum;
    404 	int     nf = asmap->numDataFailed;
    405 
    406 	sum = asmap->failedPDAs[0]->numSector;
    407 	if (nf == 2)
    408 		sum += asmap->failedPDAs[1]->numSector;
    409 
    410 	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
    411 		/* large write case */
    412 		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    413 		return;
    414 	}
    415 	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
    416 		/* small write case, no user data not in shadow */
    417 		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    418 		return;
    419 	}
    420 	RF_PANIC();
    421 }
    422 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
    423 {
    424 	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
    425 }
    426 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    427 				 * (RF_INCLUDE_RAID6 > 0) */
    428