Home | History | Annotate | Line # | Download | only in raidframe
rf_pqdegdags.c revision 1.3
      1 /*	$NetBSD: rf_pqdegdags.c,v 1.3 1999/02/05 00:06:15 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_pqdegdags.c
     31  * Degraded mode dags for double fault cases.
     32 */
     33 
     34 
     35 #include "rf_archs.h"
     36 
     37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     38 
     39 #include "rf_types.h"
     40 #include "rf_raid.h"
     41 #include "rf_dag.h"
     42 #include "rf_dagfuncs.h"
     43 #include "rf_dagutils.h"
     44 #include "rf_etimer.h"
     45 #include "rf_acctrace.h"
     46 #include "rf_general.h"
     47 #include "rf_pqdegdags.h"
     48 #include "rf_pq.h"
     49 #include "rf_sys.h"
     50 
     51 static void
     52 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
     53     RF_PhysDiskAddr_t * qpda, void *bp);
     54 
     55 /*
     56    Two data drives have failed, and we are doing a read that covers one of them.
     57    We may also be reading some of the surviving drives.
     58 
     59 
     60  *****************************************************************************************
     61  *
     62  * creates a DAG to perform a degraded-mode read of data within one stripe.
     63  * This DAG is as follows:
     64  *
     65  *                                      Hdr
     66  *                                       |
     67  *                                     Block
     68  *                       /         /           \         \     \   \
     69  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
     70  *                      | \       | \         | \       | \    | \ | \
     71  *
     72  *                                 |                 |
     73  *                              Unblock              X
     74  *                                  \               /
     75  *                                   ------ T ------
     76  *
     77  * Each R node is a successor of the L node
     78  * One successor arc from each R node goes to U, and the other to X
     79  * There is one Rud for each chunk of surviving user data requested by the user,
     80  * and one Rrd for each chunk of surviving user data _not_ being read by the user
     81  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
     82  * X = pq recovery node, T = terminate
     83  *
     84  * The block & unblock nodes are leftovers from a previous version.  They
     85  * do nothing, but I haven't deleted them because it would be a tremendous
     86  * effort to put them back in.
     87  *
     88  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
     89  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
     90  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
     91  * zero the target buffer prior to the re-use.
     92  *
     93  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
     94  * needs and what's not.
     95  ****************************************************************************************/
     96 /*   init a disk node with 2 successors and one predecessor */
     97 #define INIT_DISK_NODE(node,name) \
     98 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
     99 (node)->succedents[0] = unblockNode; \
    100 (node)->succedents[1] = recoveryNode; \
    101 (node)->antecedents[0] = blockNode; \
    102 (node)->antType[0] = rf_control
    103 
    104 #define DISK_NODE_PARAMS(_node_,_p_) \
    105   (_node_).params[0].p = _p_ ; \
    106   (_node_).params[1].p = (_p_)->bufPtr; \
    107   (_node_).params[2].v = parityStripeID; \
    108   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
    109 
    110 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
    111 
    112 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
    113 {
    114 	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
    115 	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
    116 }
    117 
    118 static void
    119 applyPDA(raidPtr, pda, ppda, qpda, bp)
    120 	RF_Raid_t *raidPtr;
    121 	RF_PhysDiskAddr_t *pda;
    122 	RF_PhysDiskAddr_t *ppda;
    123 	RF_PhysDiskAddr_t *qpda;
    124 	void   *bp;
    125 {
    126 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    127 	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
    128 	RF_SectorCount_t s0len = ppda->numSector, len;
    129 	RF_SectorNum_t suoffset;
    130 	unsigned coeff;
    131 	char   *pbuf = ppda->bufPtr;
    132 	char   *qbuf = qpda->bufPtr;
    133 	char   *buf;
    134 	int     delta;
    135 
    136 	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    137 	len = pda->numSector;
    138 	/* see if pda intersects a recovery pda */
    139 	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
    140 		buf = pda->bufPtr;
    141 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    142 		coeff = (coeff % raidPtr->Layout.numDataCol);
    143 
    144 		if (suoffset < s0off) {
    145 			delta = s0off - suoffset;
    146 			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    147 			suoffset = s0off;
    148 			len -= delta;
    149 		}
    150 		if (suoffset > s0off) {
    151 			delta = suoffset - s0off;
    152 			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    153 			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    154 		}
    155 		if ((suoffset + len) > (s0len + s0off))
    156 			len = s0len + s0off - suoffset;
    157 
    158 		/* src, dest, len */
    159 		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
    160 
    161 		/* dest, src, len, coeff */
    162 		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
    163 	}
    164 }
    165 /*
    166    Recover data in the case of a double failure. There can be two
    167    result buffers, one for each chunk of data trying to be recovered.
    168    The params are pda's that have not been range restricted or otherwise
    169    politely massaged - this should be done here. The last params are the
    170    pdas of P and Q, followed by the raidPtr. The list can look like
    171 
    172    pda, pda, ... , p pda, q pda, raidptr, asm
    173 
    174    or
    175 
    176    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
    177 
    178    depending on wether two chunks of recovery data were required.
    179 
    180    The second condition only arises if there are two failed buffers
    181    whose lengths do not add up a stripe unit.
    182 */
    183 
    184 
    185 int
    186 rf_PQDoubleRecoveryFunc(node)
    187 	RF_DagNode_t *node;
    188 {
    189 	int     np = node->numParams;
    190 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    191 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    192 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    193 	int     d, i;
    194 	unsigned coeff;
    195 	RF_RaidAddr_t sosAddr, suoffset;
    196 	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
    197 	int     two = 0;
    198 	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
    199 	char   *buf;
    200 	int     numDataCol = layoutPtr->numDataCol;
    201 	RF_Etimer_t timer;
    202 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    203 
    204 	RF_ETIMER_START(timer);
    205 
    206 	if (asmap->failedPDAs[1] &&
    207 	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
    208 		RF_ASSERT(0);
    209 		ppda = node->params[np - 6].p;
    210 		ppda2 = node->params[np - 5].p;
    211 		qpda = node->params[np - 4].p;
    212 		qpda2 = node->params[np - 3].p;
    213 		d = (np - 6);
    214 		two = 1;
    215 	} else {
    216 		ppda = node->params[np - 4].p;
    217 		qpda = node->params[np - 3].p;
    218 		d = (np - 4);
    219 	}
    220 
    221 	for (i = 0; i < d; i++) {
    222 		pda = node->params[i].p;
    223 		buf = pda->bufPtr;
    224 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    225 		len = pda->numSector;
    226 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    227 		/* compute the data unit offset within the column */
    228 		coeff = (coeff % raidPtr->Layout.numDataCol);
    229 		/* see if pda intersects a recovery pda */
    230 		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    231 		if (two)
    232 			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    233 	}
    234 
    235 	/* ok, we got the parity back to the point where we can recover. We
    236 	 * now need to determine the coeff of the columns that need to be
    237 	 * recovered. We can also only need to recover a single stripe unit. */
    238 
    239 	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
    240 						 * to recover. */
    241 		pda = asmap->failedPDAs[0];
    242 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    243 		/* need to determine the column of the other failed disk */
    244 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    245 		/* compute the data unit offset within the column */
    246 		coeff = (coeff % raidPtr->Layout.numDataCol);
    247 		for (i = 0; i < numDataCol; i++) {
    248 			npda.raidAddress = sosAddr + (i * secPerSU);
    249 			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    250 			/* skip over dead disks */
    251 			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    252 				if (i != coeff)
    253 					break;
    254 		}
    255 		RF_ASSERT(i < numDataCol);
    256 		RF_ASSERT(two == 0);
    257 		/* recover the data. Since we need only want to recover one
    258 		 * column, we overwrite the parity with the other one. */
    259 		if (coeff < i)	/* recovering 'a' */
    260 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    261 		else		/* recovering 'b' */
    262 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    263 	} else
    264 		RF_PANIC();
    265 
    266 	RF_ETIMER_STOP(timer);
    267 	RF_ETIMER_EVAL(timer);
    268 	if (tracerec)
    269 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    270 	rf_GenericWakeupFunc(node, 0);
    271 	return (0);
    272 }
    273 
    274 int
    275 rf_PQWriteDoubleRecoveryFunc(node)
    276 	RF_DagNode_t *node;
    277 {
    278 	/* The situation:
    279 	 *
    280 	 * We are doing a write that hits only one failed data unit. The other
    281 	 * failed data unit is not being overwritten, so we need to generate
    282 	 * it.
    283 	 *
    284 	 * For the moment, we assume all the nonfailed data being written is in
    285 	 * the shadow of the failed data unit. (i.e,, either a single data
    286 	 * unit write or the entire failed stripe unit is being overwritten. )
    287 	 *
    288 	 * Recovery strategy: apply the recovery data to the parity and q. Use P
    289 	 * & Q to recover the second failed data unit in P. Zero fill Q, then
    290 	 * apply the recovered data to p. Then apply the data being written to
    291 	 * the failed drive. Then walk through the surviving drives, applying
    292 	 * new data when it exists, othewise the recovery data. Quite a mess.
    293 	 *
    294 	 *
    295 	 * The params
    296 	 *
    297 	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
    298 	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
    299 	 * raidPtr, asmap */
    300 
    301 	int     np = node->numParams;
    302 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    303 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    304 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    305 	int     i;
    306 	RF_RaidAddr_t sosAddr;
    307 	unsigned coeff;
    308 	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
    309 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
    310 	int     numDataCol = layoutPtr->numDataCol;
    311 	RF_Etimer_t timer;
    312 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    313 
    314 	RF_ASSERT(node->numResults == 2);
    315 	RF_ASSERT(asmap->failedPDAs[1] == NULL);
    316 	RF_ETIMER_START(timer);
    317 	ppda = node->results[0];
    318 	qpda = node->results[1];
    319 	/* apply the recovery data */
    320 	for (i = 0; i < numDataCol - 2; i++)
    321 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    322 
    323 	/* determine the other failed data unit */
    324 	pda = asmap->failedPDAs[0];
    325 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    326 	/* need to determine the column of the other failed disk */
    327 	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    328 	/* compute the data unit offset within the column */
    329 	coeff = (coeff % raidPtr->Layout.numDataCol);
    330 	for (i = 0; i < numDataCol; i++) {
    331 		npda.raidAddress = sosAddr + (i * secPerSU);
    332 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    333 		/* skip over dead disks */
    334 		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    335 			if (i != coeff)
    336 				break;
    337 	}
    338 	RF_ASSERT(i < numDataCol);
    339 	/* recover the data. The column we want to recover we write over the
    340 	 * parity. The column we don't care about we dump in q. */
    341 	if (coeff < i)		/* recovering 'a' */
    342 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    343 	else			/* recovering 'b' */
    344 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    345 
    346 	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
    347 	bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
    348 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
    349 
    350 	/* now apply all the write data to the buffer */
    351 	/* single stripe unit write case: the failed data is only thing we are
    352 	 * writing. */
    353 	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
    354 	/* dest, src, len, coeff */
    355 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
    356 	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
    357 
    358 	/* now apply all the recovery data */
    359 	for (i = 0; i < numDataCol - 2; i++)
    360 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    361 
    362 	RF_ETIMER_STOP(timer);
    363 	RF_ETIMER_EVAL(timer);
    364 	if (tracerec)
    365 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    366 
    367 	rf_GenericWakeupFunc(node, 0);
    368 	return (0);
    369 }
    370 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
    371 {
    372 	RF_PANIC();
    373 }
    374 /*
    375    Two lost data unit write case.
    376 
    377    There are really two cases here:
    378 
    379    (1) The write completely covers the two lost data units.
    380        In that case, a reconstruct write that doesn't write the
    381        failed data units will do the correct thing. So in this case,
    382        the dag looks like
    383 
    384             full stripe read of surviving data units (not being overwriten)
    385 	    write new data (ignoring failed units)   compute P&Q
    386 	                                             write P&Q
    387 
    388 
    389    (2) The write does not completely cover both failed data units
    390        (but touches at least one of them). Then we need to do the
    391        equivalent of a reconstruct read to recover the missing data
    392        unit from the other stripe.
    393 
    394        For any data we are writing that is not in the "shadow"
    395        of the failed units, we need to do a four cycle update.
    396        PANIC on this case. for now
    397 
    398 */
    399 
    400 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
    401 {
    402 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    403 	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
    404 	int     sum;
    405 	int     nf = asmap->numDataFailed;
    406 
    407 	sum = asmap->failedPDAs[0]->numSector;
    408 	if (nf == 2)
    409 		sum += asmap->failedPDAs[1]->numSector;
    410 
    411 	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
    412 		/* large write case */
    413 		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    414 		return;
    415 	}
    416 	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
    417 		/* small write case, no user data not in shadow */
    418 		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    419 		return;
    420 	}
    421 	RF_PANIC();
    422 }
    423 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
    424 {
    425 	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
    426 }
    427 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    428 				 * (RF_INCLUDE_RAID6 > 0) */
    429