Home | History | Annotate | Line # | Download | only in raidframe
rf_pqdegdags.c revision 1.7
      1 /*	$NetBSD: rf_pqdegdags.c,v 1.7 2001/10/04 15:58:55 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_pqdegdags.c
     31  * Degraded mode dags for double fault cases.
     32 */
     33 
     34 
     35 #include "rf_archs.h"
     36 
     37 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     38 
     39 #include <dev/raidframe/raidframevar.h>
     40 
     41 #include "rf_raid.h"
     42 #include "rf_dag.h"
     43 #include "rf_dagdegrd.h"
     44 #include "rf_dagdegwr.h"
     45 #include "rf_dagfuncs.h"
     46 #include "rf_dagutils.h"
     47 #include "rf_etimer.h"
     48 #include "rf_acctrace.h"
     49 #include "rf_general.h"
     50 #include "rf_pqdegdags.h"
     51 #include "rf_pq.h"
     52 
     53 static void
     54 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
     55     RF_PhysDiskAddr_t * qpda, void *bp);
     56 
     57 /*
     58    Two data drives have failed, and we are doing a read that covers one of them.
     59    We may also be reading some of the surviving drives.
     60 
     61 
     62  *****************************************************************************************
     63  *
     64  * creates a DAG to perform a degraded-mode read of data within one stripe.
     65  * This DAG is as follows:
     66  *
     67  *                                      Hdr
     68  *                                       |
     69  *                                     Block
     70  *                       /         /           \         \     \   \
     71  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
     72  *                      | \       | \         | \       | \    | \ | \
     73  *
     74  *                                 |                 |
     75  *                              Unblock              X
     76  *                                  \               /
     77  *                                   ------ T ------
     78  *
     79  * Each R node is a successor of the L node
     80  * One successor arc from each R node goes to U, and the other to X
     81  * There is one Rud for each chunk of surviving user data requested by the user,
     82  * and one Rrd for each chunk of surviving user data _not_ being read by the user
     83  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
     84  * X = pq recovery node, T = terminate
     85  *
     86  * The block & unblock nodes are leftovers from a previous version.  They
     87  * do nothing, but I haven't deleted them because it would be a tremendous
     88  * effort to put them back in.
     89  *
     90  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
     91  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
     92  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
     93  * zero the target buffer prior to the re-use.
     94  *
     95  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
     96  * needs and what's not.
     97  ****************************************************************************************/
     98 /*   init a disk node with 2 successors and one predecessor */
     99 #define INIT_DISK_NODE(node,name) \
    100 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
    101 (node)->succedents[0] = unblockNode; \
    102 (node)->succedents[1] = recoveryNode; \
    103 (node)->antecedents[0] = blockNode; \
    104 (node)->antType[0] = rf_control
    105 
    106 #define DISK_NODE_PARAMS(_node_,_p_) \
    107   (_node_).params[0].p = _p_ ; \
    108   (_node_).params[1].p = (_p_)->bufPtr; \
    109   (_node_).params[2].v = parityStripeID; \
    110   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
    111 
    112 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
    113 
    114 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
    115 {
    116 	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
    117 	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
    118 }
    119 
    120 static void
    121 applyPDA(raidPtr, pda, ppda, qpda, bp)
    122 	RF_Raid_t *raidPtr;
    123 	RF_PhysDiskAddr_t *pda;
    124 	RF_PhysDiskAddr_t *ppda;
    125 	RF_PhysDiskAddr_t *qpda;
    126 	void   *bp;
    127 {
    128 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    129 	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
    130 	RF_SectorCount_t s0len = ppda->numSector, len;
    131 	RF_SectorNum_t suoffset;
    132 	unsigned coeff;
    133 	char   *pbuf = ppda->bufPtr;
    134 	char   *qbuf = qpda->bufPtr;
    135 	char   *buf;
    136 	int     delta;
    137 
    138 	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    139 	len = pda->numSector;
    140 	/* see if pda intersects a recovery pda */
    141 	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
    142 		buf = pda->bufPtr;
    143 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    144 		coeff = (coeff % raidPtr->Layout.numDataCol);
    145 
    146 		if (suoffset < s0off) {
    147 			delta = s0off - suoffset;
    148 			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    149 			suoffset = s0off;
    150 			len -= delta;
    151 		}
    152 		if (suoffset > s0off) {
    153 			delta = suoffset - s0off;
    154 			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    155 			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    156 		}
    157 		if ((suoffset + len) > (s0len + s0off))
    158 			len = s0len + s0off - suoffset;
    159 
    160 		/* src, dest, len */
    161 		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
    162 
    163 		/* dest, src, len, coeff */
    164 		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
    165 	}
    166 }
    167 /*
    168    Recover data in the case of a double failure. There can be two
    169    result buffers, one for each chunk of data trying to be recovered.
    170    The params are pda's that have not been range restricted or otherwise
    171    politely massaged - this should be done here. The last params are the
    172    pdas of P and Q, followed by the raidPtr. The list can look like
    173 
    174    pda, pda, ... , p pda, q pda, raidptr, asm
    175 
    176    or
    177 
    178    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
    179 
    180    depending on wether two chunks of recovery data were required.
    181 
    182    The second condition only arises if there are two failed buffers
    183    whose lengths do not add up a stripe unit.
    184 */
    185 
    186 
    187 int
    188 rf_PQDoubleRecoveryFunc(node)
    189 	RF_DagNode_t *node;
    190 {
    191 	int     np = node->numParams;
    192 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    193 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    194 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    195 	int     d, i;
    196 	unsigned coeff;
    197 	RF_RaidAddr_t sosAddr, suoffset;
    198 	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
    199 	int     two = 0;
    200 	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
    201 	char   *buf;
    202 	int     numDataCol = layoutPtr->numDataCol;
    203 	RF_Etimer_t timer;
    204 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    205 
    206 	RF_ETIMER_START(timer);
    207 
    208 	if (asmap->failedPDAs[1] &&
    209 	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
    210 		RF_ASSERT(0);
    211 		ppda = node->params[np - 6].p;
    212 		ppda2 = node->params[np - 5].p;
    213 		qpda = node->params[np - 4].p;
    214 		qpda2 = node->params[np - 3].p;
    215 		d = (np - 6);
    216 		two = 1;
    217 	} else {
    218 		ppda = node->params[np - 4].p;
    219 		qpda = node->params[np - 3].p;
    220 		d = (np - 4);
    221 	}
    222 
    223 	for (i = 0; i < d; i++) {
    224 		pda = node->params[i].p;
    225 		buf = pda->bufPtr;
    226 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    227 		len = pda->numSector;
    228 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    229 		/* compute the data unit offset within the column */
    230 		coeff = (coeff % raidPtr->Layout.numDataCol);
    231 		/* see if pda intersects a recovery pda */
    232 		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    233 		if (two)
    234 			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    235 	}
    236 
    237 	/* ok, we got the parity back to the point where we can recover. We
    238 	 * now need to determine the coeff of the columns that need to be
    239 	 * recovered. We can also only need to recover a single stripe unit. */
    240 
    241 	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
    242 						 * to recover. */
    243 		pda = asmap->failedPDAs[0];
    244 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    245 		/* need to determine the column of the other failed disk */
    246 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    247 		/* compute the data unit offset within the column */
    248 		coeff = (coeff % raidPtr->Layout.numDataCol);
    249 		for (i = 0; i < numDataCol; i++) {
    250 			npda.raidAddress = sosAddr + (i * secPerSU);
    251 			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    252 			/* skip over dead disks */
    253 			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    254 				if (i != coeff)
    255 					break;
    256 		}
    257 		RF_ASSERT(i < numDataCol);
    258 		RF_ASSERT(two == 0);
    259 		/* recover the data. Since we need only want to recover one
    260 		 * column, we overwrite the parity with the other one. */
    261 		if (coeff < i)	/* recovering 'a' */
    262 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    263 		else		/* recovering 'b' */
    264 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    265 	} else
    266 		RF_PANIC();
    267 
    268 	RF_ETIMER_STOP(timer);
    269 	RF_ETIMER_EVAL(timer);
    270 	if (tracerec)
    271 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    272 	rf_GenericWakeupFunc(node, 0);
    273 	return (0);
    274 }
    275 
    276 int
    277 rf_PQWriteDoubleRecoveryFunc(node)
    278 	RF_DagNode_t *node;
    279 {
    280 	/* The situation:
    281 	 *
    282 	 * We are doing a write that hits only one failed data unit. The other
    283 	 * failed data unit is not being overwritten, so we need to generate
    284 	 * it.
    285 	 *
    286 	 * For the moment, we assume all the nonfailed data being written is in
    287 	 * the shadow of the failed data unit. (i.e,, either a single data
    288 	 * unit write or the entire failed stripe unit is being overwritten. )
    289 	 *
    290 	 * Recovery strategy: apply the recovery data to the parity and q. Use P
    291 	 * & Q to recover the second failed data unit in P. Zero fill Q, then
    292 	 * apply the recovered data to p. Then apply the data being written to
    293 	 * the failed drive. Then walk through the surviving drives, applying
    294 	 * new data when it exists, othewise the recovery data. Quite a mess.
    295 	 *
    296 	 *
    297 	 * The params
    298 	 *
    299 	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
    300 	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
    301 	 * raidPtr, asmap */
    302 
    303 	int     np = node->numParams;
    304 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    305 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    306 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    307 	int     i;
    308 	RF_RaidAddr_t sosAddr;
    309 	unsigned coeff;
    310 	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
    311 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
    312 	int     numDataCol = layoutPtr->numDataCol;
    313 	RF_Etimer_t timer;
    314 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    315 
    316 	RF_ASSERT(node->numResults == 2);
    317 	RF_ASSERT(asmap->failedPDAs[1] == NULL);
    318 	RF_ETIMER_START(timer);
    319 	ppda = node->results[0];
    320 	qpda = node->results[1];
    321 	/* apply the recovery data */
    322 	for (i = 0; i < numDataCol - 2; i++)
    323 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    324 
    325 	/* determine the other failed data unit */
    326 	pda = asmap->failedPDAs[0];
    327 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    328 	/* need to determine the column of the other failed disk */
    329 	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    330 	/* compute the data unit offset within the column */
    331 	coeff = (coeff % raidPtr->Layout.numDataCol);
    332 	for (i = 0; i < numDataCol; i++) {
    333 		npda.raidAddress = sosAddr + (i * secPerSU);
    334 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    335 		/* skip over dead disks */
    336 		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    337 			if (i != coeff)
    338 				break;
    339 	}
    340 	RF_ASSERT(i < numDataCol);
    341 	/* recover the data. The column we want to recover we write over the
    342 	 * parity. The column we don't care about we dump in q. */
    343 	if (coeff < i)		/* recovering 'a' */
    344 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    345 	else			/* recovering 'b' */
    346 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    347 
    348 	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
    349 	memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
    350 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
    351 
    352 	/* now apply all the write data to the buffer */
    353 	/* single stripe unit write case: the failed data is only thing we are
    354 	 * writing. */
    355 	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
    356 	/* dest, src, len, coeff */
    357 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
    358 	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
    359 
    360 	/* now apply all the recovery data */
    361 	for (i = 0; i < numDataCol - 2; i++)
    362 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    363 
    364 	RF_ETIMER_STOP(timer);
    365 	RF_ETIMER_EVAL(timer);
    366 	if (tracerec)
    367 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    368 
    369 	rf_GenericWakeupFunc(node, 0);
    370 	return (0);
    371 }
    372 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
    373 {
    374 	RF_PANIC();
    375 }
    376 /*
    377    Two lost data unit write case.
    378 
    379    There are really two cases here:
    380 
    381    (1) The write completely covers the two lost data units.
    382        In that case, a reconstruct write that doesn't write the
    383        failed data units will do the correct thing. So in this case,
    384        the dag looks like
    385 
    386             full stripe read of surviving data units (not being overwriten)
    387 	    write new data (ignoring failed units)   compute P&Q
    388 	                                             write P&Q
    389 
    390 
    391    (2) The write does not completely cover both failed data units
    392        (but touches at least one of them). Then we need to do the
    393        equivalent of a reconstruct read to recover the missing data
    394        unit from the other stripe.
    395 
    396        For any data we are writing that is not in the "shadow"
    397        of the failed units, we need to do a four cycle update.
    398        PANIC on this case. for now
    399 
    400 */
    401 
    402 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
    403 {
    404 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    405 	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
    406 	int     sum;
    407 	int     nf = asmap->numDataFailed;
    408 
    409 	sum = asmap->failedPDAs[0]->numSector;
    410 	if (nf == 2)
    411 		sum += asmap->failedPDAs[1]->numSector;
    412 
    413 	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
    414 		/* large write case */
    415 		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    416 		return;
    417 	}
    418 	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
    419 		/* small write case, no user data not in shadow */
    420 		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    421 		return;
    422 	}
    423 	RF_PANIC();
    424 }
    425 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
    426 {
    427 	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
    428 }
    429 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    430 				 * (RF_INCLUDE_RAID6 > 0) */
    431