Home | History | Annotate | Line # | Download | only in raidframe
rf_pqdegdags.c revision 1.8
      1 /*	$NetBSD: rf_pqdegdags.c,v 1.8 2001/11/13 07:11:16 lukem Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_pqdegdags.c
     31  * Degraded mode dags for double fault cases.
     32 */
     33 
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: rf_pqdegdags.c,v 1.8 2001/11/13 07:11:16 lukem Exp $");
     37 
     38 #include "rf_archs.h"
     39 
     40 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     41 
     42 #include <dev/raidframe/raidframevar.h>
     43 
     44 #include "rf_raid.h"
     45 #include "rf_dag.h"
     46 #include "rf_dagdegrd.h"
     47 #include "rf_dagdegwr.h"
     48 #include "rf_dagfuncs.h"
     49 #include "rf_dagutils.h"
     50 #include "rf_etimer.h"
     51 #include "rf_acctrace.h"
     52 #include "rf_general.h"
     53 #include "rf_pqdegdags.h"
     54 #include "rf_pq.h"
     55 
     56 static void
     57 applyPDA(RF_Raid_t * raidPtr, RF_PhysDiskAddr_t * pda, RF_PhysDiskAddr_t * ppda,
     58     RF_PhysDiskAddr_t * qpda, void *bp);
     59 
     60 /*
     61    Two data drives have failed, and we are doing a read that covers one of them.
     62    We may also be reading some of the surviving drives.
     63 
     64 
     65  *****************************************************************************************
     66  *
     67  * creates a DAG to perform a degraded-mode read of data within one stripe.
     68  * This DAG is as follows:
     69  *
     70  *                                      Hdr
     71  *                                       |
     72  *                                     Block
     73  *                       /         /           \         \     \   \
     74  *                      Rud  ...  Rud         Rrd  ...  Rrd    Rp  Rq
     75  *                      | \       | \         | \       | \    | \ | \
     76  *
     77  *                                 |                 |
     78  *                              Unblock              X
     79  *                                  \               /
     80  *                                   ------ T ------
     81  *
     82  * Each R node is a successor of the L node
     83  * One successor arc from each R node goes to U, and the other to X
     84  * There is one Rud for each chunk of surviving user data requested by the user,
     85  * and one Rrd for each chunk of surviving user data _not_ being read by the user
     86  * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
     87  * X = pq recovery node, T = terminate
     88  *
     89  * The block & unblock nodes are leftovers from a previous version.  They
     90  * do nothing, but I haven't deleted them because it would be a tremendous
     91  * effort to put them back in.
     92  *
     93  * Note:  The target buffer for the XOR node is set to the actual user buffer where the
     94  * failed data is supposed to end up.  This buffer is zero'd by the code here.  Thus,
     95  * if you create a degraded read dag, use it, and then re-use, you have to be sure to
     96  * zero the target buffer prior to the re-use.
     97  *
     98  * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
     99  * needs and what's not.
    100  ****************************************************************************************/
    101 /*   init a disk node with 2 successors and one predecessor */
    102 #define INIT_DISK_NODE(node,name) \
    103 rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
    104 (node)->succedents[0] = unblockNode; \
    105 (node)->succedents[1] = recoveryNode; \
    106 (node)->antecedents[0] = blockNode; \
    107 (node)->antType[0] = rf_control
    108 
    109 #define DISK_NODE_PARAMS(_node_,_p_) \
    110   (_node_).params[0].p = _p_ ; \
    111   (_node_).params[1].p = (_p_)->bufPtr; \
    112   (_node_).params[2].v = parityStripeID; \
    113   (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
    114 
    115 #define DISK_NODE_PDA(node)  ((node)->params[0].p)
    116 
    117 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
    118 {
    119 	rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
    120 	    "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
    121 }
    122 
    123 static void
    124 applyPDA(raidPtr, pda, ppda, qpda, bp)
    125 	RF_Raid_t *raidPtr;
    126 	RF_PhysDiskAddr_t *pda;
    127 	RF_PhysDiskAddr_t *ppda;
    128 	RF_PhysDiskAddr_t *qpda;
    129 	void   *bp;
    130 {
    131 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    132 	RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
    133 	RF_SectorCount_t s0len = ppda->numSector, len;
    134 	RF_SectorNum_t suoffset;
    135 	unsigned coeff;
    136 	char   *pbuf = ppda->bufPtr;
    137 	char   *qbuf = qpda->bufPtr;
    138 	char   *buf;
    139 	int     delta;
    140 
    141 	suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    142 	len = pda->numSector;
    143 	/* see if pda intersects a recovery pda */
    144 	if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
    145 		buf = pda->bufPtr;
    146 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    147 		coeff = (coeff % raidPtr->Layout.numDataCol);
    148 
    149 		if (suoffset < s0off) {
    150 			delta = s0off - suoffset;
    151 			buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    152 			suoffset = s0off;
    153 			len -= delta;
    154 		}
    155 		if (suoffset > s0off) {
    156 			delta = suoffset - s0off;
    157 			pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    158 			qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta);
    159 		}
    160 		if ((suoffset + len) > (s0len + s0off))
    161 			len = s0len + s0off - suoffset;
    162 
    163 		/* src, dest, len */
    164 		rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
    165 
    166 		/* dest, src, len, coeff */
    167 		rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff);
    168 	}
    169 }
    170 /*
    171    Recover data in the case of a double failure. There can be two
    172    result buffers, one for each chunk of data trying to be recovered.
    173    The params are pda's that have not been range restricted or otherwise
    174    politely massaged - this should be done here. The last params are the
    175    pdas of P and Q, followed by the raidPtr. The list can look like
    176 
    177    pda, pda, ... , p pda, q pda, raidptr, asm
    178 
    179    or
    180 
    181    pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
    182 
    183    depending on wether two chunks of recovery data were required.
    184 
    185    The second condition only arises if there are two failed buffers
    186    whose lengths do not add up a stripe unit.
    187 */
    188 
    189 
    190 int
    191 rf_PQDoubleRecoveryFunc(node)
    192 	RF_DagNode_t *node;
    193 {
    194 	int     np = node->numParams;
    195 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    196 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    197 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    198 	int     d, i;
    199 	unsigned coeff;
    200 	RF_RaidAddr_t sosAddr, suoffset;
    201 	RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
    202 	int     two = 0;
    203 	RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
    204 	char   *buf;
    205 	int     numDataCol = layoutPtr->numDataCol;
    206 	RF_Etimer_t timer;
    207 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    208 
    209 	RF_ETIMER_START(timer);
    210 
    211 	if (asmap->failedPDAs[1] &&
    212 	    (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
    213 		RF_ASSERT(0);
    214 		ppda = node->params[np - 6].p;
    215 		ppda2 = node->params[np - 5].p;
    216 		qpda = node->params[np - 4].p;
    217 		qpda2 = node->params[np - 3].p;
    218 		d = (np - 6);
    219 		two = 1;
    220 	} else {
    221 		ppda = node->params[np - 4].p;
    222 		qpda = node->params[np - 3].p;
    223 		d = (np - 4);
    224 	}
    225 
    226 	for (i = 0; i < d; i++) {
    227 		pda = node->params[i].p;
    228 		buf = pda->bufPtr;
    229 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    230 		len = pda->numSector;
    231 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    232 		/* compute the data unit offset within the column */
    233 		coeff = (coeff % raidPtr->Layout.numDataCol);
    234 		/* see if pda intersects a recovery pda */
    235 		applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    236 		if (two)
    237 			applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
    238 	}
    239 
    240 	/* ok, we got the parity back to the point where we can recover. We
    241 	 * now need to determine the coeff of the columns that need to be
    242 	 * recovered. We can also only need to recover a single stripe unit. */
    243 
    244 	if (asmap->failedPDAs[1] == NULL) {	/* only a single stripe unit
    245 						 * to recover. */
    246 		pda = asmap->failedPDAs[0];
    247 		sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    248 		/* need to determine the column of the other failed disk */
    249 		coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    250 		/* compute the data unit offset within the column */
    251 		coeff = (coeff % raidPtr->Layout.numDataCol);
    252 		for (i = 0; i < numDataCol; i++) {
    253 			npda.raidAddress = sosAddr + (i * secPerSU);
    254 			(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    255 			/* skip over dead disks */
    256 			if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    257 				if (i != coeff)
    258 					break;
    259 		}
    260 		RF_ASSERT(i < numDataCol);
    261 		RF_ASSERT(two == 0);
    262 		/* recover the data. Since we need only want to recover one
    263 		 * column, we overwrite the parity with the other one. */
    264 		if (coeff < i)	/* recovering 'a' */
    265 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    266 		else		/* recovering 'b' */
    267 			rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    268 	} else
    269 		RF_PANIC();
    270 
    271 	RF_ETIMER_STOP(timer);
    272 	RF_ETIMER_EVAL(timer);
    273 	if (tracerec)
    274 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    275 	rf_GenericWakeupFunc(node, 0);
    276 	return (0);
    277 }
    278 
    279 int
    280 rf_PQWriteDoubleRecoveryFunc(node)
    281 	RF_DagNode_t *node;
    282 {
    283 	/* The situation:
    284 	 *
    285 	 * We are doing a write that hits only one failed data unit. The other
    286 	 * failed data unit is not being overwritten, so we need to generate
    287 	 * it.
    288 	 *
    289 	 * For the moment, we assume all the nonfailed data being written is in
    290 	 * the shadow of the failed data unit. (i.e,, either a single data
    291 	 * unit write or the entire failed stripe unit is being overwritten. )
    292 	 *
    293 	 * Recovery strategy: apply the recovery data to the parity and q. Use P
    294 	 * & Q to recover the second failed data unit in P. Zero fill Q, then
    295 	 * apply the recovered data to p. Then apply the data being written to
    296 	 * the failed drive. Then walk through the surviving drives, applying
    297 	 * new data when it exists, othewise the recovery data. Quite a mess.
    298 	 *
    299 	 *
    300 	 * The params
    301 	 *
    302 	 * read pda0, read pda1, ... read pda (numDataCol-3), write pda0, ... ,
    303 	 * write pda (numStripeUnitAccess - numDataFailed), failed pda,
    304 	 * raidPtr, asmap */
    305 
    306 	int     np = node->numParams;
    307 	RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
    308 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
    309 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
    310 	int     i;
    311 	RF_RaidAddr_t sosAddr;
    312 	unsigned coeff;
    313 	RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
    314 	RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
    315 	int     numDataCol = layoutPtr->numDataCol;
    316 	RF_Etimer_t timer;
    317 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    318 
    319 	RF_ASSERT(node->numResults == 2);
    320 	RF_ASSERT(asmap->failedPDAs[1] == NULL);
    321 	RF_ETIMER_START(timer);
    322 	ppda = node->results[0];
    323 	qpda = node->results[1];
    324 	/* apply the recovery data */
    325 	for (i = 0; i < numDataCol - 2; i++)
    326 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    327 
    328 	/* determine the other failed data unit */
    329 	pda = asmap->failedPDAs[0];
    330 	sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    331 	/* need to determine the column of the other failed disk */
    332 	coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
    333 	/* compute the data unit offset within the column */
    334 	coeff = (coeff % raidPtr->Layout.numDataCol);
    335 	for (i = 0; i < numDataCol; i++) {
    336 		npda.raidAddress = sosAddr + (i * secPerSU);
    337 		(raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
    338 		/* skip over dead disks */
    339 		if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
    340 			if (i != coeff)
    341 				break;
    342 	}
    343 	RF_ASSERT(i < numDataCol);
    344 	/* recover the data. The column we want to recover we write over the
    345 	 * parity. The column we don't care about we dump in q. */
    346 	if (coeff < i)		/* recovering 'a' */
    347 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
    348 	else			/* recovering 'b' */
    349 		rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
    350 
    351 	/* OK. The valid data is in P. Zero fill Q, then inc it into it. */
    352 	memset(qpda->bufPtr, 0, rf_RaidAddressToByte(raidPtr, qpda->numSector));
    353 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
    354 
    355 	/* now apply all the write data to the buffer */
    356 	/* single stripe unit write case: the failed data is only thing we are
    357 	 * writing. */
    358 	RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
    359 	/* dest, src, len, coeff */
    360 	rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
    361 	rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
    362 
    363 	/* now apply all the recovery data */
    364 	for (i = 0; i < numDataCol - 2; i++)
    365 		applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp);
    366 
    367 	RF_ETIMER_STOP(timer);
    368 	RF_ETIMER_EVAL(timer);
    369 	if (tracerec)
    370 		tracerec->q_us += RF_ETIMER_VAL_US(timer);
    371 
    372 	rf_GenericWakeupFunc(node, 0);
    373 	return (0);
    374 }
    375 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
    376 {
    377 	RF_PANIC();
    378 }
    379 /*
    380    Two lost data unit write case.
    381 
    382    There are really two cases here:
    383 
    384    (1) The write completely covers the two lost data units.
    385        In that case, a reconstruct write that doesn't write the
    386        failed data units will do the correct thing. So in this case,
    387        the dag looks like
    388 
    389             full stripe read of surviving data units (not being overwriten)
    390 	    write new data (ignoring failed units)   compute P&Q
    391 	                                             write P&Q
    392 
    393 
    394    (2) The write does not completely cover both failed data units
    395        (but touches at least one of them). Then we need to do the
    396        equivalent of a reconstruct read to recover the missing data
    397        unit from the other stripe.
    398 
    399        For any data we are writing that is not in the "shadow"
    400        of the failed units, we need to do a four cycle update.
    401        PANIC on this case. for now
    402 
    403 */
    404 
    405 RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
    406 {
    407 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    408 	RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
    409 	int     sum;
    410 	int     nf = asmap->numDataFailed;
    411 
    412 	sum = asmap->failedPDAs[0]->numSector;
    413 	if (nf == 2)
    414 		sum += asmap->failedPDAs[1]->numSector;
    415 
    416 	if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
    417 		/* large write case */
    418 		rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    419 		return;
    420 	}
    421 	if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
    422 		/* small write case, no user data not in shadow */
    423 		rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
    424 		return;
    425 	}
    426 	RF_PANIC();
    427 }
    428 RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
    429 {
    430 	rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
    431 }
    432 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    433 				 * (RF_INCLUDE_RAID6 > 0) */
    434