Home | History | Annotate | Line # | Download | only in raidframe
rf_pq.c revision 1.6.2.1
      1 /*	$NetBSD: rf_pq.c,v 1.6.2.1 2000/11/20 11:42:57 bouyer Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * Code for RAID level 6 (P + Q) disk array architecture.
     31  */
     32 
     33 #include "rf_archs.h"
     34 #include "rf_types.h"
     35 #include "rf_raid.h"
     36 #include "rf_dag.h"
     37 #include "rf_dagffrd.h"
     38 #include "rf_dagffwr.h"
     39 #include "rf_dagdegrd.h"
     40 #include "rf_dagdegwr.h"
     41 #include "rf_dagutils.h"
     42 #include "rf_dagfuncs.h"
     43 #include "rf_etimer.h"
     44 #include "rf_pqdeg.h"
     45 #include "rf_general.h"
     46 #include "rf_map.h"
     47 #include "rf_pq.h"
     48 
     49 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
     50 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
     51 
     52 int
     53 rf_RegularONPFunc(node)
     54 	RF_DagNode_t *node;
     55 {
     56 	return (rf_RegularXorFunc(node));
     57 }
     58 /*
     59    same as simpleONQ func, but the coefficient is always 1
     60 */
     61 
     62 int
     63 rf_SimpleONPFunc(node)
     64 	RF_DagNode_t *node;
     65 {
     66 	return (rf_SimpleXorFunc(node));
     67 }
     68 
     69 int
     70 rf_RecoveryPFunc(node)
     71 	RF_DagNode_t *node;
     72 {
     73 	return (rf_RecoveryXorFunc(node));
     74 }
     75 
     76 int
     77 rf_RegularPFunc(node)
     78 	RF_DagNode_t *node;
     79 {
     80 	return (rf_RegularXorFunc(node));
     81 }
     82 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     83 
     84 static void
     85 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
     86     unsigned char coeff);
     87 static void
     88 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
     89     unsigned length, unsigned coeff);
     90 
     91 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
     92 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
     93 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
     94 
     95 void
     96 rf_PQDagSelect(
     97     RF_Raid_t * raidPtr,
     98     RF_IoType_t type,
     99     RF_AccessStripeMap_t * asmap,
    100     RF_VoidFuncPtr * createFunc)
    101 {
    102 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    103 	unsigned ndfail = asmap->numDataFailed;
    104 	unsigned npfail = asmap->numParityFailed;
    105 	unsigned ntfail = npfail + ndfail;
    106 
    107 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    108 	if (ntfail > 2) {
    109 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
    110 		 /* *infoFunc = */ *createFunc = NULL;
    111 		return;
    112 	}
    113 	/* ok, we can do this I/O */
    114 	if (type == RF_IO_TYPE_READ) {
    115 		switch (ndfail) {
    116 		case 0:
    117 			/* fault free read */
    118 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
    119 			break;
    120 		case 1:
    121 			/* lost a single data unit */
    122 			/* two cases: (1) parity is not lost. do a normal raid
    123 			 * 5 reconstruct read. (2) parity is lost. do a
    124 			 * reconstruct read using "q". */
    125 			if (ntfail == 2) {	/* also lost redundancy */
    126 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
    127 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
    128 				else
    129 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
    130 			} else {
    131 				/* P and Q are ok. But is there a failure in
    132 				 * some unaccessed data unit? */
    133 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    134 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    135 				else
    136 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
    137 			}
    138 			break;
    139 		case 2:
    140 			/* lost two data units */
    141 			/* *infoFunc = PQOneTwo; */
    142 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    143 			break;
    144 		}
    145 		return;
    146 	}
    147 	/* a write */
    148 	switch (ntfail) {
    149 	case 0:		/* fault free */
    150 		if (rf_suppressLocksAndLargeWrites ||
    151 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
    152 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
    153 
    154 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
    155 		} else {
    156 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
    157 		}
    158 		break;
    159 
    160 	case 1:		/* single disk fault */
    161 		if (npfail == 1) {
    162 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
    163 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
    164 										 * normal mode raid5
    165 										 * write. */
    166 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    167 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    168 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
    169 				else
    170 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
    171 			} else {/* parity died, small write only updating Q */
    172 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    173 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    174 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
    175 				else
    176 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
    177 			}
    178 		} else {	/* data missing. Do a P reconstruct write if
    179 				 * only a single data unit is lost in the
    180 				 * stripe, otherwise a PQ reconstruct write. */
    181 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    182 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    183 			else
    184 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
    185 		}
    186 		break;
    187 
    188 	case 2:		/* two disk faults */
    189 		switch (npfail) {
    190 		case 2:	/* both p and q dead */
    191 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
    192 			break;
    193 		case 1:	/* either p or q and dead data */
    194 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
    195 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
    196 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
    197 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
    198 			else
    199 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
    200 			break;
    201 		case 0:	/* double data loss */
    202 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    203 			break;
    204 		}
    205 		break;
    206 
    207 	default:		/* more than 2 disk faults */
    208 		*createFunc = NULL;
    209 		RF_PANIC();
    210 	}
    211 	return;
    212 }
    213 /*
    214    Used as a stop gap info function
    215 */
    216 #if 0
    217 static void
    218 PQOne(raidPtr, nSucc, nAnte, asmap)
    219 	RF_Raid_t *raidPtr;
    220 	int    *nSucc;
    221 	int    *nAnte;
    222 	RF_AccessStripeMap_t *asmap;
    223 {
    224 	*nSucc = *nAnte = 1;
    225 }
    226 
    227 static void
    228 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
    229 	RF_Raid_t *raidPtr;
    230 	int    *nSucc;
    231 	int    *nAnte;
    232 	RF_AccessStripeMap_t *asmap;
    233 {
    234 	*nSucc = 1;
    235 	*nAnte = 2;
    236 }
    237 #endif
    238 
    239 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
    240 {
    241 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
    242 	    rf_RegularPQFunc, RF_FALSE);
    243 }
    244 
    245 int
    246 rf_RegularONQFunc(node)
    247 	RF_DagNode_t *node;
    248 {
    249 	int     np = node->numParams;
    250 	int     d;
    251 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    252 	int     i;
    253 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    254 	RF_Etimer_t timer;
    255 	char   *qbuf, *qpbuf;
    256 	char   *obuf, *nbuf;
    257 	RF_PhysDiskAddr_t *old, *new;
    258 	unsigned long coeff;
    259 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    260 
    261 	RF_ETIMER_START(timer);
    262 
    263 	d = (np - 3) / 4;
    264 	RF_ASSERT(4 * d + 3 == np);
    265 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    266 	for (i = 0; i < d; i++) {
    267 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    268 		obuf = (char *) node->params[2 * i + 1].p;
    269 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    270 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    271 		RF_ASSERT(new->numSector == old->numSector);
    272 		RF_ASSERT(new->raidAddress == old->raidAddress);
    273 		/* the stripe unit within the stripe tells us the coefficient
    274 		 * to use for the multiply. */
    275 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    276 		/* compute the data unit offset within the column, then add
    277 		 * one */
    278 		coeff = (coeff % raidPtr->Layout.numDataCol);
    279 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    280 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    281 	}
    282 
    283 	RF_ETIMER_STOP(timer);
    284 	RF_ETIMER_EVAL(timer);
    285 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    286 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    287 					 * I/O in this node */
    288 	return (0);
    289 }
    290 /*
    291    See the SimpleXORFunc for the difference between a simple and regular func.
    292    These Q functions should be used for
    293 
    294          new q = Q(data,old data,old q)
    295 
    296    style updates and not for
    297 
    298          q = ( new data, new data, .... )
    299 
    300    computations.
    301 
    302    The simple q takes 2(2d+1)+1 params, where d is the number
    303    of stripes written. The order of params is
    304    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
    305    [2d] old q pda_0, old q buffer
    306    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
    307    raidPtr
    308 */
    309 
    310 int
    311 rf_SimpleONQFunc(node)
    312 	RF_DagNode_t *node;
    313 {
    314 	int     np = node->numParams;
    315 	int     d;
    316 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    317 	int     i;
    318 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    319 	RF_Etimer_t timer;
    320 	char   *qbuf;
    321 	char   *obuf, *nbuf;
    322 	RF_PhysDiskAddr_t *old, *new;
    323 	unsigned long coeff;
    324 
    325 	RF_ETIMER_START(timer);
    326 
    327 	d = (np - 3) / 4;
    328 	RF_ASSERT(4 * d + 3 == np);
    329 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    330 	for (i = 0; i < d; i++) {
    331 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    332 		obuf = (char *) node->params[2 * i + 1].p;
    333 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    334 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    335 		RF_ASSERT(new->numSector == old->numSector);
    336 		RF_ASSERT(new->raidAddress == old->raidAddress);
    337 		/* the stripe unit within the stripe tells us the coefficient
    338 		 * to use for the multiply. */
    339 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    340 		/* compute the data unit offset within the column, then add
    341 		 * one */
    342 		coeff = (coeff % raidPtr->Layout.numDataCol);
    343 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    344 	}
    345 
    346 	RF_ETIMER_STOP(timer);
    347 	RF_ETIMER_EVAL(timer);
    348 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    349 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    350 					 * I/O in this node */
    351 	return (0);
    352 }
    353 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
    354 {
    355 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
    356 }
    357 
    358 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
    359 
    360 static void
    361 RegularQSubr(node, qbuf)
    362 	RF_DagNode_t *node;
    363 	char   *qbuf;
    364 {
    365 	int     np = node->numParams;
    366 	int     d;
    367 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    368 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    369 	int     i;
    370 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    371 	RF_Etimer_t timer;
    372 	char   *obuf, *qpbuf;
    373 	RF_PhysDiskAddr_t *old;
    374 	unsigned long coeff;
    375 
    376 	RF_ETIMER_START(timer);
    377 
    378 	d = (np - 1) / 2;
    379 	RF_ASSERT(2 * d + 1 == np);
    380 	for (i = 0; i < d; i++) {
    381 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    382 		obuf = (char *) node->params[2 * i + 1].p;
    383 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    384 		/* compute the data unit offset within the column, then add
    385 		 * one */
    386 		coeff = (coeff % raidPtr->Layout.numDataCol);
    387 		/* the input buffers may not all be aligned with the start of
    388 		 * the stripe. so shift by their sector offset within the
    389 		 * stripe unit */
    390 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    391 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    392 	}
    393 
    394 	RF_ETIMER_STOP(timer);
    395 	RF_ETIMER_EVAL(timer);
    396 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    397 }
    398 /*
    399    used in degraded writes.
    400 */
    401 
    402 static void DegrQSubr(RF_DagNode_t *node);
    403 
    404 static void
    405 DegrQSubr(node)
    406 	RF_DagNode_t *node;
    407 {
    408 	int     np = node->numParams;
    409 	int     d;
    410 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    411 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    412 	int     i;
    413 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    414 	RF_Etimer_t timer;
    415 	char   *qbuf = node->results[1];
    416 	char   *obuf, *qpbuf;
    417 	RF_PhysDiskAddr_t *old;
    418 	unsigned long coeff;
    419 	unsigned fail_start;
    420 	int     j;
    421 
    422 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    423 	fail_start = old->startSector % secPerSU;
    424 
    425 	RF_ETIMER_START(timer);
    426 
    427 	d = (np - 2) / 2;
    428 	RF_ASSERT(2 * d + 2 == np);
    429 	for (i = 0; i < d; i++) {
    430 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    431 		obuf = (char *) node->params[2 * i + 1].p;
    432 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    433 		/* compute the data unit offset within the column, then add
    434 		 * one */
    435 		coeff = (coeff % raidPtr->Layout.numDataCol);
    436 		/* the input buffers may not all be aligned with the start of
    437 		 * the stripe. so shift by their sector offset within the
    438 		 * stripe unit */
    439 		j = old->startSector % secPerSU;
    440 		RF_ASSERT(j >= fail_start);
    441 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    442 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    443 	}
    444 
    445 	RF_ETIMER_STOP(timer);
    446 	RF_ETIMER_EVAL(timer);
    447 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    448 }
    449 /*
    450    Called by large write code to compute the new parity and the new q.
    451 
    452    structure of the params:
    453 
    454    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
    455    raidPtr
    456 
    457    for a total of 2d+1 arguments.
    458    The result buffers results[0], results[1] are the buffers for the p and q,
    459    respectively.
    460 
    461    We compute Q first, then compute P. The P calculation may try to reuse
    462    one of the input buffers for its output, so if we computed P first, we would
    463    corrupt the input for the q calculation.
    464 */
    465 
    466 int
    467 rf_RegularPQFunc(node)
    468 	RF_DagNode_t *node;
    469 {
    470 	RegularQSubr(node, node->results[1]);
    471 	return (rf_RegularXorFunc(node));	/* does the wakeup */
    472 }
    473 
    474 int
    475 rf_RegularQFunc(node)
    476 	RF_DagNode_t *node;
    477 {
    478 	/* Almost ... adjust Qsubr args */
    479 	RegularQSubr(node, node->results[0]);
    480 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    481 					 * I/O in this node */
    482 	return (0);
    483 }
    484 /*
    485    Called by singly degraded write code to compute the new parity and the new q.
    486 
    487    structure of the params:
    488 
    489    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
    490    failedPDA raidPtr
    491 
    492    for a total of 2d+2 arguments.
    493    The result buffers results[0], results[1] are the buffers for the parity and q,
    494    respectively.
    495 
    496    We compute Q first, then compute parity. The parity calculation may try to reuse
    497    one of the input buffers for its output, so if we computed parity first, we would
    498    corrupt the input for the q calculation.
    499 
    500    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
    501 */
    502 
    503 void
    504 rf_Degraded_100_PQFunc(node)
    505 	RF_DagNode_t *node;
    506 {
    507 	int     np = node->numParams;
    508 
    509 	RF_ASSERT(np >= 2);
    510 	DegrQSubr(node);
    511 	rf_RecoveryXorFunc(node);
    512 }
    513 
    514 
    515 /*
    516    The two below are used when reading a stripe with a single lost data unit.
    517    The parameters are
    518 
    519    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
    520 
    521    and results[0] contains the data buffer. Which is originally zero-filled.
    522 
    523 */
    524 
    525 /* this Q func is used by the degraded-mode dag functions to recover lost data.
    526  * the second-to-last parameter is the PDA for the failed portion of the access.
    527  * the code here looks at this PDA and assumes that the xor target buffer is
    528  * equal in size to the number of sectors in the failed PDA.  It then uses
    529  * the other PDAs in the parameter list to determine where within the target
    530  * buffer the corresponding data should be xored.
    531  *
    532  * Recall the basic equation is
    533  *
    534  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
    535  *
    536  * so to recover data_j we need
    537  *
    538  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
    539  *
    540  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
    541  * copying Q into it. Then we need to do a table lookup to convert to solve
    542  *   data_j /= J
    543  *
    544  *
    545  */
    546 int
    547 rf_RecoveryQFunc(node)
    548 	RF_DagNode_t *node;
    549 {
    550 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    551 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    552 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    553 	int     i;
    554 	RF_PhysDiskAddr_t *pda;
    555 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    556 	char   *srcbuf, *destbuf;
    557 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    558 	RF_Etimer_t timer;
    559 	unsigned long coeff;
    560 
    561 	RF_ETIMER_START(timer);
    562 	/* start by copying Q into the buffer */
    563 	bcopy(node->params[node->numParams - 3].p, node->results[0],
    564 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
    565 	for (i = 0; i < node->numParams - 4; i += 2) {
    566 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
    567 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    568 		srcbuf = (char *) node->params[i + 1].p;
    569 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    570 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    571 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    572 		/* compute the data unit offset within the column */
    573 		coeff = (coeff % raidPtr->Layout.numDataCol);
    574 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    575 	}
    576 	/* Do the nasty inversion now */
    577 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
    578 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    579 	RF_ETIMER_STOP(timer);
    580 	RF_ETIMER_EVAL(timer);
    581 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    582 	rf_GenericWakeupFunc(node, 0);
    583 	return (0);
    584 }
    585 
    586 int
    587 rf_RecoveryPQFunc(node)
    588 	RF_DagNode_t *node;
    589 {
    590 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    591 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
    592 	return (1);
    593 }
    594 /*
    595    Degraded write Q subroutine.
    596    Used when P is dead.
    597    Large-write style Q computation.
    598    Parameters
    599 
    600    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
    601 
    602    We ignore failedPDA.
    603 
    604    This is a "simple style" recovery func.
    605 */
    606 
    607 void
    608 rf_PQ_DegradedWriteQFunc(node)
    609 	RF_DagNode_t *node;
    610 {
    611 	int     np = node->numParams;
    612 	int     d;
    613 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    614 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    615 	int     i;
    616 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    617 	RF_Etimer_t timer;
    618 	char   *qbuf = node->results[0];
    619 	char   *obuf, *qpbuf;
    620 	RF_PhysDiskAddr_t *old;
    621 	unsigned long coeff;
    622 	int     fail_start, j;
    623 
    624 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    625 	fail_start = old->startSector % secPerSU;
    626 
    627 	RF_ETIMER_START(timer);
    628 
    629 	d = (np - 2) / 2;
    630 	RF_ASSERT(2 * d + 2 == np);
    631 
    632 	for (i = 0; i < d; i++) {
    633 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    634 		obuf = (char *) node->params[2 * i + 1].p;
    635 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    636 		/* compute the data unit offset within the column, then add
    637 		 * one */
    638 		coeff = (coeff % raidPtr->Layout.numDataCol);
    639 		j = old->startSector % secPerSU;
    640 		RF_ASSERT(j >= fail_start);
    641 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    642 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    643 	}
    644 
    645 	RF_ETIMER_STOP(timer);
    646 	RF_ETIMER_EVAL(timer);
    647 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    648 	rf_GenericWakeupFunc(node, 0);
    649 }
    650 
    651 
    652 
    653 
    654 /* Q computations */
    655 
    656 /*
    657    coeff - colummn;
    658 
    659    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
    660 
    661    on 5-bit basis;
    662    length in bytes;
    663 */
    664 
    665 void
    666 rf_IncQ(dest, buf, length, coeff)
    667 	unsigned long *dest;
    668 	unsigned long *buf;
    669 	unsigned length;
    670 	unsigned coeff;
    671 {
    672 	unsigned long a, d, new;
    673 	unsigned long a1, a2;
    674 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    675 	unsigned r = rf_rn[coeff + 1];
    676 
    677 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
    678 #define INSERT(a,i) (a << (5L*i))
    679 
    680 	length /= 8;
    681 	/* 13 5 bit quants in a 64 bit word */
    682 	while (length) {
    683 		a = *buf++;
    684 		d = *dest;
    685 		a1 = EXTRACT(a, 0) ^ r;
    686 		a2 = EXTRACT(a, 1) ^ r;
    687 		new = INSERT(a2, 1) | a1;
    688 		a1 = EXTRACT(a, 2) ^ r;
    689 		a2 = EXTRACT(a, 3) ^ r;
    690 		a1 = q[a1];
    691 		a2 = q[a2];
    692 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    693 		a1 = EXTRACT(a, 4) ^ r;
    694 		a2 = EXTRACT(a, 5) ^ r;
    695 		a1 = q[a1];
    696 		a2 = q[a2];
    697 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    698 		a1 = EXTRACT(a, 5) ^ r;
    699 		a2 = EXTRACT(a, 6) ^ r;
    700 		a1 = q[a1];
    701 		a2 = q[a2];
    702 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    703 #if RF_LONGSHIFT > 2
    704 		a1 = EXTRACT(a, 7) ^ r;
    705 		a2 = EXTRACT(a, 8) ^ r;
    706 		a1 = q[a1];
    707 		a2 = q[a2];
    708 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    709 		a1 = EXTRACT(a, 9) ^ r;
    710 		a2 = EXTRACT(a, 10) ^ r;
    711 		a1 = q[a1];
    712 		a2 = q[a2];
    713 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    714 		a1 = EXTRACT(a, 11) ^ r;
    715 		a2 = EXTRACT(a, 12) ^ r;
    716 		a1 = q[a1];
    717 		a2 = q[a2];
    718 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    719 #endif				/* RF_LONGSHIFT > 2 */
    720 		d ^= new;
    721 		*dest++ = d;
    722 		length--;
    723 	}
    724 }
    725 /*
    726    compute
    727 
    728    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
    729 
    730    on a five bit basis.
    731    optimization: compute old ^ new on 64 bit basis.
    732 
    733    length in bytes.
    734 */
    735 
    736 static void
    737 QDelta(
    738     char *dest,
    739     char *obuf,
    740     char *nbuf,
    741     unsigned length,
    742     unsigned char coeff)
    743 {
    744 	unsigned long a, d, new;
    745 	unsigned long a1, a2;
    746 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    747 	unsigned int r = rf_rn[coeff + 1];
    748 
    749 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
    750 	q = NULL; /* XXX for now */
    751 
    752 #ifdef _KERNEL
    753 	/* PQ in kernel currently not supported because the encoding/decoding
    754 	 * table is not present */
    755 	bzero(dest, length);
    756 #else				/* KERNEL */
    757 	/* this code probably doesn't work and should be rewritten  -wvcii */
    758 	/* 13 5 bit quants in a 64 bit word */
    759 	length /= 8;
    760 	while (length) {
    761 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
    762 		a ^= *nbuf++;
    763 		d = *dest;
    764 		a1 = EXTRACT(a, 0) ^ r;
    765 		a2 = EXTRACT(a, 1) ^ r;
    766 		a1 = q[a1];
    767 		a2 = q[a2];
    768 		new = INSERT(a2, 1) | a1;
    769 		a1 = EXTRACT(a, 2) ^ r;
    770 		a2 = EXTRACT(a, 3) ^ r;
    771 		a1 = q[a1];
    772 		a2 = q[a2];
    773 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    774 		a1 = EXTRACT(a, 4) ^ r;
    775 		a2 = EXTRACT(a, 5) ^ r;
    776 		a1 = q[a1];
    777 		a2 = q[a2];
    778 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    779 		a1 = EXTRACT(a, 5) ^ r;
    780 		a2 = EXTRACT(a, 6) ^ r;
    781 		a1 = q[a1];
    782 		a2 = q[a2];
    783 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    784 #if RF_LONGSHIFT > 2
    785 		a1 = EXTRACT(a, 7) ^ r;
    786 		a2 = EXTRACT(a, 8) ^ r;
    787 		a1 = q[a1];
    788 		a2 = q[a2];
    789 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    790 		a1 = EXTRACT(a, 9) ^ r;
    791 		a2 = EXTRACT(a, 10) ^ r;
    792 		a1 = q[a1];
    793 		a2 = q[a2];
    794 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    795 		a1 = EXTRACT(a, 11) ^ r;
    796 		a2 = EXTRACT(a, 12) ^ r;
    797 		a1 = q[a1];
    798 		a2 = q[a2];
    799 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    800 #endif				/* RF_LONGSHIFT > 2 */
    801 		d ^= new;
    802 		*dest++ = d;
    803 		length--;
    804 	}
    805 #endif				/* _KERNEL */
    806 }
    807 /*
    808    recover columns a and b from the given p and q into
    809    bufs abuf and bbuf. All bufs are word aligned.
    810    Length is in bytes.
    811 */
    812 
    813 
    814 /*
    815  * XXX
    816  *
    817  * Everything about this seems wrong.
    818  */
    819 void
    820 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
    821 	unsigned long *pbuf;
    822 	unsigned long *qbuf;
    823 	unsigned long *abuf;
    824 	unsigned long *bbuf;
    825 	unsigned length;
    826 	unsigned coeff_a;
    827 	unsigned coeff_b;
    828 {
    829 	unsigned long p, q, a, a0, a1;
    830 	int     col = (29 * coeff_a) + coeff_b;
    831 	unsigned char *q0 = &(rf_qinv[col][0]);
    832 
    833 	length /= 8;
    834 	while (length) {
    835 		p = *pbuf++;
    836 		q = *qbuf++;
    837 		a0 = EXTRACT(p, 0);
    838 		a1 = EXTRACT(q, 0);
    839 		a = q0[a0 << 5 | a1];
    840 #define MF(i) \
    841       a0 = EXTRACT(p,i); \
    842       a1 = EXTRACT(q,i); \
    843       a  = a | INSERT(q0[a0<<5 | a1],i)
    844 
    845 		MF(1);
    846 		MF(2);
    847 		MF(3);
    848 		MF(4);
    849 		MF(5);
    850 		MF(6);
    851 #if 0
    852 		MF(7);
    853 		MF(8);
    854 		MF(9);
    855 		MF(10);
    856 		MF(11);
    857 		MF(12);
    858 #endif				/* 0 */
    859 		*abuf++ = a;
    860 		*bbuf++ = a ^ p;
    861 		length--;
    862 	}
    863 }
    864 /*
    865    Lost parity and a data column. Recover that data column.
    866    Assume col coeff is lost. Let q the contents of Q after
    867    all surviving data columns have been q-xored out of it.
    868    Then we have the equation
    869 
    870    q[28-coeff][a_i ^ r_i+1] = q
    871 
    872    but q is cyclic with period 31.
    873    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
    874       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
    875 
    876    so a_i = r_{coeff+1} ^ q[3+coeff][q]
    877 
    878    The routine is passed q buffer and the buffer
    879    the data is to be recoverd into. They can be the same.
    880 */
    881 
    882 
    883 
    884 static void
    885 rf_InvertQ(
    886     unsigned long *qbuf,
    887     unsigned long *abuf,
    888     unsigned length,
    889     unsigned coeff)
    890 {
    891 	unsigned long a, new;
    892 	unsigned long a1, a2;
    893 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
    894 	unsigned r = rf_rn[coeff + 1];
    895 
    896 	/* 13 5 bit quants in a 64 bit word */
    897 	length /= 8;
    898 	while (length) {
    899 		a = *qbuf++;
    900 		a1 = EXTRACT(a, 0);
    901 		a2 = EXTRACT(a, 1);
    902 		a1 = r ^ q[a1];
    903 		a2 = r ^ q[a2];
    904 		new = INSERT(a2, 1) | a1;
    905 #define M(i,j) \
    906       a1 = EXTRACT(a,i); \
    907       a2 = EXTRACT(a,j); \
    908       a1 = r ^ q[a1]; \
    909       a2 = r ^ q[a2]; \
    910       new = new | INSERT(a1,i) | INSERT(a2,j)
    911 
    912 		M(2, 3);
    913 		M(4, 5);
    914 		M(5, 6);
    915 #if RF_LONGSHIFT > 2
    916 		M(7, 8);
    917 		M(9, 10);
    918 		M(11, 12);
    919 #endif				/* RF_LONGSHIFT > 2 */
    920 		*abuf++ = new;
    921 		length--;
    922 	}
    923 }
    924 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    925 				 * (RF_INCLUDE_RAID6 > 0) */
    926