Home | History | Annotate | Line # | Download | only in raidframe
rf_pq.c revision 1.6
      1 /*	$NetBSD: rf_pq.c,v 1.6 1999/08/15 03:44:46 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * Code for RAID level 6 (P + Q) disk array architecture.
     31  */
     32 
     33 #include "rf_archs.h"
     34 #include "rf_types.h"
     35 #include "rf_raid.h"
     36 #include "rf_dag.h"
     37 #include "rf_dagffrd.h"
     38 #include "rf_dagffwr.h"
     39 #include "rf_dagdegrd.h"
     40 #include "rf_dagdegwr.h"
     41 #include "rf_dagutils.h"
     42 #include "rf_dagfuncs.h"
     43 #include "rf_threadid.h"
     44 #include "rf_etimer.h"
     45 #include "rf_pqdeg.h"
     46 #include "rf_general.h"
     47 #include "rf_map.h"
     48 #include "rf_pq.h"
     49 
     50 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
     51 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
     52 
     53 int
     54 rf_RegularONPFunc(node)
     55 	RF_DagNode_t *node;
     56 {
     57 	return (rf_RegularXorFunc(node));
     58 }
     59 /*
     60    same as simpleONQ func, but the coefficient is always 1
     61 */
     62 
     63 int
     64 rf_SimpleONPFunc(node)
     65 	RF_DagNode_t *node;
     66 {
     67 	return (rf_SimpleXorFunc(node));
     68 }
     69 
     70 int
     71 rf_RecoveryPFunc(node)
     72 	RF_DagNode_t *node;
     73 {
     74 	return (rf_RecoveryXorFunc(node));
     75 }
     76 
     77 int
     78 rf_RegularPFunc(node)
     79 	RF_DagNode_t *node;
     80 {
     81 	return (rf_RegularXorFunc(node));
     82 }
     83 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     84 
     85 static void
     86 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
     87     unsigned char coeff);
     88 static void
     89 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
     90     unsigned length, unsigned coeff);
     91 
     92 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
     93 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
     94 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
     95 
     96 void
     97 rf_PQDagSelect(
     98     RF_Raid_t * raidPtr,
     99     RF_IoType_t type,
    100     RF_AccessStripeMap_t * asmap,
    101     RF_VoidFuncPtr * createFunc)
    102 {
    103 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    104 	unsigned ndfail = asmap->numDataFailed;
    105 	unsigned npfail = asmap->numParityFailed;
    106 	unsigned ntfail = npfail + ndfail;
    107 
    108 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    109 	if (ntfail > 2) {
    110 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
    111 		 /* *infoFunc = */ *createFunc = NULL;
    112 		return;
    113 	}
    114 	/* ok, we can do this I/O */
    115 	if (type == RF_IO_TYPE_READ) {
    116 		switch (ndfail) {
    117 		case 0:
    118 			/* fault free read */
    119 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
    120 			break;
    121 		case 1:
    122 			/* lost a single data unit */
    123 			/* two cases: (1) parity is not lost. do a normal raid
    124 			 * 5 reconstruct read. (2) parity is lost. do a
    125 			 * reconstruct read using "q". */
    126 			if (ntfail == 2) {	/* also lost redundancy */
    127 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
    128 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
    129 				else
    130 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
    131 			} else {
    132 				/* P and Q are ok. But is there a failure in
    133 				 * some unaccessed data unit? */
    134 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    135 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    136 				else
    137 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
    138 			}
    139 			break;
    140 		case 2:
    141 			/* lost two data units */
    142 			/* *infoFunc = PQOneTwo; */
    143 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    144 			break;
    145 		}
    146 		return;
    147 	}
    148 	/* a write */
    149 	switch (ntfail) {
    150 	case 0:		/* fault free */
    151 		if (rf_suppressLocksAndLargeWrites ||
    152 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
    153 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
    154 
    155 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
    156 		} else {
    157 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
    158 		}
    159 		break;
    160 
    161 	case 1:		/* single disk fault */
    162 		if (npfail == 1) {
    163 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
    164 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
    165 										 * normal mode raid5
    166 										 * write. */
    167 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    168 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    169 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
    170 				else
    171 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
    172 			} else {/* parity died, small write only updating Q */
    173 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    174 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    175 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
    176 				else
    177 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
    178 			}
    179 		} else {	/* data missing. Do a P reconstruct write if
    180 				 * only a single data unit is lost in the
    181 				 * stripe, otherwise a PQ reconstruct write. */
    182 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    183 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    184 			else
    185 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
    186 		}
    187 		break;
    188 
    189 	case 2:		/* two disk faults */
    190 		switch (npfail) {
    191 		case 2:	/* both p and q dead */
    192 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
    193 			break;
    194 		case 1:	/* either p or q and dead data */
    195 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
    196 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
    197 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
    198 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
    199 			else
    200 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
    201 			break;
    202 		case 0:	/* double data loss */
    203 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    204 			break;
    205 		}
    206 		break;
    207 
    208 	default:		/* more than 2 disk faults */
    209 		*createFunc = NULL;
    210 		RF_PANIC();
    211 	}
    212 	return;
    213 }
    214 /*
    215    Used as a stop gap info function
    216 */
    217 #if 0
    218 static void
    219 PQOne(raidPtr, nSucc, nAnte, asmap)
    220 	RF_Raid_t *raidPtr;
    221 	int    *nSucc;
    222 	int    *nAnte;
    223 	RF_AccessStripeMap_t *asmap;
    224 {
    225 	*nSucc = *nAnte = 1;
    226 }
    227 
    228 static void
    229 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
    230 	RF_Raid_t *raidPtr;
    231 	int    *nSucc;
    232 	int    *nAnte;
    233 	RF_AccessStripeMap_t *asmap;
    234 {
    235 	*nSucc = 1;
    236 	*nAnte = 2;
    237 }
    238 #endif
    239 
    240 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
    241 {
    242 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
    243 	    rf_RegularPQFunc, RF_FALSE);
    244 }
    245 
    246 int
    247 rf_RegularONQFunc(node)
    248 	RF_DagNode_t *node;
    249 {
    250 	int     np = node->numParams;
    251 	int     d;
    252 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    253 	int     i;
    254 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    255 	RF_Etimer_t timer;
    256 	char   *qbuf, *qpbuf;
    257 	char   *obuf, *nbuf;
    258 	RF_PhysDiskAddr_t *old, *new;
    259 	unsigned long coeff;
    260 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    261 
    262 	RF_ETIMER_START(timer);
    263 
    264 	d = (np - 3) / 4;
    265 	RF_ASSERT(4 * d + 3 == np);
    266 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    267 	for (i = 0; i < d; i++) {
    268 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    269 		obuf = (char *) node->params[2 * i + 1].p;
    270 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    271 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    272 		RF_ASSERT(new->numSector == old->numSector);
    273 		RF_ASSERT(new->raidAddress == old->raidAddress);
    274 		/* the stripe unit within the stripe tells us the coefficient
    275 		 * to use for the multiply. */
    276 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    277 		/* compute the data unit offset within the column, then add
    278 		 * one */
    279 		coeff = (coeff % raidPtr->Layout.numDataCol);
    280 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    281 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    282 	}
    283 
    284 	RF_ETIMER_STOP(timer);
    285 	RF_ETIMER_EVAL(timer);
    286 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    287 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    288 					 * I/O in this node */
    289 	return (0);
    290 }
    291 /*
    292    See the SimpleXORFunc for the difference between a simple and regular func.
    293    These Q functions should be used for
    294 
    295          new q = Q(data,old data,old q)
    296 
    297    style updates and not for
    298 
    299          q = ( new data, new data, .... )
    300 
    301    computations.
    302 
    303    The simple q takes 2(2d+1)+1 params, where d is the number
    304    of stripes written. The order of params is
    305    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
    306    [2d] old q pda_0, old q buffer
    307    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
    308    raidPtr
    309 */
    310 
    311 int
    312 rf_SimpleONQFunc(node)
    313 	RF_DagNode_t *node;
    314 {
    315 	int     np = node->numParams;
    316 	int     d;
    317 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    318 	int     i;
    319 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    320 	RF_Etimer_t timer;
    321 	char   *qbuf;
    322 	char   *obuf, *nbuf;
    323 	RF_PhysDiskAddr_t *old, *new;
    324 	unsigned long coeff;
    325 
    326 	RF_ETIMER_START(timer);
    327 
    328 	d = (np - 3) / 4;
    329 	RF_ASSERT(4 * d + 3 == np);
    330 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    331 	for (i = 0; i < d; i++) {
    332 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    333 		obuf = (char *) node->params[2 * i + 1].p;
    334 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    335 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    336 		RF_ASSERT(new->numSector == old->numSector);
    337 		RF_ASSERT(new->raidAddress == old->raidAddress);
    338 		/* the stripe unit within the stripe tells us the coefficient
    339 		 * to use for the multiply. */
    340 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    341 		/* compute the data unit offset within the column, then add
    342 		 * one */
    343 		coeff = (coeff % raidPtr->Layout.numDataCol);
    344 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    345 	}
    346 
    347 	RF_ETIMER_STOP(timer);
    348 	RF_ETIMER_EVAL(timer);
    349 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    350 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    351 					 * I/O in this node */
    352 	return (0);
    353 }
    354 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
    355 {
    356 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
    357 }
    358 
    359 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
    360 
    361 static void
    362 RegularQSubr(node, qbuf)
    363 	RF_DagNode_t *node;
    364 	char   *qbuf;
    365 {
    366 	int     np = node->numParams;
    367 	int     d;
    368 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    369 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    370 	int     i;
    371 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    372 	RF_Etimer_t timer;
    373 	char   *obuf, *qpbuf;
    374 	RF_PhysDiskAddr_t *old;
    375 	unsigned long coeff;
    376 
    377 	RF_ETIMER_START(timer);
    378 
    379 	d = (np - 1) / 2;
    380 	RF_ASSERT(2 * d + 1 == np);
    381 	for (i = 0; i < d; i++) {
    382 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    383 		obuf = (char *) node->params[2 * i + 1].p;
    384 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    385 		/* compute the data unit offset within the column, then add
    386 		 * one */
    387 		coeff = (coeff % raidPtr->Layout.numDataCol);
    388 		/* the input buffers may not all be aligned with the start of
    389 		 * the stripe. so shift by their sector offset within the
    390 		 * stripe unit */
    391 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    392 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    393 	}
    394 
    395 	RF_ETIMER_STOP(timer);
    396 	RF_ETIMER_EVAL(timer);
    397 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    398 }
    399 /*
    400    used in degraded writes.
    401 */
    402 
    403 static void DegrQSubr(RF_DagNode_t *node);
    404 
    405 static void
    406 DegrQSubr(node)
    407 	RF_DagNode_t *node;
    408 {
    409 	int     np = node->numParams;
    410 	int     d;
    411 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    412 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    413 	int     i;
    414 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    415 	RF_Etimer_t timer;
    416 	char   *qbuf = node->results[1];
    417 	char   *obuf, *qpbuf;
    418 	RF_PhysDiskAddr_t *old;
    419 	unsigned long coeff;
    420 	unsigned fail_start;
    421 	int     j;
    422 
    423 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    424 	fail_start = old->startSector % secPerSU;
    425 
    426 	RF_ETIMER_START(timer);
    427 
    428 	d = (np - 2) / 2;
    429 	RF_ASSERT(2 * d + 2 == np);
    430 	for (i = 0; i < d; i++) {
    431 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    432 		obuf = (char *) node->params[2 * i + 1].p;
    433 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    434 		/* compute the data unit offset within the column, then add
    435 		 * one */
    436 		coeff = (coeff % raidPtr->Layout.numDataCol);
    437 		/* the input buffers may not all be aligned with the start of
    438 		 * the stripe. so shift by their sector offset within the
    439 		 * stripe unit */
    440 		j = old->startSector % secPerSU;
    441 		RF_ASSERT(j >= fail_start);
    442 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    443 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    444 	}
    445 
    446 	RF_ETIMER_STOP(timer);
    447 	RF_ETIMER_EVAL(timer);
    448 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    449 }
    450 /*
    451    Called by large write code to compute the new parity and the new q.
    452 
    453    structure of the params:
    454 
    455    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
    456    raidPtr
    457 
    458    for a total of 2d+1 arguments.
    459    The result buffers results[0], results[1] are the buffers for the p and q,
    460    respectively.
    461 
    462    We compute Q first, then compute P. The P calculation may try to reuse
    463    one of the input buffers for its output, so if we computed P first, we would
    464    corrupt the input for the q calculation.
    465 */
    466 
    467 int
    468 rf_RegularPQFunc(node)
    469 	RF_DagNode_t *node;
    470 {
    471 	RegularQSubr(node, node->results[1]);
    472 	return (rf_RegularXorFunc(node));	/* does the wakeup */
    473 }
    474 
    475 int
    476 rf_RegularQFunc(node)
    477 	RF_DagNode_t *node;
    478 {
    479 	/* Almost ... adjust Qsubr args */
    480 	RegularQSubr(node, node->results[0]);
    481 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    482 					 * I/O in this node */
    483 	return (0);
    484 }
    485 /*
    486    Called by singly degraded write code to compute the new parity and the new q.
    487 
    488    structure of the params:
    489 
    490    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
    491    failedPDA raidPtr
    492 
    493    for a total of 2d+2 arguments.
    494    The result buffers results[0], results[1] are the buffers for the parity and q,
    495    respectively.
    496 
    497    We compute Q first, then compute parity. The parity calculation may try to reuse
    498    one of the input buffers for its output, so if we computed parity first, we would
    499    corrupt the input for the q calculation.
    500 
    501    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
    502 */
    503 
    504 void
    505 rf_Degraded_100_PQFunc(node)
    506 	RF_DagNode_t *node;
    507 {
    508 	int     np = node->numParams;
    509 
    510 	RF_ASSERT(np >= 2);
    511 	DegrQSubr(node);
    512 	rf_RecoveryXorFunc(node);
    513 }
    514 
    515 
    516 /*
    517    The two below are used when reading a stripe with a single lost data unit.
    518    The parameters are
    519 
    520    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
    521 
    522    and results[0] contains the data buffer. Which is originally zero-filled.
    523 
    524 */
    525 
    526 /* this Q func is used by the degraded-mode dag functions to recover lost data.
    527  * the second-to-last parameter is the PDA for the failed portion of the access.
    528  * the code here looks at this PDA and assumes that the xor target buffer is
    529  * equal in size to the number of sectors in the failed PDA.  It then uses
    530  * the other PDAs in the parameter list to determine where within the target
    531  * buffer the corresponding data should be xored.
    532  *
    533  * Recall the basic equation is
    534  *
    535  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
    536  *
    537  * so to recover data_j we need
    538  *
    539  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
    540  *
    541  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
    542  * copying Q into it. Then we need to do a table lookup to convert to solve
    543  *   data_j /= J
    544  *
    545  *
    546  */
    547 int
    548 rf_RecoveryQFunc(node)
    549 	RF_DagNode_t *node;
    550 {
    551 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    552 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    553 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    554 	int     i;
    555 	RF_PhysDiskAddr_t *pda;
    556 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    557 	char   *srcbuf, *destbuf;
    558 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    559 	RF_Etimer_t timer;
    560 	unsigned long coeff;
    561 
    562 	RF_ETIMER_START(timer);
    563 	/* start by copying Q into the buffer */
    564 	bcopy(node->params[node->numParams - 3].p, node->results[0],
    565 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
    566 	for (i = 0; i < node->numParams - 4; i += 2) {
    567 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
    568 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    569 		srcbuf = (char *) node->params[i + 1].p;
    570 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    571 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    572 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    573 		/* compute the data unit offset within the column */
    574 		coeff = (coeff % raidPtr->Layout.numDataCol);
    575 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    576 	}
    577 	/* Do the nasty inversion now */
    578 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
    579 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    580 	RF_ETIMER_STOP(timer);
    581 	RF_ETIMER_EVAL(timer);
    582 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    583 	rf_GenericWakeupFunc(node, 0);
    584 	return (0);
    585 }
    586 
    587 int
    588 rf_RecoveryPQFunc(node)
    589 	RF_DagNode_t *node;
    590 {
    591 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    592 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
    593 	return (1);
    594 }
    595 /*
    596    Degraded write Q subroutine.
    597    Used when P is dead.
    598    Large-write style Q computation.
    599    Parameters
    600 
    601    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
    602 
    603    We ignore failedPDA.
    604 
    605    This is a "simple style" recovery func.
    606 */
    607 
    608 void
    609 rf_PQ_DegradedWriteQFunc(node)
    610 	RF_DagNode_t *node;
    611 {
    612 	int     np = node->numParams;
    613 	int     d;
    614 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    615 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    616 	int     i;
    617 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    618 	RF_Etimer_t timer;
    619 	char   *qbuf = node->results[0];
    620 	char   *obuf, *qpbuf;
    621 	RF_PhysDiskAddr_t *old;
    622 	unsigned long coeff;
    623 	int     fail_start, j;
    624 
    625 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    626 	fail_start = old->startSector % secPerSU;
    627 
    628 	RF_ETIMER_START(timer);
    629 
    630 	d = (np - 2) / 2;
    631 	RF_ASSERT(2 * d + 2 == np);
    632 
    633 	for (i = 0; i < d; i++) {
    634 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    635 		obuf = (char *) node->params[2 * i + 1].p;
    636 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    637 		/* compute the data unit offset within the column, then add
    638 		 * one */
    639 		coeff = (coeff % raidPtr->Layout.numDataCol);
    640 		j = old->startSector % secPerSU;
    641 		RF_ASSERT(j >= fail_start);
    642 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    643 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    644 	}
    645 
    646 	RF_ETIMER_STOP(timer);
    647 	RF_ETIMER_EVAL(timer);
    648 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    649 	rf_GenericWakeupFunc(node, 0);
    650 }
    651 
    652 
    653 
    654 
    655 /* Q computations */
    656 
    657 /*
    658    coeff - colummn;
    659 
    660    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
    661 
    662    on 5-bit basis;
    663    length in bytes;
    664 */
    665 
    666 void
    667 rf_IncQ(dest, buf, length, coeff)
    668 	unsigned long *dest;
    669 	unsigned long *buf;
    670 	unsigned length;
    671 	unsigned coeff;
    672 {
    673 	unsigned long a, d, new;
    674 	unsigned long a1, a2;
    675 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    676 	unsigned r = rf_rn[coeff + 1];
    677 
    678 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
    679 #define INSERT(a,i) (a << (5L*i))
    680 
    681 	length /= 8;
    682 	/* 13 5 bit quants in a 64 bit word */
    683 	while (length) {
    684 		a = *buf++;
    685 		d = *dest;
    686 		a1 = EXTRACT(a, 0) ^ r;
    687 		a2 = EXTRACT(a, 1) ^ r;
    688 		new = INSERT(a2, 1) | a1;
    689 		a1 = EXTRACT(a, 2) ^ r;
    690 		a2 = EXTRACT(a, 3) ^ r;
    691 		a1 = q[a1];
    692 		a2 = q[a2];
    693 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    694 		a1 = EXTRACT(a, 4) ^ r;
    695 		a2 = EXTRACT(a, 5) ^ r;
    696 		a1 = q[a1];
    697 		a2 = q[a2];
    698 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    699 		a1 = EXTRACT(a, 5) ^ r;
    700 		a2 = EXTRACT(a, 6) ^ r;
    701 		a1 = q[a1];
    702 		a2 = q[a2];
    703 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    704 #if RF_LONGSHIFT > 2
    705 		a1 = EXTRACT(a, 7) ^ r;
    706 		a2 = EXTRACT(a, 8) ^ r;
    707 		a1 = q[a1];
    708 		a2 = q[a2];
    709 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    710 		a1 = EXTRACT(a, 9) ^ r;
    711 		a2 = EXTRACT(a, 10) ^ r;
    712 		a1 = q[a1];
    713 		a2 = q[a2];
    714 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    715 		a1 = EXTRACT(a, 11) ^ r;
    716 		a2 = EXTRACT(a, 12) ^ r;
    717 		a1 = q[a1];
    718 		a2 = q[a2];
    719 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    720 #endif				/* RF_LONGSHIFT > 2 */
    721 		d ^= new;
    722 		*dest++ = d;
    723 		length--;
    724 	}
    725 }
    726 /*
    727    compute
    728 
    729    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
    730 
    731    on a five bit basis.
    732    optimization: compute old ^ new on 64 bit basis.
    733 
    734    length in bytes.
    735 */
    736 
    737 static void
    738 QDelta(
    739     char *dest,
    740     char *obuf,
    741     char *nbuf,
    742     unsigned length,
    743     unsigned char coeff)
    744 {
    745 	unsigned long a, d, new;
    746 	unsigned long a1, a2;
    747 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    748 	unsigned int r = rf_rn[coeff + 1];
    749 
    750 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
    751 	q = NULL; /* XXX for now */
    752 
    753 #ifdef _KERNEL
    754 	/* PQ in kernel currently not supported because the encoding/decoding
    755 	 * table is not present */
    756 	bzero(dest, length);
    757 #else				/* KERNEL */
    758 	/* this code probably doesn't work and should be rewritten  -wvcii */
    759 	/* 13 5 bit quants in a 64 bit word */
    760 	length /= 8;
    761 	while (length) {
    762 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
    763 		a ^= *nbuf++;
    764 		d = *dest;
    765 		a1 = EXTRACT(a, 0) ^ r;
    766 		a2 = EXTRACT(a, 1) ^ r;
    767 		a1 = q[a1];
    768 		a2 = q[a2];
    769 		new = INSERT(a2, 1) | a1;
    770 		a1 = EXTRACT(a, 2) ^ r;
    771 		a2 = EXTRACT(a, 3) ^ r;
    772 		a1 = q[a1];
    773 		a2 = q[a2];
    774 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    775 		a1 = EXTRACT(a, 4) ^ r;
    776 		a2 = EXTRACT(a, 5) ^ r;
    777 		a1 = q[a1];
    778 		a2 = q[a2];
    779 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    780 		a1 = EXTRACT(a, 5) ^ r;
    781 		a2 = EXTRACT(a, 6) ^ r;
    782 		a1 = q[a1];
    783 		a2 = q[a2];
    784 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    785 #if RF_LONGSHIFT > 2
    786 		a1 = EXTRACT(a, 7) ^ r;
    787 		a2 = EXTRACT(a, 8) ^ r;
    788 		a1 = q[a1];
    789 		a2 = q[a2];
    790 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    791 		a1 = EXTRACT(a, 9) ^ r;
    792 		a2 = EXTRACT(a, 10) ^ r;
    793 		a1 = q[a1];
    794 		a2 = q[a2];
    795 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    796 		a1 = EXTRACT(a, 11) ^ r;
    797 		a2 = EXTRACT(a, 12) ^ r;
    798 		a1 = q[a1];
    799 		a2 = q[a2];
    800 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    801 #endif				/* RF_LONGSHIFT > 2 */
    802 		d ^= new;
    803 		*dest++ = d;
    804 		length--;
    805 	}
    806 #endif				/* _KERNEL */
    807 }
    808 /*
    809    recover columns a and b from the given p and q into
    810    bufs abuf and bbuf. All bufs are word aligned.
    811    Length is in bytes.
    812 */
    813 
    814 
    815 /*
    816  * XXX
    817  *
    818  * Everything about this seems wrong.
    819  */
    820 void
    821 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
    822 	unsigned long *pbuf;
    823 	unsigned long *qbuf;
    824 	unsigned long *abuf;
    825 	unsigned long *bbuf;
    826 	unsigned length;
    827 	unsigned coeff_a;
    828 	unsigned coeff_b;
    829 {
    830 	unsigned long p, q, a, a0, a1;
    831 	int     col = (29 * coeff_a) + coeff_b;
    832 	unsigned char *q0 = &(rf_qinv[col][0]);
    833 
    834 	length /= 8;
    835 	while (length) {
    836 		p = *pbuf++;
    837 		q = *qbuf++;
    838 		a0 = EXTRACT(p, 0);
    839 		a1 = EXTRACT(q, 0);
    840 		a = q0[a0 << 5 | a1];
    841 #define MF(i) \
    842       a0 = EXTRACT(p,i); \
    843       a1 = EXTRACT(q,i); \
    844       a  = a | INSERT(q0[a0<<5 | a1],i)
    845 
    846 		MF(1);
    847 		MF(2);
    848 		MF(3);
    849 		MF(4);
    850 		MF(5);
    851 		MF(6);
    852 #if 0
    853 		MF(7);
    854 		MF(8);
    855 		MF(9);
    856 		MF(10);
    857 		MF(11);
    858 		MF(12);
    859 #endif				/* 0 */
    860 		*abuf++ = a;
    861 		*bbuf++ = a ^ p;
    862 		length--;
    863 	}
    864 }
    865 /*
    866    Lost parity and a data column. Recover that data column.
    867    Assume col coeff is lost. Let q the contents of Q after
    868    all surviving data columns have been q-xored out of it.
    869    Then we have the equation
    870 
    871    q[28-coeff][a_i ^ r_i+1] = q
    872 
    873    but q is cyclic with period 31.
    874    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
    875       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
    876 
    877    so a_i = r_{coeff+1} ^ q[3+coeff][q]
    878 
    879    The routine is passed q buffer and the buffer
    880    the data is to be recoverd into. They can be the same.
    881 */
    882 
    883 
    884 
    885 static void
    886 rf_InvertQ(
    887     unsigned long *qbuf,
    888     unsigned long *abuf,
    889     unsigned length,
    890     unsigned coeff)
    891 {
    892 	unsigned long a, new;
    893 	unsigned long a1, a2;
    894 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
    895 	unsigned r = rf_rn[coeff + 1];
    896 
    897 	/* 13 5 bit quants in a 64 bit word */
    898 	length /= 8;
    899 	while (length) {
    900 		a = *qbuf++;
    901 		a1 = EXTRACT(a, 0);
    902 		a2 = EXTRACT(a, 1);
    903 		a1 = r ^ q[a1];
    904 		a2 = r ^ q[a2];
    905 		new = INSERT(a2, 1) | a1;
    906 #define M(i,j) \
    907       a1 = EXTRACT(a,i); \
    908       a2 = EXTRACT(a,j); \
    909       a1 = r ^ q[a1]; \
    910       a2 = r ^ q[a2]; \
    911       new = new | INSERT(a1,i) | INSERT(a2,j)
    912 
    913 		M(2, 3);
    914 		M(4, 5);
    915 		M(5, 6);
    916 #if RF_LONGSHIFT > 2
    917 		M(7, 8);
    918 		M(9, 10);
    919 		M(11, 12);
    920 #endif				/* RF_LONGSHIFT > 2 */
    921 		*abuf++ = new;
    922 		length--;
    923 	}
    924 }
    925 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    926 				 * (RF_INCLUDE_RAID6 > 0) */
    927