Home | History | Annotate | Line # | Download | only in raidframe
rf_pq.c revision 1.3
      1 /*	$NetBSD: rf_pq.c,v 1.3 1999/02/05 00:06:14 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * Code for RAID level 6 (P + Q) disk array architecture.
     31  */
     32 
     33 #include "rf_archs.h"
     34 #include "rf_types.h"
     35 #include "rf_raid.h"
     36 #include "rf_dag.h"
     37 #include "rf_dagffrd.h"
     38 #include "rf_dagffwr.h"
     39 #include "rf_dagdegrd.h"
     40 #include "rf_dagdegwr.h"
     41 #include "rf_dagutils.h"
     42 #include "rf_dagfuncs.h"
     43 #include "rf_threadid.h"
     44 #include "rf_etimer.h"
     45 #include "rf_pqdeg.h"
     46 #include "rf_general.h"
     47 #include "rf_map.h"
     48 #include "rf_pq.h"
     49 #include "rf_sys.h"
     50 
     51 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
     52 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
     53 
     54 int
     55 rf_RegularONPFunc(node)
     56 	RF_DagNode_t *node;
     57 {
     58 	return (rf_RegularXorFunc(node));
     59 }
     60 /*
     61    same as simpleONQ func, but the coefficient is always 1
     62 */
     63 
     64 int
     65 rf_SimpleONPFunc(node)
     66 	RF_DagNode_t *node;
     67 {
     68 	return (rf_SimpleXorFunc(node));
     69 }
     70 
     71 int
     72 rf_RecoveryPFunc(node)
     73 	RF_DagNode_t *node;
     74 {
     75 	return (rf_RecoveryXorFunc(node));
     76 }
     77 
     78 int
     79 rf_RegularPFunc(node)
     80 	RF_DagNode_t *node;
     81 {
     82 	return (rf_RegularXorFunc(node));
     83 }
     84 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     85 
     86 static void
     87 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
     88     unsigned char coeff);
     89 static void
     90 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
     91     unsigned length, unsigned coeff);
     92 
     93 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
     94 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
     95 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
     96 
     97 void
     98 rf_PQDagSelect(
     99     RF_Raid_t * raidPtr,
    100     RF_IoType_t type,
    101     RF_AccessStripeMap_t * asmap,
    102     RF_VoidFuncPtr * createFunc)
    103 {
    104 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    105 	unsigned ndfail = asmap->numDataFailed;
    106 	unsigned npfail = asmap->numParityFailed;
    107 	unsigned ntfail = npfail + ndfail;
    108 
    109 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    110 	if (ntfail > 2) {
    111 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
    112 		 /* *infoFunc = */ *createFunc = NULL;
    113 		return;
    114 	}
    115 	/* ok, we can do this I/O */
    116 	if (type == RF_IO_TYPE_READ) {
    117 		switch (ndfail) {
    118 		case 0:
    119 			/* fault free read */
    120 			*createFunc = rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
    121 			break;
    122 		case 1:
    123 			/* lost a single data unit */
    124 			/* two cases: (1) parity is not lost. do a normal raid
    125 			 * 5 reconstruct read. (2) parity is lost. do a
    126 			 * reconstruct read using "q". */
    127 			if (ntfail == 2) {	/* also lost redundancy */
    128 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
    129 					*createFunc = rf_PQ_110_CreateReadDAG;
    130 				else
    131 					*createFunc = rf_PQ_101_CreateReadDAG;
    132 			} else {
    133 				/* P and Q are ok. But is there a failure in
    134 				 * some unaccessed data unit? */
    135 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    136 					*createFunc = rf_PQ_200_CreateReadDAG;
    137 				else
    138 					*createFunc = rf_PQ_100_CreateReadDAG;
    139 			}
    140 			break;
    141 		case 2:
    142 			/* lost two data units */
    143 			/* *infoFunc = PQOneTwo; */
    144 			*createFunc = rf_PQ_200_CreateReadDAG;
    145 			break;
    146 		}
    147 		return;
    148 	}
    149 	/* a write */
    150 	switch (ntfail) {
    151 	case 0:		/* fault free */
    152 		if (rf_suppressLocksAndLargeWrites ||
    153 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
    154 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
    155 
    156 			*createFunc = rf_PQCreateSmallWriteDAG;
    157 		} else {
    158 			*createFunc = rf_PQCreateLargeWriteDAG;
    159 		}
    160 		break;
    161 
    162 	case 1:		/* single disk fault */
    163 		if (npfail == 1) {
    164 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
    165 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
    166 										 * normal mode raid5
    167 										 * write. */
    168 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    169 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    170 					*createFunc = rf_PQ_001_CreateSmallWriteDAG;
    171 				else
    172 					*createFunc = rf_PQ_001_CreateLargeWriteDAG;
    173 			} else {/* parity died, small write only updating Q */
    174 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    175 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    176 					*createFunc = rf_PQ_010_CreateSmallWriteDAG;
    177 				else
    178 					*createFunc = rf_PQ_010_CreateLargeWriteDAG;
    179 			}
    180 		} else {	/* data missing. Do a P reconstruct write if
    181 				 * only a single data unit is lost in the
    182 				 * stripe, otherwise a PQ reconstruct write. */
    183 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    184 				*createFunc = rf_PQ_200_CreateWriteDAG;
    185 			else
    186 				*createFunc = rf_PQ_100_CreateWriteDAG;
    187 		}
    188 		break;
    189 
    190 	case 2:		/* two disk faults */
    191 		switch (npfail) {
    192 		case 2:	/* both p and q dead */
    193 			*createFunc = rf_PQ_011_CreateWriteDAG;
    194 			break;
    195 		case 1:	/* either p or q and dead data */
    196 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
    197 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
    198 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
    199 				*createFunc = rf_PQ_101_CreateWriteDAG;
    200 			else
    201 				*createFunc = rf_PQ_110_CreateWriteDAG;
    202 			break;
    203 		case 0:	/* double data loss */
    204 			*createFunc = rf_PQ_200_CreateWriteDAG;
    205 			break;
    206 		}
    207 		break;
    208 
    209 	default:		/* more than 2 disk faults */
    210 		*createFunc = NULL;
    211 		RF_PANIC();
    212 	}
    213 	return;
    214 }
    215 /*
    216    Used as a stop gap info function
    217 */
    218 static void
    219 PQOne(raidPtr, nSucc, nAnte, asmap)
    220 	RF_Raid_t *raidPtr;
    221 	int    *nSucc;
    222 	int    *nAnte;
    223 	RF_AccessStripeMap_t *asmap;
    224 {
    225 	*nSucc = *nAnte = 1;
    226 }
    227 
    228 static void
    229 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
    230 	RF_Raid_t *raidPtr;
    231 	int    *nSucc;
    232 	int    *nAnte;
    233 	RF_AccessStripeMap_t *asmap;
    234 {
    235 	*nSucc = 1;
    236 	*nAnte = 2;
    237 }
    238 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
    239 {
    240 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
    241 	    rf_RegularPQFunc, RF_FALSE);
    242 }
    243 
    244 int
    245 rf_RegularONQFunc(node)
    246 	RF_DagNode_t *node;
    247 {
    248 	int     np = node->numParams;
    249 	int     d;
    250 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    251 	int     i;
    252 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    253 	RF_Etimer_t timer;
    254 	char   *qbuf, *qpbuf;
    255 	char   *obuf, *nbuf;
    256 	RF_PhysDiskAddr_t *old, *new;
    257 	unsigned long coeff;
    258 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    259 
    260 	RF_ETIMER_START(timer);
    261 
    262 	d = (np - 3) / 4;
    263 	RF_ASSERT(4 * d + 3 == np);
    264 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    265 	for (i = 0; i < d; i++) {
    266 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    267 		obuf = (char *) node->params[2 * i + 1].p;
    268 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    269 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    270 		RF_ASSERT(new->numSector == old->numSector);
    271 		RF_ASSERT(new->raidAddress == old->raidAddress);
    272 		/* the stripe unit within the stripe tells us the coefficient
    273 		 * to use for the multiply. */
    274 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    275 		/* compute the data unit offset within the column, then add
    276 		 * one */
    277 		coeff = (coeff % raidPtr->Layout.numDataCol);
    278 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    279 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    280 	}
    281 
    282 	RF_ETIMER_STOP(timer);
    283 	RF_ETIMER_EVAL(timer);
    284 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    285 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    286 					 * I/O in this node */
    287 	return (0);
    288 }
    289 /*
    290    See the SimpleXORFunc for the difference between a simple and regular func.
    291    These Q functions should be used for
    292 
    293          new q = Q(data,old data,old q)
    294 
    295    style updates and not for
    296 
    297          q = ( new data, new data, .... )
    298 
    299    computations.
    300 
    301    The simple q takes 2(2d+1)+1 params, where d is the number
    302    of stripes written. The order of params is
    303    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
    304    [2d] old q pda_0, old q buffer
    305    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
    306    raidPtr
    307 */
    308 
    309 int
    310 rf_SimpleONQFunc(node)
    311 	RF_DagNode_t *node;
    312 {
    313 	int     np = node->numParams;
    314 	int     d;
    315 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    316 	int     i;
    317 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    318 	RF_Etimer_t timer;
    319 	char   *qbuf;
    320 	char   *obuf, *nbuf;
    321 	RF_PhysDiskAddr_t *old, *new;
    322 	unsigned long coeff;
    323 
    324 	RF_ETIMER_START(timer);
    325 
    326 	d = (np - 3) / 4;
    327 	RF_ASSERT(4 * d + 3 == np);
    328 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    329 	for (i = 0; i < d; i++) {
    330 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    331 		obuf = (char *) node->params[2 * i + 1].p;
    332 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    333 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    334 		RF_ASSERT(new->numSector == old->numSector);
    335 		RF_ASSERT(new->raidAddress == old->raidAddress);
    336 		/* the stripe unit within the stripe tells us the coefficient
    337 		 * to use for the multiply. */
    338 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    339 		/* compute the data unit offset within the column, then add
    340 		 * one */
    341 		coeff = (coeff % raidPtr->Layout.numDataCol);
    342 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    343 	}
    344 
    345 	RF_ETIMER_STOP(timer);
    346 	RF_ETIMER_EVAL(timer);
    347 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    348 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    349 					 * I/O in this node */
    350 	return (0);
    351 }
    352 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
    353 {
    354 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
    355 }
    356 
    357 static void
    358 RegularQSubr(node, qbuf)
    359 	RF_DagNode_t *node;
    360 	char   *qbuf;
    361 {
    362 	int     np = node->numParams;
    363 	int     d;
    364 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    365 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    366 	int     i;
    367 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    368 	RF_Etimer_t timer;
    369 	char   *obuf, *qpbuf;
    370 	RF_PhysDiskAddr_t *old;
    371 	unsigned long coeff;
    372 
    373 	RF_ETIMER_START(timer);
    374 
    375 	d = (np - 1) / 2;
    376 	RF_ASSERT(2 * d + 1 == np);
    377 	for (i = 0; i < d; i++) {
    378 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    379 		obuf = (char *) node->params[2 * i + 1].p;
    380 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    381 		/* compute the data unit offset within the column, then add
    382 		 * one */
    383 		coeff = (coeff % raidPtr->Layout.numDataCol);
    384 		/* the input buffers may not all be aligned with the start of
    385 		 * the stripe. so shift by their sector offset within the
    386 		 * stripe unit */
    387 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    388 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    389 	}
    390 
    391 	RF_ETIMER_STOP(timer);
    392 	RF_ETIMER_EVAL(timer);
    393 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    394 }
    395 /*
    396    used in degraded writes.
    397 */
    398 
    399 static void
    400 DegrQSubr(node)
    401 	RF_DagNode_t *node;
    402 {
    403 	int     np = node->numParams;
    404 	int     d;
    405 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    406 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    407 	int     i;
    408 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    409 	RF_Etimer_t timer;
    410 	char   *qbuf = node->results[1];
    411 	char   *obuf, *qpbuf;
    412 	RF_PhysDiskAddr_t *old;
    413 	unsigned long coeff;
    414 	unsigned fail_start;
    415 	int     j;
    416 
    417 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    418 	fail_start = old->startSector % secPerSU;
    419 
    420 	RF_ETIMER_START(timer);
    421 
    422 	d = (np - 2) / 2;
    423 	RF_ASSERT(2 * d + 2 == np);
    424 	for (i = 0; i < d; i++) {
    425 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    426 		obuf = (char *) node->params[2 * i + 1].p;
    427 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    428 		/* compute the data unit offset within the column, then add
    429 		 * one */
    430 		coeff = (coeff % raidPtr->Layout.numDataCol);
    431 		/* the input buffers may not all be aligned with the start of
    432 		 * the stripe. so shift by their sector offset within the
    433 		 * stripe unit */
    434 		j = old->startSector % secPerSU;
    435 		RF_ASSERT(j >= fail_start);
    436 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    437 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    438 	}
    439 
    440 	RF_ETIMER_STOP(timer);
    441 	RF_ETIMER_EVAL(timer);
    442 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    443 }
    444 /*
    445    Called by large write code to compute the new parity and the new q.
    446 
    447    structure of the params:
    448 
    449    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
    450    raidPtr
    451 
    452    for a total of 2d+1 arguments.
    453    The result buffers results[0], results[1] are the buffers for the p and q,
    454    respectively.
    455 
    456    We compute Q first, then compute P. The P calculation may try to reuse
    457    one of the input buffers for its output, so if we computed P first, we would
    458    corrupt the input for the q calculation.
    459 */
    460 
    461 int
    462 rf_RegularPQFunc(node)
    463 	RF_DagNode_t *node;
    464 {
    465 	RegularQSubr(node, node->results[1]);
    466 	return (rf_RegularXorFunc(node));	/* does the wakeup */
    467 }
    468 
    469 int
    470 rf_RegularQFunc(node)
    471 	RF_DagNode_t *node;
    472 {
    473 	/* Almost ... adjust Qsubr args */
    474 	RegularQSubr(node, node->results[0]);
    475 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    476 					 * I/O in this node */
    477 	return (0);
    478 }
    479 /*
    480    Called by singly degraded write code to compute the new parity and the new q.
    481 
    482    structure of the params:
    483 
    484    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
    485    failedPDA raidPtr
    486 
    487    for a total of 2d+2 arguments.
    488    The result buffers results[0], results[1] are the buffers for the parity and q,
    489    respectively.
    490 
    491    We compute Q first, then compute parity. The parity calculation may try to reuse
    492    one of the input buffers for its output, so if we computed parity first, we would
    493    corrupt the input for the q calculation.
    494 
    495    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
    496 */
    497 
    498 void
    499 rf_Degraded_100_PQFunc(node)
    500 	RF_DagNode_t *node;
    501 {
    502 	int     np = node->numParams;
    503 
    504 	RF_ASSERT(np >= 2);
    505 	DegrQSubr(node);
    506 	rf_RecoveryXorFunc(node);
    507 }
    508 
    509 
    510 /*
    511    The two below are used when reading a stripe with a single lost data unit.
    512    The parameters are
    513 
    514    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
    515 
    516    and results[0] contains the data buffer. Which is originally zero-filled.
    517 
    518 */
    519 
    520 /* this Q func is used by the degraded-mode dag functions to recover lost data.
    521  * the second-to-last parameter is the PDA for the failed portion of the access.
    522  * the code here looks at this PDA and assumes that the xor target buffer is
    523  * equal in size to the number of sectors in the failed PDA.  It then uses
    524  * the other PDAs in the parameter list to determine where within the target
    525  * buffer the corresponding data should be xored.
    526  *
    527  * Recall the basic equation is
    528  *
    529  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
    530  *
    531  * so to recover data_j we need
    532  *
    533  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
    534  *
    535  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
    536  * copying Q into it. Then we need to do a table lookup to convert to solve
    537  *   data_j /= J
    538  *
    539  *
    540  */
    541 int
    542 rf_RecoveryQFunc(node)
    543 	RF_DagNode_t *node;
    544 {
    545 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    546 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    547 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    548 	int     i;
    549 	RF_PhysDiskAddr_t *pda;
    550 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    551 	char   *srcbuf, *destbuf;
    552 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    553 	RF_Etimer_t timer;
    554 	unsigned long coeff;
    555 
    556 	RF_ETIMER_START(timer);
    557 	/* start by copying Q into the buffer */
    558 	bcopy(node->params[node->numParams - 3].p, node->results[0],
    559 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
    560 	for (i = 0; i < node->numParams - 4; i += 2) {
    561 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
    562 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    563 		srcbuf = (char *) node->params[i + 1].p;
    564 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    565 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    566 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    567 		/* compute the data unit offset within the column */
    568 		coeff = (coeff % raidPtr->Layout.numDataCol);
    569 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    570 	}
    571 	/* Do the nasty inversion now */
    572 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
    573 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    574 	RF_ETIMER_STOP(timer);
    575 	RF_ETIMER_EVAL(timer);
    576 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    577 	rf_GenericWakeupFunc(node, 0);
    578 	return (0);
    579 }
    580 
    581 int
    582 rf_RecoveryPQFunc(node)
    583 	RF_DagNode_t *node;
    584 {
    585 	RF_PANIC();
    586 	return (1);
    587 }
    588 /*
    589    Degraded write Q subroutine.
    590    Used when P is dead.
    591    Large-write style Q computation.
    592    Parameters
    593 
    594    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
    595 
    596    We ignore failedPDA.
    597 
    598    This is a "simple style" recovery func.
    599 */
    600 
    601 void
    602 rf_PQ_DegradedWriteQFunc(node)
    603 	RF_DagNode_t *node;
    604 {
    605 	int     np = node->numParams;
    606 	int     d;
    607 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    608 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    609 	int     i;
    610 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    611 	RF_Etimer_t timer;
    612 	char   *qbuf = node->results[0];
    613 	char   *obuf, *qpbuf;
    614 	RF_PhysDiskAddr_t *old;
    615 	unsigned long coeff;
    616 	int     fail_start, j;
    617 
    618 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    619 	fail_start = old->startSector % secPerSU;
    620 
    621 	RF_ETIMER_START(timer);
    622 
    623 	d = (np - 2) / 2;
    624 	RF_ASSERT(2 * d + 2 == np);
    625 
    626 	for (i = 0; i < d; i++) {
    627 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    628 		obuf = (char *) node->params[2 * i + 1].p;
    629 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    630 		/* compute the data unit offset within the column, then add
    631 		 * one */
    632 		coeff = (coeff % raidPtr->Layout.numDataCol);
    633 		j = old->startSector % secPerSU;
    634 		RF_ASSERT(j >= fail_start);
    635 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    636 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    637 	}
    638 
    639 	RF_ETIMER_STOP(timer);
    640 	RF_ETIMER_EVAL(timer);
    641 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    642 	rf_GenericWakeupFunc(node, 0);
    643 }
    644 
    645 
    646 
    647 
    648 /* Q computations */
    649 
    650 /*
    651    coeff - colummn;
    652 
    653    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
    654 
    655    on 5-bit basis;
    656    length in bytes;
    657 */
    658 
    659 void
    660 rf_IncQ(dest, buf, length, coeff)
    661 	unsigned long *dest;
    662 	unsigned long *buf;
    663 	unsigned length;
    664 	unsigned coeff;
    665 {
    666 	unsigned long a, d, new;
    667 	unsigned long a1, a2;
    668 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    669 	unsigned r = rf_rn[coeff + 1];
    670 
    671 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
    672 #define INSERT(a,i) (a << (5L*i))
    673 
    674 	length /= 8;
    675 	/* 13 5 bit quants in a 64 bit word */
    676 	while (length) {
    677 		a = *buf++;
    678 		d = *dest;
    679 		a1 = EXTRACT(a, 0) ^ r;
    680 		a2 = EXTRACT(a, 1) ^ r;
    681 		new = INSERT(a2, 1) | a1;
    682 		a1 = EXTRACT(a, 2) ^ r;
    683 		a2 = EXTRACT(a, 3) ^ r;
    684 		a1 = q[a1];
    685 		a2 = q[a2];
    686 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    687 		a1 = EXTRACT(a, 4) ^ r;
    688 		a2 = EXTRACT(a, 5) ^ r;
    689 		a1 = q[a1];
    690 		a2 = q[a2];
    691 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    692 		a1 = EXTRACT(a, 5) ^ r;
    693 		a2 = EXTRACT(a, 6) ^ r;
    694 		a1 = q[a1];
    695 		a2 = q[a2];
    696 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    697 #if RF_LONGSHIFT > 2
    698 		a1 = EXTRACT(a, 7) ^ r;
    699 		a2 = EXTRACT(a, 8) ^ r;
    700 		a1 = q[a1];
    701 		a2 = q[a2];
    702 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    703 		a1 = EXTRACT(a, 9) ^ r;
    704 		a2 = EXTRACT(a, 10) ^ r;
    705 		a1 = q[a1];
    706 		a2 = q[a2];
    707 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    708 		a1 = EXTRACT(a, 11) ^ r;
    709 		a2 = EXTRACT(a, 12) ^ r;
    710 		a1 = q[a1];
    711 		a2 = q[a2];
    712 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    713 #endif				/* RF_LONGSHIFT > 2 */
    714 		d ^= new;
    715 		*dest++ = d;
    716 		length--;
    717 	}
    718 }
    719 /*
    720    compute
    721 
    722    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
    723 
    724    on a five bit basis.
    725    optimization: compute old ^ new on 64 bit basis.
    726 
    727    length in bytes.
    728 */
    729 
    730 static void
    731 QDelta(
    732     char *dest,
    733     char *obuf,
    734     char *nbuf,
    735     unsigned length,
    736     unsigned char coeff)
    737 {
    738 	unsigned long a, d, new;
    739 	unsigned long a1, a2;
    740 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    741 	unsigned r = rf_rn[coeff + 1];
    742 
    743 #ifdef _KERNEL
    744 	/* PQ in kernel currently not supported because the encoding/decoding
    745 	 * table is not present */
    746 	bzero(dest, length);
    747 #else				/* KERNEL */
    748 	/* this code probably doesn't work and should be rewritten  -wvcii */
    749 	/* 13 5 bit quants in a 64 bit word */
    750 	length /= 8;
    751 	while (length) {
    752 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
    753 		a ^= *nbuf++;
    754 		d = *dest;
    755 		a1 = EXTRACT(a, 0) ^ r;
    756 		a2 = EXTRACT(a, 1) ^ r;
    757 		a1 = q[a1];
    758 		a2 = q[a2];
    759 		new = INSERT(a2, 1) | a1;
    760 		a1 = EXTRACT(a, 2) ^ r;
    761 		a2 = EXTRACT(a, 3) ^ r;
    762 		a1 = q[a1];
    763 		a2 = q[a2];
    764 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    765 		a1 = EXTRACT(a, 4) ^ r;
    766 		a2 = EXTRACT(a, 5) ^ r;
    767 		a1 = q[a1];
    768 		a2 = q[a2];
    769 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    770 		a1 = EXTRACT(a, 5) ^ r;
    771 		a2 = EXTRACT(a, 6) ^ r;
    772 		a1 = q[a1];
    773 		a2 = q[a2];
    774 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    775 #if RF_LONGSHIFT > 2
    776 		a1 = EXTRACT(a, 7) ^ r;
    777 		a2 = EXTRACT(a, 8) ^ r;
    778 		a1 = q[a1];
    779 		a2 = q[a2];
    780 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    781 		a1 = EXTRACT(a, 9) ^ r;
    782 		a2 = EXTRACT(a, 10) ^ r;
    783 		a1 = q[a1];
    784 		a2 = q[a2];
    785 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    786 		a1 = EXTRACT(a, 11) ^ r;
    787 		a2 = EXTRACT(a, 12) ^ r;
    788 		a1 = q[a1];
    789 		a2 = q[a2];
    790 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    791 #endif				/* RF_LONGSHIFT > 2 */
    792 		d ^= new;
    793 		*dest++ = d;
    794 		length--;
    795 	}
    796 #endif				/* _KERNEL */
    797 }
    798 /*
    799    recover columns a and b from the given p and q into
    800    bufs abuf and bbuf. All bufs are word aligned.
    801    Length is in bytes.
    802 */
    803 
    804 
    805 /*
    806  * XXX
    807  *
    808  * Everything about this seems wrong.
    809  */
    810 void
    811 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
    812 	unsigned long *pbuf;
    813 	unsigned long *qbuf;
    814 	unsigned long *abuf;
    815 	unsigned long *bbuf;
    816 	unsigned length;
    817 	unsigned coeff_a;
    818 	unsigned coeff_b;
    819 {
    820 	unsigned long p, q, a, a0, a1;
    821 	int     col = (29 * coeff_a) + coeff_b;
    822 	unsigned char *q0 = &(rf_qinv[col][0]);
    823 
    824 	length /= 8;
    825 	while (length) {
    826 		p = *pbuf++;
    827 		q = *qbuf++;
    828 		a0 = EXTRACT(p, 0);
    829 		a1 = EXTRACT(q, 0);
    830 		a = q0[a0 << 5 | a1];
    831 #define MF(i) \
    832       a0 = EXTRACT(p,i); \
    833       a1 = EXTRACT(q,i); \
    834       a  = a | INSERT(q0[a0<<5 | a1],i)
    835 
    836 		MF(1);
    837 		MF(2);
    838 		MF(3);
    839 		MF(4);
    840 		MF(5);
    841 		MF(6);
    842 #if 0
    843 		MF(7);
    844 		MF(8);
    845 		MF(9);
    846 		MF(10);
    847 		MF(11);
    848 		MF(12);
    849 #endif				/* 0 */
    850 		*abuf++ = a;
    851 		*bbuf++ = a ^ p;
    852 		length--;
    853 	}
    854 }
    855 /*
    856    Lost parity and a data column. Recover that data column.
    857    Assume col coeff is lost. Let q the contents of Q after
    858    all surviving data columns have been q-xored out of it.
    859    Then we have the equation
    860 
    861    q[28-coeff][a_i ^ r_i+1] = q
    862 
    863    but q is cyclic with period 31.
    864    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
    865       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
    866 
    867    so a_i = r_{coeff+1} ^ q[3+coeff][q]
    868 
    869    The routine is passed q buffer and the buffer
    870    the data is to be recoverd into. They can be the same.
    871 */
    872 
    873 
    874 
    875 static void
    876 rf_InvertQ(
    877     unsigned long *qbuf,
    878     unsigned long *abuf,
    879     unsigned length,
    880     unsigned coeff)
    881 {
    882 	unsigned long a, new;
    883 	unsigned long a1, a2;
    884 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
    885 	unsigned r = rf_rn[coeff + 1];
    886 
    887 	/* 13 5 bit quants in a 64 bit word */
    888 	length /= 8;
    889 	while (length) {
    890 		a = *qbuf++;
    891 		a1 = EXTRACT(a, 0);
    892 		a2 = EXTRACT(a, 1);
    893 		a1 = r ^ q[a1];
    894 		a2 = r ^ q[a2];
    895 		new = INSERT(a2, 1) | a1;
    896 #define M(i,j) \
    897       a1 = EXTRACT(a,i); \
    898       a2 = EXTRACT(a,j); \
    899       a1 = r ^ q[a1]; \
    900       a2 = r ^ q[a2]; \
    901       new = new | INSERT(a1,i) | INSERT(a2,j)
    902 
    903 		M(2, 3);
    904 		M(4, 5);
    905 		M(5, 6);
    906 #if RF_LONGSHIFT > 2
    907 		M(7, 8);
    908 		M(9, 10);
    909 		M(11, 12);
    910 #endif				/* RF_LONGSHIFT > 2 */
    911 		*abuf++ = new;
    912 		length--;
    913 	}
    914 }
    915 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    916 				 * (RF_INCLUDE_RAID6 > 0) */
    917