Home | History | Annotate | Line # | Download | only in raidframe
rf_pq.c revision 1.12
      1 /*	$NetBSD: rf_pq.c,v 1.12 2002/05/22 15:40:51 wiz Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * Code for RAID level 6 (P + Q) disk array architecture.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.12 2002/05/22 15:40:51 wiz Exp $");
     35 
     36 #include "rf_archs.h"
     37 
     38 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
     39 
     40 #include <dev/raidframe/raidframevar.h>
     41 
     42 #include "rf_raid.h"
     43 #include "rf_dag.h"
     44 #include "rf_dagffrd.h"
     45 #include "rf_dagffwr.h"
     46 #include "rf_dagdegrd.h"
     47 #include "rf_dagdegwr.h"
     48 #include "rf_dagutils.h"
     49 #include "rf_dagfuncs.h"
     50 #include "rf_etimer.h"
     51 #include "rf_pqdeg.h"
     52 #include "rf_general.h"
     53 #include "rf_map.h"
     54 #include "rf_pq.h"
     55 
     56 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
     57 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
     58 
     59 int
     60 rf_RegularONPFunc(node)
     61 	RF_DagNode_t *node;
     62 {
     63 	return (rf_RegularXorFunc(node));
     64 }
     65 /*
     66    same as simpleONQ func, but the coefficient is always 1
     67 */
     68 
     69 int
     70 rf_SimpleONPFunc(node)
     71 	RF_DagNode_t *node;
     72 {
     73 	return (rf_SimpleXorFunc(node));
     74 }
     75 
     76 int
     77 rf_RecoveryPFunc(node)
     78 	RF_DagNode_t *node;
     79 {
     80 	return (rf_RecoveryXorFunc(node));
     81 }
     82 
     83 int
     84 rf_RegularPFunc(node)
     85 	RF_DagNode_t *node;
     86 {
     87 	return (rf_RegularXorFunc(node));
     88 }
     89 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
     90 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     91 
     92 static void
     93 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
     94     unsigned char coeff);
     95 static void
     96 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
     97     unsigned length, unsigned coeff);
     98 
     99 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
    100 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
    101 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
    102 
    103 void
    104 rf_PQDagSelect(
    105     RF_Raid_t * raidPtr,
    106     RF_IoType_t type,
    107     RF_AccessStripeMap_t * asmap,
    108     RF_VoidFuncPtr * createFunc)
    109 {
    110 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    111 	unsigned ndfail = asmap->numDataFailed;
    112 	unsigned npfail = asmap->numParityFailed;
    113 	unsigned ntfail = npfail + ndfail;
    114 
    115 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    116 	if (ntfail > 2) {
    117 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
    118 		 /* *infoFunc = */ *createFunc = NULL;
    119 		return;
    120 	}
    121 	/* ok, we can do this I/O */
    122 	if (type == RF_IO_TYPE_READ) {
    123 		switch (ndfail) {
    124 		case 0:
    125 			/* fault free read */
    126 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
    127 			break;
    128 		case 1:
    129 			/* lost a single data unit */
    130 			/* two cases: (1) parity is not lost. do a normal raid
    131 			 * 5 reconstruct read. (2) parity is lost. do a
    132 			 * reconstruct read using "q". */
    133 			if (ntfail == 2) {	/* also lost redundancy */
    134 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
    135 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
    136 				else
    137 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
    138 			} else {
    139 				/* P and Q are ok. But is there a failure in
    140 				 * some unaccessed data unit? */
    141 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    142 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    143 				else
    144 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
    145 			}
    146 			break;
    147 		case 2:
    148 			/* lost two data units */
    149 			/* *infoFunc = PQOneTwo; */
    150 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    151 			break;
    152 		}
    153 		return;
    154 	}
    155 	/* a write */
    156 	switch (ntfail) {
    157 	case 0:		/* fault free */
    158 		if (rf_suppressLocksAndLargeWrites ||
    159 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
    160 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
    161 
    162 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
    163 		} else {
    164 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
    165 		}
    166 		break;
    167 
    168 	case 1:		/* single disk fault */
    169 		if (npfail == 1) {
    170 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
    171 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
    172 										 * normal mode raid5
    173 										 * write. */
    174 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    175 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    176 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
    177 				else
    178 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
    179 			} else {/* parity died, small write only updating Q */
    180 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    181 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    182 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
    183 				else
    184 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
    185 			}
    186 		} else {	/* data missing. Do a P reconstruct write if
    187 				 * only a single data unit is lost in the
    188 				 * stripe, otherwise a PQ reconstruct write. */
    189 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    190 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    191 			else
    192 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
    193 		}
    194 		break;
    195 
    196 	case 2:		/* two disk faults */
    197 		switch (npfail) {
    198 		case 2:	/* both p and q dead */
    199 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
    200 			break;
    201 		case 1:	/* either p or q and dead data */
    202 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
    203 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
    204 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
    205 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
    206 			else
    207 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
    208 			break;
    209 		case 0:	/* double data loss */
    210 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    211 			break;
    212 		}
    213 		break;
    214 
    215 	default:		/* more than 2 disk faults */
    216 		*createFunc = NULL;
    217 		RF_PANIC();
    218 	}
    219 	return;
    220 }
    221 /*
    222    Used as a stop gap info function
    223 */
    224 #if 0
    225 static void
    226 PQOne(raidPtr, nSucc, nAnte, asmap)
    227 	RF_Raid_t *raidPtr;
    228 	int    *nSucc;
    229 	int    *nAnte;
    230 	RF_AccessStripeMap_t *asmap;
    231 {
    232 	*nSucc = *nAnte = 1;
    233 }
    234 
    235 static void
    236 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
    237 	RF_Raid_t *raidPtr;
    238 	int    *nSucc;
    239 	int    *nAnte;
    240 	RF_AccessStripeMap_t *asmap;
    241 {
    242 	*nSucc = 1;
    243 	*nAnte = 2;
    244 }
    245 #endif
    246 
    247 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
    248 {
    249 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
    250 	    rf_RegularPQFunc, RF_FALSE);
    251 }
    252 
    253 int
    254 rf_RegularONQFunc(node)
    255 	RF_DagNode_t *node;
    256 {
    257 	int     np = node->numParams;
    258 	int     d;
    259 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    260 	int     i;
    261 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    262 	RF_Etimer_t timer;
    263 	char   *qbuf, *qpbuf;
    264 	char   *obuf, *nbuf;
    265 	RF_PhysDiskAddr_t *old, *new;
    266 	unsigned long coeff;
    267 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    268 
    269 	RF_ETIMER_START(timer);
    270 
    271 	d = (np - 3) / 4;
    272 	RF_ASSERT(4 * d + 3 == np);
    273 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    274 	for (i = 0; i < d; i++) {
    275 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    276 		obuf = (char *) node->params[2 * i + 1].p;
    277 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    278 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    279 		RF_ASSERT(new->numSector == old->numSector);
    280 		RF_ASSERT(new->raidAddress == old->raidAddress);
    281 		/* the stripe unit within the stripe tells us the coefficient
    282 		 * to use for the multiply. */
    283 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    284 		/* compute the data unit offset within the column, then add
    285 		 * one */
    286 		coeff = (coeff % raidPtr->Layout.numDataCol);
    287 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    288 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    289 	}
    290 
    291 	RF_ETIMER_STOP(timer);
    292 	RF_ETIMER_EVAL(timer);
    293 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    294 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    295 					 * I/O in this node */
    296 	return (0);
    297 }
    298 /*
    299    See the SimpleXORFunc for the difference between a simple and regular func.
    300    These Q functions should be used for
    301 
    302          new q = Q(data,old data,old q)
    303 
    304    style updates and not for
    305 
    306          q = ( new data, new data, .... )
    307 
    308    computations.
    309 
    310    The simple q takes 2(2d+1)+1 params, where d is the number
    311    of stripes written. The order of params is
    312    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
    313    [2d] old q pda_0, old q buffer
    314    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
    315    raidPtr
    316 */
    317 
    318 int
    319 rf_SimpleONQFunc(node)
    320 	RF_DagNode_t *node;
    321 {
    322 	int     np = node->numParams;
    323 	int     d;
    324 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    325 	int     i;
    326 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    327 	RF_Etimer_t timer;
    328 	char   *qbuf;
    329 	char   *obuf, *nbuf;
    330 	RF_PhysDiskAddr_t *old, *new;
    331 	unsigned long coeff;
    332 
    333 	RF_ETIMER_START(timer);
    334 
    335 	d = (np - 3) / 4;
    336 	RF_ASSERT(4 * d + 3 == np);
    337 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    338 	for (i = 0; i < d; i++) {
    339 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    340 		obuf = (char *) node->params[2 * i + 1].p;
    341 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    342 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    343 		RF_ASSERT(new->numSector == old->numSector);
    344 		RF_ASSERT(new->raidAddress == old->raidAddress);
    345 		/* the stripe unit within the stripe tells us the coefficient
    346 		 * to use for the multiply. */
    347 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    348 		/* compute the data unit offset within the column, then add
    349 		 * one */
    350 		coeff = (coeff % raidPtr->Layout.numDataCol);
    351 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    352 	}
    353 
    354 	RF_ETIMER_STOP(timer);
    355 	RF_ETIMER_EVAL(timer);
    356 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    357 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    358 					 * I/O in this node */
    359 	return (0);
    360 }
    361 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
    362 {
    363 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
    364 }
    365 
    366 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
    367 
    368 static void
    369 RegularQSubr(node, qbuf)
    370 	RF_DagNode_t *node;
    371 	char   *qbuf;
    372 {
    373 	int     np = node->numParams;
    374 	int     d;
    375 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    376 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    377 	int     i;
    378 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    379 	RF_Etimer_t timer;
    380 	char   *obuf, *qpbuf;
    381 	RF_PhysDiskAddr_t *old;
    382 	unsigned long coeff;
    383 
    384 	RF_ETIMER_START(timer);
    385 
    386 	d = (np - 1) / 2;
    387 	RF_ASSERT(2 * d + 1 == np);
    388 	for (i = 0; i < d; i++) {
    389 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    390 		obuf = (char *) node->params[2 * i + 1].p;
    391 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    392 		/* compute the data unit offset within the column, then add
    393 		 * one */
    394 		coeff = (coeff % raidPtr->Layout.numDataCol);
    395 		/* the input buffers may not all be aligned with the start of
    396 		 * the stripe. so shift by their sector offset within the
    397 		 * stripe unit */
    398 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    399 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    400 	}
    401 
    402 	RF_ETIMER_STOP(timer);
    403 	RF_ETIMER_EVAL(timer);
    404 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    405 }
    406 /*
    407    used in degraded writes.
    408 */
    409 
    410 static void DegrQSubr(RF_DagNode_t *node);
    411 
    412 static void
    413 DegrQSubr(node)
    414 	RF_DagNode_t *node;
    415 {
    416 	int     np = node->numParams;
    417 	int     d;
    418 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    419 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    420 	int     i;
    421 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    422 	RF_Etimer_t timer;
    423 	char   *qbuf = node->results[1];
    424 	char   *obuf, *qpbuf;
    425 	RF_PhysDiskAddr_t *old;
    426 	unsigned long coeff;
    427 	unsigned fail_start;
    428 	int     j;
    429 
    430 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    431 	fail_start = old->startSector % secPerSU;
    432 
    433 	RF_ETIMER_START(timer);
    434 
    435 	d = (np - 2) / 2;
    436 	RF_ASSERT(2 * d + 2 == np);
    437 	for (i = 0; i < d; i++) {
    438 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    439 		obuf = (char *) node->params[2 * i + 1].p;
    440 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    441 		/* compute the data unit offset within the column, then add
    442 		 * one */
    443 		coeff = (coeff % raidPtr->Layout.numDataCol);
    444 		/* the input buffers may not all be aligned with the start of
    445 		 * the stripe. so shift by their sector offset within the
    446 		 * stripe unit */
    447 		j = old->startSector % secPerSU;
    448 		RF_ASSERT(j >= fail_start);
    449 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    450 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    451 	}
    452 
    453 	RF_ETIMER_STOP(timer);
    454 	RF_ETIMER_EVAL(timer);
    455 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    456 }
    457 /*
    458    Called by large write code to compute the new parity and the new q.
    459 
    460    structure of the params:
    461 
    462    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
    463    raidPtr
    464 
    465    for a total of 2d+1 arguments.
    466    The result buffers results[0], results[1] are the buffers for the p and q,
    467    respectively.
    468 
    469    We compute Q first, then compute P. The P calculation may try to reuse
    470    one of the input buffers for its output, so if we computed P first, we would
    471    corrupt the input for the q calculation.
    472 */
    473 
    474 int
    475 rf_RegularPQFunc(node)
    476 	RF_DagNode_t *node;
    477 {
    478 	RegularQSubr(node, node->results[1]);
    479 	return (rf_RegularXorFunc(node));	/* does the wakeup */
    480 }
    481 
    482 int
    483 rf_RegularQFunc(node)
    484 	RF_DagNode_t *node;
    485 {
    486 	/* Almost ... adjust Qsubr args */
    487 	RegularQSubr(node, node->results[0]);
    488 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    489 					 * I/O in this node */
    490 	return (0);
    491 }
    492 /*
    493    Called by singly degraded write code to compute the new parity and the new q.
    494 
    495    structure of the params:
    496 
    497    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
    498    failedPDA raidPtr
    499 
    500    for a total of 2d+2 arguments.
    501    The result buffers results[0], results[1] are the buffers for the parity and q,
    502    respectively.
    503 
    504    We compute Q first, then compute parity. The parity calculation may try to reuse
    505    one of the input buffers for its output, so if we computed parity first, we would
    506    corrupt the input for the q calculation.
    507 
    508    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
    509 */
    510 
    511 void
    512 rf_Degraded_100_PQFunc(node)
    513 	RF_DagNode_t *node;
    514 {
    515 	int     np = node->numParams;
    516 
    517 	RF_ASSERT(np >= 2);
    518 	DegrQSubr(node);
    519 	rf_RecoveryXorFunc(node);
    520 }
    521 
    522 
    523 /*
    524    The two below are used when reading a stripe with a single lost data unit.
    525    The parameters are
    526 
    527    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
    528 
    529    and results[0] contains the data buffer. Which is originally zero-filled.
    530 
    531 */
    532 
    533 /* this Q func is used by the degraded-mode dag functions to recover lost data.
    534  * the second-to-last parameter is the PDA for the failed portion of the access.
    535  * the code here looks at this PDA and assumes that the xor target buffer is
    536  * equal in size to the number of sectors in the failed PDA.  It then uses
    537  * the other PDAs in the parameter list to determine where within the target
    538  * buffer the corresponding data should be xored.
    539  *
    540  * Recall the basic equation is
    541  *
    542  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
    543  *
    544  * so to recover data_j we need
    545  *
    546  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
    547  *
    548  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
    549  * copying Q into it. Then we need to do a table lookup to convert to solve
    550  *   data_j /= J
    551  *
    552  *
    553  */
    554 int
    555 rf_RecoveryQFunc(node)
    556 	RF_DagNode_t *node;
    557 {
    558 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    559 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    560 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    561 	int     i;
    562 	RF_PhysDiskAddr_t *pda;
    563 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    564 	char   *srcbuf, *destbuf;
    565 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    566 	RF_Etimer_t timer;
    567 	unsigned long coeff;
    568 
    569 	RF_ETIMER_START(timer);
    570 	/* start by copying Q into the buffer */
    571 	memcpy(node->results[0], node->params[node->numParams - 3].p,
    572 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
    573 	for (i = 0; i < node->numParams - 4; i += 2) {
    574 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
    575 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    576 		srcbuf = (char *) node->params[i + 1].p;
    577 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    578 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    579 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    580 		/* compute the data unit offset within the column */
    581 		coeff = (coeff % raidPtr->Layout.numDataCol);
    582 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    583 	}
    584 	/* Do the nasty inversion now */
    585 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
    586 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    587 	RF_ETIMER_STOP(timer);
    588 	RF_ETIMER_EVAL(timer);
    589 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    590 	rf_GenericWakeupFunc(node, 0);
    591 	return (0);
    592 }
    593 
    594 int
    595 rf_RecoveryPQFunc(node)
    596 	RF_DagNode_t *node;
    597 {
    598 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    599 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
    600 	return (1);
    601 }
    602 /*
    603    Degraded write Q subroutine.
    604    Used when P is dead.
    605    Large-write style Q computation.
    606    Parameters
    607 
    608    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
    609 
    610    We ignore failedPDA.
    611 
    612    This is a "simple style" recovery func.
    613 */
    614 
    615 void
    616 rf_PQ_DegradedWriteQFunc(node)
    617 	RF_DagNode_t *node;
    618 {
    619 	int     np = node->numParams;
    620 	int     d;
    621 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    622 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    623 	int     i;
    624 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    625 	RF_Etimer_t timer;
    626 	char   *qbuf = node->results[0];
    627 	char   *obuf, *qpbuf;
    628 	RF_PhysDiskAddr_t *old;
    629 	unsigned long coeff;
    630 	int     fail_start, j;
    631 
    632 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    633 	fail_start = old->startSector % secPerSU;
    634 
    635 	RF_ETIMER_START(timer);
    636 
    637 	d = (np - 2) / 2;
    638 	RF_ASSERT(2 * d + 2 == np);
    639 
    640 	for (i = 0; i < d; i++) {
    641 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    642 		obuf = (char *) node->params[2 * i + 1].p;
    643 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    644 		/* compute the data unit offset within the column, then add
    645 		 * one */
    646 		coeff = (coeff % raidPtr->Layout.numDataCol);
    647 		j = old->startSector % secPerSU;
    648 		RF_ASSERT(j >= fail_start);
    649 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    650 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    651 	}
    652 
    653 	RF_ETIMER_STOP(timer);
    654 	RF_ETIMER_EVAL(timer);
    655 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    656 	rf_GenericWakeupFunc(node, 0);
    657 }
    658 
    659 
    660 
    661 
    662 /* Q computations */
    663 
    664 /*
    665    coeff - colummn;
    666 
    667    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
    668 
    669    on 5-bit basis;
    670    length in bytes;
    671 */
    672 
    673 void
    674 rf_IncQ(dest, buf, length, coeff)
    675 	unsigned long *dest;
    676 	unsigned long *buf;
    677 	unsigned length;
    678 	unsigned coeff;
    679 {
    680 	unsigned long a, d, new;
    681 	unsigned long a1, a2;
    682 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    683 	unsigned r = rf_rn[coeff + 1];
    684 
    685 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
    686 #define INSERT(a,i) (a << (5L*i))
    687 
    688 	length /= 8;
    689 	/* 13 5 bit quants in a 64 bit word */
    690 	while (length) {
    691 		a = *buf++;
    692 		d = *dest;
    693 		a1 = EXTRACT(a, 0) ^ r;
    694 		a2 = EXTRACT(a, 1) ^ r;
    695 		new = INSERT(a2, 1) | a1;
    696 		a1 = EXTRACT(a, 2) ^ r;
    697 		a2 = EXTRACT(a, 3) ^ r;
    698 		a1 = q[a1];
    699 		a2 = q[a2];
    700 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    701 		a1 = EXTRACT(a, 4) ^ r;
    702 		a2 = EXTRACT(a, 5) ^ r;
    703 		a1 = q[a1];
    704 		a2 = q[a2];
    705 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    706 		a1 = EXTRACT(a, 5) ^ r;
    707 		a2 = EXTRACT(a, 6) ^ r;
    708 		a1 = q[a1];
    709 		a2 = q[a2];
    710 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    711 #if RF_LONGSHIFT > 2
    712 		a1 = EXTRACT(a, 7) ^ r;
    713 		a2 = EXTRACT(a, 8) ^ r;
    714 		a1 = q[a1];
    715 		a2 = q[a2];
    716 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    717 		a1 = EXTRACT(a, 9) ^ r;
    718 		a2 = EXTRACT(a, 10) ^ r;
    719 		a1 = q[a1];
    720 		a2 = q[a2];
    721 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    722 		a1 = EXTRACT(a, 11) ^ r;
    723 		a2 = EXTRACT(a, 12) ^ r;
    724 		a1 = q[a1];
    725 		a2 = q[a2];
    726 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    727 #endif				/* RF_LONGSHIFT > 2 */
    728 		d ^= new;
    729 		*dest++ = d;
    730 		length--;
    731 	}
    732 }
    733 /*
    734    compute
    735 
    736    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
    737 
    738    on a five bit basis.
    739    optimization: compute old ^ new on 64 bit basis.
    740 
    741    length in bytes.
    742 */
    743 
    744 static void
    745 QDelta(
    746     char *dest,
    747     char *obuf,
    748     char *nbuf,
    749     unsigned length,
    750     unsigned char coeff)
    751 {
    752 	unsigned long a, d, new;
    753 	unsigned long a1, a2;
    754 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    755 	unsigned int r = rf_rn[coeff + 1];
    756 
    757 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
    758 	q = NULL; /* XXX for now */
    759 
    760 #ifdef _KERNEL
    761 	/* PQ in kernel currently not supported because the encoding/decoding
    762 	 * table is not present */
    763 	memset(dest, 0, length);
    764 #else				/* KERNEL */
    765 	/* this code probably doesn't work and should be rewritten  -wvcii */
    766 	/* 13 5 bit quants in a 64 bit word */
    767 	length /= 8;
    768 	while (length) {
    769 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
    770 		a ^= *nbuf++;
    771 		d = *dest;
    772 		a1 = EXTRACT(a, 0) ^ r;
    773 		a2 = EXTRACT(a, 1) ^ r;
    774 		a1 = q[a1];
    775 		a2 = q[a2];
    776 		new = INSERT(a2, 1) | a1;
    777 		a1 = EXTRACT(a, 2) ^ r;
    778 		a2 = EXTRACT(a, 3) ^ r;
    779 		a1 = q[a1];
    780 		a2 = q[a2];
    781 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    782 		a1 = EXTRACT(a, 4) ^ r;
    783 		a2 = EXTRACT(a, 5) ^ r;
    784 		a1 = q[a1];
    785 		a2 = q[a2];
    786 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    787 		a1 = EXTRACT(a, 5) ^ r;
    788 		a2 = EXTRACT(a, 6) ^ r;
    789 		a1 = q[a1];
    790 		a2 = q[a2];
    791 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    792 #if RF_LONGSHIFT > 2
    793 		a1 = EXTRACT(a, 7) ^ r;
    794 		a2 = EXTRACT(a, 8) ^ r;
    795 		a1 = q[a1];
    796 		a2 = q[a2];
    797 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    798 		a1 = EXTRACT(a, 9) ^ r;
    799 		a2 = EXTRACT(a, 10) ^ r;
    800 		a1 = q[a1];
    801 		a2 = q[a2];
    802 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    803 		a1 = EXTRACT(a, 11) ^ r;
    804 		a2 = EXTRACT(a, 12) ^ r;
    805 		a1 = q[a1];
    806 		a2 = q[a2];
    807 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    808 #endif				/* RF_LONGSHIFT > 2 */
    809 		d ^= new;
    810 		*dest++ = d;
    811 		length--;
    812 	}
    813 #endif				/* _KERNEL */
    814 }
    815 /*
    816    recover columns a and b from the given p and q into
    817    bufs abuf and bbuf. All bufs are word aligned.
    818    Length is in bytes.
    819 */
    820 
    821 
    822 /*
    823  * XXX
    824  *
    825  * Everything about this seems wrong.
    826  */
    827 void
    828 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
    829 	unsigned long *pbuf;
    830 	unsigned long *qbuf;
    831 	unsigned long *abuf;
    832 	unsigned long *bbuf;
    833 	unsigned length;
    834 	unsigned coeff_a;
    835 	unsigned coeff_b;
    836 {
    837 	unsigned long p, q, a, a0, a1;
    838 	int     col = (29 * coeff_a) + coeff_b;
    839 	unsigned char *q0 = &(rf_qinv[col][0]);
    840 
    841 	length /= 8;
    842 	while (length) {
    843 		p = *pbuf++;
    844 		q = *qbuf++;
    845 		a0 = EXTRACT(p, 0);
    846 		a1 = EXTRACT(q, 0);
    847 		a = q0[a0 << 5 | a1];
    848 #define MF(i) \
    849       a0 = EXTRACT(p,i); \
    850       a1 = EXTRACT(q,i); \
    851       a  = a | INSERT(q0[a0<<5 | a1],i)
    852 
    853 		MF(1);
    854 		MF(2);
    855 		MF(3);
    856 		MF(4);
    857 		MF(5);
    858 		MF(6);
    859 #if 0
    860 		MF(7);
    861 		MF(8);
    862 		MF(9);
    863 		MF(10);
    864 		MF(11);
    865 		MF(12);
    866 #endif				/* 0 */
    867 		*abuf++ = a;
    868 		*bbuf++ = a ^ p;
    869 		length--;
    870 	}
    871 }
    872 /*
    873    Lost parity and a data column. Recover that data column.
    874    Assume col coeff is lost. Let q the contents of Q after
    875    all surviving data columns have been q-xored out of it.
    876    Then we have the equation
    877 
    878    q[28-coeff][a_i ^ r_i+1] = q
    879 
    880    but q is cyclic with period 31.
    881    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
    882       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
    883 
    884    so a_i = r_{coeff+1} ^ q[3+coeff][q]
    885 
    886    The routine is passed q buffer and the buffer
    887    the data is to be recoverd into. They can be the same.
    888 */
    889 
    890 
    891 
    892 static void
    893 rf_InvertQ(
    894     unsigned long *qbuf,
    895     unsigned long *abuf,
    896     unsigned length,
    897     unsigned coeff)
    898 {
    899 	unsigned long a, new;
    900 	unsigned long a1, a2;
    901 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
    902 	unsigned r = rf_rn[coeff + 1];
    903 
    904 	/* 13 5 bit quants in a 64 bit word */
    905 	length /= 8;
    906 	while (length) {
    907 		a = *qbuf++;
    908 		a1 = EXTRACT(a, 0);
    909 		a2 = EXTRACT(a, 1);
    910 		a1 = r ^ q[a1];
    911 		a2 = r ^ q[a2];
    912 		new = INSERT(a2, 1) | a1;
    913 #define M(i,j) \
    914       a1 = EXTRACT(a,i); \
    915       a2 = EXTRACT(a,j); \
    916       a1 = r ^ q[a1]; \
    917       a2 = r ^ q[a2]; \
    918       new = new | INSERT(a1,i) | INSERT(a2,j)
    919 
    920 		M(2, 3);
    921 		M(4, 5);
    922 		M(5, 6);
    923 #if RF_LONGSHIFT > 2
    924 		M(7, 8);
    925 		M(9, 10);
    926 		M(11, 12);
    927 #endif				/* RF_LONGSHIFT > 2 */
    928 		*abuf++ = new;
    929 		length--;
    930 	}
    931 }
    932 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    933 				 * (RF_INCLUDE_RAID6 > 0) */
    934