Home | History | Annotate | Line # | Download | only in raidframe
rf_pq.c revision 1.15.74.1
      1 /*	$NetBSD: rf_pq.c,v 1.15.74.1 2009/05/04 08:13:16 yamt Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * Code for RAID level 6 (P + Q) disk array architecture.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.15.74.1 2009/05/04 08:13:16 yamt Exp $");
     35 
     36 #include "rf_archs.h"
     37 
     38 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
     39 
     40 #include <dev/raidframe/raidframevar.h>
     41 
     42 #include "rf_raid.h"
     43 #include "rf_dag.h"
     44 #include "rf_dagffrd.h"
     45 #include "rf_dagffwr.h"
     46 #include "rf_dagdegrd.h"
     47 #include "rf_dagdegwr.h"
     48 #include "rf_dagutils.h"
     49 #include "rf_dagfuncs.h"
     50 #include "rf_etimer.h"
     51 #include "rf_pqdeg.h"
     52 #include "rf_general.h"
     53 #include "rf_map.h"
     54 #include "rf_pq.h"
     55 
     56 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
     57 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
     58 
     59 int
     60 rf_RegularONPFunc(RF_DagNode_t *node)
     61 {
     62 	return (rf_RegularXorFunc(node));
     63 }
     64 /*
     65    same as simpleONQ func, but the coefficient is always 1
     66 */
     67 
     68 int
     69 rf_SimpleONPFunc(RF_DagNode_t *node)
     70 {
     71 	return (rf_SimpleXorFunc(node));
     72 }
     73 
     74 int
     75 rf_RecoveryPFunc(RF_DagNode_t *node)
     76 {
     77 	return (rf_RecoveryXorFunc(node));
     78 }
     79 
     80 int
     81 rf_RegularPFunc(RF_DagNode_t *node)
     82 {
     83 	return (rf_RegularXorFunc(node));
     84 }
     85 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
     86 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     87 
     88 static void
     89 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
     90     unsigned char coeff);
     91 static void
     92 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
     93     unsigned length, unsigned coeff);
     94 
     95 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
     96 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
     97 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
     98 
     99 void
    100 rf_PQDagSelect(
    101     RF_Raid_t * raidPtr,
    102     RF_IoType_t type,
    103     RF_AccessStripeMap_t * asmap,
    104     RF_VoidFuncPtr * createFunc)
    105 {
    106 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    107 	unsigned ndfail = asmap->numDataFailed;
    108 	unsigned npfail = asmap->numParityFailed;
    109 	unsigned ntfail = npfail + ndfail;
    110 
    111 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    112 	if (ntfail > 2) {
    113 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
    114 		*createFunc = NULL;
    115 		return;
    116 	}
    117 	/* ok, we can do this I/O */
    118 	if (type == RF_IO_TYPE_READ) {
    119 		switch (ndfail) {
    120 		case 0:
    121 			/* fault free read */
    122 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
    123 			break;
    124 		case 1:
    125 			/* lost a single data unit */
    126 			/* two cases: (1) parity is not lost. do a normal raid
    127 			 * 5 reconstruct read. (2) parity is lost. do a
    128 			 * reconstruct read using "q". */
    129 			if (ntfail == 2) {	/* also lost redundancy */
    130 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
    131 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
    132 				else
    133 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
    134 			} else {
    135 				/* P and Q are ok. But is there a failure in
    136 				 * some unaccessed data unit? */
    137 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    138 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    139 				else
    140 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
    141 			}
    142 			break;
    143 		case 2:
    144 			/* lost two data units */
    145 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    146 			break;
    147 		}
    148 		return;
    149 	}
    150 	/* a write */
    151 	switch (ntfail) {
    152 	case 0:		/* fault free */
    153 		if (rf_suppressLocksAndLargeWrites ||
    154 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
    155 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
    156 
    157 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
    158 		} else {
    159 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
    160 		}
    161 		break;
    162 
    163 	case 1:		/* single disk fault */
    164 		if (npfail == 1) {
    165 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
    166 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
    167 										 * normal mode raid5
    168 										 * write. */
    169 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    170 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    171 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
    172 				else
    173 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
    174 			} else {/* parity died, small write only updating Q */
    175 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    176 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    177 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
    178 				else
    179 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
    180 			}
    181 		} else {	/* data missing. Do a P reconstruct write if
    182 				 * only a single data unit is lost in the
    183 				 * stripe, otherwise a PQ reconstruct write. */
    184 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    185 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    186 			else
    187 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
    188 		}
    189 		break;
    190 
    191 	case 2:		/* two disk faults */
    192 		switch (npfail) {
    193 		case 2:	/* both p and q dead */
    194 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
    195 			break;
    196 		case 1:	/* either p or q and dead data */
    197 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
    198 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
    199 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
    200 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
    201 			else
    202 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
    203 			break;
    204 		case 0:	/* double data loss */
    205 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    206 			break;
    207 		}
    208 		break;
    209 
    210 	default:		/* more than 2 disk faults */
    211 		*createFunc = NULL;
    212 		RF_PANIC();
    213 	}
    214 	return;
    215 }
    216 /*
    217    Used as a stop gap info function
    218 */
    219 #if 0
    220 static void
    221 PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
    222 {
    223 	*nSucc = *nAnte = 1;
    224 }
    225 
    226 static void
    227 PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
    228 {
    229 	*nSucc = 1;
    230 	*nAnte = 2;
    231 }
    232 #endif
    233 
    234 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
    235 {
    236 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
    237 	    rf_RegularPQFunc, RF_FALSE);
    238 }
    239 
    240 int
    241 rf_RegularONQFunc(RF_DagNode_t *node)
    242 {
    243 	int     np = node->numParams;
    244 	int     d;
    245 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    246 	int     i;
    247 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    248 	RF_Etimer_t timer;
    249 	char   *qbuf, *qpbuf;
    250 	char   *obuf, *nbuf;
    251 	RF_PhysDiskAddr_t *old, *new;
    252 	unsigned long coeff;
    253 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    254 
    255 	RF_ETIMER_START(timer);
    256 
    257 	d = (np - 3) / 4;
    258 	RF_ASSERT(4 * d + 3 == np);
    259 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    260 	for (i = 0; i < d; i++) {
    261 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    262 		obuf = (char *) node->params[2 * i + 1].p;
    263 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    264 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    265 		RF_ASSERT(new->numSector == old->numSector);
    266 		RF_ASSERT(new->raidAddress == old->raidAddress);
    267 		/* the stripe unit within the stripe tells us the coefficient
    268 		 * to use for the multiply. */
    269 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    270 		/* compute the data unit offset within the column, then add
    271 		 * one */
    272 		coeff = (coeff % raidPtr->Layout.numDataCol);
    273 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    274 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    275 	}
    276 
    277 	RF_ETIMER_STOP(timer);
    278 	RF_ETIMER_EVAL(timer);
    279 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    280 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    281 					 * I/O in this node */
    282 	return (0);
    283 }
    284 /*
    285    See the SimpleXORFunc for the difference between a simple and regular func.
    286    These Q functions should be used for
    287 
    288          new q = Q(data,old data,old q)
    289 
    290    style updates and not for
    291 
    292          q = ( new data, new data, .... )
    293 
    294    computations.
    295 
    296    The simple q takes 2(2d+1)+1 params, where d is the number
    297    of stripes written. The order of params is
    298    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
    299    [2d] old q pda_0, old q buffer
    300    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
    301    raidPtr
    302 */
    303 
    304 int
    305 rf_SimpleONQFunc(RF_DagNode_t *node)
    306 {
    307 	int     np = node->numParams;
    308 	int     d;
    309 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    310 	int     i;
    311 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    312 	RF_Etimer_t timer;
    313 	char   *qbuf;
    314 	char   *obuf, *nbuf;
    315 	RF_PhysDiskAddr_t *old, *new;
    316 	unsigned long coeff;
    317 
    318 	RF_ETIMER_START(timer);
    319 
    320 	d = (np - 3) / 4;
    321 	RF_ASSERT(4 * d + 3 == np);
    322 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    323 	for (i = 0; i < d; i++) {
    324 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    325 		obuf = (char *) node->params[2 * i + 1].p;
    326 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    327 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    328 		RF_ASSERT(new->numSector == old->numSector);
    329 		RF_ASSERT(new->raidAddress == old->raidAddress);
    330 		/* the stripe unit within the stripe tells us the coefficient
    331 		 * to use for the multiply. */
    332 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    333 		/* compute the data unit offset within the column, then add
    334 		 * one */
    335 		coeff = (coeff % raidPtr->Layout.numDataCol);
    336 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    337 	}
    338 
    339 	RF_ETIMER_STOP(timer);
    340 	RF_ETIMER_EVAL(timer);
    341 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    342 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    343 					 * I/O in this node */
    344 	return (0);
    345 }
    346 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
    347 {
    348 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
    349 }
    350 
    351 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
    352 
    353 static void
    354 RegularQSubr(RF_DagNode_t *node, char *qbuf)
    355 {
    356 	int     np = node->numParams;
    357 	int     d;
    358 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    359 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    360 	int     i;
    361 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    362 	RF_Etimer_t timer;
    363 	char   *obuf, *qpbuf;
    364 	RF_PhysDiskAddr_t *old;
    365 	unsigned long coeff;
    366 
    367 	RF_ETIMER_START(timer);
    368 
    369 	d = (np - 1) / 2;
    370 	RF_ASSERT(2 * d + 1 == np);
    371 	for (i = 0; i < d; i++) {
    372 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    373 		obuf = (char *) node->params[2 * i + 1].p;
    374 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    375 		/* compute the data unit offset within the column, then add
    376 		 * one */
    377 		coeff = (coeff % raidPtr->Layout.numDataCol);
    378 		/* the input buffers may not all be aligned with the start of
    379 		 * the stripe. so shift by their sector offset within the
    380 		 * stripe unit */
    381 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    382 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    383 	}
    384 
    385 	RF_ETIMER_STOP(timer);
    386 	RF_ETIMER_EVAL(timer);
    387 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    388 }
    389 /*
    390    used in degraded writes.
    391 */
    392 
    393 static void DegrQSubr(RF_DagNode_t *node);
    394 
    395 static void
    396 DegrQSubr(RF_DagNode_t *node)
    397 {
    398 	int     np = node->numParams;
    399 	int     d;
    400 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    401 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    402 	int     i;
    403 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    404 	RF_Etimer_t timer;
    405 	char   *qbuf = node->results[1];
    406 	char   *obuf, *qpbuf;
    407 	RF_PhysDiskAddr_t *old;
    408 	unsigned long coeff;
    409 	unsigned fail_start;
    410 	int     j;
    411 
    412 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    413 	fail_start = old->startSector % secPerSU;
    414 
    415 	RF_ETIMER_START(timer);
    416 
    417 	d = (np - 2) / 2;
    418 	RF_ASSERT(2 * d + 2 == np);
    419 	for (i = 0; i < d; i++) {
    420 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    421 		obuf = (char *) node->params[2 * i + 1].p;
    422 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    423 		/* compute the data unit offset within the column, then add
    424 		 * one */
    425 		coeff = (coeff % raidPtr->Layout.numDataCol);
    426 		/* the input buffers may not all be aligned with the start of
    427 		 * the stripe. so shift by their sector offset within the
    428 		 * stripe unit */
    429 		j = old->startSector % secPerSU;
    430 		RF_ASSERT(j >= fail_start);
    431 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    432 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    433 	}
    434 
    435 	RF_ETIMER_STOP(timer);
    436 	RF_ETIMER_EVAL(timer);
    437 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    438 }
    439 /*
    440    Called by large write code to compute the new parity and the new q.
    441 
    442    structure of the params:
    443 
    444    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
    445    raidPtr
    446 
    447    for a total of 2d+1 arguments.
    448    The result buffers results[0], results[1] are the buffers for the p and q,
    449    respectively.
    450 
    451    We compute Q first, then compute P. The P calculation may try to reuse
    452    one of the input buffers for its output, so if we computed P first, we would
    453    corrupt the input for the q calculation.
    454 */
    455 
    456 int
    457 rf_RegularPQFunc(RF_DagNode_t *node)
    458 {
    459 	RegularQSubr(node, node->results[1]);
    460 	return (rf_RegularXorFunc(node));	/* does the wakeup */
    461 }
    462 
    463 int
    464 rf_RegularQFunc(RF_DagNode_t *node)
    465 {
    466 	/* Almost ... adjust Qsubr args */
    467 	RegularQSubr(node, node->results[0]);
    468 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    469 					 * I/O in this node */
    470 	return (0);
    471 }
    472 /*
    473    Called by singly degraded write code to compute the new parity and the new q.
    474 
    475    structure of the params:
    476 
    477    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
    478    failedPDA raidPtr
    479 
    480    for a total of 2d+2 arguments.
    481    The result buffers results[0], results[1] are the buffers for the parity and q,
    482    respectively.
    483 
    484    We compute Q first, then compute parity. The parity calculation may try to reuse
    485    one of the input buffers for its output, so if we computed parity first, we would
    486    corrupt the input for the q calculation.
    487 
    488    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
    489 */
    490 
    491 void
    492 rf_Degraded_100_PQFunc(RF_DagNode_t *node)
    493 {
    494 	int     np = node->numParams;
    495 
    496 	RF_ASSERT(np >= 2);
    497 	DegrQSubr(node);
    498 	rf_RecoveryXorFunc(node);
    499 }
    500 
    501 
    502 /*
    503    The two below are used when reading a stripe with a single lost data unit.
    504    The parameters are
    505 
    506    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
    507 
    508    and results[0] contains the data buffer. Which is originally zero-filled.
    509 
    510 */
    511 
    512 /* this Q func is used by the degraded-mode dag functions to recover lost data.
    513  * the second-to-last parameter is the PDA for the failed portion of the access.
    514  * the code here looks at this PDA and assumes that the xor target buffer is
    515  * equal in size to the number of sectors in the failed PDA.  It then uses
    516  * the other PDAs in the parameter list to determine where within the target
    517  * buffer the corresponding data should be xored.
    518  *
    519  * Recall the basic equation is
    520  *
    521  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
    522  *
    523  * so to recover data_j we need
    524  *
    525  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
    526  *
    527  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
    528  * copying Q into it. Then we need to do a table lookup to convert to solve
    529  *   data_j /= J
    530  *
    531  *
    532  */
    533 int
    534 rf_RecoveryQFunc(RF_DagNode_t *node)
    535 {
    536 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    537 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    538 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    539 	int     i;
    540 	RF_PhysDiskAddr_t *pda;
    541 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    542 	char   *srcbuf, *destbuf;
    543 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    544 	RF_Etimer_t timer;
    545 	unsigned long coeff;
    546 
    547 	RF_ETIMER_START(timer);
    548 	/* start by copying Q into the buffer */
    549 	memcpy(node->results[0], node->params[node->numParams - 3].p,
    550 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
    551 	for (i = 0; i < node->numParams - 4; i += 2) {
    552 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
    553 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    554 		srcbuf = (char *) node->params[i + 1].p;
    555 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    556 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    557 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    558 		/* compute the data unit offset within the column */
    559 		coeff = (coeff % raidPtr->Layout.numDataCol);
    560 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    561 	}
    562 	/* Do the nasty inversion now */
    563 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
    564 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    565 	RF_ETIMER_STOP(timer);
    566 	RF_ETIMER_EVAL(timer);
    567 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    568 	rf_GenericWakeupFunc(node, 0);
    569 	return (0);
    570 }
    571 
    572 int
    573 rf_RecoveryPQFunc(RF_DagNode_t *node)
    574 {
    575 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    576 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
    577 	return (1);
    578 }
    579 /*
    580    Degraded write Q subroutine.
    581    Used when P is dead.
    582    Large-write style Q computation.
    583    Parameters
    584 
    585    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
    586 
    587    We ignore failedPDA.
    588 
    589    This is a "simple style" recovery func.
    590 */
    591 
    592 void
    593 rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node)
    594 {
    595 	int     np = node->numParams;
    596 	int     d;
    597 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    598 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    599 	int     i;
    600 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    601 	RF_Etimer_t timer;
    602 	char   *qbuf = node->results[0];
    603 	char   *obuf, *qpbuf;
    604 	RF_PhysDiskAddr_t *old;
    605 	unsigned long coeff;
    606 	int     fail_start, j;
    607 
    608 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    609 	fail_start = old->startSector % secPerSU;
    610 
    611 	RF_ETIMER_START(timer);
    612 
    613 	d = (np - 2) / 2;
    614 	RF_ASSERT(2 * d + 2 == np);
    615 
    616 	for (i = 0; i < d; i++) {
    617 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    618 		obuf = (char *) node->params[2 * i + 1].p;
    619 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    620 		/* compute the data unit offset within the column, then add
    621 		 * one */
    622 		coeff = (coeff % raidPtr->Layout.numDataCol);
    623 		j = old->startSector % secPerSU;
    624 		RF_ASSERT(j >= fail_start);
    625 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    626 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    627 	}
    628 
    629 	RF_ETIMER_STOP(timer);
    630 	RF_ETIMER_EVAL(timer);
    631 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    632 	rf_GenericWakeupFunc(node, 0);
    633 }
    634 
    635 
    636 
    637 
    638 /* Q computations */
    639 
    640 /*
    641    coeff - colummn;
    642 
    643    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
    644 
    645    on 5-bit basis;
    646    length in bytes;
    647 */
    648 
    649 void
    650 rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length, unsigned coeff)
    651 {
    652 	unsigned long a, d, new;
    653 	unsigned long a1, a2;
    654 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    655 	unsigned r = rf_rn[coeff + 1];
    656 
    657 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
    658 #define INSERT(a,i) (a << (5L*i))
    659 
    660 	length /= 8;
    661 	/* 13 5 bit quants in a 64 bit word */
    662 	while (length) {
    663 		a = *buf++;
    664 		d = *dest;
    665 		a1 = EXTRACT(a, 0) ^ r;
    666 		a2 = EXTRACT(a, 1) ^ r;
    667 		new = INSERT(a2, 1) | a1;
    668 		a1 = EXTRACT(a, 2) ^ r;
    669 		a2 = EXTRACT(a, 3) ^ r;
    670 		a1 = q[a1];
    671 		a2 = q[a2];
    672 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    673 		a1 = EXTRACT(a, 4) ^ r;
    674 		a2 = EXTRACT(a, 5) ^ r;
    675 		a1 = q[a1];
    676 		a2 = q[a2];
    677 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    678 		a1 = EXTRACT(a, 5) ^ r;
    679 		a2 = EXTRACT(a, 6) ^ r;
    680 		a1 = q[a1];
    681 		a2 = q[a2];
    682 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    683 #if RF_LONGSHIFT > 2
    684 		a1 = EXTRACT(a, 7) ^ r;
    685 		a2 = EXTRACT(a, 8) ^ r;
    686 		a1 = q[a1];
    687 		a2 = q[a2];
    688 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    689 		a1 = EXTRACT(a, 9) ^ r;
    690 		a2 = EXTRACT(a, 10) ^ r;
    691 		a1 = q[a1];
    692 		a2 = q[a2];
    693 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    694 		a1 = EXTRACT(a, 11) ^ r;
    695 		a2 = EXTRACT(a, 12) ^ r;
    696 		a1 = q[a1];
    697 		a2 = q[a2];
    698 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    699 #endif				/* RF_LONGSHIFT > 2 */
    700 		d ^= new;
    701 		*dest++ = d;
    702 		length--;
    703 	}
    704 }
    705 /*
    706    compute
    707 
    708    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
    709 
    710    on a five bit basis.
    711    optimization: compute old ^ new on 64 bit basis.
    712 
    713    length in bytes.
    714 */
    715 
    716 static void
    717 QDelta(
    718     char *dest,
    719     char *obuf,
    720     char *nbuf,
    721     unsigned length,
    722     unsigned char coeff)
    723 {
    724 	unsigned long a, d, new;
    725 	unsigned long a1, a2;
    726 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    727 	unsigned int r = rf_rn[coeff + 1];
    728 
    729 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
    730 	q = NULL; /* XXX for now */
    731 
    732 #ifdef _KERNEL
    733 	/* PQ in kernel currently not supported because the encoding/decoding
    734 	 * table is not present */
    735 	memset(dest, 0, length);
    736 #else				/* KERNEL */
    737 	/* this code probably doesn't work and should be rewritten  -wvcii */
    738 	/* 13 5 bit quants in a 64 bit word */
    739 	length /= 8;
    740 	while (length) {
    741 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
    742 		a ^= *nbuf++;
    743 		d = *dest;
    744 		a1 = EXTRACT(a, 0) ^ r;
    745 		a2 = EXTRACT(a, 1) ^ r;
    746 		a1 = q[a1];
    747 		a2 = q[a2];
    748 		new = INSERT(a2, 1) | a1;
    749 		a1 = EXTRACT(a, 2) ^ r;
    750 		a2 = EXTRACT(a, 3) ^ r;
    751 		a1 = q[a1];
    752 		a2 = q[a2];
    753 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    754 		a1 = EXTRACT(a, 4) ^ r;
    755 		a2 = EXTRACT(a, 5) ^ r;
    756 		a1 = q[a1];
    757 		a2 = q[a2];
    758 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    759 		a1 = EXTRACT(a, 5) ^ r;
    760 		a2 = EXTRACT(a, 6) ^ r;
    761 		a1 = q[a1];
    762 		a2 = q[a2];
    763 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    764 #if RF_LONGSHIFT > 2
    765 		a1 = EXTRACT(a, 7) ^ r;
    766 		a2 = EXTRACT(a, 8) ^ r;
    767 		a1 = q[a1];
    768 		a2 = q[a2];
    769 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    770 		a1 = EXTRACT(a, 9) ^ r;
    771 		a2 = EXTRACT(a, 10) ^ r;
    772 		a1 = q[a1];
    773 		a2 = q[a2];
    774 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    775 		a1 = EXTRACT(a, 11) ^ r;
    776 		a2 = EXTRACT(a, 12) ^ r;
    777 		a1 = q[a1];
    778 		a2 = q[a2];
    779 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    780 #endif				/* RF_LONGSHIFT > 2 */
    781 		d ^= new;
    782 		*dest++ = d;
    783 		length--;
    784 	}
    785 #endif				/* _KERNEL */
    786 }
    787 /*
    788    recover columns a and b from the given p and q into
    789    bufs abuf and bbuf. All bufs are word aligned.
    790    Length is in bytes.
    791 */
    792 
    793 
    794 /*
    795  * XXX
    796  *
    797  * Everything about this seems wrong.
    798  */
    799 void
    800 rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf, unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b)
    801 {
    802 	unsigned long p, q, a, a0, a1;
    803 	int     col = (29 * coeff_a) + coeff_b;
    804 	unsigned char *q0 = &(rf_qinv[col][0]);
    805 
    806 	length /= 8;
    807 	while (length) {
    808 		p = *pbuf++;
    809 		q = *qbuf++;
    810 		a0 = EXTRACT(p, 0);
    811 		a1 = EXTRACT(q, 0);
    812 		a = q0[a0 << 5 | a1];
    813 #define MF(i) \
    814       a0 = EXTRACT(p,i); \
    815       a1 = EXTRACT(q,i); \
    816       a  = a | INSERT(q0[a0<<5 | a1],i)
    817 
    818 		MF(1);
    819 		MF(2);
    820 		MF(3);
    821 		MF(4);
    822 		MF(5);
    823 		MF(6);
    824 #if 0
    825 		MF(7);
    826 		MF(8);
    827 		MF(9);
    828 		MF(10);
    829 		MF(11);
    830 		MF(12);
    831 #endif				/* 0 */
    832 		*abuf++ = a;
    833 		*bbuf++ = a ^ p;
    834 		length--;
    835 	}
    836 }
    837 /*
    838    Lost parity and a data column. Recover that data column.
    839    Assume col coeff is lost. Let q the contents of Q after
    840    all surviving data columns have been q-xored out of it.
    841    Then we have the equation
    842 
    843    q[28-coeff][a_i ^ r_i+1] = q
    844 
    845    but q is cyclic with period 31.
    846    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
    847       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
    848 
    849    so a_i = r_{coeff+1} ^ q[3+coeff][q]
    850 
    851    The routine is passed q buffer and the buffer
    852    the data is to be recoverd into. They can be the same.
    853 */
    854 
    855 
    856 
    857 static void
    858 rf_InvertQ(
    859     unsigned long *qbuf,
    860     unsigned long *abuf,
    861     unsigned length,
    862     unsigned coeff)
    863 {
    864 	unsigned long a, new;
    865 	unsigned long a1, a2;
    866 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
    867 	unsigned r = rf_rn[coeff + 1];
    868 
    869 	/* 13 5 bit quants in a 64 bit word */
    870 	length /= 8;
    871 	while (length) {
    872 		a = *qbuf++;
    873 		a1 = EXTRACT(a, 0);
    874 		a2 = EXTRACT(a, 1);
    875 		a1 = r ^ q[a1];
    876 		a2 = r ^ q[a2];
    877 		new = INSERT(a2, 1) | a1;
    878 #define M(i,j) \
    879       a1 = EXTRACT(a,i); \
    880       a2 = EXTRACT(a,j); \
    881       a1 = r ^ q[a1]; \
    882       a2 = r ^ q[a2]; \
    883       new = new | INSERT(a1,i) | INSERT(a2,j)
    884 
    885 		M(2, 3);
    886 		M(4, 5);
    887 		M(5, 6);
    888 #if RF_LONGSHIFT > 2
    889 		M(7, 8);
    890 		M(9, 10);
    891 		M(11, 12);
    892 #endif				/* RF_LONGSHIFT > 2 */
    893 		*abuf++ = new;
    894 		length--;
    895 	}
    896 }
    897 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    898 				 * (RF_INCLUDE_RAID6 > 0) */
    899