Home | History | Annotate | Line # | Download | only in raidframe
      1 /*	$NetBSD: rf_pq.c,v 1.18 2023/10/15 18:15:20 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * Code for RAID level 6 (P + Q) disk array architecture.
     31  */
     32 
     33 #include <sys/cdefs.h>
     34 __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.18 2023/10/15 18:15:20 oster Exp $");
     35 
     36 #include "rf_archs.h"
     37 
     38 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
     39 
     40 #include <dev/raidframe/raidframevar.h>
     41 
     42 #include "rf_raid.h"
     43 #include "rf_dag.h"
     44 #include "rf_dagffrd.h"
     45 #include "rf_dagffwr.h"
     46 #include "rf_dagdegrd.h"
     47 #include "rf_dagdegwr.h"
     48 #include "rf_dagutils.h"
     49 #include "rf_dagfuncs.h"
     50 #include "rf_etimer.h"
     51 #include "rf_pqdeg.h"
     52 #include "rf_general.h"
     53 #include "rf_map.h"
     54 #include "rf_pq.h"
     55 
     56 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
     57 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
     58 
     59 void
     60 rf_RegularONPFunc(RF_DagNode_t *node)
     61 {
     62 	rf_RegularXorFunc(node);
     63 }
     64 /*
     65    same as simpleONQ func, but the coefficient is always 1
     66 */
     67 
     68 void
     69 rf_SimpleONPFunc(RF_DagNode_t *node)
     70 {
     71 	rf_SimpleXorFunc(node);
     72 }
     73 
     74 void
     75 rf_RecoveryPFunc(RF_DagNode_t *node)
     76 {
     77 	rf_RecoveryXorFunc(node);
     78 }
     79 
     80 void
     81 rf_RegularPFunc(RF_DagNode_t *node)
     82 {
     83 	rf_RegularXorFunc(node);
     84 }
     85 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
     86 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     87 
     88 static void
     89 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
     90     unsigned char coeff);
     91 static void
     92 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
     93     unsigned length, unsigned coeff);
     94 
     95 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
     96 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
     97 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
     98 
     99 void
    100 rf_PQDagSelect(
    101     RF_Raid_t * raidPtr,
    102     RF_IoType_t type,
    103     RF_AccessStripeMap_t * asmap,
    104     RF_VoidFuncPtr * createFunc)
    105 {
    106 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    107 	unsigned ndfail = asmap->numDataFailed;
    108 	unsigned npfail = asmap->numParityFailed;
    109 	unsigned ntfail = npfail + ndfail;
    110 
    111 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    112 	if (ntfail > 2) {
    113 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
    114 		*createFunc = NULL;
    115 		return;
    116 	}
    117 	/* ok, we can do this I/O */
    118 	if (type == RF_IO_TYPE_READ) {
    119 		switch (ndfail) {
    120 		case 0:
    121 			/* fault free read */
    122 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
    123 			break;
    124 		case 1:
    125 			/* lost a single data unit */
    126 			/* two cases: (1) parity is not lost. do a normal raid
    127 			 * 5 reconstruct read. (2) parity is lost. do a
    128 			 * reconstruct read using "q". */
    129 			if (ntfail == 2) {	/* also lost redundancy */
    130 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
    131 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
    132 				else
    133 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
    134 			} else {
    135 				/* P and Q are ok. But is there a failure in
    136 				 * some unaccessed data unit? */
    137 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    138 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    139 				else
    140 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
    141 			}
    142 			break;
    143 		case 2:
    144 			/* lost two data units */
    145 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    146 			break;
    147 		}
    148 		return;
    149 	}
    150 	/* a write */
    151 	switch (ntfail) {
    152 	case 0:		/* fault free */
    153 		if (rf_suppressLocksAndLargeWrites ||
    154 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
    155 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
    156 
    157 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
    158 		} else {
    159 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
    160 		}
    161 		break;
    162 
    163 	case 1:		/* single disk fault */
    164 		if (npfail == 1) {
    165 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
    166 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
    167 										 * normal mode raid5
    168 										 * write. */
    169 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    170 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    171 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
    172 				else
    173 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
    174 			} else {/* parity died, small write only updating Q */
    175 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    176 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    177 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
    178 				else
    179 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
    180 			}
    181 		} else {	/* data missing. Do a P reconstruct write if
    182 				 * only a single data unit is lost in the
    183 				 * stripe, otherwise a PQ reconstruct write. */
    184 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    185 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    186 			else
    187 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
    188 		}
    189 		break;
    190 
    191 	case 2:		/* two disk faults */
    192 		switch (npfail) {
    193 		case 2:	/* both p and q dead */
    194 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
    195 			break;
    196 		case 1:	/* either p or q and dead data */
    197 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
    198 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
    199 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
    200 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
    201 			else
    202 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
    203 			break;
    204 		case 0:	/* double data loss */
    205 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    206 			break;
    207 		}
    208 		break;
    209 
    210 	default:		/* more than 2 disk faults */
    211 		*createFunc = NULL;
    212 		RF_PANIC();
    213 	}
    214 	return;
    215 }
    216 /*
    217    Used as a stop gap info function
    218 */
    219 #if 0
    220 static void
    221 PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
    222 {
    223 	*nSucc = *nAnte = 1;
    224 }
    225 
    226 static void
    227 PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
    228 {
    229 	*nSucc = 1;
    230 	*nAnte = 2;
    231 }
    232 #endif
    233 
    234 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
    235 {
    236 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
    237 	    rf_RegularPQFunc, RF_FALSE);
    238 }
    239 
    240 void
    241 rf_RegularONQFunc(RF_DagNode_t *node)
    242 {
    243 	int     np = node->numParams;
    244 	int     d;
    245 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    246 	int     i;
    247 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    248 	RF_Etimer_t timer;
    249 	char   *qbuf, *qpbuf;
    250 	char   *obuf, *nbuf;
    251 	RF_PhysDiskAddr_t *old, *new;
    252 	unsigned long coeff;
    253 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    254 
    255 	RF_ETIMER_START(timer);
    256 
    257 	d = (np - 3) / 4;
    258 	RF_ASSERT(4 * d + 3 == np);
    259 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    260 	for (i = 0; i < d; i++) {
    261 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    262 		obuf = (char *) node->params[2 * i + 1].p;
    263 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    264 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    265 		RF_ASSERT(new->numSector == old->numSector);
    266 		RF_ASSERT(new->raidAddress == old->raidAddress);
    267 		/* the stripe unit within the stripe tells us the coefficient
    268 		 * to use for the multiply. */
    269 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    270 		/* compute the data unit offset within the column, then add
    271 		 * one */
    272 		coeff = (coeff % raidPtr->Layout.numDataCol);
    273 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    274 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    275 	}
    276 
    277 	RF_ETIMER_STOP(timer);
    278 	RF_ETIMER_EVAL(timer);
    279 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    280 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    281 					 * I/O in this node */
    282 }
    283 /*
    284    See the SimpleXORFunc for the difference between a simple and regular func.
    285    These Q functions should be used for
    286 
    287          new q = Q(data,old data,old q)
    288 
    289    style updates and not for
    290 
    291          q = ( new data, new data, .... )
    292 
    293    computations.
    294 
    295    The simple q takes 2(2d+1)+1 params, where d is the number
    296    of stripes written. The order of params is
    297    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
    298    [2d] old q pda_0, old q buffer
    299    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
    300    raidPtr
    301 */
    302 
    303 void
    304 rf_SimpleONQFunc(RF_DagNode_t *node)
    305 {
    306 	int     np = node->numParams;
    307 	int     d;
    308 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    309 	int     i;
    310 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    311 	RF_Etimer_t timer;
    312 	char   *qbuf;
    313 	char   *obuf, *nbuf;
    314 	RF_PhysDiskAddr_t *old, *new;
    315 	unsigned long coeff;
    316 
    317 	RF_ETIMER_START(timer);
    318 
    319 	d = (np - 3) / 4;
    320 	RF_ASSERT(4 * d + 3 == np);
    321 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    322 	for (i = 0; i < d; i++) {
    323 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    324 		obuf = (char *) node->params[2 * i + 1].p;
    325 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    326 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    327 		RF_ASSERT(new->numSector == old->numSector);
    328 		RF_ASSERT(new->raidAddress == old->raidAddress);
    329 		/* the stripe unit within the stripe tells us the coefficient
    330 		 * to use for the multiply. */
    331 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    332 		/* compute the data unit offset within the column, then add
    333 		 * one */
    334 		coeff = (coeff % raidPtr->Layout.numDataCol);
    335 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    336 	}
    337 
    338 	RF_ETIMER_STOP(timer);
    339 	RF_ETIMER_EVAL(timer);
    340 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    341 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    342 					 * I/O in this node */
    343 }
    344 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
    345 {
    346 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
    347 }
    348 
    349 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
    350 
    351 static void
    352 RegularQSubr(RF_DagNode_t *node, char *qbuf)
    353 {
    354 	int     np = node->numParams;
    355 	int     d;
    356 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    357 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    358 	int     i;
    359 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    360 	RF_Etimer_t timer;
    361 	char   *obuf, *qpbuf;
    362 	RF_PhysDiskAddr_t *old;
    363 	unsigned long coeff;
    364 
    365 	RF_ETIMER_START(timer);
    366 
    367 	d = (np - 1) / 2;
    368 	RF_ASSERT(2 * d + 1 == np);
    369 	for (i = 0; i < d; i++) {
    370 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    371 		obuf = (char *) node->params[2 * i + 1].p;
    372 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    373 		/* compute the data unit offset within the column, then add
    374 		 * one */
    375 		coeff = (coeff % raidPtr->Layout.numDataCol);
    376 		/* the input buffers may not all be aligned with the start of
    377 		 * the stripe. so shift by their sector offset within the
    378 		 * stripe unit */
    379 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    380 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    381 	}
    382 
    383 	RF_ETIMER_STOP(timer);
    384 	RF_ETIMER_EVAL(timer);
    385 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    386 }
    387 /*
    388    used in degraded writes.
    389 */
    390 
    391 static void DegrQSubr(RF_DagNode_t *node);
    392 
    393 static void
    394 DegrQSubr(RF_DagNode_t *node)
    395 {
    396 	int     np = node->numParams;
    397 	int     d;
    398 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    399 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    400 	int     i;
    401 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    402 	RF_Etimer_t timer;
    403 	char   *qbuf = node->results[1];
    404 	char   *obuf, *qpbuf;
    405 	RF_PhysDiskAddr_t *old;
    406 	unsigned long coeff;
    407 	unsigned fail_start;
    408 	int     j;
    409 
    410 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    411 	fail_start = old->startSector % secPerSU;
    412 
    413 	RF_ETIMER_START(timer);
    414 
    415 	d = (np - 2) / 2;
    416 	RF_ASSERT(2 * d + 2 == np);
    417 	for (i = 0; i < d; i++) {
    418 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    419 		obuf = (char *) node->params[2 * i + 1].p;
    420 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    421 		/* compute the data unit offset within the column, then add
    422 		 * one */
    423 		coeff = (coeff % raidPtr->Layout.numDataCol);
    424 		/* the input buffers may not all be aligned with the start of
    425 		 * the stripe. so shift by their sector offset within the
    426 		 * stripe unit */
    427 		j = old->startSector % secPerSU;
    428 		RF_ASSERT(j >= fail_start);
    429 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    430 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    431 	}
    432 
    433 	RF_ETIMER_STOP(timer);
    434 	RF_ETIMER_EVAL(timer);
    435 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    436 }
    437 /*
    438    Called by large write code to compute the new parity and the new q.
    439 
    440    structure of the params:
    441 
    442    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
    443    raidPtr
    444 
    445    for a total of 2d+1 arguments.
    446    The result buffers results[0], results[1] are the buffers for the p and q,
    447    respectively.
    448 
    449    We compute Q first, then compute P. The P calculation may try to reuse
    450    one of the input buffers for its output, so if we computed P first, we would
    451    corrupt the input for the q calculation.
    452 */
    453 
    454 void
    455 rf_RegularPQFunc(RF_DagNode_t *node)
    456 {
    457 	RegularQSubr(node, node->results[1]);
    458 	rf_RegularXorFunc(node);	/* does the wakeup */
    459 }
    460 
    461 void
    462 rf_RegularQFunc(RF_DagNode_t *node)
    463 {
    464 	/* Almost ... adjust Qsubr args */
    465 	RegularQSubr(node, node->results[0]);
    466 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    467 					 * I/O in this node */
    468 }
    469 /*
    470    Called by singly degraded write code to compute the new parity and the new q.
    471 
    472    structure of the params:
    473 
    474    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
    475    failedPDA raidPtr
    476 
    477    for a total of 2d+2 arguments.
    478    The result buffers results[0], results[1] are the buffers for the parity and q,
    479    respectively.
    480 
    481    We compute Q first, then compute parity. The parity calculation may try to reuse
    482    one of the input buffers for its output, so if we computed parity first, we would
    483    corrupt the input for the q calculation.
    484 
    485    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
    486 */
    487 
    488 void
    489 rf_Degraded_100_PQFunc(RF_DagNode_t *node)
    490 {
    491 	int     np = node->numParams;
    492 
    493 	RF_ASSERT(np >= 2);
    494 	DegrQSubr(node);
    495 	rf_RecoveryXorFunc(node);
    496 }
    497 
    498 
    499 /*
    500    The two below are used when reading a stripe with a single lost data unit.
    501    The parameters are
    502 
    503    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
    504 
    505    and results[0] contains the data buffer. Which is originally zero-filled.
    506 
    507 */
    508 
    509 /* this Q func is used by the degraded-mode dag functions to recover lost data.
    510  * the second-to-last parameter is the PDA for the failed portion of the access.
    511  * the code here looks at this PDA and assumes that the xor target buffer is
    512  * equal in size to the number of sectors in the failed PDA.  It then uses
    513  * the other PDAs in the parameter list to determine where within the target
    514  * buffer the corresponding data should be xored.
    515  *
    516  * Recall the basic equation is
    517  *
    518  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
    519  *
    520  * so to recover data_j we need
    521  *
    522  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
    523  *
    524  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
    525  * copying Q into it. Then we need to do a table lookup to convert to solve
    526  *   data_j /= J
    527  *
    528  *
    529  */
    530 void
    531 rf_RecoveryQFunc(RF_DagNode_t *node)
    532 {
    533 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    534 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    535 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    536 	int     i;
    537 	RF_PhysDiskAddr_t *pda = NULL;
    538 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    539 	char   *srcbuf, *destbuf;
    540 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    541 	RF_Etimer_t timer;
    542 	unsigned long coeff;
    543 
    544 	RF_ETIMER_START(timer);
    545 	/* start by copying Q into the buffer */
    546 	memcpy(node->results[0], node->params[node->numParams - 3].p,
    547 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
    548 	for (i = 0; i < node->numParams - 4; i += 2) {
    549 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
    550 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    551 		srcbuf = (char *) node->params[i + 1].p;
    552 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    553 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    554 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    555 		/* compute the data unit offset within the column */
    556 		coeff = (coeff % raidPtr->Layout.numDataCol);
    557 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    558 	}
    559 	/* Do the nasty inversion now */
    560 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
    561 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    562 	RF_ETIMER_STOP(timer);
    563 	RF_ETIMER_EVAL(timer);
    564 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    565 	rf_GenericWakeupFunc(node, 0);
    566 }
    567 
    568 void
    569 rf_RecoveryPQFunc(RF_DagNode_t *node)
    570 {
    571 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    572 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
    573 	/* XXX: Was: */
    574 	/* return (1); */
    575 }
    576 /*
    577    Degraded write Q subroutine.
    578    Used when P is dead.
    579    Large-write style Q computation.
    580    Parameters
    581 
    582    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
    583 
    584    We ignore failedPDA.
    585 
    586    This is a "simple style" recovery func.
    587 */
    588 
    589 void
    590 rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node)
    591 {
    592 	int     np = node->numParams;
    593 	int     d;
    594 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    595 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    596 	int     i;
    597 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    598 	RF_Etimer_t timer;
    599 	char   *qbuf = node->results[0];
    600 	char   *obuf, *qpbuf;
    601 	RF_PhysDiskAddr_t *old;
    602 	unsigned long coeff;
    603 	int     fail_start, j;
    604 
    605 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    606 	fail_start = old->startSector % secPerSU;
    607 
    608 	RF_ETIMER_START(timer);
    609 
    610 	d = (np - 2) / 2;
    611 	RF_ASSERT(2 * d + 2 == np);
    612 
    613 	for (i = 0; i < d; i++) {
    614 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    615 		obuf = (char *) node->params[2 * i + 1].p;
    616 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    617 		/* compute the data unit offset within the column, then add
    618 		 * one */
    619 		coeff = (coeff % raidPtr->Layout.numDataCol);
    620 		j = old->startSector % secPerSU;
    621 		RF_ASSERT(j >= fail_start);
    622 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    623 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    624 	}
    625 
    626 	RF_ETIMER_STOP(timer);
    627 	RF_ETIMER_EVAL(timer);
    628 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    629 	rf_GenericWakeupFunc(node, 0);
    630 }
    631 
    632 
    633 
    634 
    635 /* Q computations */
    636 
    637 /*
    638    coeff - colummn;
    639 
    640    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
    641 
    642    on 5-bit basis;
    643    length in bytes;
    644 */
    645 
    646 void
    647 rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length, unsigned coeff)
    648 {
    649 	unsigned long a, d, new;
    650 	unsigned long a1, a2;
    651 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    652 	unsigned r = rf_rn[coeff + 1];
    653 
    654 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
    655 #define INSERT(a,i) (a << (5L*i))
    656 
    657 	length /= 8;
    658 	/* 13 5 bit quants in a 64 bit word */
    659 	while (length) {
    660 		a = *buf++;
    661 		d = *dest;
    662 		a1 = EXTRACT(a, 0) ^ r;
    663 		a2 = EXTRACT(a, 1) ^ r;
    664 		new = INSERT(a2, 1) | a1;
    665 		a1 = EXTRACT(a, 2) ^ r;
    666 		a2 = EXTRACT(a, 3) ^ r;
    667 		a1 = q[a1];
    668 		a2 = q[a2];
    669 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    670 		a1 = EXTRACT(a, 4) ^ r;
    671 		a2 = EXTRACT(a, 5) ^ r;
    672 		a1 = q[a1];
    673 		a2 = q[a2];
    674 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    675 		a1 = EXTRACT(a, 5) ^ r;
    676 		a2 = EXTRACT(a, 6) ^ r;
    677 		a1 = q[a1];
    678 		a2 = q[a2];
    679 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    680 #if RF_LONGSHIFT > 2
    681 		a1 = EXTRACT(a, 7) ^ r;
    682 		a2 = EXTRACT(a, 8) ^ r;
    683 		a1 = q[a1];
    684 		a2 = q[a2];
    685 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    686 		a1 = EXTRACT(a, 9) ^ r;
    687 		a2 = EXTRACT(a, 10) ^ r;
    688 		a1 = q[a1];
    689 		a2 = q[a2];
    690 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    691 		a1 = EXTRACT(a, 11) ^ r;
    692 		a2 = EXTRACT(a, 12) ^ r;
    693 		a1 = q[a1];
    694 		a2 = q[a2];
    695 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    696 #endif				/* RF_LONGSHIFT > 2 */
    697 		d ^= new;
    698 		*dest++ = d;
    699 		length--;
    700 	}
    701 }
    702 /*
    703    compute
    704 
    705    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
    706 
    707    on a five bit basis.
    708    optimization: compute old ^ new on 64 bit basis.
    709 
    710    length in bytes.
    711 */
    712 
    713 static void
    714 QDelta(
    715     char *dest,
    716     char *obuf,
    717     char *nbuf,
    718     unsigned length,
    719     unsigned char coeff)
    720 {
    721 #ifndef _KERNEL
    722 	unsigned long a, d, new;
    723 	unsigned long a1, a2;
    724 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    725 	unsigned int r = rf_rn[coeff + 1];
    726 
    727 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
    728 	q = NULL; /* XXX for now */
    729 #endif
    730 #ifdef _KERNEL
    731 	/* PQ in kernel currently not supported because the encoding/decoding
    732 	 * table is not present */
    733 	memset(dest, 0, length);
    734 #else				/* KERNEL */
    735 	/* this code probably doesn't work and should be rewritten  -wvcii */
    736 	/* 13 5 bit quants in a 64 bit word */
    737 	length /= 8;
    738 	while (length) {
    739 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
    740 		a ^= *nbuf++;
    741 		d = *dest;
    742 		a1 = EXTRACT(a, 0) ^ r;
    743 		a2 = EXTRACT(a, 1) ^ r;
    744 		a1 = q[a1];
    745 		a2 = q[a2];
    746 		new = INSERT(a2, 1) | a1;
    747 		a1 = EXTRACT(a, 2) ^ r;
    748 		a2 = EXTRACT(a, 3) ^ r;
    749 		a1 = q[a1];
    750 		a2 = q[a2];
    751 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    752 		a1 = EXTRACT(a, 4) ^ r;
    753 		a2 = EXTRACT(a, 5) ^ r;
    754 		a1 = q[a1];
    755 		a2 = q[a2];
    756 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    757 		a1 = EXTRACT(a, 5) ^ r;
    758 		a2 = EXTRACT(a, 6) ^ r;
    759 		a1 = q[a1];
    760 		a2 = q[a2];
    761 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    762 #if RF_LONGSHIFT > 2
    763 		a1 = EXTRACT(a, 7) ^ r;
    764 		a2 = EXTRACT(a, 8) ^ r;
    765 		a1 = q[a1];
    766 		a2 = q[a2];
    767 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    768 		a1 = EXTRACT(a, 9) ^ r;
    769 		a2 = EXTRACT(a, 10) ^ r;
    770 		a1 = q[a1];
    771 		a2 = q[a2];
    772 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    773 		a1 = EXTRACT(a, 11) ^ r;
    774 		a2 = EXTRACT(a, 12) ^ r;
    775 		a1 = q[a1];
    776 		a2 = q[a2];
    777 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    778 #endif				/* RF_LONGSHIFT > 2 */
    779 		d ^= new;
    780 		*dest++ = d;
    781 		length--;
    782 	}
    783 #endif				/* _KERNEL */
    784 }
    785 /*
    786    recover columns a and b from the given p and q into
    787    bufs abuf and bbuf. All bufs are word aligned.
    788    Length is in bytes.
    789 */
    790 
    791 
    792 /*
    793  * XXX
    794  *
    795  * Everything about this seems wrong.
    796  */
    797 void
    798 rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf, unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b)
    799 {
    800 	unsigned long p, q, a, a0, a1;
    801 	int     col = (29 * coeff_a) + coeff_b;
    802 	unsigned char *q0 = &(rf_qinv[col][0]);
    803 
    804 	length /= 8;
    805 	while (length) {
    806 		p = *pbuf++;
    807 		q = *qbuf++;
    808 		a0 = EXTRACT(p, 0);
    809 		a1 = EXTRACT(q, 0);
    810 		a = q0[a0 << 5 | a1];
    811 #define MF(i) \
    812       a0 = EXTRACT(p,i); \
    813       a1 = EXTRACT(q,i); \
    814       a  = a | INSERT(q0[a0<<5 | a1],i)
    815 
    816 		MF(1);
    817 		MF(2);
    818 		MF(3);
    819 		MF(4);
    820 		MF(5);
    821 		MF(6);
    822 #if 0
    823 		MF(7);
    824 		MF(8);
    825 		MF(9);
    826 		MF(10);
    827 		MF(11);
    828 		MF(12);
    829 #endif				/* 0 */
    830 		*abuf++ = a;
    831 		*bbuf++ = a ^ p;
    832 		length--;
    833 	}
    834 }
    835 /*
    836    Lost parity and a data column. Recover that data column.
    837    Assume col coeff is lost. Let q the contents of Q after
    838    all surviving data columns have been q-xored out of it.
    839    Then we have the equation
    840 
    841    q[28-coeff][a_i ^ r_i+1] = q
    842 
    843    but q is cyclic with period 31.
    844    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
    845       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
    846 
    847    so a_i = r_{coeff+1} ^ q[3+coeff][q]
    848 
    849    The routine is passed q buffer and the buffer
    850    the data is to be recoverd into. They can be the same.
    851 */
    852 
    853 
    854 
    855 static void
    856 rf_InvertQ(
    857     unsigned long *qbuf,
    858     unsigned long *abuf,
    859     unsigned length,
    860     unsigned coeff)
    861 {
    862 	unsigned long a, new;
    863 	unsigned long a1, a2;
    864 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
    865 	unsigned r = rf_rn[coeff + 1];
    866 
    867 	/* 13 5 bit quants in a 64 bit word */
    868 	length /= 8;
    869 	while (length) {
    870 		a = *qbuf++;
    871 		a1 = EXTRACT(a, 0);
    872 		a2 = EXTRACT(a, 1);
    873 		a1 = r ^ q[a1];
    874 		a2 = r ^ q[a2];
    875 		new = INSERT(a2, 1) | a1;
    876 #define M(i,j) \
    877       a1 = EXTRACT(a,i); \
    878       a2 = EXTRACT(a,j); \
    879       a1 = r ^ q[a1]; \
    880       a2 = r ^ q[a2]; \
    881       new = new | INSERT(a1,i) | INSERT(a2,j)
    882 
    883 		M(2, 3);
    884 		M(4, 5);
    885 		M(5, 6);
    886 #if RF_LONGSHIFT > 2
    887 		M(7, 8);
    888 		M(9, 10);
    889 		M(11, 12);
    890 #endif				/* RF_LONGSHIFT > 2 */
    891 		*abuf++ = new;
    892 		length--;
    893 	}
    894 }
    895 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    896 				 * (RF_INCLUDE_RAID6 > 0) */
    897