Home | History | Annotate | Line # | Download | only in raidframe
rf_pq.c revision 1.9
      1 /*	$NetBSD: rf_pq.c,v 1.9 2001/07/18 06:45:34 thorpej Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Daniel Stodolsky
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * Code for RAID level 6 (P + Q) disk array architecture.
     31  */
     32 
     33 #include "rf_archs.h"
     34 
     35 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
     36 
     37 #include "rf_types.h"
     38 #include "rf_raid.h"
     39 #include "rf_dag.h"
     40 #include "rf_dagffrd.h"
     41 #include "rf_dagffwr.h"
     42 #include "rf_dagdegrd.h"
     43 #include "rf_dagdegwr.h"
     44 #include "rf_dagutils.h"
     45 #include "rf_dagfuncs.h"
     46 #include "rf_etimer.h"
     47 #include "rf_pqdeg.h"
     48 #include "rf_general.h"
     49 #include "rf_map.h"
     50 #include "rf_pq.h"
     51 
     52 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
     53 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
     54 
     55 int
     56 rf_RegularONPFunc(node)
     57 	RF_DagNode_t *node;
     58 {
     59 	return (rf_RegularXorFunc(node));
     60 }
     61 /*
     62    same as simpleONQ func, but the coefficient is always 1
     63 */
     64 
     65 int
     66 rf_SimpleONPFunc(node)
     67 	RF_DagNode_t *node;
     68 {
     69 	return (rf_SimpleXorFunc(node));
     70 }
     71 
     72 int
     73 rf_RecoveryPFunc(node)
     74 	RF_DagNode_t *node;
     75 {
     76 	return (rf_RecoveryXorFunc(node));
     77 }
     78 
     79 int
     80 rf_RegularPFunc(node)
     81 	RF_DagNode_t *node;
     82 {
     83 	return (rf_RegularXorFunc(node));
     84 }
     85 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
     86 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
     87 
     88 static void
     89 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
     90     unsigned char coeff);
     91 static void
     92 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
     93     unsigned length, unsigned coeff);
     94 
     95 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
     96 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
     97 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
     98 
     99 void
    100 rf_PQDagSelect(
    101     RF_Raid_t * raidPtr,
    102     RF_IoType_t type,
    103     RF_AccessStripeMap_t * asmap,
    104     RF_VoidFuncPtr * createFunc)
    105 {
    106 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    107 	unsigned ndfail = asmap->numDataFailed;
    108 	unsigned npfail = asmap->numParityFailed;
    109 	unsigned ntfail = npfail + ndfail;
    110 
    111 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    112 	if (ntfail > 2) {
    113 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
    114 		 /* *infoFunc = */ *createFunc = NULL;
    115 		return;
    116 	}
    117 	/* ok, we can do this I/O */
    118 	if (type == RF_IO_TYPE_READ) {
    119 		switch (ndfail) {
    120 		case 0:
    121 			/* fault free read */
    122 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
    123 			break;
    124 		case 1:
    125 			/* lost a single data unit */
    126 			/* two cases: (1) parity is not lost. do a normal raid
    127 			 * 5 reconstruct read. (2) parity is lost. do a
    128 			 * reconstruct read using "q". */
    129 			if (ntfail == 2) {	/* also lost redundancy */
    130 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
    131 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
    132 				else
    133 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
    134 			} else {
    135 				/* P and Q are ok. But is there a failure in
    136 				 * some unaccessed data unit? */
    137 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    138 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    139 				else
    140 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
    141 			}
    142 			break;
    143 		case 2:
    144 			/* lost two data units */
    145 			/* *infoFunc = PQOneTwo; */
    146 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
    147 			break;
    148 		}
    149 		return;
    150 	}
    151 	/* a write */
    152 	switch (ntfail) {
    153 	case 0:		/* fault free */
    154 		if (rf_suppressLocksAndLargeWrites ||
    155 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
    156 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
    157 
    158 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
    159 		} else {
    160 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
    161 		}
    162 		break;
    163 
    164 	case 1:		/* single disk fault */
    165 		if (npfail == 1) {
    166 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
    167 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
    168 										 * normal mode raid5
    169 										 * write. */
    170 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    171 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    172 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
    173 				else
    174 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
    175 			} else {/* parity died, small write only updating Q */
    176 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
    177 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
    178 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
    179 				else
    180 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
    181 			}
    182 		} else {	/* data missing. Do a P reconstruct write if
    183 				 * only a single data unit is lost in the
    184 				 * stripe, otherwise a PQ reconstruct write. */
    185 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
    186 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    187 			else
    188 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
    189 		}
    190 		break;
    191 
    192 	case 2:		/* two disk faults */
    193 		switch (npfail) {
    194 		case 2:	/* both p and q dead */
    195 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
    196 			break;
    197 		case 1:	/* either p or q and dead data */
    198 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
    199 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
    200 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
    201 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
    202 			else
    203 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
    204 			break;
    205 		case 0:	/* double data loss */
    206 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
    207 			break;
    208 		}
    209 		break;
    210 
    211 	default:		/* more than 2 disk faults */
    212 		*createFunc = NULL;
    213 		RF_PANIC();
    214 	}
    215 	return;
    216 }
    217 /*
    218    Used as a stop gap info function
    219 */
    220 #if 0
    221 static void
    222 PQOne(raidPtr, nSucc, nAnte, asmap)
    223 	RF_Raid_t *raidPtr;
    224 	int    *nSucc;
    225 	int    *nAnte;
    226 	RF_AccessStripeMap_t *asmap;
    227 {
    228 	*nSucc = *nAnte = 1;
    229 }
    230 
    231 static void
    232 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
    233 	RF_Raid_t *raidPtr;
    234 	int    *nSucc;
    235 	int    *nAnte;
    236 	RF_AccessStripeMap_t *asmap;
    237 {
    238 	*nSucc = 1;
    239 	*nAnte = 2;
    240 }
    241 #endif
    242 
    243 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
    244 {
    245 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
    246 	    rf_RegularPQFunc, RF_FALSE);
    247 }
    248 
    249 int
    250 rf_RegularONQFunc(node)
    251 	RF_DagNode_t *node;
    252 {
    253 	int     np = node->numParams;
    254 	int     d;
    255 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    256 	int     i;
    257 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    258 	RF_Etimer_t timer;
    259 	char   *qbuf, *qpbuf;
    260 	char   *obuf, *nbuf;
    261 	RF_PhysDiskAddr_t *old, *new;
    262 	unsigned long coeff;
    263 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    264 
    265 	RF_ETIMER_START(timer);
    266 
    267 	d = (np - 3) / 4;
    268 	RF_ASSERT(4 * d + 3 == np);
    269 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    270 	for (i = 0; i < d; i++) {
    271 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    272 		obuf = (char *) node->params[2 * i + 1].p;
    273 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    274 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    275 		RF_ASSERT(new->numSector == old->numSector);
    276 		RF_ASSERT(new->raidAddress == old->raidAddress);
    277 		/* the stripe unit within the stripe tells us the coefficient
    278 		 * to use for the multiply. */
    279 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    280 		/* compute the data unit offset within the column, then add
    281 		 * one */
    282 		coeff = (coeff % raidPtr->Layout.numDataCol);
    283 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    284 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    285 	}
    286 
    287 	RF_ETIMER_STOP(timer);
    288 	RF_ETIMER_EVAL(timer);
    289 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    290 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    291 					 * I/O in this node */
    292 	return (0);
    293 }
    294 /*
    295    See the SimpleXORFunc for the difference between a simple and regular func.
    296    These Q functions should be used for
    297 
    298          new q = Q(data,old data,old q)
    299 
    300    style updates and not for
    301 
    302          q = ( new data, new data, .... )
    303 
    304    computations.
    305 
    306    The simple q takes 2(2d+1)+1 params, where d is the number
    307    of stripes written. The order of params is
    308    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
    309    [2d] old q pda_0, old q buffer
    310    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
    311    raidPtr
    312 */
    313 
    314 int
    315 rf_SimpleONQFunc(node)
    316 	RF_DagNode_t *node;
    317 {
    318 	int     np = node->numParams;
    319 	int     d;
    320 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    321 	int     i;
    322 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    323 	RF_Etimer_t timer;
    324 	char   *qbuf;
    325 	char   *obuf, *nbuf;
    326 	RF_PhysDiskAddr_t *old, *new;
    327 	unsigned long coeff;
    328 
    329 	RF_ETIMER_START(timer);
    330 
    331 	d = (np - 3) / 4;
    332 	RF_ASSERT(4 * d + 3 == np);
    333 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
    334 	for (i = 0; i < d; i++) {
    335 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    336 		obuf = (char *) node->params[2 * i + 1].p;
    337 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
    338 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
    339 		RF_ASSERT(new->numSector == old->numSector);
    340 		RF_ASSERT(new->raidAddress == old->raidAddress);
    341 		/* the stripe unit within the stripe tells us the coefficient
    342 		 * to use for the multiply. */
    343 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
    344 		/* compute the data unit offset within the column, then add
    345 		 * one */
    346 		coeff = (coeff % raidPtr->Layout.numDataCol);
    347 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    348 	}
    349 
    350 	RF_ETIMER_STOP(timer);
    351 	RF_ETIMER_EVAL(timer);
    352 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    353 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    354 					 * I/O in this node */
    355 	return (0);
    356 }
    357 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
    358 {
    359 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
    360 }
    361 
    362 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
    363 
    364 static void
    365 RegularQSubr(node, qbuf)
    366 	RF_DagNode_t *node;
    367 	char   *qbuf;
    368 {
    369 	int     np = node->numParams;
    370 	int     d;
    371 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    372 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    373 	int     i;
    374 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    375 	RF_Etimer_t timer;
    376 	char   *obuf, *qpbuf;
    377 	RF_PhysDiskAddr_t *old;
    378 	unsigned long coeff;
    379 
    380 	RF_ETIMER_START(timer);
    381 
    382 	d = (np - 1) / 2;
    383 	RF_ASSERT(2 * d + 1 == np);
    384 	for (i = 0; i < d; i++) {
    385 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    386 		obuf = (char *) node->params[2 * i + 1].p;
    387 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    388 		/* compute the data unit offset within the column, then add
    389 		 * one */
    390 		coeff = (coeff % raidPtr->Layout.numDataCol);
    391 		/* the input buffers may not all be aligned with the start of
    392 		 * the stripe. so shift by their sector offset within the
    393 		 * stripe unit */
    394 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
    395 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    396 	}
    397 
    398 	RF_ETIMER_STOP(timer);
    399 	RF_ETIMER_EVAL(timer);
    400 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    401 }
    402 /*
    403    used in degraded writes.
    404 */
    405 
    406 static void DegrQSubr(RF_DagNode_t *node);
    407 
    408 static void
    409 DegrQSubr(node)
    410 	RF_DagNode_t *node;
    411 {
    412 	int     np = node->numParams;
    413 	int     d;
    414 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    415 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    416 	int     i;
    417 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    418 	RF_Etimer_t timer;
    419 	char   *qbuf = node->results[1];
    420 	char   *obuf, *qpbuf;
    421 	RF_PhysDiskAddr_t *old;
    422 	unsigned long coeff;
    423 	unsigned fail_start;
    424 	int     j;
    425 
    426 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    427 	fail_start = old->startSector % secPerSU;
    428 
    429 	RF_ETIMER_START(timer);
    430 
    431 	d = (np - 2) / 2;
    432 	RF_ASSERT(2 * d + 2 == np);
    433 	for (i = 0; i < d; i++) {
    434 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    435 		obuf = (char *) node->params[2 * i + 1].p;
    436 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    437 		/* compute the data unit offset within the column, then add
    438 		 * one */
    439 		coeff = (coeff % raidPtr->Layout.numDataCol);
    440 		/* the input buffers may not all be aligned with the start of
    441 		 * the stripe. so shift by their sector offset within the
    442 		 * stripe unit */
    443 		j = old->startSector % secPerSU;
    444 		RF_ASSERT(j >= fail_start);
    445 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    446 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    447 	}
    448 
    449 	RF_ETIMER_STOP(timer);
    450 	RF_ETIMER_EVAL(timer);
    451 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    452 }
    453 /*
    454    Called by large write code to compute the new parity and the new q.
    455 
    456    structure of the params:
    457 
    458    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
    459    raidPtr
    460 
    461    for a total of 2d+1 arguments.
    462    The result buffers results[0], results[1] are the buffers for the p and q,
    463    respectively.
    464 
    465    We compute Q first, then compute P. The P calculation may try to reuse
    466    one of the input buffers for its output, so if we computed P first, we would
    467    corrupt the input for the q calculation.
    468 */
    469 
    470 int
    471 rf_RegularPQFunc(node)
    472 	RF_DagNode_t *node;
    473 {
    474 	RegularQSubr(node, node->results[1]);
    475 	return (rf_RegularXorFunc(node));	/* does the wakeup */
    476 }
    477 
    478 int
    479 rf_RegularQFunc(node)
    480 	RF_DagNode_t *node;
    481 {
    482 	/* Almost ... adjust Qsubr args */
    483 	RegularQSubr(node, node->results[0]);
    484 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
    485 					 * I/O in this node */
    486 	return (0);
    487 }
    488 /*
    489    Called by singly degraded write code to compute the new parity and the new q.
    490 
    491    structure of the params:
    492 
    493    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
    494    failedPDA raidPtr
    495 
    496    for a total of 2d+2 arguments.
    497    The result buffers results[0], results[1] are the buffers for the parity and q,
    498    respectively.
    499 
    500    We compute Q first, then compute parity. The parity calculation may try to reuse
    501    one of the input buffers for its output, so if we computed parity first, we would
    502    corrupt the input for the q calculation.
    503 
    504    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
    505 */
    506 
    507 void
    508 rf_Degraded_100_PQFunc(node)
    509 	RF_DagNode_t *node;
    510 {
    511 	int     np = node->numParams;
    512 
    513 	RF_ASSERT(np >= 2);
    514 	DegrQSubr(node);
    515 	rf_RecoveryXorFunc(node);
    516 }
    517 
    518 
    519 /*
    520    The two below are used when reading a stripe with a single lost data unit.
    521    The parameters are
    522 
    523    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
    524 
    525    and results[0] contains the data buffer. Which is originally zero-filled.
    526 
    527 */
    528 
    529 /* this Q func is used by the degraded-mode dag functions to recover lost data.
    530  * the second-to-last parameter is the PDA for the failed portion of the access.
    531  * the code here looks at this PDA and assumes that the xor target buffer is
    532  * equal in size to the number of sectors in the failed PDA.  It then uses
    533  * the other PDAs in the parameter list to determine where within the target
    534  * buffer the corresponding data should be xored.
    535  *
    536  * Recall the basic equation is
    537  *
    538  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
    539  *
    540  * so to recover data_j we need
    541  *
    542  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
    543  *
    544  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
    545  * copying Q into it. Then we need to do a table lookup to convert to solve
    546  *   data_j /= J
    547  *
    548  *
    549  */
    550 int
    551 rf_RecoveryQFunc(node)
    552 	RF_DagNode_t *node;
    553 {
    554 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    555 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
    556 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
    557 	int     i;
    558 	RF_PhysDiskAddr_t *pda;
    559 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
    560 	char   *srcbuf, *destbuf;
    561 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    562 	RF_Etimer_t timer;
    563 	unsigned long coeff;
    564 
    565 	RF_ETIMER_START(timer);
    566 	/* start by copying Q into the buffer */
    567 	bcopy(node->params[node->numParams - 3].p, node->results[0],
    568 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
    569 	for (i = 0; i < node->numParams - 4; i += 2) {
    570 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
    571 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
    572 		srcbuf = (char *) node->params[i + 1].p;
    573 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
    574 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
    575 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
    576 		/* compute the data unit offset within the column */
    577 		coeff = (coeff % raidPtr->Layout.numDataCol);
    578 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    579 	}
    580 	/* Do the nasty inversion now */
    581 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
    582 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
    583 	RF_ETIMER_STOP(timer);
    584 	RF_ETIMER_EVAL(timer);
    585 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    586 	rf_GenericWakeupFunc(node, 0);
    587 	return (0);
    588 }
    589 
    590 int
    591 rf_RecoveryPQFunc(node)
    592 	RF_DagNode_t *node;
    593 {
    594 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
    595 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
    596 	return (1);
    597 }
    598 /*
    599    Degraded write Q subroutine.
    600    Used when P is dead.
    601    Large-write style Q computation.
    602    Parameters
    603 
    604    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
    605 
    606    We ignore failedPDA.
    607 
    608    This is a "simple style" recovery func.
    609 */
    610 
    611 void
    612 rf_PQ_DegradedWriteQFunc(node)
    613 	RF_DagNode_t *node;
    614 {
    615 	int     np = node->numParams;
    616 	int     d;
    617 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
    618 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
    619 	int     i;
    620 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
    621 	RF_Etimer_t timer;
    622 	char   *qbuf = node->results[0];
    623 	char   *obuf, *qpbuf;
    624 	RF_PhysDiskAddr_t *old;
    625 	unsigned long coeff;
    626 	int     fail_start, j;
    627 
    628 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
    629 	fail_start = old->startSector % secPerSU;
    630 
    631 	RF_ETIMER_START(timer);
    632 
    633 	d = (np - 2) / 2;
    634 	RF_ASSERT(2 * d + 2 == np);
    635 
    636 	for (i = 0; i < d; i++) {
    637 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
    638 		obuf = (char *) node->params[2 * i + 1].p;
    639 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
    640 		/* compute the data unit offset within the column, then add
    641 		 * one */
    642 		coeff = (coeff % raidPtr->Layout.numDataCol);
    643 		j = old->startSector % secPerSU;
    644 		RF_ASSERT(j >= fail_start);
    645 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
    646 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
    647 	}
    648 
    649 	RF_ETIMER_STOP(timer);
    650 	RF_ETIMER_EVAL(timer);
    651 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
    652 	rf_GenericWakeupFunc(node, 0);
    653 }
    654 
    655 
    656 
    657 
    658 /* Q computations */
    659 
    660 /*
    661    coeff - colummn;
    662 
    663    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
    664 
    665    on 5-bit basis;
    666    length in bytes;
    667 */
    668 
    669 void
    670 rf_IncQ(dest, buf, length, coeff)
    671 	unsigned long *dest;
    672 	unsigned long *buf;
    673 	unsigned length;
    674 	unsigned coeff;
    675 {
    676 	unsigned long a, d, new;
    677 	unsigned long a1, a2;
    678 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    679 	unsigned r = rf_rn[coeff + 1];
    680 
    681 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
    682 #define INSERT(a,i) (a << (5L*i))
    683 
    684 	length /= 8;
    685 	/* 13 5 bit quants in a 64 bit word */
    686 	while (length) {
    687 		a = *buf++;
    688 		d = *dest;
    689 		a1 = EXTRACT(a, 0) ^ r;
    690 		a2 = EXTRACT(a, 1) ^ r;
    691 		new = INSERT(a2, 1) | a1;
    692 		a1 = EXTRACT(a, 2) ^ r;
    693 		a2 = EXTRACT(a, 3) ^ r;
    694 		a1 = q[a1];
    695 		a2 = q[a2];
    696 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    697 		a1 = EXTRACT(a, 4) ^ r;
    698 		a2 = EXTRACT(a, 5) ^ r;
    699 		a1 = q[a1];
    700 		a2 = q[a2];
    701 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    702 		a1 = EXTRACT(a, 5) ^ r;
    703 		a2 = EXTRACT(a, 6) ^ r;
    704 		a1 = q[a1];
    705 		a2 = q[a2];
    706 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    707 #if RF_LONGSHIFT > 2
    708 		a1 = EXTRACT(a, 7) ^ r;
    709 		a2 = EXTRACT(a, 8) ^ r;
    710 		a1 = q[a1];
    711 		a2 = q[a2];
    712 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    713 		a1 = EXTRACT(a, 9) ^ r;
    714 		a2 = EXTRACT(a, 10) ^ r;
    715 		a1 = q[a1];
    716 		a2 = q[a2];
    717 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    718 		a1 = EXTRACT(a, 11) ^ r;
    719 		a2 = EXTRACT(a, 12) ^ r;
    720 		a1 = q[a1];
    721 		a2 = q[a2];
    722 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    723 #endif				/* RF_LONGSHIFT > 2 */
    724 		d ^= new;
    725 		*dest++ = d;
    726 		length--;
    727 	}
    728 }
    729 /*
    730    compute
    731 
    732    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
    733 
    734    on a five bit basis.
    735    optimization: compute old ^ new on 64 bit basis.
    736 
    737    length in bytes.
    738 */
    739 
    740 static void
    741 QDelta(
    742     char *dest,
    743     char *obuf,
    744     char *nbuf,
    745     unsigned length,
    746     unsigned char coeff)
    747 {
    748 	unsigned long a, d, new;
    749 	unsigned long a1, a2;
    750 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
    751 	unsigned int r = rf_rn[coeff + 1];
    752 
    753 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
    754 	q = NULL; /* XXX for now */
    755 
    756 #ifdef _KERNEL
    757 	/* PQ in kernel currently not supported because the encoding/decoding
    758 	 * table is not present */
    759 	memset(dest, 0, length);
    760 #else				/* KERNEL */
    761 	/* this code probably doesn't work and should be rewritten  -wvcii */
    762 	/* 13 5 bit quants in a 64 bit word */
    763 	length /= 8;
    764 	while (length) {
    765 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
    766 		a ^= *nbuf++;
    767 		d = *dest;
    768 		a1 = EXTRACT(a, 0) ^ r;
    769 		a2 = EXTRACT(a, 1) ^ r;
    770 		a1 = q[a1];
    771 		a2 = q[a2];
    772 		new = INSERT(a2, 1) | a1;
    773 		a1 = EXTRACT(a, 2) ^ r;
    774 		a2 = EXTRACT(a, 3) ^ r;
    775 		a1 = q[a1];
    776 		a2 = q[a2];
    777 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
    778 		a1 = EXTRACT(a, 4) ^ r;
    779 		a2 = EXTRACT(a, 5) ^ r;
    780 		a1 = q[a1];
    781 		a2 = q[a2];
    782 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
    783 		a1 = EXTRACT(a, 5) ^ r;
    784 		a2 = EXTRACT(a, 6) ^ r;
    785 		a1 = q[a1];
    786 		a2 = q[a2];
    787 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
    788 #if RF_LONGSHIFT > 2
    789 		a1 = EXTRACT(a, 7) ^ r;
    790 		a2 = EXTRACT(a, 8) ^ r;
    791 		a1 = q[a1];
    792 		a2 = q[a2];
    793 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
    794 		a1 = EXTRACT(a, 9) ^ r;
    795 		a2 = EXTRACT(a, 10) ^ r;
    796 		a1 = q[a1];
    797 		a2 = q[a2];
    798 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
    799 		a1 = EXTRACT(a, 11) ^ r;
    800 		a2 = EXTRACT(a, 12) ^ r;
    801 		a1 = q[a1];
    802 		a2 = q[a2];
    803 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
    804 #endif				/* RF_LONGSHIFT > 2 */
    805 		d ^= new;
    806 		*dest++ = d;
    807 		length--;
    808 	}
    809 #endif				/* _KERNEL */
    810 }
    811 /*
    812    recover columns a and b from the given p and q into
    813    bufs abuf and bbuf. All bufs are word aligned.
    814    Length is in bytes.
    815 */
    816 
    817 
    818 /*
    819  * XXX
    820  *
    821  * Everything about this seems wrong.
    822  */
    823 void
    824 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
    825 	unsigned long *pbuf;
    826 	unsigned long *qbuf;
    827 	unsigned long *abuf;
    828 	unsigned long *bbuf;
    829 	unsigned length;
    830 	unsigned coeff_a;
    831 	unsigned coeff_b;
    832 {
    833 	unsigned long p, q, a, a0, a1;
    834 	int     col = (29 * coeff_a) + coeff_b;
    835 	unsigned char *q0 = &(rf_qinv[col][0]);
    836 
    837 	length /= 8;
    838 	while (length) {
    839 		p = *pbuf++;
    840 		q = *qbuf++;
    841 		a0 = EXTRACT(p, 0);
    842 		a1 = EXTRACT(q, 0);
    843 		a = q0[a0 << 5 | a1];
    844 #define MF(i) \
    845       a0 = EXTRACT(p,i); \
    846       a1 = EXTRACT(q,i); \
    847       a  = a | INSERT(q0[a0<<5 | a1],i)
    848 
    849 		MF(1);
    850 		MF(2);
    851 		MF(3);
    852 		MF(4);
    853 		MF(5);
    854 		MF(6);
    855 #if 0
    856 		MF(7);
    857 		MF(8);
    858 		MF(9);
    859 		MF(10);
    860 		MF(11);
    861 		MF(12);
    862 #endif				/* 0 */
    863 		*abuf++ = a;
    864 		*bbuf++ = a ^ p;
    865 		length--;
    866 	}
    867 }
    868 /*
    869    Lost parity and a data column. Recover that data column.
    870    Assume col coeff is lost. Let q the contents of Q after
    871    all surviving data columns have been q-xored out of it.
    872    Then we have the equation
    873 
    874    q[28-coeff][a_i ^ r_i+1] = q
    875 
    876    but q is cyclic with period 31.
    877    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
    878       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
    879 
    880    so a_i = r_{coeff+1} ^ q[3+coeff][q]
    881 
    882    The routine is passed q buffer and the buffer
    883    the data is to be recoverd into. They can be the same.
    884 */
    885 
    886 
    887 
    888 static void
    889 rf_InvertQ(
    890     unsigned long *qbuf,
    891     unsigned long *abuf,
    892     unsigned length,
    893     unsigned coeff)
    894 {
    895 	unsigned long a, new;
    896 	unsigned long a1, a2;
    897 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
    898 	unsigned r = rf_rn[coeff + 1];
    899 
    900 	/* 13 5 bit quants in a 64 bit word */
    901 	length /= 8;
    902 	while (length) {
    903 		a = *qbuf++;
    904 		a1 = EXTRACT(a, 0);
    905 		a2 = EXTRACT(a, 1);
    906 		a1 = r ^ q[a1];
    907 		a2 = r ^ q[a2];
    908 		new = INSERT(a2, 1) | a1;
    909 #define M(i,j) \
    910       a1 = EXTRACT(a,i); \
    911       a2 = EXTRACT(a,j); \
    912       a1 = r ^ q[a1]; \
    913       a2 = r ^ q[a2]; \
    914       new = new | INSERT(a1,i) | INSERT(a2,j)
    915 
    916 		M(2, 3);
    917 		M(4, 5);
    918 		M(5, 6);
    919 #if RF_LONGSHIFT > 2
    920 		M(7, 8);
    921 		M(9, 10);
    922 		M(11, 12);
    923 #endif				/* RF_LONGSHIFT > 2 */
    924 		*abuf++ = new;
    925 		length--;
    926 	}
    927 }
    928 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
    929 				 * (RF_INCLUDE_RAID6 > 0) */
    930