rf_pq.c revision 1.9 1 /* $NetBSD: rf_pq.c,v 1.9 2001/07/18 06:45:34 thorpej Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * Code for RAID level 6 (P + Q) disk array architecture.
31 */
32
33 #include "rf_archs.h"
34
35 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
36
37 #include "rf_types.h"
38 #include "rf_raid.h"
39 #include "rf_dag.h"
40 #include "rf_dagffrd.h"
41 #include "rf_dagffwr.h"
42 #include "rf_dagdegrd.h"
43 #include "rf_dagdegwr.h"
44 #include "rf_dagutils.h"
45 #include "rf_dagfuncs.h"
46 #include "rf_etimer.h"
47 #include "rf_pqdeg.h"
48 #include "rf_general.h"
49 #include "rf_map.h"
50 #include "rf_pq.h"
51
52 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
53 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
54
55 int
56 rf_RegularONPFunc(node)
57 RF_DagNode_t *node;
58 {
59 return (rf_RegularXorFunc(node));
60 }
61 /*
62 same as simpleONQ func, but the coefficient is always 1
63 */
64
65 int
66 rf_SimpleONPFunc(node)
67 RF_DagNode_t *node;
68 {
69 return (rf_SimpleXorFunc(node));
70 }
71
72 int
73 rf_RecoveryPFunc(node)
74 RF_DagNode_t *node;
75 {
76 return (rf_RecoveryXorFunc(node));
77 }
78
79 int
80 rf_RegularPFunc(node)
81 RF_DagNode_t *node;
82 {
83 return (rf_RegularXorFunc(node));
84 }
85 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
86 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
87
88 static void
89 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
90 unsigned char coeff);
91 static void
92 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
93 unsigned length, unsigned coeff);
94
95 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
96 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
97 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
98
99 void
100 rf_PQDagSelect(
101 RF_Raid_t * raidPtr,
102 RF_IoType_t type,
103 RF_AccessStripeMap_t * asmap,
104 RF_VoidFuncPtr * createFunc)
105 {
106 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
107 unsigned ndfail = asmap->numDataFailed;
108 unsigned npfail = asmap->numParityFailed;
109 unsigned ntfail = npfail + ndfail;
110
111 RF_ASSERT(RF_IO_IS_R_OR_W(type));
112 if (ntfail > 2) {
113 RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n");
114 /* *infoFunc = */ *createFunc = NULL;
115 return;
116 }
117 /* ok, we can do this I/O */
118 if (type == RF_IO_TYPE_READ) {
119 switch (ndfail) {
120 case 0:
121 /* fault free read */
122 *createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; /* same as raid 5 */
123 break;
124 case 1:
125 /* lost a single data unit */
126 /* two cases: (1) parity is not lost. do a normal raid
127 * 5 reconstruct read. (2) parity is lost. do a
128 * reconstruct read using "q". */
129 if (ntfail == 2) { /* also lost redundancy */
130 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
131 *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
132 else
133 *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
134 } else {
135 /* P and Q are ok. But is there a failure in
136 * some unaccessed data unit? */
137 if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
138 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
139 else
140 *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
141 }
142 break;
143 case 2:
144 /* lost two data units */
145 /* *infoFunc = PQOneTwo; */
146 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
147 break;
148 }
149 return;
150 }
151 /* a write */
152 switch (ntfail) {
153 case 0: /* fault free */
154 if (rf_suppressLocksAndLargeWrites ||
155 (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
156 (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
157
158 *createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
159 } else {
160 *createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
161 }
162 break;
163
164 case 1: /* single disk fault */
165 if (npfail == 1) {
166 RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
167 if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) { /* q died, treat like
168 * normal mode raid5
169 * write. */
170 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
171 || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
172 *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
173 else
174 *createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
175 } else {/* parity died, small write only updating Q */
176 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
177 || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
178 *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
179 else
180 *createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
181 }
182 } else { /* data missing. Do a P reconstruct write if
183 * only a single data unit is lost in the
184 * stripe, otherwise a PQ reconstruct write. */
185 if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
186 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
187 else
188 *createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
189 }
190 break;
191
192 case 2: /* two disk faults */
193 switch (npfail) {
194 case 2: /* both p and q dead */
195 *createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
196 break;
197 case 1: /* either p or q and dead data */
198 RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
199 RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
200 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
201 *createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
202 else
203 *createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
204 break;
205 case 0: /* double data loss */
206 *createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
207 break;
208 }
209 break;
210
211 default: /* more than 2 disk faults */
212 *createFunc = NULL;
213 RF_PANIC();
214 }
215 return;
216 }
217 /*
218 Used as a stop gap info function
219 */
220 #if 0
221 static void
222 PQOne(raidPtr, nSucc, nAnte, asmap)
223 RF_Raid_t *raidPtr;
224 int *nSucc;
225 int *nAnte;
226 RF_AccessStripeMap_t *asmap;
227 {
228 *nSucc = *nAnte = 1;
229 }
230
231 static void
232 PQOneTwo(raidPtr, nSucc, nAnte, asmap)
233 RF_Raid_t *raidPtr;
234 int *nSucc;
235 int *nAnte;
236 RF_AccessStripeMap_t *asmap;
237 {
238 *nSucc = 1;
239 *nAnte = 2;
240 }
241 #endif
242
243 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
244 {
245 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
246 rf_RegularPQFunc, RF_FALSE);
247 }
248
249 int
250 rf_RegularONQFunc(node)
251 RF_DagNode_t *node;
252 {
253 int np = node->numParams;
254 int d;
255 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
256 int i;
257 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
258 RF_Etimer_t timer;
259 char *qbuf, *qpbuf;
260 char *obuf, *nbuf;
261 RF_PhysDiskAddr_t *old, *new;
262 unsigned long coeff;
263 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
264
265 RF_ETIMER_START(timer);
266
267 d = (np - 3) / 4;
268 RF_ASSERT(4 * d + 3 == np);
269 qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */
270 for (i = 0; i < d; i++) {
271 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
272 obuf = (char *) node->params[2 * i + 1].p;
273 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
274 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
275 RF_ASSERT(new->numSector == old->numSector);
276 RF_ASSERT(new->raidAddress == old->raidAddress);
277 /* the stripe unit within the stripe tells us the coefficient
278 * to use for the multiply. */
279 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
280 /* compute the data unit offset within the column, then add
281 * one */
282 coeff = (coeff % raidPtr->Layout.numDataCol);
283 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
284 QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
285 }
286
287 RF_ETIMER_STOP(timer);
288 RF_ETIMER_EVAL(timer);
289 tracerec->q_us += RF_ETIMER_VAL_US(timer);
290 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no
291 * I/O in this node */
292 return (0);
293 }
294 /*
295 See the SimpleXORFunc for the difference between a simple and regular func.
296 These Q functions should be used for
297
298 new q = Q(data,old data,old q)
299
300 style updates and not for
301
302 q = ( new data, new data, .... )
303
304 computations.
305
306 The simple q takes 2(2d+1)+1 params, where d is the number
307 of stripes written. The order of params is
308 old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
309 [2d] old q pda_0, old q buffer
310 [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d
311 raidPtr
312 */
313
314 int
315 rf_SimpleONQFunc(node)
316 RF_DagNode_t *node;
317 {
318 int np = node->numParams;
319 int d;
320 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
321 int i;
322 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
323 RF_Etimer_t timer;
324 char *qbuf;
325 char *obuf, *nbuf;
326 RF_PhysDiskAddr_t *old, *new;
327 unsigned long coeff;
328
329 RF_ETIMER_START(timer);
330
331 d = (np - 3) / 4;
332 RF_ASSERT(4 * d + 3 == np);
333 qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */
334 for (i = 0; i < d; i++) {
335 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
336 obuf = (char *) node->params[2 * i + 1].p;
337 new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
338 nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
339 RF_ASSERT(new->numSector == old->numSector);
340 RF_ASSERT(new->raidAddress == old->raidAddress);
341 /* the stripe unit within the stripe tells us the coefficient
342 * to use for the multiply. */
343 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
344 /* compute the data unit offset within the column, then add
345 * one */
346 coeff = (coeff % raidPtr->Layout.numDataCol);
347 QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
348 }
349
350 RF_ETIMER_STOP(timer);
351 RF_ETIMER_EVAL(timer);
352 tracerec->q_us += RF_ETIMER_VAL_US(timer);
353 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no
354 * I/O in this node */
355 return (0);
356 }
357 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
358 {
359 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
360 }
361
362 static void RegularQSubr(RF_DagNode_t *node, char *qbuf);
363
364 static void
365 RegularQSubr(node, qbuf)
366 RF_DagNode_t *node;
367 char *qbuf;
368 {
369 int np = node->numParams;
370 int d;
371 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
372 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
373 int i;
374 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
375 RF_Etimer_t timer;
376 char *obuf, *qpbuf;
377 RF_PhysDiskAddr_t *old;
378 unsigned long coeff;
379
380 RF_ETIMER_START(timer);
381
382 d = (np - 1) / 2;
383 RF_ASSERT(2 * d + 1 == np);
384 for (i = 0; i < d; i++) {
385 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
386 obuf = (char *) node->params[2 * i + 1].p;
387 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
388 /* compute the data unit offset within the column, then add
389 * one */
390 coeff = (coeff % raidPtr->Layout.numDataCol);
391 /* the input buffers may not all be aligned with the start of
392 * the stripe. so shift by their sector offset within the
393 * stripe unit */
394 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
395 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
396 }
397
398 RF_ETIMER_STOP(timer);
399 RF_ETIMER_EVAL(timer);
400 tracerec->q_us += RF_ETIMER_VAL_US(timer);
401 }
402 /*
403 used in degraded writes.
404 */
405
406 static void DegrQSubr(RF_DagNode_t *node);
407
408 static void
409 DegrQSubr(node)
410 RF_DagNode_t *node;
411 {
412 int np = node->numParams;
413 int d;
414 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
415 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
416 int i;
417 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
418 RF_Etimer_t timer;
419 char *qbuf = node->results[1];
420 char *obuf, *qpbuf;
421 RF_PhysDiskAddr_t *old;
422 unsigned long coeff;
423 unsigned fail_start;
424 int j;
425
426 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
427 fail_start = old->startSector % secPerSU;
428
429 RF_ETIMER_START(timer);
430
431 d = (np - 2) / 2;
432 RF_ASSERT(2 * d + 2 == np);
433 for (i = 0; i < d; i++) {
434 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
435 obuf = (char *) node->params[2 * i + 1].p;
436 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
437 /* compute the data unit offset within the column, then add
438 * one */
439 coeff = (coeff % raidPtr->Layout.numDataCol);
440 /* the input buffers may not all be aligned with the start of
441 * the stripe. so shift by their sector offset within the
442 * stripe unit */
443 j = old->startSector % secPerSU;
444 RF_ASSERT(j >= fail_start);
445 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
446 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
447 }
448
449 RF_ETIMER_STOP(timer);
450 RF_ETIMER_EVAL(timer);
451 tracerec->q_us += RF_ETIMER_VAL_US(timer);
452 }
453 /*
454 Called by large write code to compute the new parity and the new q.
455
456 structure of the params:
457
458 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
459 raidPtr
460
461 for a total of 2d+1 arguments.
462 The result buffers results[0], results[1] are the buffers for the p and q,
463 respectively.
464
465 We compute Q first, then compute P. The P calculation may try to reuse
466 one of the input buffers for its output, so if we computed P first, we would
467 corrupt the input for the q calculation.
468 */
469
470 int
471 rf_RegularPQFunc(node)
472 RF_DagNode_t *node;
473 {
474 RegularQSubr(node, node->results[1]);
475 return (rf_RegularXorFunc(node)); /* does the wakeup */
476 }
477
478 int
479 rf_RegularQFunc(node)
480 RF_DagNode_t *node;
481 {
482 /* Almost ... adjust Qsubr args */
483 RegularQSubr(node, node->results[0]);
484 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no
485 * I/O in this node */
486 return (0);
487 }
488 /*
489 Called by singly degraded write code to compute the new parity and the new q.
490
491 structure of the params:
492
493 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
494 failedPDA raidPtr
495
496 for a total of 2d+2 arguments.
497 The result buffers results[0], results[1] are the buffers for the parity and q,
498 respectively.
499
500 We compute Q first, then compute parity. The parity calculation may try to reuse
501 one of the input buffers for its output, so if we computed parity first, we would
502 corrupt the input for the q calculation.
503
504 We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
505 */
506
507 void
508 rf_Degraded_100_PQFunc(node)
509 RF_DagNode_t *node;
510 {
511 int np = node->numParams;
512
513 RF_ASSERT(np >= 2);
514 DegrQSubr(node);
515 rf_RecoveryXorFunc(node);
516 }
517
518
519 /*
520 The two below are used when reading a stripe with a single lost data unit.
521 The parameters are
522
523 pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
524
525 and results[0] contains the data buffer. Which is originally zero-filled.
526
527 */
528
529 /* this Q func is used by the degraded-mode dag functions to recover lost data.
530 * the second-to-last parameter is the PDA for the failed portion of the access.
531 * the code here looks at this PDA and assumes that the xor target buffer is
532 * equal in size to the number of sectors in the failed PDA. It then uses
533 * the other PDAs in the parameter list to determine where within the target
534 * buffer the corresponding data should be xored.
535 *
536 * Recall the basic equation is
537 *
538 * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256
539 *
540 * so to recover data_j we need
541 *
542 * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
543 *
544 * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
545 * copying Q into it. Then we need to do a table lookup to convert to solve
546 * data_j /= J
547 *
548 *
549 */
550 int
551 rf_RecoveryQFunc(node)
552 RF_DagNode_t *node;
553 {
554 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
555 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
556 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
557 int i;
558 RF_PhysDiskAddr_t *pda;
559 RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
560 char *srcbuf, *destbuf;
561 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
562 RF_Etimer_t timer;
563 unsigned long coeff;
564
565 RF_ETIMER_START(timer);
566 /* start by copying Q into the buffer */
567 bcopy(node->params[node->numParams - 3].p, node->results[0],
568 rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
569 for (i = 0; i < node->numParams - 4; i += 2) {
570 RF_ASSERT(node->params[i + 1].p != node->results[0]);
571 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
572 srcbuf = (char *) node->params[i + 1].p;
573 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
574 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
575 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
576 /* compute the data unit offset within the column */
577 coeff = (coeff % raidPtr->Layout.numDataCol);
578 rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
579 }
580 /* Do the nasty inversion now */
581 coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
582 rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
583 RF_ETIMER_STOP(timer);
584 RF_ETIMER_EVAL(timer);
585 tracerec->q_us += RF_ETIMER_VAL_US(timer);
586 rf_GenericWakeupFunc(node, 0);
587 return (0);
588 }
589
590 int
591 rf_RecoveryPQFunc(node)
592 RF_DagNode_t *node;
593 {
594 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
595 printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
596 return (1);
597 }
598 /*
599 Degraded write Q subroutine.
600 Used when P is dead.
601 Large-write style Q computation.
602 Parameters
603
604 (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
605
606 We ignore failedPDA.
607
608 This is a "simple style" recovery func.
609 */
610
611 void
612 rf_PQ_DegradedWriteQFunc(node)
613 RF_DagNode_t *node;
614 {
615 int np = node->numParams;
616 int d;
617 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
618 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
619 int i;
620 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
621 RF_Etimer_t timer;
622 char *qbuf = node->results[0];
623 char *obuf, *qpbuf;
624 RF_PhysDiskAddr_t *old;
625 unsigned long coeff;
626 int fail_start, j;
627
628 old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
629 fail_start = old->startSector % secPerSU;
630
631 RF_ETIMER_START(timer);
632
633 d = (np - 2) / 2;
634 RF_ASSERT(2 * d + 2 == np);
635
636 for (i = 0; i < d; i++) {
637 old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
638 obuf = (char *) node->params[2 * i + 1].p;
639 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
640 /* compute the data unit offset within the column, then add
641 * one */
642 coeff = (coeff % raidPtr->Layout.numDataCol);
643 j = old->startSector % secPerSU;
644 RF_ASSERT(j >= fail_start);
645 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
646 rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
647 }
648
649 RF_ETIMER_STOP(timer);
650 RF_ETIMER_EVAL(timer);
651 tracerec->q_us += RF_ETIMER_VAL_US(timer);
652 rf_GenericWakeupFunc(node, 0);
653 }
654
655
656
657
658 /* Q computations */
659
660 /*
661 coeff - colummn;
662
663 compute dest ^= qfor[28-coeff][rn[coeff+1] a]
664
665 on 5-bit basis;
666 length in bytes;
667 */
668
669 void
670 rf_IncQ(dest, buf, length, coeff)
671 unsigned long *dest;
672 unsigned long *buf;
673 unsigned length;
674 unsigned coeff;
675 {
676 unsigned long a, d, new;
677 unsigned long a1, a2;
678 unsigned int *q = &(rf_qfor[28 - coeff][0]);
679 unsigned r = rf_rn[coeff + 1];
680
681 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
682 #define INSERT(a,i) (a << (5L*i))
683
684 length /= 8;
685 /* 13 5 bit quants in a 64 bit word */
686 while (length) {
687 a = *buf++;
688 d = *dest;
689 a1 = EXTRACT(a, 0) ^ r;
690 a2 = EXTRACT(a, 1) ^ r;
691 new = INSERT(a2, 1) | a1;
692 a1 = EXTRACT(a, 2) ^ r;
693 a2 = EXTRACT(a, 3) ^ r;
694 a1 = q[a1];
695 a2 = q[a2];
696 new = new | INSERT(a1, 2) | INSERT(a2, 3);
697 a1 = EXTRACT(a, 4) ^ r;
698 a2 = EXTRACT(a, 5) ^ r;
699 a1 = q[a1];
700 a2 = q[a2];
701 new = new | INSERT(a1, 4) | INSERT(a2, 5);
702 a1 = EXTRACT(a, 5) ^ r;
703 a2 = EXTRACT(a, 6) ^ r;
704 a1 = q[a1];
705 a2 = q[a2];
706 new = new | INSERT(a1, 5) | INSERT(a2, 6);
707 #if RF_LONGSHIFT > 2
708 a1 = EXTRACT(a, 7) ^ r;
709 a2 = EXTRACT(a, 8) ^ r;
710 a1 = q[a1];
711 a2 = q[a2];
712 new = new | INSERT(a1, 7) | INSERT(a2, 8);
713 a1 = EXTRACT(a, 9) ^ r;
714 a2 = EXTRACT(a, 10) ^ r;
715 a1 = q[a1];
716 a2 = q[a2];
717 new = new | INSERT(a1, 9) | INSERT(a2, 10);
718 a1 = EXTRACT(a, 11) ^ r;
719 a2 = EXTRACT(a, 12) ^ r;
720 a1 = q[a1];
721 a2 = q[a2];
722 new = new | INSERT(a1, 11) | INSERT(a2, 12);
723 #endif /* RF_LONGSHIFT > 2 */
724 d ^= new;
725 *dest++ = d;
726 length--;
727 }
728 }
729 /*
730 compute
731
732 dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
733
734 on a five bit basis.
735 optimization: compute old ^ new on 64 bit basis.
736
737 length in bytes.
738 */
739
740 static void
741 QDelta(
742 char *dest,
743 char *obuf,
744 char *nbuf,
745 unsigned length,
746 unsigned char coeff)
747 {
748 unsigned long a, d, new;
749 unsigned long a1, a2;
750 unsigned int *q = &(rf_qfor[28 - coeff][0]);
751 unsigned int r = rf_rn[coeff + 1];
752
753 r = a1 = a2 = new = d = a = 0; /* XXX for now... */
754 q = NULL; /* XXX for now */
755
756 #ifdef _KERNEL
757 /* PQ in kernel currently not supported because the encoding/decoding
758 * table is not present */
759 memset(dest, 0, length);
760 #else /* KERNEL */
761 /* this code probably doesn't work and should be rewritten -wvcii */
762 /* 13 5 bit quants in a 64 bit word */
763 length /= 8;
764 while (length) {
765 a = *obuf++; /* XXX need to reorg to avoid cache conflicts */
766 a ^= *nbuf++;
767 d = *dest;
768 a1 = EXTRACT(a, 0) ^ r;
769 a2 = EXTRACT(a, 1) ^ r;
770 a1 = q[a1];
771 a2 = q[a2];
772 new = INSERT(a2, 1) | a1;
773 a1 = EXTRACT(a, 2) ^ r;
774 a2 = EXTRACT(a, 3) ^ r;
775 a1 = q[a1];
776 a2 = q[a2];
777 new = new | INSERT(a1, 2) | INSERT(a2, 3);
778 a1 = EXTRACT(a, 4) ^ r;
779 a2 = EXTRACT(a, 5) ^ r;
780 a1 = q[a1];
781 a2 = q[a2];
782 new = new | INSERT(a1, 4) | INSERT(a2, 5);
783 a1 = EXTRACT(a, 5) ^ r;
784 a2 = EXTRACT(a, 6) ^ r;
785 a1 = q[a1];
786 a2 = q[a2];
787 new = new | INSERT(a1, 5) | INSERT(a2, 6);
788 #if RF_LONGSHIFT > 2
789 a1 = EXTRACT(a, 7) ^ r;
790 a2 = EXTRACT(a, 8) ^ r;
791 a1 = q[a1];
792 a2 = q[a2];
793 new = new | INSERT(a1, 7) | INSERT(a2, 8);
794 a1 = EXTRACT(a, 9) ^ r;
795 a2 = EXTRACT(a, 10) ^ r;
796 a1 = q[a1];
797 a2 = q[a2];
798 new = new | INSERT(a1, 9) | INSERT(a2, 10);
799 a1 = EXTRACT(a, 11) ^ r;
800 a2 = EXTRACT(a, 12) ^ r;
801 a1 = q[a1];
802 a2 = q[a2];
803 new = new | INSERT(a1, 11) | INSERT(a2, 12);
804 #endif /* RF_LONGSHIFT > 2 */
805 d ^= new;
806 *dest++ = d;
807 length--;
808 }
809 #endif /* _KERNEL */
810 }
811 /*
812 recover columns a and b from the given p and q into
813 bufs abuf and bbuf. All bufs are word aligned.
814 Length is in bytes.
815 */
816
817
818 /*
819 * XXX
820 *
821 * Everything about this seems wrong.
822 */
823 void
824 rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
825 unsigned long *pbuf;
826 unsigned long *qbuf;
827 unsigned long *abuf;
828 unsigned long *bbuf;
829 unsigned length;
830 unsigned coeff_a;
831 unsigned coeff_b;
832 {
833 unsigned long p, q, a, a0, a1;
834 int col = (29 * coeff_a) + coeff_b;
835 unsigned char *q0 = &(rf_qinv[col][0]);
836
837 length /= 8;
838 while (length) {
839 p = *pbuf++;
840 q = *qbuf++;
841 a0 = EXTRACT(p, 0);
842 a1 = EXTRACT(q, 0);
843 a = q0[a0 << 5 | a1];
844 #define MF(i) \
845 a0 = EXTRACT(p,i); \
846 a1 = EXTRACT(q,i); \
847 a = a | INSERT(q0[a0<<5 | a1],i)
848
849 MF(1);
850 MF(2);
851 MF(3);
852 MF(4);
853 MF(5);
854 MF(6);
855 #if 0
856 MF(7);
857 MF(8);
858 MF(9);
859 MF(10);
860 MF(11);
861 MF(12);
862 #endif /* 0 */
863 *abuf++ = a;
864 *bbuf++ = a ^ p;
865 length--;
866 }
867 }
868 /*
869 Lost parity and a data column. Recover that data column.
870 Assume col coeff is lost. Let q the contents of Q after
871 all surviving data columns have been q-xored out of it.
872 Then we have the equation
873
874 q[28-coeff][a_i ^ r_i+1] = q
875
876 but q is cyclic with period 31.
877 So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
878 q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
879
880 so a_i = r_{coeff+1} ^ q[3+coeff][q]
881
882 The routine is passed q buffer and the buffer
883 the data is to be recoverd into. They can be the same.
884 */
885
886
887
888 static void
889 rf_InvertQ(
890 unsigned long *qbuf,
891 unsigned long *abuf,
892 unsigned length,
893 unsigned coeff)
894 {
895 unsigned long a, new;
896 unsigned long a1, a2;
897 unsigned int *q = &(rf_qfor[3 + coeff][0]);
898 unsigned r = rf_rn[coeff + 1];
899
900 /* 13 5 bit quants in a 64 bit word */
901 length /= 8;
902 while (length) {
903 a = *qbuf++;
904 a1 = EXTRACT(a, 0);
905 a2 = EXTRACT(a, 1);
906 a1 = r ^ q[a1];
907 a2 = r ^ q[a2];
908 new = INSERT(a2, 1) | a1;
909 #define M(i,j) \
910 a1 = EXTRACT(a,i); \
911 a2 = EXTRACT(a,j); \
912 a1 = r ^ q[a1]; \
913 a2 = r ^ q[a2]; \
914 new = new | INSERT(a1,i) | INSERT(a2,j)
915
916 M(2, 3);
917 M(4, 5);
918 M(5, 6);
919 #if RF_LONGSHIFT > 2
920 M(7, 8);
921 M(9, 10);
922 M(11, 12);
923 #endif /* RF_LONGSHIFT > 2 */
924 *abuf++ = new;
925 length--;
926 }
927 }
928 #endif /* (RF_INCLUDE_DECL_PQ > 0) ||
929 * (RF_INCLUDE_RAID6 > 0) */
930