rf_pq.c revision 1.1 1 /* $NetBSD: rf_pq.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Daniel Stodolsky
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * Code for RAID level 6 (P + Q) disk array architecture.
31 *
32 * :
33 * Log: rf_pq.c,v
34 * Revision 1.33 1996/11/05 21:10:40 jimz
35 * failed pda generalization
36 *
37 * Revision 1.32 1996/07/31 16:29:50 jimz
38 * "fix" math on 32-bit machines using RF_LONGSHIFT
39 * (may be incorrect)
40 *
41 * Revision 1.31 1996/07/31 15:35:01 jimz
42 * evenodd changes; bugfixes for double-degraded archs, generalize
43 * some formerly PQ-only functions
44 *
45 * Revision 1.30 1996/07/27 23:36:08 jimz
46 * Solaris port of simulator
47 *
48 * Revision 1.29 1996/07/22 19:52:16 jimz
49 * switched node params to RF_DagParam_t, a union of
50 * a 64-bit int and a void *, for better portability
51 * attempted hpux port, but failed partway through for
52 * lack of a single C compiler capable of compiling all
53 * source files
54 *
55 * Revision 1.28 1996/06/09 02:36:46 jimz
56 * lots of little crufty cleanup- fixup whitespace
57 * issues, comment #ifdefs, improve typing in some
58 * places (esp size-related)
59 *
60 * Revision 1.27 1996/06/07 21:33:04 jimz
61 * begin using consistent types for sector numbers,
62 * stripe numbers, row+col numbers, recon unit numbers
63 *
64 * Revision 1.26 1996/06/02 17:31:48 jimz
65 * Moved a lot of global stuff into array structure, where it belongs.
66 * Fixed up paritylogging, pss modules in this manner. Some general
67 * code cleanup. Removed lots of dead code, some dead files.
68 *
69 * Revision 1.25 1996/05/31 22:26:54 jimz
70 * fix a lot of mapping problems, memory allocation problems
71 * found some weird lock issues, fixed 'em
72 * more code cleanup
73 *
74 * Revision 1.24 1996/05/30 23:22:16 jimz
75 * bugfixes of serialization, timing problems
76 * more cleanup
77 *
78 * Revision 1.23 1996/05/30 12:59:18 jimz
79 * make etimer happier, more portable
80 *
81 * Revision 1.22 1996/05/27 18:56:37 jimz
82 * more code cleanup
83 * better typing
84 * compiles in all 3 environments
85 *
86 * Revision 1.21 1996/05/24 22:17:04 jimz
87 * continue code + namespace cleanup
88 * typed a bunch of flags
89 *
90 * Revision 1.20 1996/05/24 04:28:55 jimz
91 * release cleanup ckpt
92 *
93 * Revision 1.19 1996/05/23 21:46:35 jimz
94 * checkpoint in code cleanup (release prep)
95 * lots of types, function names have been fixed
96 *
97 * Revision 1.18 1996/05/23 00:33:23 jimz
98 * code cleanup: move all debug decls to rf_options.c, all extern
99 * debug decls to rf_options.h, all debug vars preceded by rf_
100 *
101 * Revision 1.17 1996/05/18 19:51:34 jimz
102 * major code cleanup- fix syntax, make some types consistent,
103 * add prototypes, clean out dead code, et cetera
104 *
105 * Revision 1.16 1996/05/17 14:52:04 wvcii
106 * added prototyping to QDelta()
107 * - changed buf params from volatile unsigned long * to char *
108 * changed QDelta for kernel
109 * - just bzero the buf since kernel doesn't include pq decode table
110 *
111 * Revision 1.15 1996/05/03 19:40:20 wvcii
112 * added includes for dag library
113 *
114 * Revision 1.14 1995/12/12 18:10:06 jimz
115 * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
116 * fix 80-column brain damage in comments
117 *
118 * Revision 1.13 1995/11/30 16:19:55 wvcii
119 * added copyright info
120 *
121 * Revision 1.12 1995/11/07 16:13:47 wvcii
122 * changed PQDagSelect prototype
123 * function no longer returns numHdrSucc, numTermAnt
124 * note: this file contains node functions which should be
125 * moved to rf_dagfuncs.c so that all node funcs are bundled together
126 *
127 * Revision 1.11 1995/10/04 03:50:33 wvcii
128 * removed panics, minor code cleanup in dag selection
129 *
130 *
131 */
132
133 #include "rf_archs.h"
134 #include "rf_types.h"
135 #include "rf_raid.h"
136 #include "rf_dag.h"
137 #include "rf_dagffrd.h"
138 #include "rf_dagffwr.h"
139 #include "rf_dagdegrd.h"
140 #include "rf_dagdegwr.h"
141 #include "rf_dagutils.h"
142 #include "rf_dagfuncs.h"
143 #include "rf_threadid.h"
144 #include "rf_etimer.h"
145 #include "rf_pqdeg.h"
146 #include "rf_general.h"
147 #include "rf_map.h"
148 #include "rf_pq.h"
149 #include "rf_sys.h"
150
151 RF_RedFuncs_t rf_pFuncs = { rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P" };
152 RF_RedFuncs_t rf_pRecoveryFuncs = { rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func" };
153
154 int rf_RegularONPFunc(node)
155 RF_DagNode_t *node;
156 {
157 return(rf_RegularXorFunc(node));
158 }
159
160 /*
161 same as simpleONQ func, but the coefficient is always 1
162 */
163
164 int rf_SimpleONPFunc(node)
165 RF_DagNode_t *node;
166 {
167 return(rf_SimpleXorFunc(node));
168 }
169
170 int rf_RecoveryPFunc(node)
171 RF_DagNode_t *node;
172 {
173 return(rf_RecoveryXorFunc(node));
174 }
175
176 int rf_RegularPFunc(node)
177 RF_DagNode_t *node;
178 {
179 return(rf_RegularXorFunc(node));
180 }
181
182 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
183
184 static void QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
185 unsigned char coeff);
186 static void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
187 unsigned length, unsigned coeff);
188
189 RF_RedFuncs_t rf_qFuncs = { rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q" };
190 RF_RedFuncs_t rf_qRecoveryFuncs = { rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func" };
191 RF_RedFuncs_t rf_pqRecoveryFuncs = { rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func" };
192
193 void rf_PQDagSelect(
194 RF_Raid_t *raidPtr,
195 RF_IoType_t type,
196 RF_AccessStripeMap_t *asmap,
197 RF_VoidFuncPtr *createFunc)
198 {
199 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
200 unsigned ndfail = asmap->numDataFailed;
201 unsigned npfail = asmap->numParityFailed;
202 unsigned ntfail = npfail + ndfail;
203
204 RF_ASSERT(RF_IO_IS_R_OR_W(type));
205 if (ntfail > 2)
206 {
207 RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n");
208 /* *infoFunc = */ *createFunc = NULL;
209 return;
210 }
211
212 /* ok, we can do this I/O */
213 if (type == RF_IO_TYPE_READ)
214 {
215 switch (ndfail)
216 {
217 case 0:
218 /* fault free read */
219 *createFunc = rf_CreateFaultFreeReadDAG; /* same as raid 5 */
220 break;
221 case 1:
222 /* lost a single data unit */
223 /* two cases:
224 (1) parity is not lost.
225 do a normal raid 5 reconstruct read.
226 (2) parity is lost.
227 do a reconstruct read using "q".
228 */
229 if (ntfail == 2) /* also lost redundancy */
230 {
231 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
232 *createFunc = rf_PQ_110_CreateReadDAG;
233 else
234 *createFunc = rf_PQ_101_CreateReadDAG;
235 }
236 else
237 {
238 /* P and Q are ok. But is there a failure
239 in some unaccessed data unit?
240 */
241 if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
242 *createFunc = rf_PQ_200_CreateReadDAG;
243 else
244 *createFunc = rf_PQ_100_CreateReadDAG;
245 }
246 break;
247 case 2:
248 /* lost two data units */
249 /* *infoFunc = PQOneTwo; */
250 *createFunc = rf_PQ_200_CreateReadDAG;
251 break;
252 }
253 return;
254 }
255
256 /* a write */
257 switch (ntfail)
258 {
259 case 0: /* fault free */
260 if (rf_suppressLocksAndLargeWrites ||
261 (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
262 (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
263
264 *createFunc = rf_PQCreateSmallWriteDAG;
265 }
266 else {
267 *createFunc = rf_PQCreateLargeWriteDAG;
268 }
269 break;
270
271 case 1: /* single disk fault */
272 if (npfail==1)
273 {
274 RF_ASSERT ((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
275 if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)
276 { /* q died, treat like normal mode raid5 write.*/
277 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
278 || rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
279 *createFunc = rf_PQ_001_CreateSmallWriteDAG;
280 else
281 *createFunc = rf_PQ_001_CreateLargeWriteDAG;
282 }
283 else
284 { /* parity died, small write only updating Q */
285 if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
286 || rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
287 *createFunc = rf_PQ_010_CreateSmallWriteDAG;
288 else
289 *createFunc = rf_PQ_010_CreateLargeWriteDAG;
290 }
291 }
292 else
293 { /* data missing.
294 Do a P reconstruct write if only a single data unit
295 is lost in the stripe, otherwise a PQ reconstruct
296 write. */
297 if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
298 *createFunc = rf_PQ_200_CreateWriteDAG;
299 else
300 *createFunc = rf_PQ_100_CreateWriteDAG;
301 }
302 break;
303
304 case 2: /* two disk faults */
305 switch (npfail)
306 {
307 case 2: /* both p and q dead */
308 *createFunc = rf_PQ_011_CreateWriteDAG;
309 break;
310 case 1: /* either p or q and dead data */
311 RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
312 RF_ASSERT ((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
313 if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
314 *createFunc = rf_PQ_101_CreateWriteDAG;
315 else
316 *createFunc = rf_PQ_110_CreateWriteDAG;
317 break;
318 case 0: /* double data loss */
319 *createFunc = rf_PQ_200_CreateWriteDAG;
320 break;
321 }
322 break;
323
324 default: /* more than 2 disk faults */
325 *createFunc = NULL;
326 RF_PANIC();
327 }
328 return;
329 }
330
331 /*
332 Used as a stop gap info function
333 */
334 static void PQOne(raidPtr, nSucc, nAnte, asmap)
335 RF_Raid_t *raidPtr;
336 int *nSucc;
337 int *nAnte;
338 RF_AccessStripeMap_t *asmap;
339 {
340 *nSucc = *nAnte = 1;
341 }
342
343 static void PQOneTwo(raidPtr, nSucc, nAnte, asmap)
344 RF_Raid_t *raidPtr;
345 int *nSucc;
346 int *nAnte;
347 RF_AccessStripeMap_t *asmap;
348 {
349 *nSucc = 1;
350 *nAnte = 2;
351 }
352
353 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
354 {
355 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
356 rf_RegularPQFunc, RF_FALSE);
357 }
358
359 int rf_RegularONQFunc(node)
360 RF_DagNode_t *node;
361 {
362 int np = node->numParams;
363 int d;
364 RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[np-1].p;
365 int i;
366 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
367 RF_Etimer_t timer;
368 char *qbuf, *qpbuf;
369 char *obuf, *nbuf;
370 RF_PhysDiskAddr_t *old, *new;
371 unsigned long coeff;
372 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
373
374 RF_ETIMER_START(timer);
375
376 d = (np-3)/4;
377 RF_ASSERT (4*d+3 == np);
378 qbuf = (char *) node->params[2*d+1].p; /* q buffer*/
379 for (i=0; i < d; i++)
380 {
381 old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
382 obuf = (char *) node->params[2*i+1].p;
383 new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p;
384 nbuf = (char *) node->params[2*(d+1+i)+1].p;
385 RF_ASSERT (new->numSector == old->numSector);
386 RF_ASSERT (new->raidAddress == old->raidAddress);
387 /* the stripe unit within the stripe tells us the coefficient to use
388 for the multiply. */
389 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress);
390 /* compute the data unit offset within the column, then add one */
391 coeff = (coeff % raidPtr->Layout.numDataCol);
392 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU);
393 QDelta(qpbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
394 }
395
396 RF_ETIMER_STOP(timer);
397 RF_ETIMER_EVAL(timer);
398 tracerec->q_us += RF_ETIMER_VAL_US(timer);
399 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
400 return(0);
401 }
402
403 /*
404 See the SimpleXORFunc for the difference between a simple and regular func.
405 These Q functions should be used for
406
407 new q = Q(data,old data,old q)
408
409 style updates and not for
410
411 q = ( new data, new data, .... )
412
413 computations.
414
415 The simple q takes 2(2d+1)+1 params, where d is the number
416 of stripes written. The order of params is
417 old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
418 [2d] old q pda_0, old q buffer
419 [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d
420 raidPtr
421 */
422
423 int rf_SimpleONQFunc(node)
424 RF_DagNode_t *node;
425 {
426 int np = node->numParams;
427 int d;
428 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
429 int i;
430 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
431 RF_Etimer_t timer;
432 char *qbuf;
433 char *obuf, *nbuf;
434 RF_PhysDiskAddr_t *old, *new;
435 unsigned long coeff;
436
437 RF_ETIMER_START(timer);
438
439 d = (np-3)/4;
440 RF_ASSERT (4*d+3 == np);
441 qbuf = (char *) node->params[2*d+1].p; /* q buffer*/
442 for (i=0; i < d; i++)
443 {
444 old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
445 obuf = (char *) node->params[2*i+1].p;
446 new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p;
447 nbuf = (char *) node->params[2*(d+1+i)+1].p;
448 RF_ASSERT (new->numSector == old->numSector);
449 RF_ASSERT (new->raidAddress == old->raidAddress);
450 /* the stripe unit within the stripe tells us the coefficient to use
451 for the multiply. */
452 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress);
453 /* compute the data unit offset within the column, then add one */
454 coeff = (coeff % raidPtr->Layout.numDataCol);
455 QDelta(qbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
456 }
457
458 RF_ETIMER_STOP(timer);
459 RF_ETIMER_EVAL(timer);
460 tracerec->q_us += RF_ETIMER_VAL_US(timer);
461 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
462 return(0);
463 }
464
465 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
466 {
467 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
468 }
469
470 static void RegularQSubr(node,qbuf)
471 RF_DagNode_t *node;
472 char *qbuf;
473 {
474 int np = node->numParams;
475 int d;
476 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
477 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
478 int i;
479 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
480 RF_Etimer_t timer;
481 char *obuf, *qpbuf;
482 RF_PhysDiskAddr_t *old;
483 unsigned long coeff;
484
485 RF_ETIMER_START(timer);
486
487 d = (np-1)/2;
488 RF_ASSERT (2*d+1 == np);
489 for (i=0; i < d; i++)
490 {
491 old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
492 obuf = (char *) node->params[2*i+1].p;
493 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
494 /* compute the data unit offset within the column, then add one */
495 coeff = (coeff % raidPtr->Layout.numDataCol);
496 /* the input buffers may not all be aligned with the start of the
497 stripe. so shift by their sector offset within the stripe unit */
498 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU);
499 rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
500 }
501
502 RF_ETIMER_STOP(timer);
503 RF_ETIMER_EVAL(timer);
504 tracerec->q_us += RF_ETIMER_VAL_US(timer);
505 }
506
507 /*
508 used in degraded writes.
509 */
510
511 static void DegrQSubr(node)
512 RF_DagNode_t *node;
513 {
514 int np = node->numParams;
515 int d;
516 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
517 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
518 int i;
519 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
520 RF_Etimer_t timer;
521 char *qbuf = node->results[1];
522 char *obuf, *qpbuf;
523 RF_PhysDiskAddr_t *old;
524 unsigned long coeff;
525 unsigned fail_start;
526 int j;
527
528 old = (RF_PhysDiskAddr_t *)node->params[np-2].p;
529 fail_start = old->startSector % secPerSU;
530
531 RF_ETIMER_START(timer);
532
533 d = (np-2)/2;
534 RF_ASSERT (2*d+2 == np);
535 for (i=0; i < d; i++)
536 {
537 old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
538 obuf = (char *) node->params[2*i+1].p;
539 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
540 /* compute the data unit offset within the column, then add one */
541 coeff = (coeff % raidPtr->Layout.numDataCol);
542 /* the input buffers may not all be aligned with the start of the
543 stripe. so shift by their sector offset within the stripe unit */
544 j = old->startSector % secPerSU;
545 RF_ASSERT(j >= fail_start);
546 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start);
547 rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
548 }
549
550 RF_ETIMER_STOP(timer);
551 RF_ETIMER_EVAL(timer);
552 tracerec->q_us += RF_ETIMER_VAL_US(timer);
553 }
554
555 /*
556 Called by large write code to compute the new parity and the new q.
557
558 structure of the params:
559
560 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
561 raidPtr
562
563 for a total of 2d+1 arguments.
564 The result buffers results[0], results[1] are the buffers for the p and q,
565 respectively.
566
567 We compute Q first, then compute P. The P calculation may try to reuse
568 one of the input buffers for its output, so if we computed P first, we would
569 corrupt the input for the q calculation.
570 */
571
572 int rf_RegularPQFunc(node)
573 RF_DagNode_t *node;
574 {
575 RegularQSubr(node,node->results[1]);
576 return(rf_RegularXorFunc(node)); /* does the wakeup */
577 }
578
579 int rf_RegularQFunc(node)
580 RF_DagNode_t *node;
581 {
582 /* Almost ... adjust Qsubr args */
583 RegularQSubr(node, node->results[0]);
584 rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
585 return(0);
586 }
587
588 /*
589 Called by singly degraded write code to compute the new parity and the new q.
590
591 structure of the params:
592
593 pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
594 failedPDA raidPtr
595
596 for a total of 2d+2 arguments.
597 The result buffers results[0], results[1] are the buffers for the parity and q,
598 respectively.
599
600 We compute Q first, then compute parity. The parity calculation may try to reuse
601 one of the input buffers for its output, so if we computed parity first, we would
602 corrupt the input for the q calculation.
603
604 We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
605 */
606
607 void rf_Degraded_100_PQFunc(node)
608 RF_DagNode_t *node;
609 {
610 int np = node->numParams;
611
612 RF_ASSERT (np >= 2);
613 DegrQSubr(node);
614 rf_RecoveryXorFunc(node);
615 }
616
617
618 /*
619 The two below are used when reading a stripe with a single lost data unit.
620 The parameters are
621
622 pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
623
624 and results[0] contains the data buffer. Which is originally zero-filled.
625
626 */
627
628 /* this Q func is used by the degraded-mode dag functions to recover lost data.
629 * the second-to-last parameter is the PDA for the failed portion of the access.
630 * the code here looks at this PDA and assumes that the xor target buffer is
631 * equal in size to the number of sectors in the failed PDA. It then uses
632 * the other PDAs in the parameter list to determine where within the target
633 * buffer the corresponding data should be xored.
634 *
635 * Recall the basic equation is
636 *
637 * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256
638 *
639 * so to recover data_j we need
640 *
641 * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
642 *
643 * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
644 * copying Q into it. Then we need to do a table lookup to convert to solve
645 * data_j /= J
646 *
647 *
648 */
649 int rf_RecoveryQFunc(node)
650 RF_DagNode_t *node;
651 {
652 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
653 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
654 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p;
655 int i;
656 RF_PhysDiskAddr_t *pda;
657 RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
658 char *srcbuf, *destbuf;
659 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
660 RF_Etimer_t timer;
661 unsigned long coeff;
662
663 RF_ETIMER_START(timer);
664 /* start by copying Q into the buffer */
665 bcopy(node->params[node->numParams-3].p,node->results[0],
666 rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
667 for (i=0; i<node->numParams-4; i+=2)
668 {
669 RF_ASSERT (node->params[i+1].p != node->results[0]);
670 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
671 srcbuf = (char *) node->params[i+1].p;
672 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
673 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
674 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
675 /* compute the data unit offset within the column */
676 coeff = (coeff % raidPtr->Layout.numDataCol);
677 rf_IncQ((unsigned long *)destbuf, (unsigned long *)srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
678 }
679 /* Do the nasty inversion now */
680 coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),failedPDA->startSector) % raidPtr->Layout.numDataCol);
681 rf_InvertQ(node->results[0],node->results[0],rf_RaidAddressToByte(raidPtr,pda->numSector),coeff);
682 RF_ETIMER_STOP(timer);
683 RF_ETIMER_EVAL(timer);
684 tracerec->q_us += RF_ETIMER_VAL_US(timer);
685 rf_GenericWakeupFunc(node, 0);
686 return(0);
687 }
688
689 int rf_RecoveryPQFunc(node)
690 RF_DagNode_t *node;
691 {
692 RF_PANIC();
693 return(1);
694 }
695
696 /*
697 Degraded write Q subroutine.
698 Used when P is dead.
699 Large-write style Q computation.
700 Parameters
701
702 (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
703
704 We ignore failedPDA.
705
706 This is a "simple style" recovery func.
707 */
708
709 void rf_PQ_DegradedWriteQFunc(node)
710 RF_DagNode_t *node;
711 {
712 int np = node->numParams;
713 int d;
714 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
715 unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
716 int i;
717 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
718 RF_Etimer_t timer;
719 char *qbuf = node->results[0];
720 char *obuf, *qpbuf;
721 RF_PhysDiskAddr_t *old;
722 unsigned long coeff;
723 int fail_start,j;
724
725 old = (RF_PhysDiskAddr_t *) node->params[np-2].p;
726 fail_start = old->startSector % secPerSU;
727
728 RF_ETIMER_START(timer);
729
730 d = (np-2)/2;
731 RF_ASSERT (2*d+2 == np);
732
733 for (i=0; i < d; i++)
734 {
735 old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
736 obuf = (char *) node->params[2*i+1].p;
737 coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
738 /* compute the data unit offset within the column, then add one */
739 coeff = (coeff % raidPtr->Layout.numDataCol);
740 j = old->startSector % secPerSU;
741 RF_ASSERT(j >= fail_start);
742 qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start);
743 rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
744 }
745
746 RF_ETIMER_STOP(timer);
747 RF_ETIMER_EVAL(timer);
748 tracerec->q_us += RF_ETIMER_VAL_US(timer);
749 rf_GenericWakeupFunc(node, 0);
750 }
751
752
753
754
755 /* Q computations */
756
757 /*
758 coeff - colummn;
759
760 compute dest ^= qfor[28-coeff][rn[coeff+1] a]
761
762 on 5-bit basis;
763 length in bytes;
764 */
765
766 void rf_IncQ(dest,buf,length,coeff)
767 unsigned long *dest;
768 unsigned long *buf;
769 unsigned length;
770 unsigned coeff;
771 {
772 unsigned long a, d, new;
773 unsigned long a1, a2;
774 unsigned int *q = &(rf_qfor[28-coeff][0]);
775 unsigned r = rf_rn[coeff+1];
776
777 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
778 #define INSERT(a,i) (a << (5L*i))
779
780 length /= 8;
781 /* 13 5 bit quants in a 64 bit word */
782 while (length)
783 {
784 a = *buf++;
785 d = *dest;
786 a1 = EXTRACT(a,0) ^ r;
787 a2 = EXTRACT(a,1) ^ r;
788 new = INSERT(a2,1) | a1 ;
789 a1 = EXTRACT(a,2) ^ r;
790 a2 = EXTRACT(a,3) ^ r;
791 a1 = q[a1];
792 a2 = q[a2];
793 new = new | INSERT(a1,2) | INSERT (a2,3);
794 a1 = EXTRACT(a,4) ^ r;
795 a2 = EXTRACT(a,5) ^ r;
796 a1 = q[a1];
797 a2 = q[a2];
798 new = new | INSERT(a1,4) | INSERT (a2,5);
799 a1 = EXTRACT(a,5) ^ r;
800 a2 = EXTRACT(a,6) ^ r;
801 a1 = q[a1];
802 a2 = q[a2];
803 new = new | INSERT(a1,5) | INSERT (a2,6);
804 #if RF_LONGSHIFT > 2
805 a1 = EXTRACT(a,7) ^ r;
806 a2 = EXTRACT(a,8) ^ r;
807 a1 = q[a1];
808 a2 = q[a2];
809 new = new | INSERT(a1,7) | INSERT (a2,8);
810 a1 = EXTRACT(a,9) ^ r;
811 a2 = EXTRACT(a,10) ^ r;
812 a1 = q[a1];
813 a2 = q[a2];
814 new = new | INSERT(a1,9) | INSERT (a2,10);
815 a1 = EXTRACT(a,11) ^ r;
816 a2 = EXTRACT(a,12) ^ r;
817 a1 = q[a1];
818 a2 = q[a2];
819 new = new | INSERT(a1,11) | INSERT (a2,12);
820 #endif /* RF_LONGSHIFT > 2 */
821 d ^= new;
822 *dest++ = d;
823 length--;
824 }
825 }
826
827 /*
828 compute
829
830 dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
831
832 on a five bit basis.
833 optimization: compute old ^ new on 64 bit basis.
834
835 length in bytes.
836 */
837
838 static void QDelta(
839 char *dest,
840 char *obuf,
841 char *nbuf,
842 unsigned length,
843 unsigned char coeff)
844 {
845 unsigned long a, d, new;
846 unsigned long a1, a2;
847 unsigned int *q = &(rf_qfor[28-coeff][0]);
848 unsigned r = rf_rn[coeff+1];
849
850 #ifdef KERNEL
851 /* PQ in kernel currently not supported because the encoding/decoding table is not present */
852 bzero(dest, length);
853 #else /* KERNEL */
854 /* this code probably doesn't work and should be rewritten -wvcii */
855 /* 13 5 bit quants in a 64 bit word */
856 length /= 8;
857 while (length)
858 {
859 a = *obuf++; /* XXX need to reorg to avoid cache conflicts */
860 a ^= *nbuf++;
861 d = *dest;
862 a1 = EXTRACT(a,0) ^ r;
863 a2 = EXTRACT(a,1) ^ r;
864 a1 = q[a1];
865 a2 = q[a2];
866 new = INSERT(a2,1) | a1 ;
867 a1 = EXTRACT(a,2) ^ r;
868 a2 = EXTRACT(a,3) ^ r;
869 a1 = q[a1];
870 a2 = q[a2];
871 new = new | INSERT(a1,2) | INSERT (a2,3);
872 a1 = EXTRACT(a,4) ^ r;
873 a2 = EXTRACT(a,5) ^ r;
874 a1 = q[a1];
875 a2 = q[a2];
876 new = new | INSERT(a1,4) | INSERT (a2,5);
877 a1 = EXTRACT(a,5) ^ r;
878 a2 = EXTRACT(a,6) ^ r;
879 a1 = q[a1];
880 a2 = q[a2];
881 new = new | INSERT(a1,5) | INSERT (a2,6);
882 #if RF_LONGSHIFT > 2
883 a1 = EXTRACT(a,7) ^ r;
884 a2 = EXTRACT(a,8) ^ r;
885 a1 = q[a1];
886 a2 = q[a2];
887 new = new | INSERT(a1,7) | INSERT (a2,8);
888 a1 = EXTRACT(a,9) ^ r;
889 a2 = EXTRACT(a,10) ^ r;
890 a1 = q[a1];
891 a2 = q[a2];
892 new = new | INSERT(a1,9) | INSERT (a2,10);
893 a1 = EXTRACT(a,11) ^ r;
894 a2 = EXTRACT(a,12) ^ r;
895 a1 = q[a1];
896 a2 = q[a2];
897 new = new | INSERT(a1,11) | INSERT (a2,12);
898 #endif /* RF_LONGSHIFT > 2 */
899 d ^= new;
900 *dest++ = d;
901 length--;
902 }
903 #endif /* KERNEL */
904 }
905
906 /*
907 recover columns a and b from the given p and q into
908 bufs abuf and bbuf. All bufs are word aligned.
909 Length is in bytes.
910 */
911
912
913 /*
914 * XXX
915 *
916 * Everything about this seems wrong.
917 */
918 void rf_PQ_recover(pbuf,qbuf,abuf,bbuf,length,coeff_a,coeff_b)
919 unsigned long *pbuf;
920 unsigned long *qbuf;
921 unsigned long *abuf;
922 unsigned long *bbuf;
923 unsigned length;
924 unsigned coeff_a;
925 unsigned coeff_b;
926 {
927 unsigned long p, q, a, a0, a1;
928 int col = (29 * coeff_a) + coeff_b;
929 unsigned char *q0 = & (rf_qinv[col][0]);
930
931 length /= 8;
932 while (length)
933 {
934 p = *pbuf++;
935 q = *qbuf++;
936 a0 = EXTRACT(p,0);
937 a1 = EXTRACT(q,0);
938 a = q0[a0<<5 | a1];
939 #define MF(i) \
940 a0 = EXTRACT(p,i); \
941 a1 = EXTRACT(q,i); \
942 a = a | INSERT(q0[a0<<5 | a1],i)
943
944 MF(1);
945 MF(2);
946 MF(3);
947 MF(4);
948 MF(5);
949 MF(6);
950 #if 0
951 MF(7);
952 MF(8);
953 MF(9);
954 MF(10);
955 MF(11);
956 MF(12);
957 #endif /* 0 */
958 *abuf++ = a;
959 *bbuf++ = a ^ p;
960 length--;
961 }
962 }
963
964 /*
965 Lost parity and a data column. Recover that data column.
966 Assume col coeff is lost. Let q the contents of Q after
967 all surviving data columns have been q-xored out of it.
968 Then we have the equation
969
970 q[28-coeff][a_i ^ r_i+1] = q
971
972 but q is cyclic with period 31.
973 So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
974 q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
975
976 so a_i = r_{coeff+1} ^ q[3+coeff][q]
977
978 The routine is passed q buffer and the buffer
979 the data is to be recoverd into. They can be the same.
980 */
981
982
983
984 static void rf_InvertQ(
985 unsigned long *qbuf,
986 unsigned long *abuf,
987 unsigned length,
988 unsigned coeff)
989 {
990 unsigned long a, new;
991 unsigned long a1, a2;
992 unsigned int *q = &(rf_qfor[3+coeff][0]);
993 unsigned r = rf_rn[coeff+1];
994
995 /* 13 5 bit quants in a 64 bit word */
996 length /= 8;
997 while (length)
998 {
999 a = *qbuf++;
1000 a1 = EXTRACT(a,0);
1001 a2 = EXTRACT(a,1);
1002 a1 = r ^ q[a1];
1003 a2 = r ^ q[a2];
1004 new = INSERT(a2,1) | a1;
1005 #define M(i,j) \
1006 a1 = EXTRACT(a,i); \
1007 a2 = EXTRACT(a,j); \
1008 a1 = r ^ q[a1]; \
1009 a2 = r ^ q[a2]; \
1010 new = new | INSERT(a1,i) | INSERT(a2,j)
1011
1012 M(2,3);
1013 M(4,5);
1014 M(5,6);
1015 #if RF_LONGSHIFT > 2
1016 M(7,8);
1017 M(9,10);
1018 M(11,12);
1019 #endif /* RF_LONGSHIFT > 2 */
1020 *abuf++ = new;
1021 length--;
1022 }
1023 }
1024
1025 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
1026