rf_evenodd_dagfuncs.c revision 1.23 1 /* $NetBSD: rf_evenodd_dagfuncs.c,v 1.23 2019/02/09 03:34:00 christos Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: ChangMing Wu
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * Code for RAID-EVENODD architecture.
31 */
32
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_evenodd_dagfuncs.c,v 1.23 2019/02/09 03:34:00 christos Exp $");
35
36 #include "rf_archs.h"
37
38 #ifdef _KERNEL_OPT
39 #include "opt_raid_diagnostic.h"
40 #endif
41
42 #if RF_INCLUDE_EVENODD > 0
43
44 #include <dev/raidframe/raidframevar.h>
45
46 #include "rf_raid.h"
47 #include "rf_dag.h"
48 #include "rf_dagffrd.h"
49 #include "rf_dagffwr.h"
50 #include "rf_dagdegrd.h"
51 #include "rf_dagdegwr.h"
52 #include "rf_dagutils.h"
53 #include "rf_dagfuncs.h"
54 #include "rf_etimer.h"
55 #include "rf_general.h"
56 #include "rf_parityscan.h"
57 #include "rf_evenodd.h"
58 #include "rf_evenodd_dagfuncs.h"
59
60 /* These redundant functions are for small write */
61 RF_RedFuncs_t rf_EOSmallWritePFuncs = {rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P"};
62 RF_RedFuncs_t rf_EOSmallWriteEFuncs = {rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E"};
63 /* These redundant functions are for degraded read */
64 RF_RedFuncs_t rf_eoPRecoveryFuncs = {rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
65 RF_RedFuncs_t rf_eoERecoveryFuncs = {rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func"};
66 /**********************************************************************************************
67 * the following encoding node functions is used in EO_000_CreateLargeWriteDAG
68 **********************************************************************************************/
69 int
70 rf_RegularPEFunc(RF_DagNode_t *node)
71 {
72 rf_RegularESubroutine(node, node->results[1]);
73 rf_RegularXorFunc(node);/* does the wakeup here! */
74 #if 1
75 return (0); /* XXX This was missing... GO */
76 #endif
77 }
78
79
80 /************************************************************************************************
81 * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
82 * be used. The previous case is when write access at least sectors of full stripe unit.
83 * The later function is used when the write access two stripe units but with total sectors
84 * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
85 * areas in their stripe unit and parity write and 'E' write are both devided into two distinct
86 * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
87 ************************************************************************************************/
88
89 /* Algorithm:
90 1. Store the difference of old data and new data in the Rod buffer.
91 2. then encode this buffer into the buffer which already have old 'E' information inside it,
92 the result can be shown to be the new 'E' information.
93 3. xor the Wnd buffer into the difference buffer to recover the original old data.
94 Here we have another alternative: to allocate a temporary buffer for storing the difference of
95 old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
96 take the same speed as the previous, and need more memory.
97 */
98 int
99 rf_RegularONEFunc(RF_DagNode_t *node)
100 {
101 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
102 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
103 int EpdaIndex = (node->numParams - 1) / 2 - 1; /* the parameter of node
104 * where you can find
105 * e-pda */
106 int i, k;
107 int suoffset, length;
108 RF_RowCol_t scol;
109 char *srcbuf, *destbuf;
110 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
111 RF_Etimer_t timer;
112 RF_PhysDiskAddr_t *pda;
113 #ifdef RAID_DIAGNOSTIC
114 RF_PhysDiskAddr_t *EPDA =
115 (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
116 int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector);
117
118 RF_ASSERT(EPDA->type == RF_PDA_TYPE_Q);
119 RF_ASSERT(ESUOffset == 0);
120 #endif /* RAID_DIAGNOSTIC */
121
122 RF_ETIMER_START(timer);
123
124 /* Xor the Wnd buffer into Rod buffer, the difference of old data and
125 * new data is stored in Rod buffer */
126 for (k = 0; k < EpdaIndex; k += 2) {
127 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
128 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
129 }
130 /* Start to encoding the buffer storing the difference of old data and
131 * new data into 'E' buffer */
132 for (i = 0; i < EpdaIndex; i += 2)
133 if (node->params[i + 1].p != node->results[0]) { /* results[0] is buf ptr
134 * of E */
135 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
136 srcbuf = (char *) node->params[i + 1].p;
137 scol = rf_EUCol(layoutPtr, pda->raidAddress);
138 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
139 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset);
140 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
141 }
142 /* Recover the original old data to be used by parity encoding
143 * function in XorNode */
144 for (k = 0; k < EpdaIndex; k += 2) {
145 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[k].p)->numSector);
146 rf_bxor(node->params[k + EpdaIndex + 3].p, node->params[k + 1].p, length);
147 }
148 RF_ETIMER_STOP(timer);
149 RF_ETIMER_EVAL(timer);
150 tracerec->q_us += RF_ETIMER_VAL_US(timer);
151 rf_GenericWakeupFunc(node, 0);
152 #if 1
153 return (0); /* XXX this was missing.. GO */
154 #endif
155 }
156
157 int
158 rf_SimpleONEFunc(RF_DagNode_t *node)
159 {
160 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
161 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
162 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
163 int retcode = 0;
164 char *srcbuf, *destbuf;
165 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
166 int length;
167 RF_RowCol_t scol;
168 RF_Etimer_t timer;
169
170 RF_ASSERT(((RF_PhysDiskAddr_t *) node->params[2].p)->type == RF_PDA_TYPE_Q);
171 if (node->dagHdr->status == rf_enable) {
172 RF_ETIMER_START(timer);
173 length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *) node->params[4].p)->numSector); /* this is a pda of
174 * writeDataNodes */
175 /* bxor to buffer of readDataNodes */
176 retcode = rf_bxor(node->params[5].p, node->params[1].p, length);
177 /* find out the corresponding colume in encoding matrix for
178 * write colume to be encoded into redundant disk 'E' */
179 scol = rf_EUCol(layoutPtr, pda->raidAddress);
180 srcbuf = node->params[1].p;
181 destbuf = node->params[3].p;
182 /* Start encoding process */
183 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
184 rf_bxor(node->params[5].p, node->params[1].p, length);
185 RF_ETIMER_STOP(timer);
186 RF_ETIMER_EVAL(timer);
187 tracerec->q_us += RF_ETIMER_VAL_US(timer);
188
189 }
190 return (rf_GenericWakeupFunc(node, retcode)); /* call wake func
191 * explicitly since no
192 * I/O in this node */
193 }
194
195
196 /****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/
197 void
198 rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf)
199 {
200 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
201 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
202 RF_PhysDiskAddr_t *pda;
203 int i, suoffset;
204 RF_RowCol_t scol;
205 char *srcbuf, *destbuf;
206 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
207 RF_Etimer_t timer;
208
209 RF_ETIMER_START(timer);
210 for (i = 0; i < node->numParams - 2; i += 2) {
211 RF_ASSERT(node->params[i + 1].p != ebuf);
212 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
213 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
214 scol = rf_EUCol(layoutPtr, pda->raidAddress);
215 srcbuf = (char *) node->params[i + 1].p;
216 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset);
217 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
218 }
219 RF_ETIMER_STOP(timer);
220 RF_ETIMER_EVAL(timer);
221 tracerec->xor_us += RF_ETIMER_VAL_US(timer);
222 }
223
224
225 /*******************************************************************************************
226 * Used in EO_001_CreateLargeWriteDAG
227 ******************************************************************************************/
228 int
229 rf_RegularEFunc(RF_DagNode_t *node)
230 {
231 rf_RegularESubroutine(node, node->results[0]);
232 rf_GenericWakeupFunc(node, 0);
233 #if 1
234 return (0); /* XXX this was missing?.. GO */
235 #endif
236 }
237 /*******************************************************************************************
238 * This degraded function allow only two case:
239 * 1. when write access the full failed stripe unit, then the access can be more than
240 * one tripe units.
241 * 2. when write access only part of the failed SU, we assume accesses of more than
242 * one stripe unit is not allowed so that the write can be dealt with like a
243 * large write.
244 * The following function is based on these assumptions. So except in the second case,
245 * it looks the same as a large write encodeing function. But this is not exactly the
246 * normal way for doing a degraded write, since raidframe have to break cases of access
247 * other than the above two into smaller accesses. We may have to change
248 * DegrESubroutin in the future.
249 *******************************************************************************************/
250 void
251 rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf)
252 {
253 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
254 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
255 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
256 RF_PhysDiskAddr_t *pda;
257 int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
258 RF_RowCol_t scol;
259 char *srcbuf, *destbuf;
260 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
261 RF_Etimer_t timer;
262
263 RF_ETIMER_START(timer);
264 for (i = 0; i < node->numParams - 2; i += 2) {
265 RF_ASSERT(node->params[i + 1].p != ebuf);
266 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
267 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
268 scol = rf_EUCol(layoutPtr, pda->raidAddress);
269 srcbuf = (char *) node->params[i + 1].p;
270 destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
271 rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
272 }
273
274 RF_ETIMER_STOP(timer);
275 RF_ETIMER_EVAL(timer);
276 tracerec->q_us += RF_ETIMER_VAL_US(timer);
277 }
278
279
280 /**************************************************************************************
281 * This function is used in case where one data disk failed and both redundant disks
282 * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
283 * failed in the stripe but not accessed at this time, then we should, instead, use
284 * the rf_EOWriteDoubleRecoveryFunc().
285 **************************************************************************************/
286 int
287 rf_Degraded_100_EOFunc(RF_DagNode_t *node)
288 {
289 rf_DegrESubroutine(node, node->results[1]);
290 rf_RecoveryXorFunc(node); /* does the wakeup here! */
291 #if 1
292 return (0); /* XXX this was missing... SHould these be
293 * void functions??? GO */
294 #endif
295 }
296 /**************************************************************************************
297 * This function is to encode one sector in one of the data disks to the E disk.
298 * However, in evenodd this function can also be used as decoding function to recover
299 * data from dead disk in the case of parity failure and a single data failure.
300 **************************************************************************************/
301 void
302 rf_e_EncOneSect(
303 RF_RowCol_t srcLogicCol,
304 char *srcSecbuf,
305 RF_RowCol_t destLogicCol,
306 char *destSecbuf,
307 int bytesPerSector)
308 {
309 int S_index; /* index of the EU in the src col which need
310 * be Xored into all EUs in a dest sector */
311 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
312 RF_RowCol_t j, indexInDest, /* row index of an encoding unit in
313 * the destination colume of encoding
314 * matrix */
315 indexInSrc; /* row index of an encoding unit in the source
316 * colume used for recovery */
317 int bytesPerEU = bytesPerSector / numRowInEncMatix;
318
319 #if RF_EO_MATRIX_DIM > 17
320 int shortsPerEU = bytesPerEU / sizeof(short);
321 short *destShortBuf, *srcShortBuf1, *srcShortBuf2;
322 short temp1;
323 #elif RF_EO_MATRIX_DIM == 17
324 int longsPerEU = bytesPerEU / sizeof(long);
325 long *destLongBuf, *srcLongBuf1, *srcLongBuf2;
326 long temp1;
327 #endif
328
329 #if RF_EO_MATRIX_DIM > 17
330 RF_ASSERT(sizeof(short) == 2 || sizeof(short) == 1);
331 RF_ASSERT(bytesPerEU % sizeof(short) == 0);
332 #elif RF_EO_MATRIX_DIM == 17
333 RF_ASSERT(sizeof(long) == 8 || sizeof(long) == 4);
334 RF_ASSERT(bytesPerEU % sizeof(long) == 0);
335 #endif
336
337 S_index = rf_EO_Mod((RF_EO_MATRIX_DIM - 1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
338 #if RF_EO_MATRIX_DIM > 17
339 srcShortBuf1 = (short *) (srcSecbuf + S_index * bytesPerEU);
340 #elif RF_EO_MATRIX_DIM == 17
341 srcLongBuf1 = (long *) (srcSecbuf + S_index * bytesPerEU);
342 #endif
343
344 for (indexInDest = 0; indexInDest < numRowInEncMatix; indexInDest++) {
345 indexInSrc = rf_EO_Mod((indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
346
347 #if RF_EO_MATRIX_DIM > 17
348 destShortBuf = (short *) (destSecbuf + indexInDest * bytesPerEU);
349 srcShortBuf2 = (short *) (srcSecbuf + indexInSrc * bytesPerEU);
350 for (j = 0; j < shortsPerEU; j++) {
351 temp1 = destShortBuf[j] ^ srcShortBuf1[j];
352 /* note: S_index won't be at the end row for any src
353 * col! */
354 if (indexInSrc != RF_EO_MATRIX_DIM - 1)
355 destShortBuf[j] = (srcShortBuf2[j]) ^ temp1;
356 /* if indexInSrc is at the end row, ie.
357 * RF_EO_MATRIX_DIM -1, then all elements are zero! */
358 else
359 destShortBuf[j] = temp1;
360 }
361
362 #elif RF_EO_MATRIX_DIM == 17
363 destLongBuf = (long *) (destSecbuf + indexInDest * bytesPerEU);
364 srcLongBuf2 = (long *) (srcSecbuf + indexInSrc * bytesPerEU);
365 for (j = 0; j < longsPerEU; j++) {
366 temp1 = destLongBuf[j] ^ srcLongBuf1[j];
367 if (indexInSrc != RF_EO_MATRIX_DIM - 1)
368 destLongBuf[j] = (srcLongBuf2[j]) ^ temp1;
369 else
370 destLongBuf[j] = temp1;
371 }
372 #endif
373 }
374 }
375
376 void
377 rf_e_encToBuf(
378 RF_Raid_t * raidPtr,
379 RF_RowCol_t srcLogicCol,
380 char *srcbuf,
381 RF_RowCol_t destLogicCol,
382 char *destbuf,
383 int numSector)
384 {
385 int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
386
387 for (i = 0; i < numSector; i++) {
388 rf_e_EncOneSect(srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
389 srcbuf += bytesPerSector;
390 destbuf += bytesPerSector;
391 }
392 }
393 /**************************************************************************************
394 * when parity die and one data die, We use second redundant information, 'E',
395 * to recover the data in dead disk. This function is used in the recovery node of
396 * for EO_110_CreateReadDAG
397 **************************************************************************************/
398 int
399 rf_RecoveryEFunc(RF_DagNode_t *node)
400 {
401 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
402 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
403 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
404 RF_RowCol_t scol, /* source logical column */
405 fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress); /* logical column of
406 * failed SU */
407 int i;
408 RF_PhysDiskAddr_t *pda;
409 int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
410 char *srcbuf, *destbuf;
411 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
412 RF_Etimer_t timer;
413
414 memset(node->results[0], 0,
415 rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
416 if (node->dagHdr->status == rf_enable) {
417 RF_ETIMER_START(timer);
418 for (i = 0; i < node->numParams - 2; i += 2)
419 if (node->params[i + 1].p != node->results[0]) {
420 pda = (RF_PhysDiskAddr_t *) node->params[i].p;
421 if (i == node->numParams - 4)
422 scol = RF_EO_MATRIX_DIM - 2; /* the colume of
423 * redundant E */
424 else
425 scol = rf_EUCol(layoutPtr, pda->raidAddress);
426 srcbuf = (char *) node->params[i + 1].p;
427 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
428 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
429 rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
430 }
431 RF_ETIMER_STOP(timer);
432 RF_ETIMER_EVAL(timer);
433 tracerec->xor_us += RF_ETIMER_VAL_US(timer);
434 }
435 return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */
436 }
437 /**************************************************************************************
438 * This function is used in the case where one data and the parity have filed.
439 * (in EO_110_CreateWriteDAG )
440 **************************************************************************************/
441 int
442 rf_EO_DegradedWriteEFunc(RF_DagNode_t * node)
443 {
444 rf_DegrESubroutine(node, node->results[0]);
445 rf_GenericWakeupFunc(node, 0);
446 #if 1
447 return (0); /* XXX Yet another one!! GO */
448 #endif
449 }
450
451
452
453 /**************************************************************************************
454 * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
455 **************************************************************************************/
456
457 void
458 rf_doubleEOdecode(
459 RF_Raid_t * raidPtr,
460 char **rrdbuf,
461 char **dest,
462 RF_RowCol_t * fcol,
463 char *pbuf,
464 char *ebuf)
465 {
466 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
467 int i, j, k, f1, f2, row;
468 int rrdrow, erow, count = 0;
469 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
470 int numRowInEncMatix = (RF_EO_MATRIX_DIM) - 1;
471 #if 0
472 int pcol = (RF_EO_MATRIX_DIM) - 1;
473 #endif
474 int ecol = (RF_EO_MATRIX_DIM) - 2;
475 int bytesPerEU = bytesPerSector / numRowInEncMatix;
476 int numDataCol = layoutPtr->numDataCol;
477 #if RF_EO_MATRIX_DIM > 17
478 int shortsPerEU = bytesPerEU / sizeof(short);
479 short *rrdbuf_current, *pbuf_current, *ebuf_current;
480 short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
481 short *temp;
482 short *P;
483
484 RF_ASSERT(bytesPerEU % sizeof(short) == 0);
485 #elif RF_EO_MATRIX_DIM == 17
486 int longsPerEU = bytesPerEU / sizeof(long);
487 long *rrdbuf_current, *pbuf_current, *ebuf_current;
488 long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
489 long *temp;
490 long *P;
491
492 RF_ASSERT(bytesPerEU % sizeof(long) == 0);
493 #endif
494 P = RF_Malloc(bytesPerEU);
495 temp = RF_Malloc(bytesPerEU);
496 RF_ASSERT(*((long *) dest[0]) == 0);
497 RF_ASSERT(*((long *) dest[1]) == 0);
498 RF_ASSERT(*P == 0);
499 /* calculate the 'P' parameter, which, not parity, is the Xor of all
500 * elements in the last two column, ie. 'E' and 'parity' colume, see
501 * the Ref. paper by Blaum, et al 1993 */
502 for (i = 0; i < numRowInEncMatix; i++)
503 for (k = 0; k < longsPerEU; k++) {
504 #if RF_EO_MATRIX_DIM > 17
505 ebuf_current = ((short *) ebuf) + i * shortsPerEU + k;
506 pbuf_current = ((short *) pbuf) + i * shortsPerEU + k;
507 #elif RF_EO_MATRIX_DIM == 17
508 ebuf_current = ((long *) ebuf) + i * longsPerEU + k;
509 pbuf_current = ((long *) pbuf) + i * longsPerEU + k;
510 #endif
511 P[k] ^= *ebuf_current;
512 P[k] ^= *pbuf_current;
513 }
514 RF_ASSERT(fcol[0] != fcol[1]);
515 if (fcol[0] < fcol[1]) {
516 #if RF_EO_MATRIX_DIM > 17
517 dest_smaller = (short *) (dest[0]);
518 dest_larger = (short *) (dest[1]);
519 #elif RF_EO_MATRIX_DIM == 17
520 dest_smaller = (long *) (dest[0]);
521 dest_larger = (long *) (dest[1]);
522 #endif
523 f1 = fcol[0];
524 f2 = fcol[1];
525 } else {
526 #if RF_EO_MATRIX_DIM > 17
527 dest_smaller = (short *) (dest[1]);
528 dest_larger = (short *) (dest[0]);
529 #elif RF_EO_MATRIX_DIM == 17
530 dest_smaller = (long *) (dest[1]);
531 dest_larger = (long *) (dest[0]);
532 #endif
533 f1 = fcol[1];
534 f2 = fcol[0];
535 }
536 row = (RF_EO_MATRIX_DIM) - 1;
537 while ((row = rf_EO_Mod((row + f1 - f2), RF_EO_MATRIX_DIM)) != ((RF_EO_MATRIX_DIM) - 1)) {
538 #if RF_EO_MATRIX_DIM > 17
539 dest_larger_current = dest_larger + row * shortsPerEU;
540 dest_smaller_current = dest_smaller + row * shortsPerEU;
541 #elif RF_EO_MATRIX_DIM == 17
542 dest_larger_current = dest_larger + row * longsPerEU;
543 dest_smaller_current = dest_smaller + row * longsPerEU;
544 #endif
545 /** Do the diagonal recovery. Initially, temp[k] = (failed 1),
546 which is the failed data in the colume which has smaller col index. **/
547 /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */
548 for (j = 0; j < numDataCol; j++) {
549 if (j == f1 || j == f2)
550 continue;
551 rrdrow = rf_EO_Mod((row + f2 - j), RF_EO_MATRIX_DIM);
552 if (rrdrow != (RF_EO_MATRIX_DIM) - 1) {
553 #if RF_EO_MATRIX_DIM > 17
554 rrdbuf_current = (short *) (rrdbuf[j]) + rrdrow * shortsPerEU;
555 for (k = 0; k < shortsPerEU; k++)
556 temp[k] ^= *(rrdbuf_current + k);
557 #elif RF_EO_MATRIX_DIM == 17
558 rrdbuf_current = (long *) (rrdbuf[j]) + rrdrow * longsPerEU;
559 for (k = 0; k < longsPerEU; k++)
560 temp[k] ^= *(rrdbuf_current + k);
561 #endif
562 }
563 }
564 /* step 2: ^E(erow,m-2), If erow is at the buttom row, don't
565 * Xor into it E(erow,m-2) = (principle diagonal) ^ (failed
566 * 1) ^ (failed 2) ^ ( SUM of nonfailed in-diagonal
567 * A(rrdrow,0..m-3) ) After this step, temp[k] = (principle
568 * diagonal) ^ (failed 2) */
569
570 erow = rf_EO_Mod((row + f2 - ecol), (RF_EO_MATRIX_DIM));
571 if (erow != (RF_EO_MATRIX_DIM) - 1) {
572 #if RF_EO_MATRIX_DIM > 17
573 ebuf_current = (short *) ebuf + shortsPerEU * erow;
574 for (k = 0; k < shortsPerEU; k++)
575 temp[k] ^= *(ebuf_current + k);
576 #elif RF_EO_MATRIX_DIM == 17
577 ebuf_current = (long *) ebuf + longsPerEU * erow;
578 for (k = 0; k < longsPerEU; k++)
579 temp[k] ^= *(ebuf_current + k);
580 #endif
581 }
582 /* step 3: ^P to obtain the failed data (failed 2). P can be
583 * proved to be actually (principle diagonal) After this
584 * step, temp[k] = (failed 2), the failed data to be recovered */
585 #if RF_EO_MATRIX_DIM > 17
586 for (k = 0; k < shortsPerEU; k++)
587 temp[k] ^= P[k];
588 /* Put the data to the destination buffer */
589 for (k = 0; k < shortsPerEU; k++)
590 dest_larger_current[k] = temp[k];
591 #elif RF_EO_MATRIX_DIM == 17
592 for (k = 0; k < longsPerEU; k++)
593 temp[k] ^= P[k];
594 /* Put the data to the destination buffer */
595 for (k = 0; k < longsPerEU; k++)
596 dest_larger_current[k] = temp[k];
597 #endif
598
599 /** THE FOLLOWING DO THE HORIZONTAL XOR **/
600 /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data
601 * columes */
602 for (j = 0; j < numDataCol; j++) {
603 if (j == f1 || j == f2)
604 continue;
605 #if RF_EO_MATRIX_DIM > 17
606 rrdbuf_current = (short *) (rrdbuf[j]) + row * shortsPerEU;
607 for (k = 0; k < shortsPerEU; k++)
608 temp[k] ^= *(rrdbuf_current + k);
609 #elif RF_EO_MATRIX_DIM == 17
610 rrdbuf_current = (long *) (rrdbuf[j]) + row * longsPerEU;
611 for (k = 0; k < longsPerEU; k++)
612 temp[k] ^= *(rrdbuf_current + k);
613 #endif
614 }
615 /* step 2: ^A(row,m-1) */
616 /* step 3: Put the data to the destination buffer */
617 #if RF_EO_MATRIX_DIM > 17
618 pbuf_current = (short *) pbuf + shortsPerEU * row;
619 for (k = 0; k < shortsPerEU; k++)
620 temp[k] ^= *(pbuf_current + k);
621 for (k = 0; k < shortsPerEU; k++)
622 dest_smaller_current[k] = temp[k];
623 #elif RF_EO_MATRIX_DIM == 17
624 pbuf_current = (long *) pbuf + longsPerEU * row;
625 for (k = 0; k < longsPerEU; k++)
626 temp[k] ^= *(pbuf_current + k);
627 for (k = 0; k < longsPerEU; k++)
628 dest_smaller_current[k] = temp[k];
629 #endif
630 count++;
631 }
632 /* Check if all Encoding Unit in the data buffer have been decoded,
633 * according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
634 * this algorithm will covered all buffer */
635 RF_ASSERT(count == numRowInEncMatix);
636 RF_Free((char *) P, bytesPerEU);
637 RF_Free((char *) temp, bytesPerEU);
638 }
639
640
641 /***************************************************************************************
642 * This function is called by double degragded read
643 * EO_200_CreateReadDAG
644 *
645 ***************************************************************************************/
646 int
647 rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node)
648 {
649 int ndataParam = 0;
650 int np = node->numParams;
651 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
652 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
653 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
654 int i, prm, sector, nresults = node->numResults;
655 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
656 unsigned sosAddr;
657 int mallc_one = 0, mallc_two = 0; /* flags to indicate if
658 * memory is allocated */
659 int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
660 RF_PhysDiskAddr_t *ppda, *ppda2, *epda, *epda2, *pda, *pda0, *pda1,
661 npda;
662 RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
663 char **buf, *ebuf, *pbuf, *dest[2];
664 long *suoff = NULL, *suend = NULL, *prmToCol = NULL,
665 psuoff = 0, esuoff = 0;
666 RF_SectorNum_t startSector, endSector;
667 RF_Etimer_t timer;
668 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
669
670 RF_ETIMER_START(timer);
671
672 /* Find out the number of parameters which are pdas for data
673 * information */
674 for (i = 0; i <= np; i++)
675 if (((RF_PhysDiskAddr_t *) node->params[i].p)->type != RF_PDA_TYPE_DATA) {
676 ndataParam = i;
677 break;
678 }
679 buf = RF_Malloc(numDataCol * sizeof(*buf));
680 if (ndataParam != 0) {
681 suoff = RF_Malloc(ndataParam * sizeof(*suoff));
682 suend = RF_Malloc(ndataParam * sizeof(*suend));
683 prmToCol = RF_Malloc(ndataParam * sizeof(*prmToCol));
684 }
685 if (asmap->failedPDAs[1] &&
686 (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
687 RF_ASSERT(0); /* currently, no support for this situation */
688 ppda = node->params[np - 6].p;
689 ppda2 = node->params[np - 5].p;
690 RF_ASSERT(ppda2->type == RF_PDA_TYPE_PARITY);
691 epda = node->params[np - 4].p;
692 epda2 = node->params[np - 3].p;
693 RF_ASSERT(epda2->type == RF_PDA_TYPE_Q);
694 } else {
695 ppda = node->params[np - 4].p;
696 epda = node->params[np - 3].p;
697 psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
698 esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
699 RF_ASSERT(psuoff == esuoff);
700 }
701 /*
702 the followings have three goals:
703 1. determine the startSector to begin decoding and endSector to end decoding.
704 2. determine the colume numbers of the two failed disks.
705 3. determine the offset and end offset of the access within each failed stripe unit.
706 */
707 if (nresults == 1) {
708 /* find the startSector to begin decoding */
709 pda = node->results[0];
710 memset(pda->bufPtr, 0, bytesPerSector * pda->numSector);
711 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
712 fsuend[0] = fsuoff[0] + pda->numSector;
713 fsuoff[1] = 0;
714 fsuend[1] = 0;
715 startSector = fsuoff[0];
716 endSector = fsuend[0];
717
718 /* find out the column of failed disk being accessed */
719 fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress);
720
721 /* find out the other failed colume not accessed */
722 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
723 for (i = 0; i < numDataCol; i++) {
724 npda.raidAddress = sosAddr + (i * secPerSU);
725 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
726 /* skip over dead disks */
727 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
728 if (i != fcol[0])
729 break;
730 }
731 RF_ASSERT(i < numDataCol);
732 fcol[1] = i;
733 } else {
734 RF_ASSERT(nresults == 2);
735 pda0 = node->results[0];
736 memset(pda0->bufPtr, 0, bytesPerSector * pda0->numSector);
737 pda1 = node->results[1];
738 memset(pda1->bufPtr, 0, bytesPerSector * pda1->numSector);
739 /* determine the failed colume numbers of the two failed
740 * disks. */
741 fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress);
742 fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress);
743 /* determine the offset and end offset of the access within
744 * each failed stripe unit. */
745 fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector);
746 fsuend[0] = fsuoff[0] + pda0->numSector;
747 fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector);
748 fsuend[1] = fsuoff[1] + pda1->numSector;
749 /* determine the startSector to begin decoding */
750 startSector = RF_MIN(pda0->startSector, pda1->startSector);
751 /* determine the endSector to end decoding */
752 endSector = RF_MAX(fsuend[0], fsuend[1]);
753 }
754 /*
755 assign the beginning sector and the end sector for each parameter
756 find out the corresponding colume # for each parameter
757 */
758 for (prm = 0; prm < ndataParam; prm++) {
759 pda = node->params[prm].p;
760 suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
761 suend[prm] = suoff[prm] + pda->numSector;
762 prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress);
763 }
764 /* 'sector' is the sector for the current decoding algorithm. For each
765 * sector in the failed SU, find out the corresponding parameters that
766 * cover the current sector and that are needed for decoding of this
767 * sector in failed SU. 2. Find out if sector is in the shadow of any
768 * accessed failed SU. If not, malloc a temporary space of a sector in
769 * size. */
770 for (sector = startSector; sector < endSector; sector++) {
771 if (nresults == 2)
772 if (!(fsuoff[0] <= sector && sector < fsuend[0]) && !(fsuoff[1] <= sector && sector < fsuend[1]))
773 continue;
774 for (prm = 0; prm < ndataParam; prm++)
775 if (suoff[prm] <= sector && sector < suend[prm])
776 buf[(prmToCol[prm])] = (char *)((RF_PhysDiskAddr_t *) node->params[prm].p)->bufPtr +
777 rf_RaidAddressToByte(raidPtr, sector - suoff[prm]);
778 /* find out if sector is in the shadow of any accessed failed
779 * SU. If yes, assign dest[0], dest[1] to point at suitable
780 * position of the buffer corresponding to failed SUs. if no,
781 * malloc a temporary space of a sector in size for
782 * destination of decoding. */
783 RF_ASSERT(nresults == 1 || nresults == 2);
784 if (nresults == 1) {
785 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
786 /* Always malloc temp buffer to dest[1] */
787 dest[1] = RF_Malloc(bytesPerSector);
788 mallc_two = 1;
789 } else {
790 if (fsuoff[0] <= sector && sector < fsuend[0])
791 dest[0] = (char *)((RF_PhysDiskAddr_t *) node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[0]);
792 else {
793 dest[0] = RF_Malloc(bytesPerSector);
794 mallc_one = 1;
795 }
796 if (fsuoff[1] <= sector && sector < fsuend[1])
797 dest[1] = (char *)((RF_PhysDiskAddr_t *) node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector - fsuoff[1]);
798 else {
799 dest[1] = RF_Malloc(bytesPerSector);
800 mallc_two = 1;
801 }
802 RF_ASSERT(mallc_one == 0 || mallc_two == 0);
803 }
804 pbuf = (char *)ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - psuoff);
805 ebuf = (char *)epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector - esuoff);
806 /*
807 * After finish finding all needed sectors, call doubleEOdecode function for decoding
808 * one sector to destination.
809 */
810 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
811 /* free all allocated memory, and mark flag to indicate no
812 * memory is being allocated */
813 if (mallc_one == 1)
814 RF_Free(dest[0], bytesPerSector);
815 if (mallc_two == 1)
816 RF_Free(dest[1], bytesPerSector);
817 mallc_one = mallc_two = 0;
818 }
819 RF_Free(buf, numDataCol * sizeof(char *));
820 if (ndataParam != 0) {
821 RF_Free(suoff, ndataParam * sizeof(long));
822 RF_Free(suend, ndataParam * sizeof(long));
823 RF_Free(prmToCol, ndataParam * sizeof(long));
824 }
825 RF_ETIMER_STOP(timer);
826 RF_ETIMER_EVAL(timer);
827 if (tracerec) {
828 tracerec->q_us += RF_ETIMER_VAL_US(timer);
829 }
830 rf_GenericWakeupFunc(node, 0);
831 #if 1
832 return (0); /* XXX is this even close!!?!?!!? GO */
833 #endif
834 }
835
836
837 /* currently, only access of one of the two failed SU is allowed in this function.
838 * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
839 * many accesses of single stripe unit.
840 */
841
842 int
843 rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node)
844 {
845 int np = node->numParams;
846 RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p;
847 RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
848 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
849 RF_SectorNum_t sector;
850 RF_RowCol_t col, scol;
851 int prm, i, j;
852 RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
853 unsigned sosAddr;
854 unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
855 RF_int64 numbytes;
856 RF_SectorNum_t startSector, endSector;
857 RF_PhysDiskAddr_t *ppda, *epda, *pda, *fpda, npda;
858 RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
859 char **buf; /* buf[0], buf[1], buf[2], ...etc. point to
860 * buffer storing data read from col0, col1,
861 * col2 */
862 char *ebuf, *pbuf, *dest[2], *olddata[2];
863 RF_Etimer_t timer;
864 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
865
866 RF_ASSERT(asmap->numDataFailed == 1); /* currently only support this
867 * case, the other failed SU
868 * is not being accessed */
869 RF_ETIMER_START(timer);
870 buf = RF_Malloc(numDataCol * sizeof(*buf));
871
872 ppda = node->results[0];/* Instead of being buffers, node->results[0]
873 * and [1] are Ppda and Epda */
874 epda = node->results[1];
875 fpda = asmap->failedPDAs[0];
876
877 /* First, recovery the failed old SU using EvenOdd double decoding */
878 /* determine the startSector and endSector for decoding */
879 startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector);
880 endSector = startSector + fpda->numSector;
881 /* Assign buf[col] pointers to point to each non-failed colume and
882 * initialize the pbuf and ebuf to point at the beginning of each
883 * source buffers and destination buffers */
884 for (prm = 0; prm < numDataCol - 2; prm++) {
885 pda = (RF_PhysDiskAddr_t *) node->params[prm].p;
886 col = rf_EUCol(layoutPtr, pda->raidAddress);
887 buf[col] = pda->bufPtr;
888 }
889 /* pbuf and ebuf: they will change values as double recovery decoding
890 * goes on */
891 pbuf = ppda->bufPtr;
892 ebuf = epda->bufPtr;
893 /* find out the logical colume numbers in the encoding matrix of the
894 * two failed columes */
895 fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress);
896
897 /* find out the other failed colume not accessed this time */
898 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
899 for (i = 0; i < numDataCol; i++) {
900 npda.raidAddress = sosAddr + (i * secPerSU);
901 (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.col), &(npda.startSector), 0);
902 /* skip over dead disks */
903 if (RF_DEAD_DISK(raidPtr->Disks[npda.col].status))
904 if (i != fcol[0])
905 break;
906 }
907 RF_ASSERT(i < numDataCol);
908 fcol[1] = i;
909 /* assign temporary space to put recovered failed SU */
910 numbytes = fpda->numSector * bytesPerSector;
911 olddata[0] = RF_Malloc(numbytes);
912 olddata[1] = RF_Malloc(numbytes);
913 dest[0] = olddata[0];
914 dest[1] = olddata[1];
915 /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j]
916 * have already pointed at the beginning of each source buffers and
917 * destination buffers */
918 for (sector = startSector, i = 0; sector < endSector; sector++, i++) {
919 rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf);
920 for (j = 0; j < numDataCol; j++)
921 if ((j != fcol[0]) && (j != fcol[1]))
922 buf[j] += bytesPerSector;
923 dest[0] += bytesPerSector;
924 dest[1] += bytesPerSector;
925 ebuf += bytesPerSector;
926 pbuf += bytesPerSector;
927 }
928 /* after recovery, the buffer pointed by olddata[0] is the old failed
929 * data. With new writing data and this old data, use small write to
930 * calculate the new redundant informations */
931 /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of
932 * Rrd; params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol
933 * -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; params[
934 * PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol
935 * +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] are Pdas of
936 * wudNodes; For current implementation, we assume the simplest case:
937 * asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1
938 * ie. PDAPerDisk = 1 then node->params[numDataCol] must be the new
939 * data to be writen to the failed disk. We first bxor the new data
940 * into the old recovered data, then do the same things as small
941 * write. */
942
943 rf_bxor(((RF_PhysDiskAddr_t *) node->params[numDataCol].p)->bufPtr, olddata[0], numbytes);
944 /* do new 'E' calculation */
945 /* find out the corresponding colume in encoding matrix for write
946 * colume to be encoded into redundant disk 'E' */
947 scol = rf_EUCol(layoutPtr, fpda->raidAddress);
948 /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest
949 * buffer pointer */
950 rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
951
952 /* do new 'P' calculation */
953 rf_bxor(olddata[0], ppda->bufPtr, numbytes);
954 /* Free the allocated buffer */
955 RF_Free(olddata[0], numbytes);
956 RF_Free(olddata[1], numbytes);
957 RF_Free(buf, numDataCol * sizeof(char *));
958
959 RF_ETIMER_STOP(timer);
960 RF_ETIMER_EVAL(timer);
961 if (tracerec) {
962 tracerec->q_us += RF_ETIMER_VAL_US(timer);
963 }
964 rf_GenericWakeupFunc(node, 0);
965 return (0);
966 }
967 #endif /* RF_INCLUDE_EVENODD > 0 */
968