rf_pqdegdags.c revision 1.1 1 1.1 oster /* $NetBSD: rf_pqdegdags.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
2 1.1 oster /*
3 1.1 oster * Copyright (c) 1995 Carnegie-Mellon University.
4 1.1 oster * All rights reserved.
5 1.1 oster *
6 1.1 oster * Author: Daniel Stodolsky
7 1.1 oster *
8 1.1 oster * Permission to use, copy, modify and distribute this software and
9 1.1 oster * its documentation is hereby granted, provided that both the copyright
10 1.1 oster * notice and this permission notice appear in all copies of the
11 1.1 oster * software, derivative works or modified versions, and any portions
12 1.1 oster * thereof, and that both notices appear in supporting documentation.
13 1.1 oster *
14 1.1 oster * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 1.1 oster * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 1.1 oster * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 1.1 oster *
18 1.1 oster * Carnegie Mellon requests users of this software to return to
19 1.1 oster *
20 1.1 oster * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 1.1 oster * School of Computer Science
22 1.1 oster * Carnegie Mellon University
23 1.1 oster * Pittsburgh PA 15213-3890
24 1.1 oster *
25 1.1 oster * any improvements or extensions that they make and grant Carnegie the
26 1.1 oster * rights to redistribute these changes.
27 1.1 oster */
28 1.1 oster
29 1.1 oster /*
30 1.1 oster * rf_pqdegdags.c
31 1.1 oster * Degraded mode dags for double fault cases.
32 1.1 oster */
33 1.1 oster
34 1.1 oster /*
35 1.1 oster * :
36 1.1 oster * Log: rf_pqdegdags.c,v
37 1.1 oster * Revision 1.31 1996/11/05 21:10:40 jimz
38 1.1 oster * failed pda generalization
39 1.1 oster *
40 1.1 oster * Revision 1.30 1996/07/31 16:30:05 jimz
41 1.1 oster * asm/asmap fix
42 1.1 oster *
43 1.1 oster * Revision 1.29 1996/07/31 15:35:15 jimz
44 1.1 oster * evenodd changes; bugfixes for double-degraded archs, generalize
45 1.1 oster * some formerly PQ-only functions
46 1.1 oster *
47 1.1 oster * Revision 1.28 1996/07/28 20:31:39 jimz
48 1.1 oster * i386netbsd port
49 1.1 oster * true/false fixup
50 1.1 oster *
51 1.1 oster * Revision 1.27 1996/07/27 23:36:08 jimz
52 1.1 oster * Solaris port of simulator
53 1.1 oster *
54 1.1 oster * Revision 1.26 1996/07/22 19:52:16 jimz
55 1.1 oster * switched node params to RF_DagParam_t, a union of
56 1.1 oster * a 64-bit int and a void *, for better portability
57 1.1 oster * attempted hpux port, but failed partway through for
58 1.1 oster * lack of a single C compiler capable of compiling all
59 1.1 oster * source files
60 1.1 oster *
61 1.1 oster * Revision 1.25 1996/06/09 02:36:46 jimz
62 1.1 oster * lots of little crufty cleanup- fixup whitespace
63 1.1 oster * issues, comment #ifdefs, improve typing in some
64 1.1 oster * places (esp size-related)
65 1.1 oster *
66 1.1 oster * Revision 1.24 1996/06/07 22:26:27 jimz
67 1.1 oster * type-ify which_ru (RF_ReconUnitNum_t)
68 1.1 oster *
69 1.1 oster * Revision 1.23 1996/06/07 21:33:04 jimz
70 1.1 oster * begin using consistent types for sector numbers,
71 1.1 oster * stripe numbers, row+col numbers, recon unit numbers
72 1.1 oster *
73 1.1 oster * Revision 1.22 1996/06/02 17:31:48 jimz
74 1.1 oster * Moved a lot of global stuff into array structure, where it belongs.
75 1.1 oster * Fixed up paritylogging, pss modules in this manner. Some general
76 1.1 oster * code cleanup. Removed lots of dead code, some dead files.
77 1.1 oster *
78 1.1 oster * Revision 1.21 1996/05/31 22:26:54 jimz
79 1.1 oster * fix a lot of mapping problems, memory allocation problems
80 1.1 oster * found some weird lock issues, fixed 'em
81 1.1 oster * more code cleanup
82 1.1 oster *
83 1.1 oster * Revision 1.20 1996/05/30 12:59:18 jimz
84 1.1 oster * make etimer happier, more portable
85 1.1 oster *
86 1.1 oster * Revision 1.19 1996/05/30 11:29:41 jimz
87 1.1 oster * Numerous bug fixes. Stripe lock release code disagreed with the taking code
88 1.1 oster * about when stripes should be locked (I made it consistent: no parity, no lock)
89 1.1 oster * There was a lot of extra serialization of I/Os which I've removed- a lot of
90 1.1 oster * it was to calculate values for the cache code, which is no longer with us.
91 1.1 oster * More types, function, macro cleanup. Added code to properly quiesce the array
92 1.1 oster * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
93 1.1 oster * before. Fixed memory allocation, freeing bugs.
94 1.1 oster *
95 1.1 oster * Revision 1.18 1996/05/27 18:56:37 jimz
96 1.1 oster * more code cleanup
97 1.1 oster * better typing
98 1.1 oster * compiles in all 3 environments
99 1.1 oster *
100 1.1 oster * Revision 1.17 1996/05/24 22:17:04 jimz
101 1.1 oster * continue code + namespace cleanup
102 1.1 oster * typed a bunch of flags
103 1.1 oster *
104 1.1 oster * Revision 1.16 1996/05/24 04:28:55 jimz
105 1.1 oster * release cleanup ckpt
106 1.1 oster *
107 1.1 oster * Revision 1.15 1996/05/23 21:46:35 jimz
108 1.1 oster * checkpoint in code cleanup (release prep)
109 1.1 oster * lots of types, function names have been fixed
110 1.1 oster *
111 1.1 oster * Revision 1.14 1996/05/23 00:33:23 jimz
112 1.1 oster * code cleanup: move all debug decls to rf_options.c, all extern
113 1.1 oster * debug decls to rf_options.h, all debug vars preceded by rf_
114 1.1 oster *
115 1.1 oster * Revision 1.13 1996/05/18 19:51:34 jimz
116 1.1 oster * major code cleanup- fix syntax, make some types consistent,
117 1.1 oster * add prototypes, clean out dead code, et cetera
118 1.1 oster *
119 1.1 oster * Revision 1.12 1996/05/08 21:01:24 jimz
120 1.1 oster * fixed up enum type names that were conflicting with other
121 1.1 oster * enums and function names (ie, "panic")
122 1.1 oster * future naming trends will be towards RF_ and rf_ for
123 1.1 oster * everything raidframe-related
124 1.1 oster *
125 1.1 oster * Revision 1.11 1996/05/03 19:47:50 wvcii
126 1.1 oster * removed include of rf_redstripe.h
127 1.1 oster *
128 1.1 oster * Revision 1.10 1995/12/12 18:10:06 jimz
129 1.1 oster * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
130 1.1 oster * fix 80-column brain damage in comments
131 1.1 oster *
132 1.1 oster * Revision 1.9 1995/11/30 16:17:57 wvcii
133 1.1 oster * added copyright info
134 1.1 oster *
135 1.1 oster * Revision 1.8 1995/11/07 15:33:25 wvcii
136 1.1 oster * dag creation routines now generate term node
137 1.1 oster * added asserts
138 1.1 oster * encoded commit point nodes, antecedence types into dags
139 1.1 oster * didn't add commit barrier - the code is a mess and needs to
140 1.1 oster * be cleand up first
141 1.1 oster *
142 1.1 oster */
143 1.1 oster
144 1.1 oster #include "rf_archs.h"
145 1.1 oster
146 1.1 oster #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
147 1.1 oster
148 1.1 oster #include "rf_types.h"
149 1.1 oster #include "rf_raid.h"
150 1.1 oster #include "rf_dag.h"
151 1.1 oster #include "rf_dagfuncs.h"
152 1.1 oster #include "rf_dagutils.h"
153 1.1 oster #include "rf_etimer.h"
154 1.1 oster #include "rf_acctrace.h"
155 1.1 oster #include "rf_general.h"
156 1.1 oster #include "rf_pqdegdags.h"
157 1.1 oster #include "rf_pq.h"
158 1.1 oster #include "rf_sys.h"
159 1.1 oster
160 1.1 oster static void applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda,
161 1.1 oster RF_PhysDiskAddr_t *qpda, void *bp);
162 1.1 oster
163 1.1 oster /*
164 1.1 oster Two data drives have failed, and we are doing a read that covers one of them.
165 1.1 oster We may also be reading some of the surviving drives.
166 1.1 oster
167 1.1 oster
168 1.1 oster *****************************************************************************************
169 1.1 oster *
170 1.1 oster * creates a DAG to perform a degraded-mode read of data within one stripe.
171 1.1 oster * This DAG is as follows:
172 1.1 oster *
173 1.1 oster * Hdr
174 1.1 oster * |
175 1.1 oster * Block
176 1.1 oster * / / \ \ \ \
177 1.1 oster * Rud ... Rud Rrd ... Rrd Rp Rq
178 1.1 oster * | \ | \ | \ | \ | \ | \
179 1.1 oster *
180 1.1 oster * | |
181 1.1 oster * Unblock X
182 1.1 oster * \ /
183 1.1 oster * ------ T ------
184 1.1 oster *
185 1.1 oster * Each R node is a successor of the L node
186 1.1 oster * One successor arc from each R node goes to U, and the other to X
187 1.1 oster * There is one Rud for each chunk of surviving user data requested by the user,
188 1.1 oster * and one Rrd for each chunk of surviving user data _not_ being read by the user
189 1.1 oster * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
190 1.1 oster * X = pq recovery node, T = terminate
191 1.1 oster *
192 1.1 oster * The block & unblock nodes are leftovers from a previous version. They
193 1.1 oster * do nothing, but I haven't deleted them because it would be a tremendous
194 1.1 oster * effort to put them back in.
195 1.1 oster *
196 1.1 oster * Note: The target buffer for the XOR node is set to the actual user buffer where the
197 1.1 oster * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
198 1.1 oster * if you create a degraded read dag, use it, and then re-use, you have to be sure to
199 1.1 oster * zero the target buffer prior to the re-use.
200 1.1 oster *
201 1.1 oster * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
202 1.1 oster * needs and what's not.
203 1.1 oster ****************************************************************************************/
204 1.1 oster /* init a disk node with 2 successors and one predecessor */
205 1.1 oster #define INIT_DISK_NODE(node,name) \
206 1.1 oster rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
207 1.1 oster (node)->succedents[0] = unblockNode; \
208 1.1 oster (node)->succedents[1] = recoveryNode; \
209 1.1 oster (node)->antecedents[0] = blockNode; \
210 1.1 oster (node)->antType[0] = rf_control
211 1.1 oster
212 1.1 oster #define DISK_NODE_PARAMS(_node_,_p_) \
213 1.1 oster (_node_).params[0].p = _p_ ; \
214 1.1 oster (_node_).params[1].p = (_p_)->bufPtr; \
215 1.1 oster (_node_).params[2].v = parityStripeID; \
216 1.1 oster (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
217 1.1 oster
218 1.1 oster #define DISK_NODE_PDA(node) ((node)->params[0].p)
219 1.1 oster
220 1.1 oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
221 1.1 oster {
222 1.1 oster rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
223 1.1 oster "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
224 1.1 oster }
225 1.1 oster
226 1.1 oster static void applyPDA(raidPtr,pda,ppda,qpda, bp)
227 1.1 oster RF_Raid_t *raidPtr;
228 1.1 oster RF_PhysDiskAddr_t *pda;
229 1.1 oster RF_PhysDiskAddr_t *ppda;
230 1.1 oster RF_PhysDiskAddr_t *qpda;
231 1.1 oster void *bp;
232 1.1 oster {
233 1.1 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
234 1.1 oster RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
235 1.1 oster RF_SectorCount_t s0len = ppda->numSector, len;
236 1.1 oster RF_SectorNum_t suoffset;
237 1.1 oster unsigned coeff;
238 1.1 oster char *pbuf = ppda->bufPtr;
239 1.1 oster char *qbuf = qpda->bufPtr;
240 1.1 oster char *buf;
241 1.1 oster int delta;
242 1.1 oster
243 1.1 oster suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
244 1.1 oster len = pda->numSector;
245 1.1 oster /* see if pda intersects a recovery pda */
246 1.1 oster if ((suoffset < s0off+s0len) && ( suoffset+len > s0off))
247 1.1 oster {
248 1.1 oster buf = pda->bufPtr;
249 1.1 oster coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
250 1.1 oster coeff = (coeff % raidPtr->Layout.numDataCol);
251 1.1 oster
252 1.1 oster if (suoffset < s0off)
253 1.1 oster {
254 1.1 oster delta = s0off - suoffset;
255 1.1 oster buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
256 1.1 oster suoffset = s0off;
257 1.1 oster len -= delta;
258 1.1 oster }
259 1.1 oster if (suoffset > s0off)
260 1.1 oster {
261 1.1 oster delta = suoffset - s0off;
262 1.1 oster pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
263 1.1 oster qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
264 1.1 oster }
265 1.1 oster if ((suoffset + len) > (s0len + s0off))
266 1.1 oster len = s0len + s0off - suoffset;
267 1.1 oster
268 1.1 oster /* src, dest, len */
269 1.1 oster rf_bxor(buf,pbuf,rf_RaidAddressToByte(raidPtr,len), bp);
270 1.1 oster
271 1.1 oster /* dest, src, len, coeff */
272 1.1 oster rf_IncQ((unsigned long *)qbuf,(unsigned long *)buf,rf_RaidAddressToByte(raidPtr,len),coeff);
273 1.1 oster }
274 1.1 oster }
275 1.1 oster /*
276 1.1 oster Recover data in the case of a double failure. There can be two
277 1.1 oster result buffers, one for each chunk of data trying to be recovered.
278 1.1 oster The params are pda's that have not been range restricted or otherwise
279 1.1 oster politely massaged - this should be done here. The last params are the
280 1.1 oster pdas of P and Q, followed by the raidPtr. The list can look like
281 1.1 oster
282 1.1 oster pda, pda, ... , p pda, q pda, raidptr, asm
283 1.1 oster
284 1.1 oster or
285 1.1 oster
286 1.1 oster pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
287 1.1 oster
288 1.1 oster depending on wether two chunks of recovery data were required.
289 1.1 oster
290 1.1 oster The second condition only arises if there are two failed buffers
291 1.1 oster whose lengths do not add up a stripe unit.
292 1.1 oster */
293 1.1 oster
294 1.1 oster
295 1.1 oster int rf_PQDoubleRecoveryFunc(node)
296 1.1 oster RF_DagNode_t *node;
297 1.1 oster {
298 1.1 oster int np = node->numParams;
299 1.1 oster RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
300 1.1 oster RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
301 1.1 oster RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
302 1.1 oster int d, i;
303 1.1 oster unsigned coeff;
304 1.1 oster RF_RaidAddr_t sosAddr, suoffset;
305 1.1 oster RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
306 1.1 oster int two = 0;
307 1.1 oster RF_PhysDiskAddr_t *ppda,*ppda2,*qpda,*qpda2,*pda,npda;
308 1.1 oster char *buf;
309 1.1 oster int numDataCol = layoutPtr->numDataCol;
310 1.1 oster RF_Etimer_t timer;
311 1.1 oster RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
312 1.1 oster
313 1.1 oster RF_ETIMER_START(timer);
314 1.1 oster
315 1.1 oster if (asmap->failedPDAs[1] &&
316 1.1 oster (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU))
317 1.1 oster {
318 1.1 oster RF_ASSERT(0);
319 1.1 oster ppda = node->params[np-6].p;
320 1.1 oster ppda2 = node->params[np-5].p;
321 1.1 oster qpda = node->params[np-4].p;
322 1.1 oster qpda2 = node->params[np-3].p;
323 1.1 oster d = (np-6);
324 1.1 oster two = 1;
325 1.1 oster }
326 1.1 oster else
327 1.1 oster {
328 1.1 oster ppda = node->params[np-4].p;
329 1.1 oster qpda = node->params[np-3].p;
330 1.1 oster d = (np-4);
331 1.1 oster }
332 1.1 oster
333 1.1 oster for (i=0; i < d; i++)
334 1.1 oster {
335 1.1 oster pda = node->params[i].p;
336 1.1 oster buf = pda->bufPtr;
337 1.1 oster suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
338 1.1 oster len = pda->numSector;
339 1.1 oster coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
340 1.1 oster /* compute the data unit offset within the column */
341 1.1 oster coeff = (coeff % raidPtr->Layout.numDataCol);
342 1.1 oster /* see if pda intersects a recovery pda */
343 1.1 oster applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
344 1.1 oster if (two)
345 1.1 oster applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
346 1.1 oster }
347 1.1 oster
348 1.1 oster /* ok, we got the parity back to the point where we can recover.
349 1.1 oster We now need to determine the coeff of the columns that need to be
350 1.1 oster recovered. We can also only need to recover a single stripe unit.
351 1.1 oster */
352 1.1 oster
353 1.1 oster if (asmap->failedPDAs[1] == NULL)
354 1.1 oster { /* only a single stripe unit to recover. */
355 1.1 oster pda = asmap->failedPDAs[0];
356 1.1 oster sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
357 1.1 oster /* need to determine the column of the other failed disk */
358 1.1 oster coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
359 1.1 oster /* compute the data unit offset within the column */
360 1.1 oster coeff = (coeff % raidPtr->Layout.numDataCol);
361 1.1 oster for (i=0; i < numDataCol; i++)
362 1.1 oster {
363 1.1 oster npda.raidAddress = sosAddr + (i * secPerSU);
364 1.1 oster (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
365 1.1 oster /* skip over dead disks */
366 1.1 oster if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
367 1.1 oster if (i != coeff) break;
368 1.1 oster }
369 1.1 oster RF_ASSERT (i < numDataCol);
370 1.1 oster RF_ASSERT (two==0);
371 1.1 oster /* recover the data. Since we need only want to recover one column, we overwrite the
372 1.1 oster parity with the other one. */
373 1.1 oster if (coeff < i) /* recovering 'a' */
374 1.1 oster rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)pda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
375 1.1 oster else /* recovering 'b' */
376 1.1 oster rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)pda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
377 1.1 oster }
378 1.1 oster else
379 1.1 oster RF_PANIC();
380 1.1 oster
381 1.1 oster RF_ETIMER_STOP(timer);
382 1.1 oster RF_ETIMER_EVAL(timer);
383 1.1 oster if (tracerec)
384 1.1 oster tracerec->q_us += RF_ETIMER_VAL_US(timer);
385 1.1 oster rf_GenericWakeupFunc(node,0);
386 1.1 oster return(0);
387 1.1 oster }
388 1.1 oster
389 1.1 oster int rf_PQWriteDoubleRecoveryFunc(node)
390 1.1 oster RF_DagNode_t *node;
391 1.1 oster {
392 1.1 oster /* The situation:
393 1.1 oster
394 1.1 oster We are doing a write that hits only one
395 1.1 oster failed data unit.
396 1.1 oster The other failed data unit is not being overwritten, so
397 1.1 oster we need to generate it.
398 1.1 oster
399 1.1 oster For the moment, we assume all the nonfailed data being
400 1.1 oster written is in the shadow of the failed data unit.
401 1.1 oster (i.e,, either a single data unit write or the entire
402 1.1 oster failed stripe unit is being overwritten. )
403 1.1 oster
404 1.1 oster Recovery strategy:
405 1.1 oster apply the recovery data to the parity and q.
406 1.1 oster Use P & Q to recover the second failed data unit in P.
407 1.1 oster Zero fill Q, then apply the recovered data to p.
408 1.1 oster Then apply the data being written to the failed drive.
409 1.1 oster Then walk through the surviving drives, applying new data
410 1.1 oster when it exists, othewise the recovery data. Quite a mess.
411 1.1 oster
412 1.1 oster
413 1.1 oster The params
414 1.1 oster
415 1.1 oster read pda0, read pda1, ... read pda (numDataCol-3),
416 1.1 oster write pda0, ... , write pda (numStripeUnitAccess - numDataFailed),
417 1.1 oster failed pda, raidPtr, asmap
418 1.1 oster */
419 1.1 oster
420 1.1 oster int np = node->numParams;
421 1.1 oster RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
422 1.1 oster RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
423 1.1 oster RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
424 1.1 oster int i;
425 1.1 oster RF_RaidAddr_t sosAddr;
426 1.1 oster unsigned coeff;
427 1.1 oster RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
428 1.1 oster RF_PhysDiskAddr_t *ppda,*qpda,*pda,npda;
429 1.1 oster int numDataCol = layoutPtr->numDataCol;
430 1.1 oster RF_Etimer_t timer;
431 1.1 oster RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
432 1.1 oster
433 1.1 oster RF_ASSERT(node->numResults == 2);
434 1.1 oster RF_ASSERT(asmap->failedPDAs[1] == NULL);
435 1.1 oster RF_ETIMER_START(timer);
436 1.1 oster ppda = node->results[0];
437 1.1 oster qpda = node->results[1];
438 1.1 oster /* apply the recovery data */
439 1.1 oster for (i=0; i < numDataCol-2; i++)
440 1.1 oster applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
441 1.1 oster
442 1.1 oster /* determine the other failed data unit */
443 1.1 oster pda = asmap->failedPDAs[0];
444 1.1 oster sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
445 1.1 oster /* need to determine the column of the other failed disk */
446 1.1 oster coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
447 1.1 oster /* compute the data unit offset within the column */
448 1.1 oster coeff = (coeff % raidPtr->Layout.numDataCol);
449 1.1 oster for (i=0; i < numDataCol; i++)
450 1.1 oster {
451 1.1 oster npda.raidAddress = sosAddr + (i * secPerSU);
452 1.1 oster (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
453 1.1 oster /* skip over dead disks */
454 1.1 oster if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
455 1.1 oster if (i != coeff) break;
456 1.1 oster }
457 1.1 oster RF_ASSERT (i < numDataCol);
458 1.1 oster /* recover the data. The column we want to recover we write over the parity.
459 1.1 oster The column we don't care about we dump in q. */
460 1.1 oster if (coeff < i) /* recovering 'a' */
461 1.1 oster rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
462 1.1 oster else /* recovering 'b' */
463 1.1 oster rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
464 1.1 oster
465 1.1 oster /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
466 1.1 oster bzero(qpda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector));
467 1.1 oster rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),i);
468 1.1 oster
469 1.1 oster /* now apply all the write data to the buffer */
470 1.1 oster /* single stripe unit write case: the failed data is only thing we are writing. */
471 1.1 oster RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
472 1.1 oster /* dest, src, len, coeff */
473 1.1 oster rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)asmap->failedPDAs[0]->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),coeff);
474 1.1 oster rf_bxor(asmap->failedPDAs[0]->bufPtr,ppda->bufPtr,rf_RaidAddressToByte(raidPtr,ppda->numSector),node->dagHdr->bp);
475 1.1 oster
476 1.1 oster /* now apply all the recovery data */
477 1.1 oster for (i=0; i < numDataCol-2; i++)
478 1.1 oster applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
479 1.1 oster
480 1.1 oster RF_ETIMER_STOP(timer);
481 1.1 oster RF_ETIMER_EVAL(timer);
482 1.1 oster if (tracerec)
483 1.1 oster tracerec->q_us += RF_ETIMER_VAL_US(timer);
484 1.1 oster
485 1.1 oster rf_GenericWakeupFunc(node,0);
486 1.1 oster return(0);
487 1.1 oster }
488 1.1 oster
489 1.1 oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
490 1.1 oster {
491 1.1 oster RF_PANIC();
492 1.1 oster }
493 1.1 oster
494 1.1 oster /*
495 1.1 oster Two lost data unit write case.
496 1.1 oster
497 1.1 oster There are really two cases here:
498 1.1 oster
499 1.1 oster (1) The write completely covers the two lost data units.
500 1.1 oster In that case, a reconstruct write that doesn't write the
501 1.1 oster failed data units will do the correct thing. So in this case,
502 1.1 oster the dag looks like
503 1.1 oster
504 1.1 oster full stripe read of surviving data units (not being overwriten)
505 1.1 oster write new data (ignoring failed units) compute P&Q
506 1.1 oster write P&Q
507 1.1 oster
508 1.1 oster
509 1.1 oster (2) The write does not completely cover both failed data units
510 1.1 oster (but touches at least one of them). Then we need to do the
511 1.1 oster equivalent of a reconstruct read to recover the missing data
512 1.1 oster unit from the other stripe.
513 1.1 oster
514 1.1 oster For any data we are writing that is not in the "shadow"
515 1.1 oster of the failed units, we need to do a four cycle update.
516 1.1 oster PANIC on this case. for now
517 1.1 oster
518 1.1 oster */
519 1.1 oster
520 1.1 oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
521 1.1 oster {
522 1.1 oster RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
523 1.1 oster RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
524 1.1 oster int sum;
525 1.1 oster int nf = asmap->numDataFailed;
526 1.1 oster
527 1.1 oster sum = asmap->failedPDAs[0]->numSector;
528 1.1 oster if (nf == 2)
529 1.1 oster sum += asmap->failedPDAs[1]->numSector;
530 1.1 oster
531 1.1 oster if ((nf == 2) && ( sum == (2*sectorsPerSU)))
532 1.1 oster {
533 1.1 oster /* large write case */
534 1.1 oster rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
535 1.1 oster return;
536 1.1 oster }
537 1.1 oster
538 1.1 oster
539 1.1 oster if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU))
540 1.1 oster {
541 1.1 oster /* small write case, no user data not in shadow */
542 1.1 oster rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
543 1.1 oster return;
544 1.1 oster }
545 1.1 oster RF_PANIC();
546 1.1 oster }
547 1.1 oster
548 1.1 oster RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
549 1.1 oster {
550 1.1 oster rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
551 1.1 oster }
552 1.1 oster
553 1.1 oster #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
554