rf_dagfuncs.c revision 1.2 1 /* $NetBSD: rf_dagfuncs.c,v 1.2 1999/01/26 02:33:53 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * dagfuncs.c -- DAG node execution routines
31 *
32 * Rules:
33 * 1. Every DAG execution function must eventually cause node->status to
34 * get set to "good" or "bad", and "FinishNode" to be called. In the
35 * case of nodes that complete immediately (xor, NullNodeFunc, etc),
36 * the node execution function can do these two things directly. In
37 * the case of nodes that have to wait for some event (a disk read to
38 * complete, a lock to be released, etc) to occur before they can
39 * complete, this is typically achieved by having whatever module
40 * is doing the operation call GenericWakeupFunc upon completion.
41 * 2. DAG execution functions should check the status in the DAG header
42 * and NOP out their operations if the status is not "enable". However,
43 * execution functions that release resources must be sure to release
44 * them even when they NOP out the function that would use them.
45 * Functions that acquire resources should go ahead and acquire them
46 * even when they NOP, so that a downstream release node will not have
47 * to check to find out whether or not the acquire was suppressed.
48 */
49
50 #include <sys/ioctl.h>
51 #include <sys/param.h>
52
53 #include "rf_archs.h"
54 #include "rf_raid.h"
55 #include "rf_dag.h"
56 #include "rf_layout.h"
57 #include "rf_etimer.h"
58 #include "rf_acctrace.h"
59 #include "rf_diskqueue.h"
60 #include "rf_dagfuncs.h"
61 #include "rf_general.h"
62 #include "rf_engine.h"
63 #include "rf_dagutils.h"
64
65 #include "rf_kintf.h"
66
67 #if RF_INCLUDE_PARITYLOGGING > 0
68 #include "rf_paritylog.h"
69 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
70
71 int (*rf_DiskReadFunc)(RF_DagNode_t *);
72 int (*rf_DiskWriteFunc)(RF_DagNode_t *);
73 int (*rf_DiskReadUndoFunc)(RF_DagNode_t *);
74 int (*rf_DiskWriteUndoFunc)(RF_DagNode_t *);
75 int (*rf_DiskUnlockFunc)(RF_DagNode_t *);
76 int (*rf_DiskUnlockUndoFunc)(RF_DagNode_t *);
77 int (*rf_RegularXorUndoFunc)(RF_DagNode_t *);
78 int (*rf_SimpleXorUndoFunc)(RF_DagNode_t *);
79 int (*rf_RecoveryXorUndoFunc)(RF_DagNode_t *);
80
81 /*****************************************************************************************
82 * main (only) configuration routine for this module
83 ****************************************************************************************/
84 int rf_ConfigureDAGFuncs(listp)
85 RF_ShutdownList_t **listp;
86 {
87 RF_ASSERT( ((sizeof(long)==8) && RF_LONGSHIFT==3) || ((sizeof(long)==4) && RF_LONGSHIFT==2) );
88 rf_DiskReadFunc = rf_DiskReadFuncForThreads;
89 rf_DiskReadUndoFunc = rf_DiskUndoFunc;
90 rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
91 rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
92 rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
93 rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
94 rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
95 rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
96 rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
97 return(0);
98 }
99
100
101
102 /*****************************************************************************************
103 * the execution function associated with a terminate node
104 ****************************************************************************************/
105 int rf_TerminateFunc(node)
106 RF_DagNode_t *node;
107 {
108 RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
109 node->status = rf_good;
110 return(rf_FinishNode(node, RF_THREAD_CONTEXT));
111 }
112
113 int rf_TerminateUndoFunc(node)
114 RF_DagNode_t *node;
115 {
116 return(0);
117 }
118
119
120 /*****************************************************************************************
121 * execution functions associated with a mirror node
122 *
123 * parameters:
124 *
125 * 0 - physical disk addres of data
126 * 1 - buffer for holding read data
127 * 2 - parity stripe ID
128 * 3 - flags
129 * 4 - physical disk address of mirror (parity)
130 *
131 ****************************************************************************************/
132
133 int rf_DiskReadMirrorIdleFunc(node)
134 RF_DagNode_t *node;
135 {
136 /* select the mirror copy with the shortest queue and fill in node parameters
137 with physical disk address */
138
139 rf_SelectMirrorDiskIdle(node);
140 return(rf_DiskReadFunc(node));
141 }
142
143 int rf_DiskReadMirrorPartitionFunc(node)
144 RF_DagNode_t *node;
145 {
146 /* select the mirror copy with the shortest queue and fill in node parameters
147 with physical disk address */
148
149 rf_SelectMirrorDiskPartition(node);
150 return(rf_DiskReadFunc(node));
151 }
152
153 int rf_DiskReadMirrorUndoFunc(node)
154 RF_DagNode_t *node;
155 {
156 return(0);
157 }
158
159
160
161 #if RF_INCLUDE_PARITYLOGGING > 0
162 /*****************************************************************************************
163 * the execution function associated with a parity log update node
164 ****************************************************************************************/
165 int rf_ParityLogUpdateFunc(node)
166 RF_DagNode_t *node;
167 {
168 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
169 caddr_t buf = (caddr_t) node->params[1].p;
170 RF_ParityLogData_t *logData;
171 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
172 RF_Etimer_t timer;
173
174 if (node->dagHdr->status == rf_enable)
175 {
176 RF_ETIMER_START(timer);
177 logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
178 (RF_Raid_t *) (node->dagHdr->raidPtr),
179 node->wakeFunc, (void *) node,
180 node->dagHdr->tracerec, timer);
181 if (logData)
182 rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
183 else
184 {
185 RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer);
186 (node->wakeFunc)(node, ENOMEM);
187 }
188 }
189 return(0);
190 }
191
192
193 /*****************************************************************************************
194 * the execution function associated with a parity log overwrite node
195 ****************************************************************************************/
196 int rf_ParityLogOverwriteFunc(node)
197 RF_DagNode_t *node;
198 {
199 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
200 caddr_t buf = (caddr_t) node->params[1].p;
201 RF_ParityLogData_t *logData;
202 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
203 RF_Etimer_t timer;
204
205 if (node->dagHdr->status == rf_enable)
206 {
207 RF_ETIMER_START(timer);
208 logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
209 node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
210 if (logData)
211 rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
212 else
213 {
214 RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer);
215 (node->wakeFunc)(node, ENOMEM);
216 }
217 }
218 return(0);
219 }
220
221 #else /* RF_INCLUDE_PARITYLOGGING > 0 */
222
223 int rf_ParityLogUpdateFunc(node)
224 RF_DagNode_t *node;
225 {
226 return(0);
227 }
228 int rf_ParityLogOverwriteFunc(node)
229 RF_DagNode_t *node;
230 {
231 return(0);
232 }
233
234 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
235
236 int rf_ParityLogUpdateUndoFunc(node)
237 RF_DagNode_t *node;
238 {
239 return(0);
240 }
241
242 int rf_ParityLogOverwriteUndoFunc(node)
243 RF_DagNode_t *node;
244 {
245 return(0);
246 }
247
248 /*****************************************************************************************
249 * the execution function associated with a NOP node
250 ****************************************************************************************/
251 int rf_NullNodeFunc(node)
252 RF_DagNode_t *node;
253 {
254 node->status = rf_good;
255 return(rf_FinishNode(node, RF_THREAD_CONTEXT));
256 }
257
258 int rf_NullNodeUndoFunc(node)
259 RF_DagNode_t *node;
260 {
261 node->status = rf_undone;
262 return(rf_FinishNode(node, RF_THREAD_CONTEXT));
263 }
264
265
266 /*****************************************************************************************
267 * the execution function associated with a disk-read node
268 ****************************************************************************************/
269 int rf_DiskReadFuncForThreads(node)
270 RF_DagNode_t *node;
271 {
272 RF_DiskQueueData_t *req;
273 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
274 caddr_t buf = (caddr_t)node->params[1].p;
275 RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v;
276 unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
277 unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
278 unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
279 unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
280 RF_DiskQueueDataFlags_t flags = 0;
281 RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
282 RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
283 void *b_proc = NULL;
284 #if RF_BACKWARD > 0
285 caddr_t undoBuf;
286 #endif
287
288 if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
289
290 RF_ASSERT( !(lock && unlock) );
291 flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
292 flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
293 #if RF_BACKWARD > 0
294 /* allocate and zero the undo buffer.
295 * this is equivalent to copying the original buffer's contents to the undo buffer
296 * prior to performing the disk read.
297 * XXX hardcoded 512 bytes per sector!
298 */
299 if (node->dagHdr->allocList == NULL)
300 rf_MakeAllocList(node->dagHdr->allocList);
301 RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
302 #endif /* RF_BACKWARD > 0 */
303 req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
304 buf, parityStripeID, which_ru,
305 (int (*)(void *,int)) node->wakeFunc,
306 node, NULL, node->dagHdr->tracerec,
307 (void *)(node->dagHdr->raidPtr), flags, b_proc);
308 if (!req) {
309 (node->wakeFunc)(node, ENOMEM);
310 } else {
311 node->dagFuncData = (void *) req;
312 rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
313 }
314 return(0);
315 }
316
317
318 /*****************************************************************************************
319 * the execution function associated with a disk-write node
320 ****************************************************************************************/
321 int rf_DiskWriteFuncForThreads(node)
322 RF_DagNode_t *node;
323 {
324 RF_DiskQueueData_t *req;
325 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
326 caddr_t buf = (caddr_t)node->params[1].p;
327 RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v;
328 unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
329 unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
330 unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
331 unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
332 RF_DiskQueueDataFlags_t flags = 0;
333 RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
334 RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
335 void *b_proc = NULL;
336 #if RF_BACKWARD > 0
337 caddr_t undoBuf;
338 #endif
339
340 if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
341
342 #if RF_BACKWARD > 0
343 /* This area is used only for backward error recovery experiments
344 * First, schedule allocate a buffer and schedule a pre-read of the disk
345 * After the pre-read, proceed with the normal disk write
346 */
347 if (node->status == rf_bwd2) {
348 /* just finished undo logging, now perform real function */
349 node->status = rf_fired;
350 RF_ASSERT( !(lock && unlock) );
351 flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
352 flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
353 req = rf_CreateDiskQueueData(iotype,
354 pda->startSector, pda->numSector, buf, parityStripeID, which_ru,
355 node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
356 (void *) (node->dagHdr->raidPtr), flags, b_proc);
357
358 if (!req) {
359 (node->wakeFunc)(node, ENOMEM);
360 } else {
361 node->dagFuncData = (void *) req;
362 rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
363 }
364 }
365
366 else {
367 /* node status should be rf_fired */
368 /* schedule a disk pre-read */
369 node->status = rf_bwd1;
370 RF_ASSERT( !(lock && unlock) );
371 flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
372 flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
373 if (node->dagHdr->allocList == NULL)
374 rf_MakeAllocList(node->dagHdr->allocList);
375 RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
376 req = rf_CreateDiskQueueData(RF_IO_TYPE_READ,
377 pda->startSector, pda->numSector, undoBuf, parityStripeID, which_ru,
378 node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
379 (void *) (node->dagHdr->raidPtr), flags, b_proc);
380
381 if (!req) {
382 (node->wakeFunc)(node, ENOMEM);
383 } else {
384 node->dagFuncData = (void *) req;
385 rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
386 }
387 }
388 return(0);
389 #endif /* RF_BACKWARD > 0 */
390
391 /* normal processing (rollaway or forward recovery) begins here */
392 RF_ASSERT( !(lock && unlock) );
393 flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
394 flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
395 req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
396 buf, parityStripeID, which_ru,
397 (int (*)(void *,int)) node->wakeFunc,
398 (void *) node, NULL,
399 node->dagHdr->tracerec,
400 (void *) (node->dagHdr->raidPtr),
401 flags, b_proc);
402
403 if (!req) {
404 (node->wakeFunc)(node, ENOMEM);
405 } else {
406 node->dagFuncData = (void *) req;
407 rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
408 }
409
410 return(0);
411 }
412
413 /*****************************************************************************************
414 * the undo function for disk nodes
415 * Note: this is not a proper undo of a write node, only locks are released.
416 * old data is not restored to disk!
417 ****************************************************************************************/
418 int rf_DiskUndoFunc(node)
419 RF_DagNode_t *node;
420 {
421 RF_DiskQueueData_t *req;
422 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
423 RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
424
425 req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
426 0L, 0, NULL, 0L, 0,
427 (int (*)(void *,int)) node->wakeFunc,
428 (void *) node,
429 NULL, node->dagHdr->tracerec,
430 (void *) (node->dagHdr->raidPtr),
431 RF_UNLOCK_DISK_QUEUE, NULL);
432 if (!req)
433 (node->wakeFunc)(node, ENOMEM);
434 else {
435 node->dagFuncData = (void *) req;
436 rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY );
437 }
438
439 return(0);
440 }
441
442 /*****************************************************************************************
443 * the execution function associated with an "unlock disk queue" node
444 ****************************************************************************************/
445 int rf_DiskUnlockFuncForThreads(node)
446 RF_DagNode_t *node;
447 {
448 RF_DiskQueueData_t *req;
449 RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
450 RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
451
452 req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
453 0L, 0, NULL, 0L, 0,
454 (int (*)(void *,int)) node->wakeFunc,
455 (void *) node,
456 NULL, node->dagHdr->tracerec,
457 (void *) (node->dagHdr->raidPtr),
458 RF_UNLOCK_DISK_QUEUE, NULL);
459 if (!req)
460 (node->wakeFunc)(node, ENOMEM);
461 else {
462 node->dagFuncData = (void *) req;
463 rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY );
464 }
465
466 return(0);
467 }
468
469 /*****************************************************************************************
470 * Callback routine for DiskRead and DiskWrite nodes. When the disk op completes,
471 * the routine is called to set the node status and inform the execution engine that
472 * the node has fired.
473 ****************************************************************************************/
474 int rf_GenericWakeupFunc(node, status)
475 RF_DagNode_t *node;
476 int status;
477 {
478 switch (node->status) {
479 case rf_bwd1 :
480 node->status = rf_bwd2;
481 if (node->dagFuncData)
482 rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
483 return(rf_DiskWriteFuncForThreads(node));
484 break;
485 case rf_fired :
486 if (status) node->status = rf_bad;
487 else node->status = rf_good;
488 break;
489 case rf_recover :
490 /* probably should never reach this case */
491 if (status) node->status = rf_panic;
492 else node->status = rf_undone;
493 break;
494 default :
495 RF_PANIC();
496 break;
497 }
498 if (node->dagFuncData)
499 rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
500 return(rf_FinishNode(node, RF_INTR_CONTEXT));
501 }
502
503
504 /*****************************************************************************************
505 * there are three distinct types of xor nodes
506 * A "regular xor" is used in the fault-free case where the access spans a complete
507 * stripe unit. It assumes that the result buffer is one full stripe unit in size,
508 * and uses the stripe-unit-offset values that it computes from the PDAs to determine
509 * where within the stripe unit to XOR each argument buffer.
510 *
511 * A "simple xor" is used in the fault-free case where the access touches only a portion
512 * of one (or two, in some cases) stripe unit(s). It assumes that all the argument
513 * buffers are of the same size and have the same stripe unit offset.
514 *
515 * A "recovery xor" is used in the degraded-mode case. It's similar to the regular
516 * xor function except that it takes the failed PDA as an additional parameter, and
517 * uses it to determine what portions of the argument buffers need to be xor'd into
518 * the result buffer, and where in the result buffer they should go.
519 ****************************************************************************************/
520
521 /* xor the params together and store the result in the result field.
522 * assume the result field points to a buffer that is the size of one SU,
523 * and use the pda params to determine where within the buffer to XOR
524 * the input buffers.
525 */
526 int rf_RegularXorFunc(node)
527 RF_DagNode_t *node;
528 {
529 RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
530 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
531 RF_Etimer_t timer;
532 int i, retcode;
533 #if RF_BACKWARD > 0
534 RF_PhysDiskAddr_t *pda;
535 caddr_t undoBuf;
536 #endif
537
538 retcode = 0;
539 if (node->dagHdr->status == rf_enable) {
540 /* don't do the XOR if the input is the same as the output */
541 RF_ETIMER_START(timer);
542 for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) {
543 #if RF_BACKWARD > 0
544 /* This section mimics undo logging for backward error recovery experiments b
545 * allocating and initializing a buffer
546 * XXX 512 byte sector size is hard coded!
547 */
548 pda = node->params[i].p;
549 if (node->dagHdr->allocList == NULL)
550 rf_MakeAllocList(node->dagHdr->allocList);
551 RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
552 #endif /* RF_BACKWARD > 0 */
553 retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
554 (char *)node->params[i+1].p, (char *) node->results[0], node->dagHdr->bp);
555 }
556 RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
557 }
558 return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */
559 }
560
561 /* xor the inputs into the result buffer, ignoring placement issues */
562 int rf_SimpleXorFunc(node)
563 RF_DagNode_t *node;
564 {
565 RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
566 int i, retcode = 0;
567 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
568 RF_Etimer_t timer;
569 #if RF_BACKWARD > 0
570 RF_PhysDiskAddr_t *pda;
571 caddr_t undoBuf;
572 #endif
573
574 if (node->dagHdr->status == rf_enable) {
575 RF_ETIMER_START(timer);
576 /* don't do the XOR if the input is the same as the output */
577 for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) {
578 #if RF_BACKWARD > 0
579 /* This section mimics undo logging for backward error recovery experiments b
580 * allocating and initializing a buffer
581 * XXX 512 byte sector size is hard coded!
582 */
583 pda = node->params[i].p;
584 if (node->dagHdr->allocList == NULL)
585 rf_MakeAllocList(node->dagHdr->allocList);
586 RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
587 #endif /* RF_BACKWARD > 0 */
588 retcode = rf_bxor((char *)node->params[i+1].p, (char *) node->results[0],
589 rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[i].p)->numSector),
590 (struct buf *) node->dagHdr->bp);
591 }
592 RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
593 }
594
595 return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */
596 }
597
598 /* this xor is used by the degraded-mode dag functions to recover lost data.
599 * the second-to-last parameter is the PDA for the failed portion of the access.
600 * the code here looks at this PDA and assumes that the xor target buffer is
601 * equal in size to the number of sectors in the failed PDA. It then uses
602 * the other PDAs in the parameter list to determine where within the target
603 * buffer the corresponding data should be xored.
604 */
605 int rf_RecoveryXorFunc(node)
606 RF_DagNode_t *node;
607 {
608 RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
609 RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
610 RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *)node->params[node->numParams-2].p;
611 int i, retcode = 0;
612 RF_PhysDiskAddr_t *pda;
613 int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
614 char *srcbuf, *destbuf;
615 RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
616 RF_Etimer_t timer;
617 #if RF_BACKWARD > 0
618 caddr_t undoBuf;
619 #endif
620
621 if (node->dagHdr->status == rf_enable) {
622 RF_ETIMER_START(timer);
623 for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) {
624 pda = (RF_PhysDiskAddr_t *)node->params[i].p;
625 #if RF_BACKWARD > 0
626 /* This section mimics undo logging for backward error recovery experiments b
627 * allocating and initializing a buffer
628 * XXX 512 byte sector size is hard coded!
629 */
630 if (node->dagHdr->allocList == NULL)
631 rf_MakeAllocList(node->dagHdr->allocList);
632 RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
633 #endif /* RF_BACKWARD > 0 */
634 srcbuf = (char *)node->params[i+1].p;
635 suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
636 destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
637 retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
638 }
639 RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
640 }
641 return (rf_GenericWakeupFunc(node, retcode));
642 }
643
644 /*****************************************************************************************
645 * The next three functions are utilities used by the above xor-execution functions.
646 ****************************************************************************************/
647
648
649 /*
650 * this is just a glorified buffer xor. targbuf points to a buffer that is one full stripe unit
651 * in size. srcbuf points to a buffer that may be less than 1 SU, but never more. When the
652 * access described by pda is one SU in size (which by implication means it's SU-aligned),
653 * all that happens is (targbuf) <- (srcbuf ^ targbuf). When the access is less than one
654 * SU in size the XOR occurs on only the portion of targbuf identified in the pda.
655 */
656
657 int rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
658 RF_Raid_t *raidPtr;
659 RF_PhysDiskAddr_t *pda;
660 char *srcbuf;
661 char *targbuf;
662 void *bp;
663 {
664 char *targptr;
665 int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
666 int SUOffset = pda->startSector % sectPerSU;
667 int length, retcode = 0;
668
669 RF_ASSERT(pda->numSector <= sectPerSU);
670
671 targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
672 length = rf_RaidAddressToByte(raidPtr, pda->numSector);
673 retcode = rf_bxor(srcbuf, targptr, length, bp);
674 return(retcode);
675 }
676
677 /* it really should be the case that the buffer pointers (returned by malloc)
678 * are aligned to the natural word size of the machine, so this is the only
679 * case we optimize for. The length should always be a multiple of the sector
680 * size, so there should be no problem with leftover bytes at the end.
681 */
682 int rf_bxor(src, dest, len, bp)
683 char *src;
684 char *dest;
685 int len;
686 void *bp;
687 {
688 unsigned mask = sizeof(long) -1, retcode = 0;
689
690 if ( !(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len&mask) ) {
691 retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len>>RF_LONGSHIFT, bp);
692 } else {
693 RF_ASSERT(0);
694 }
695 return(retcode);
696 }
697
698 /* map a user buffer into kernel space, if necessary */
699 #define REMAP_VA(_bp,x,y) (y) = (x)
700
701 /* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
702 * We don't want to assume anything about which input buffers are in kernel/user
703 * space, nor about their alignment, so in each loop we compute the maximum number
704 * of bytes that we can xor without crossing any page boundaries, and do only this many
705 * bytes before the next remap.
706 */
707 int rf_longword_bxor(src, dest, len, bp)
708 register unsigned long *src;
709 register unsigned long *dest;
710 int len; /* longwords */
711 void *bp;
712 {
713 register unsigned long *end = src+len;
714 register unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */
715 register unsigned long *pg_src, *pg_dest; /* per-page source/dest pointers */
716 int longs_this_time; /* # longwords to xor in the current iteration */
717
718 REMAP_VA(bp, src, pg_src);
719 REMAP_VA(bp, dest, pg_dest);
720 if (!pg_src || !pg_dest) return(EFAULT);
721
722 while (len >= 4 ) {
723 longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT); /* note len in longwords */
724 src += longs_this_time; dest+= longs_this_time; len -= longs_this_time;
725 while (longs_this_time >= 4) {
726 d0 = pg_dest[0];
727 d1 = pg_dest[1];
728 d2 = pg_dest[2];
729 d3 = pg_dest[3];
730 s0 = pg_src[0];
731 s1 = pg_src[1];
732 s2 = pg_src[2];
733 s3 = pg_src[3];
734 pg_dest[0] = d0 ^ s0;
735 pg_dest[1] = d1 ^ s1;
736 pg_dest[2] = d2 ^ s2;
737 pg_dest[3] = d3 ^ s3;
738 pg_src += 4;
739 pg_dest += 4;
740 longs_this_time -= 4;
741 }
742 while (longs_this_time > 0) { /* cannot cross any page boundaries here */
743 *pg_dest++ ^= *pg_src++;
744 longs_this_time--;
745 }
746
747 /* either we're done, or we've reached a page boundary on one (or possibly both) of the pointers */
748 if (len) {
749 if (RF_PAGE_ALIGNED(src)) REMAP_VA(bp, src, pg_src);
750 if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest);
751 if (!pg_src || !pg_dest) return(EFAULT);
752 }
753 }
754 while (src < end) {
755 *pg_dest++ ^= *pg_src++;
756 src++; dest++; len--;
757 if (RF_PAGE_ALIGNED(src)) REMAP_VA(bp, src, pg_src);
758 if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest);
759 }
760 RF_ASSERT(len == 0);
761 return(0);
762 }
763
764
765 /*
766 dst = a ^ b ^ c;
767 a may equal dst
768 see comment above longword_bxor
769 */
770 int rf_longword_bxor3(dst,a,b,c,len, bp)
771 register unsigned long *dst;
772 register unsigned long *a;
773 register unsigned long *b;
774 register unsigned long *c;
775 int len; /* length in longwords */
776 void *bp;
777 {
778 unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
779 register unsigned long *pg_a, *pg_b, *pg_c, *pg_dst; /* per-page source/dest pointers */
780 int longs_this_time; /* # longs to xor in the current iteration */
781 char dst_is_a = 0;
782
783 REMAP_VA(bp, a, pg_a);
784 REMAP_VA(bp, b, pg_b);
785 REMAP_VA(bp, c, pg_c);
786 if (a == dst) {pg_dst = pg_a; dst_is_a = 1;} else { REMAP_VA(bp, dst, pg_dst); }
787
788 /* align dest to cache line. Can't cross a pg boundary on dst here. */
789 while ((((unsigned long) pg_dst) & 0x1f)) {
790 *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
791 dst++; a++; b++; c++;
792 if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT);}
793 if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, a, pg_b); if (!pg_b) return(EFAULT);}
794 if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, a, pg_c); if (!pg_c) return(EFAULT);}
795 len--;
796 }
797
798 while (len > 4 ) {
799 longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
800 a+= longs_this_time; b+= longs_this_time; c+= longs_this_time; dst+=longs_this_time; len-=longs_this_time;
801 while (longs_this_time >= 4) {
802 a0 = pg_a[0]; longs_this_time -= 4;
803
804 a1 = pg_a[1];
805 a2 = pg_a[2];
806
807 a3 = pg_a[3]; pg_a += 4;
808
809 b0 = pg_b[0];
810 b1 = pg_b[1];
811
812 b2 = pg_b[2];
813 b3 = pg_b[3];
814 /* start dual issue */
815 a0 ^= b0; b0 = pg_c[0];
816
817 pg_b += 4; a1 ^= b1;
818
819 a2 ^= b2; a3 ^= b3;
820
821 b1 = pg_c[1]; a0 ^= b0;
822
823 b2 = pg_c[2]; a1 ^= b1;
824
825 b3 = pg_c[3]; a2 ^= b2;
826
827 pg_dst[0] = a0; a3 ^= b3;
828 pg_dst[1] = a1; pg_c += 4;
829 pg_dst[2] = a2;
830 pg_dst[3] = a3; pg_dst += 4;
831 }
832 while (longs_this_time > 0) { /* cannot cross any page boundaries here */
833 *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
834 longs_this_time--;
835 }
836
837 if (len) {
838 if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;}
839 if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);}
840 if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);}
841 if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);}
842 }
843 }
844 while (len) {
845 *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
846 dst++; a++; b++; c++;
847 if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;}
848 if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);}
849 if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);}
850 if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);}
851 len--;
852 }
853 return(0);
854 }
855
856 int rf_bxor3(dst,a,b,c,len, bp)
857 register unsigned char *dst;
858 register unsigned char *a;
859 register unsigned char *b;
860 register unsigned char *c;
861 unsigned long len;
862 void *bp;
863 {
864 RF_ASSERT(((RF_UL(dst)|RF_UL(a)|RF_UL(b)|RF_UL(c)|len) & 0x7) == 0);
865
866 return(rf_longword_bxor3((unsigned long *)dst, (unsigned long *)a,
867 (unsigned long *)b, (unsigned long *)c, len>>RF_LONGSHIFT, bp));
868 }
869