rf_dagutils.c revision 1.2 1 /* $NetBSD: rf_dagutils.c,v 1.2 1999/01/26 02:33:54 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /******************************************************************************
30 *
31 * rf_dagutils.c -- utility routines for manipulating dags
32 *
33 *****************************************************************************/
34
35 #include "rf_archs.h"
36 #include "rf_types.h"
37 #include "rf_threadstuff.h"
38 #include "rf_raid.h"
39 #include "rf_dag.h"
40 #include "rf_dagutils.h"
41 #include "rf_dagfuncs.h"
42 #include "rf_general.h"
43 #include "rf_freelist.h"
44 #include "rf_map.h"
45 #include "rf_shutdown.h"
46 #include "rf_sys.h"
47
48 #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
49
50 RF_RedFuncs_t rf_xorFuncs = {
51 rf_RegularXorFunc, "Reg Xr",
52 rf_SimpleXorFunc, "Simple Xr"};
53
54 RF_RedFuncs_t rf_xorRecoveryFuncs = {
55 rf_RecoveryXorFunc, "Recovery Xr",
56 rf_RecoveryXorFunc, "Recovery Xr"};
57
58 static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
59 static void rf_PrintDAG(RF_DagHeader_t *);
60 static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
61 RF_DagNode_t **, int );
62 static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
63 static void rf_ValidateVisitedBits(RF_DagHeader_t *);
64
65 /******************************************************************************
66 *
67 * InitNode - initialize a dag node
68 *
69 * the size of the propList array is always the same as that of the
70 * successors array.
71 *
72 *****************************************************************************/
73 void rf_InitNode(
74 RF_DagNode_t *node,
75 RF_NodeStatus_t initstatus,
76 int commit,
77 int (*doFunc)(RF_DagNode_t *node),
78 int (*undoFunc)(RF_DagNode_t *node),
79 int (*wakeFunc)(RF_DagNode_t *node,int status),
80 int nSucc,
81 int nAnte,
82 int nParam,
83 int nResult,
84 RF_DagHeader_t *hdr,
85 char *name,
86 RF_AllocListElem_t *alist)
87 {
88 void **ptrs;
89 int nptrs;
90
91 if (nAnte > RF_MAX_ANTECEDENTS)
92 RF_PANIC();
93 node->status = initstatus;
94 node->commitNode = commit;
95 node->doFunc = doFunc;
96 node->undoFunc = undoFunc;
97 node->wakeFunc = wakeFunc;
98 node->numParams = nParam;
99 node->numResults = nResult;
100 node->numAntecedents = nAnte;
101 node->numAntDone = 0;
102 node->next = NULL;
103 node->numSuccedents = nSucc;
104 node->name = name;
105 node->dagHdr = hdr;
106 node->visited = 0;
107
108 /* allocate all the pointers with one call to malloc */
109 nptrs = nSucc+nAnte+nResult+nSucc;
110
111 if (nptrs <= RF_DAG_PTRCACHESIZE) {
112 /*
113 * The dag_ptrs field of the node is basically some scribble
114 * space to be used here. We could get rid of it, and always
115 * allocate the range of pointers, but that's expensive. So,
116 * we pick a "common case" size for the pointer cache. Hopefully,
117 * we'll find that:
118 * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
119 * only a little bit (least efficient case)
120 * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
121 * (wasted memory)
122 */
123 ptrs = (void **)node->dag_ptrs;
124 }
125 else {
126 RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
127 }
128 node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
129 node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs+nSucc) : NULL;
130 node->results = (nResult) ? (void **) (ptrs+nSucc+nAnte) : NULL;
131 node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL;
132
133 if (nParam) {
134 if (nParam <= RF_DAG_PARAMCACHESIZE) {
135 node->params = (RF_DagParam_t *)node->dag_params;
136 }
137 else {
138 RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
139 }
140 }
141 else {
142 node->params = NULL;
143 }
144 }
145
146
147
148 /******************************************************************************
149 *
150 * allocation and deallocation routines
151 *
152 *****************************************************************************/
153
154 void rf_FreeDAG(dag_h)
155 RF_DagHeader_t *dag_h;
156 {
157 RF_AccessStripeMapHeader_t *asmap, *t_asmap;
158 RF_DagHeader_t *nextDag;
159 int i;
160
161 while (dag_h) {
162 nextDag = dag_h->next;
163 for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
164 /* release mem chunks */
165 rf_ReleaseMemChunk(dag_h->memChunk[i]);
166 dag_h->memChunk[i] = NULL;
167 }
168
169 RF_ASSERT(i == dag_h->chunkIndex);
170 if (dag_h->xtraChunkCnt > 0) {
171 /* free xtraMemChunks */
172 for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
173 rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
174 dag_h->xtraMemChunk[i] = NULL;
175 }
176 RF_ASSERT(i == dag_h->xtraChunkIndex);
177 /* free ptrs to xtraMemChunks */
178 RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
179 }
180 rf_FreeAllocList(dag_h->allocList);
181 for (asmap = dag_h->asmList; asmap;) {
182 t_asmap = asmap;
183 asmap = asmap->next;
184 rf_FreeAccessStripeMap(t_asmap);
185 }
186 rf_FreeDAGHeader(dag_h);
187 dag_h = nextDag;
188 }
189 }
190
191 RF_PropHeader_t *rf_MakePropListEntry(
192 RF_DagHeader_t *dag_h,
193 int resultNum,
194 int paramNum,
195 RF_PropHeader_t *next,
196 RF_AllocListElem_t *allocList)
197 {
198 RF_PropHeader_t *p;
199
200 RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
201 (RF_PropHeader_t *), allocList);
202 p->resultNum = resultNum;
203 p->paramNum = paramNum;
204 p->next = next;
205 return(p);
206 }
207
208 static RF_FreeList_t *rf_dagh_freelist;
209
210 #define RF_MAX_FREE_DAGH 128
211 #define RF_DAGH_INC 16
212 #define RF_DAGH_INITIAL 32
213
214 static void rf_ShutdownDAGs(void *);
215 static void rf_ShutdownDAGs(ignored)
216 void *ignored;
217 {
218 RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *));
219 }
220
221 int rf_ConfigureDAGs(listp)
222 RF_ShutdownList_t **listp;
223 {
224 int rc;
225
226 RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
227 RF_DAGH_INC, sizeof(RF_DagHeader_t));
228 if (rf_dagh_freelist == NULL)
229 return(ENOMEM);
230 rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
231 if (rc) {
232 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
233 __FILE__, __LINE__, rc);
234 rf_ShutdownDAGs(NULL);
235 return(rc);
236 }
237 RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next,
238 (RF_DagHeader_t *));
239 return(0);
240 }
241
242 RF_DagHeader_t *rf_AllocDAGHeader()
243 {
244 RF_DagHeader_t *dh;
245
246 RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *));
247 if (dh) {
248 bzero((char *)dh, sizeof(RF_DagHeader_t));
249 }
250 return(dh);
251 }
252
253 void rf_FreeDAGHeader(RF_DagHeader_t *dh)
254 {
255 RF_FREELIST_FREE(rf_dagh_freelist,dh,next);
256 }
257
258 /* allocates a buffer big enough to hold the data described by pda */
259 void *rf_AllocBuffer(
260 RF_Raid_t *raidPtr,
261 RF_DagHeader_t *dag_h,
262 RF_PhysDiskAddr_t *pda,
263 RF_AllocListElem_t *allocList)
264 {
265 char *p;
266
267 RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
268 (char *), allocList);
269 return((void *)p);
270 }
271
272 /******************************************************************************
273 *
274 * debug routines
275 *
276 *****************************************************************************/
277
278 char *rf_NodeStatusString(RF_DagNode_t *node)
279 {
280 switch (node->status) {
281 case rf_wait: return("wait");
282 case rf_fired: return("fired");
283 case rf_good: return("good");
284 case rf_bad: return("bad");
285 default: return("?");
286 }
287 }
288
289 void rf_PrintNodeInfoString(RF_DagNode_t *node)
290 {
291 RF_PhysDiskAddr_t *pda;
292 int (*df)(RF_DagNode_t *) = node->doFunc;
293 int i, lk, unlk;
294 void *bufPtr;
295
296 if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc)
297 || (df==rf_DiskReadMirrorIdleFunc)
298 || (df == rf_DiskReadMirrorPartitionFunc))
299 {
300 pda = (RF_PhysDiskAddr_t *)node->params[0].p;
301 bufPtr = (void *)node->params[1].p;
302 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
303 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
304 RF_ASSERT( !(lk && unlk) );
305 printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
306 (long)pda->startSector, (int) pda->numSector, (long)bufPtr,
307 (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
308 return;
309 }
310
311 if (df == rf_DiskUnlockFunc) {
312 pda = (RF_PhysDiskAddr_t *)node->params[0].p;
313 lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
314 unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
315 RF_ASSERT( !(lk && unlk) );
316 printf("r %d c %d %s\n", pda->row, pda->col,
317 (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
318 return;
319 }
320
321 if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc)
322 || (df==rf_RecoveryXorFunc))
323 {
324 printf("result buf 0x%lx\n",(long) node->results[0]);
325 for (i=0; i<node->numParams-1; i+=2) {
326 pda = (RF_PhysDiskAddr_t *)node->params[i].p;
327 bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
328 printf(" buf 0x%lx r%d c%d offs %ld nsect %d\n",
329 (long)bufPtr, pda->row, pda->col,
330 (long)pda->startSector, (int)pda->numSector);
331 }
332 return;
333 }
334
335 #if RF_INCLUDE_PARITYLOGGING > 0
336 if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) {
337 for (i=0; i<node->numParams-1; i+=2) {
338 pda = (RF_PhysDiskAddr_t *)node->params[i].p;
339 bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
340 printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
341 pda->row, pda->col, (long) pda->startSector,
342 (int) pda->numSector, (long) bufPtr);
343 }
344 return;
345 }
346 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
347
348 if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) {
349 printf("\n");
350 return;
351 }
352
353 printf("?\n");
354 }
355
356 static void rf_RecurPrintDAG(node, depth, unvisited)
357 RF_DagNode_t *node;
358 int depth;
359 int unvisited;
360 {
361 char *anttype;
362 int i;
363
364 node->visited = (unvisited) ? 0 : 1;
365 printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
366 node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
367 node->numSuccedents, node->numSuccFired, node->numSuccDone,
368 node->numAntecedents, node->numAntDone, node->numParams,node->numResults);
369 for (i=0; i<node->numSuccedents; i++) {
370 printf("%d%s", node->succedents[i]->nodeNum,
371 ((i==node->numSuccedents-1) ? "\0" : " "));
372 }
373 printf("} A{");
374 for (i=0; i<node->numAntecedents; i++) {
375 switch (node->antType[i]) {
376 case rf_trueData :
377 anttype = "T";
378 break;
379 case rf_antiData :
380 anttype = "A";
381 break;
382 case rf_outputData :
383 anttype = "O";
384 break;
385 case rf_control :
386 anttype = "C";
387 break;
388 default :
389 anttype = "?";
390 break;
391 }
392 printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " ");
393 }
394 printf("}; ");
395 rf_PrintNodeInfoString(node);
396 for (i=0; i<node->numSuccedents; i++) {
397 if (node->succedents[i]->visited == unvisited)
398 rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited);
399 }
400 }
401
402 static void rf_PrintDAG(dag_h)
403 RF_DagHeader_t *dag_h;
404 {
405 int unvisited, i;
406 char *status;
407
408 /* set dag status */
409 switch (dag_h->status) {
410 case rf_enable :
411 status = "enable";
412 break;
413 case rf_rollForward :
414 status = "rollForward";
415 break;
416 case rf_rollBackward :
417 status = "rollBackward";
418 break;
419 default :
420 status = "illegal!";
421 break;
422 }
423 /* find out if visited bits are currently set or clear */
424 unvisited = dag_h->succedents[0]->visited;
425
426 printf("DAG type: %s\n", dag_h->creator);
427 printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n");
428 printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
429 status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
430 for (i=0; i<dag_h->numSuccedents; i++) {
431 printf("%d%s", dag_h->succedents[i]->nodeNum,
432 ((i==dag_h->numSuccedents-1) ? "\0" : " "));
433 }
434 printf("};\n");
435 for (i=0; i<dag_h->numSuccedents; i++) {
436 if (dag_h->succedents[i]->visited == unvisited)
437 rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
438 }
439 }
440
441 /* assigns node numbers */
442 int rf_AssignNodeNums(RF_DagHeader_t *dag_h)
443 {
444 int unvisited, i, nnum;
445 RF_DagNode_t *node;
446
447 nnum = 0;
448 unvisited = dag_h->succedents[0]->visited;
449
450 dag_h->nodeNum = nnum++;
451 for (i=0; i<dag_h->numSuccedents; i++) {
452 node = dag_h->succedents[i];
453 if (node->visited == unvisited) {
454 nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
455 }
456 }
457 return(nnum);
458 }
459
460 int rf_RecurAssignNodeNums(node, num, unvisited)
461 RF_DagNode_t *node;
462 int num;
463 int unvisited;
464 {
465 int i;
466
467 node->visited = (unvisited) ? 0 : 1;
468
469 node->nodeNum = num++;
470 for (i=0; i<node->numSuccedents; i++) {
471 if (node->succedents[i]->visited == unvisited) {
472 num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
473 }
474 }
475 return(num);
476 }
477
478 /* set the header pointers in each node to "newptr" */
479 void rf_ResetDAGHeaderPointers(dag_h, newptr)
480 RF_DagHeader_t *dag_h;
481 RF_DagHeader_t *newptr;
482 {
483 int i;
484 for (i=0; i<dag_h->numSuccedents; i++)
485 if (dag_h->succedents[i]->dagHdr != newptr)
486 rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
487 }
488
489 void rf_RecurResetDAGHeaderPointers(node, newptr)
490 RF_DagNode_t *node;
491 RF_DagHeader_t *newptr;
492 {
493 int i;
494 node->dagHdr = newptr;
495 for (i=0; i<node->numSuccedents; i++)
496 if (node->succedents[i]->dagHdr != newptr)
497 rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
498 }
499
500
501 void rf_PrintDAGList(RF_DagHeader_t *dag_h)
502 {
503 int i=0;
504
505 for (; dag_h; dag_h=dag_h->next) {
506 rf_AssignNodeNums(dag_h);
507 printf("\n\nDAG %d IN LIST:\n",i++);
508 rf_PrintDAG(dag_h);
509 }
510 }
511
512 static int rf_ValidateBranch(node, scount, acount, nodes, unvisited)
513 RF_DagNode_t *node;
514 int *scount;
515 int *acount;
516 RF_DagNode_t **nodes;
517 int unvisited;
518 {
519 int i, retcode = 0;
520
521 /* construct an array of node pointers indexed by node num */
522 node->visited = (unvisited) ? 0 : 1;
523 nodes[ node->nodeNum ] = node;
524
525 if (node->next != NULL) {
526 printf("INVALID DAG: next pointer in node is not NULL\n");
527 retcode = 1;
528 }
529 if (node->status != rf_wait) {
530 printf("INVALID DAG: Node status is not wait\n");
531 retcode = 1;
532 }
533 if (node->numAntDone != 0) {
534 printf("INVALID DAG: numAntDone is not zero\n");
535 retcode = 1;
536 }
537 if (node->doFunc == rf_TerminateFunc) {
538 if (node->numSuccedents != 0) {
539 printf("INVALID DAG: Terminator node has succedents\n");
540 retcode = 1;
541 }
542 } else {
543 if (node->numSuccedents == 0) {
544 printf("INVALID DAG: Non-terminator node has no succedents\n");
545 retcode = 1;
546 }
547 }
548 for (i=0; i<node->numSuccedents; i++) {
549 if (!node->succedents[i]) {
550 printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name);
551 retcode = 1;
552 }
553 scount[ node->succedents[i]->nodeNum ]++;
554 }
555 for (i=0; i<node->numAntecedents; i++) {
556 if (!node->antecedents[i]) {
557 printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name);
558 retcode = 1;
559 }
560 acount[ node->antecedents[i]->nodeNum ]++;
561 }
562 for (i=0; i<node->numSuccedents; i++) {
563 if (node->succedents[i]->visited == unvisited) {
564 if (rf_ValidateBranch(node->succedents[i], scount,
565 acount, nodes, unvisited))
566 {
567 retcode = 1;
568 }
569 }
570 }
571 return(retcode);
572 }
573
574 static void rf_ValidateBranchVisitedBits(node, unvisited, rl)
575 RF_DagNode_t *node;
576 int unvisited;
577 int rl;
578 {
579 int i;
580
581 RF_ASSERT(node->visited == unvisited);
582 for (i=0; i<node->numSuccedents; i++) {
583 if (node->succedents[i] == NULL) {
584 printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i);
585 RF_ASSERT(0);
586 }
587 rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1);
588 }
589 }
590
591 /* NOTE: never call this on a big dag, because it is exponential
592 * in execution time
593 */
594 static void rf_ValidateVisitedBits(dag)
595 RF_DagHeader_t *dag;
596 {
597 int i, unvisited;
598
599 unvisited = dag->succedents[0]->visited;
600
601 for (i=0; i<dag->numSuccedents; i++) {
602 if (dag->succedents[i] == NULL) {
603 printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
604 RF_ASSERT(0);
605 }
606 rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0);
607 }
608 }
609
610 /* validate a DAG. _at entry_ verify that:
611 * -- numNodesCompleted is zero
612 * -- node queue is null
613 * -- dag status is rf_enable
614 * -- next pointer is null on every node
615 * -- all nodes have status wait
616 * -- numAntDone is zero in all nodes
617 * -- terminator node has zero successors
618 * -- no other node besides terminator has zero successors
619 * -- no successor or antecedent pointer in a node is NULL
620 * -- number of times that each node appears as a successor of another node
621 * is equal to the antecedent count on that node
622 * -- number of times that each node appears as an antecedent of another node
623 * is equal to the succedent count on that node
624 * -- what else?
625 */
626 int rf_ValidateDAG(dag_h)
627 RF_DagHeader_t *dag_h;
628 {
629 int i, nodecount;
630 int *scount, *acount; /* per-node successor and antecedent counts */
631 RF_DagNode_t **nodes; /* array of ptrs to nodes in dag */
632 int retcode = 0;
633 int unvisited;
634 int commitNodeCount = 0;
635
636 if (rf_validateVisitedDebug)
637 rf_ValidateVisitedBits(dag_h);
638
639 if (dag_h->numNodesCompleted != 0) {
640 printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted);
641 retcode = 1; goto validate_dag_bad;
642 }
643 if (dag_h->status != rf_enable) {
644 printf("INVALID DAG: not enabled\n");
645 retcode = 1; goto validate_dag_bad;
646 }
647 if (dag_h->numCommits != 0) {
648 printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits);
649 retcode = 1; goto validate_dag_bad;
650 }
651 if (dag_h->numSuccedents != 1) {
652 /* currently, all dags must have only one succedent */
653 printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents);
654 retcode = 1; goto validate_dag_bad;
655 }
656 nodecount = rf_AssignNodeNums(dag_h);
657
658 unvisited = dag_h->succedents[0]->visited;
659
660 RF_Calloc(scount, nodecount, sizeof(int), (int *));
661 RF_Calloc(acount, nodecount, sizeof(int), (int *));
662 RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
663 for (i=0; i<dag_h->numSuccedents; i++) {
664 if ((dag_h->succedents[i]->visited == unvisited)
665 && rf_ValidateBranch(dag_h->succedents[i], scount,
666 acount, nodes, unvisited))
667 {
668 retcode = 1;
669 }
670 }
671 /* start at 1 to skip the header node */
672 for (i=1; i<nodecount; i++) {
673 if ( nodes[i]->commitNode )
674 commitNodeCount++;
675 if ( nodes[i]->doFunc == NULL ) {
676 printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
677 retcode = 1;
678 goto validate_dag_out;
679 }
680 if ( nodes[i]->undoFunc == NULL ) {
681 printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
682 retcode = 1;
683 goto validate_dag_out;
684 }
685 if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) {
686 printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
687 nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
688 retcode = 1;
689 goto validate_dag_out;
690 }
691 if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) {
692 printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
693 nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
694 retcode = 1;
695 goto validate_dag_out;
696 }
697 }
698
699 if ( dag_h->numCommitNodes != commitNodeCount ) {
700 printf("INVALID DAG: incorrect commit node count. hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
701 dag_h->numCommitNodes, commitNodeCount);
702 retcode = 1;
703 goto validate_dag_out;
704 }
705
706 validate_dag_out:
707 RF_Free(scount, nodecount*sizeof(int));
708 RF_Free(acount, nodecount*sizeof(int));
709 RF_Free(nodes, nodecount*sizeof(RF_DagNode_t *));
710 if (retcode)
711 rf_PrintDAGList(dag_h);
712
713 if (rf_validateVisitedDebug)
714 rf_ValidateVisitedBits(dag_h);
715
716 return(retcode);
717
718 validate_dag_bad:
719 rf_PrintDAGList(dag_h);
720 return(retcode);
721 }
722
723
724 /******************************************************************************
725 *
726 * misc construction routines
727 *
728 *****************************************************************************/
729
730 void rf_redirect_asm(
731 RF_Raid_t *raidPtr,
732 RF_AccessStripeMap_t *asmap)
733 {
734 int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
735 int row = asmap->physInfo->row;
736 int fcol = raidPtr->reconControl[row]->fcol;
737 int srow = raidPtr->reconControl[row]->spareRow;
738 int scol = raidPtr->reconControl[row]->spareCol;
739 RF_PhysDiskAddr_t *pda;
740
741 RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing );
742 for (pda = asmap->physInfo; pda; pda=pda->next) {
743 if (pda->col == fcol) {
744 if (rf_dagDebug) {
745 if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
746 pda->startSector))
747 {
748 RF_PANIC();
749 }
750 }
751 /*printf("Remapped data for large write\n");*/
752 if (ds) {
753 raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
754 &pda->row, &pda->col, &pda->startSector, RF_REMAP);
755 }
756 else {
757 pda->row = srow; pda->col = scol;
758 }
759 }
760 }
761 for (pda = asmap->parityInfo; pda; pda=pda->next) {
762 if (pda->col == fcol) {
763 if (rf_dagDebug) {
764 if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
765 RF_PANIC();
766 }
767 }
768 }
769 if (ds) {
770 (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
771 }
772 else {
773 pda->row = srow; pda->col = scol;
774 }
775 }
776 }
777
778
779 /* this routine allocates read buffers and generates stripe maps for the
780 * regions of the array from the start of the stripe to the start of the
781 * access, and from the end of the access to the end of the stripe. It also
782 * computes and returns the number of DAG nodes needed to read all this data.
783 * Note that this routine does the wrong thing if the access is fully
784 * contained within one stripe unit, so we RF_ASSERT against this case at the
785 * start.
786 */
787 void rf_MapUnaccessedPortionOfStripe(
788 RF_Raid_t *raidPtr,
789 RF_RaidLayout_t *layoutPtr, /* in: layout information */
790 RF_AccessStripeMap_t *asmap, /* in: access stripe map */
791 RF_DagHeader_t *dag_h, /* in: header of the dag to create */
792 RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */
793 int *nRodNodes, /* out: num nodes to be generated to read unaccessed data */
794 char **sosBuffer, /* out: pointers to newly allocated buffer */
795 char **eosBuffer,
796 RF_AllocListElem_t *allocList)
797 {
798 RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
799 RF_SectorNum_t sosNumSector, eosNumSector;
800
801 RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) );
802 /* generate an access map for the region of the array from start of stripe
803 * to start of access */
804 new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0;
805 if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
806 sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
807 sosNumSector = asmap->raidAddress - sosRaidAddress;
808 RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
809 new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
810 new_asm_h[0]->next = dag_h->asmList;
811 dag_h->asmList = new_asm_h[0];
812 *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
813
814 RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
815 /* we're totally within one stripe here */
816 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
817 rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
818 }
819 /* generate an access map for the region of the array from end of access
820 * to end of stripe */
821 if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
822 eosRaidAddress = asmap->endRaidAddress;
823 eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
824 RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
825 new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
826 new_asm_h[1]->next = dag_h->asmList;
827 dag_h->asmList = new_asm_h[1];
828 *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
829
830 RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
831 /* we're totally within one stripe here */
832 if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
833 rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
834 }
835 }
836
837
838
839 /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
840 int rf_PDAOverlap(
841 RF_RaidLayout_t *layoutPtr,
842 RF_PhysDiskAddr_t *src,
843 RF_PhysDiskAddr_t *dest)
844 {
845 RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
846 RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
847 /* use -1 to be sure we stay within SU */
848 RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);
849 RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
850 return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 );
851 }
852
853
854 /* GenerateFailedAccessASMs
855 *
856 * this routine figures out what portion of the stripe needs to be read
857 * to effect the degraded read or write operation. It's primary function
858 * is to identify everything required to recover the data, and then
859 * eliminate anything that is already being accessed by the user.
860 *
861 * The main result is two new ASMs, one for the region from the start of the
862 * stripe to the start of the access, and one for the region from the end of
863 * the access to the end of the stripe. These ASMs describe everything that
864 * needs to be read to effect the degraded access. Other results are:
865 * nXorBufs -- the total number of buffers that need to be XORed together to
866 * recover the lost data,
867 * rpBufPtr -- ptr to a newly-allocated buffer to hold the parity. If NULL
868 * at entry, not allocated.
869 * overlappingPDAs --
870 * describes which of the non-failed PDAs in the user access
871 * overlap data that needs to be read to effect recovery.
872 * overlappingPDAs[i]==1 if and only if, neglecting the failed
873 * PDA, the ith pda in the input asm overlaps data that needs
874 * to be read for recovery.
875 */
876 /* in: asm - ASM for the actual access, one stripe only */
877 /* in: faildPDA - which component of the access has failed */
878 /* in: dag_h - header of the DAG we're going to create */
879 /* out: new_asm_h - the two new ASMs */
880 /* out: nXorBufs - the total number of xor bufs required */
881 /* out: rpBufPtr - a buffer for the parity read */
882 void rf_GenerateFailedAccessASMs(
883 RF_Raid_t *raidPtr,
884 RF_AccessStripeMap_t *asmap,
885 RF_PhysDiskAddr_t *failedPDA,
886 RF_DagHeader_t *dag_h,
887 RF_AccessStripeMapHeader_t **new_asm_h,
888 int *nXorBufs,
889 char **rpBufPtr,
890 char *overlappingPDAs,
891 RF_AllocListElem_t *allocList)
892 {
893 RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
894
895 /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
896 RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
897
898 RF_SectorCount_t numSect[2], numParitySect;
899 RF_PhysDiskAddr_t *pda;
900 char *rdBuf, *bufP;
901 int foundit, i;
902
903 bufP = NULL;
904 foundit = 0;
905 /* first compute the following raid addresses:
906 start of stripe, (sosAddr)
907 MIN(start of access, start of failed SU), (sosEndAddr)
908 MAX(end of access, end of failed SU), (eosStartAddr)
909 end of stripe (i.e. start of next stripe) (eosAddr)
910 */
911 sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
912 sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress));
913 eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
914 eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
915
916 /* now generate access stripe maps for each of the above regions of the
917 * stripe. Use a dummy (NULL) buf ptr for now */
918
919 new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr-sosAddr, NULL, RF_DONT_REMAP) : NULL;
920 new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
921
922 /* walk through the PDAs and range-restrict each SU to the region of the
923 * SU touched on the failed PDA. also compute total data buffer space
924 * requirements in this step. Ignore the parity for now. */
925
926 numSect[0] = numSect[1] = 0;
927 if (new_asm_h[0]) {
928 new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0];
929 for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
930 rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector;
931 }
932 }
933 if (new_asm_h[1]) {
934 new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1];
935 for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
936 rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector;
937 }
938 }
939 numParitySect = failedPDA->numSector;
940
941 /* allocate buffer space for the data & parity we have to read to recover
942 * from the failure */
943
944 if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) { /* don't allocate parity buf if not needed */
945 RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList);
946 bufP = rdBuf;
947 if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
948 (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP);
949 }
950
951 /* now walk through the pdas one last time and assign buffer pointers
952 * (ugh!). Again, ignore the parity. also, count nodes to find out how
953 * many bufs need to be xored together */
954 (*nXorBufs) = 1; /* in read case, 1 is for parity. In write case, 1 is for failed data */
955 if (new_asm_h[0]) {
956 for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
957 *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
958 }
959 if (new_asm_h[1]) {
960 for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
961 (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
962 }
963 if (rpBufPtr) *rpBufPtr = bufP; /* the rest of the buffer is for parity */
964
965 /* the last step is to figure out how many more distinct buffers need to
966 * get xor'd to produce the missing unit. there's one for each user-data
967 * read node that overlaps the portion of the failed unit being accessed */
968
969 for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) {
970 if (pda == failedPDA) {i--; foundit=1; continue;}
971 if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
972 overlappingPDAs[i] = 1;
973 (*nXorBufs)++;
974 }
975 }
976 if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);}
977
978 if (rf_degDagDebug) {
979 if (new_asm_h[0]) {
980 printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
981 }
982 if (new_asm_h[1]) {
983 printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
984 }
985 }
986 }
987
988
989 /* adjusts the offset and number of sectors in the destination pda so that
990 * it covers at most the region of the SU covered by the source PDA. This
991 * is exclusively a restriction: the number of sectors indicated by the
992 * target PDA can only shrink.
993 *
994 * For example: s = sectors within SU indicated by source PDA
995 * d = sectors within SU indicated by dest PDA
996 * r = results, stored in dest PDA
997 *
998 * |--------------- one stripe unit ---------------------|
999 * | sssssssssssssssssssssssssssssssss |
1000 * | ddddddddddddddddddddddddddddddddddddddddddddd |
1001 * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr |
1002 *
1003 * Another example:
1004 *
1005 * |--------------- one stripe unit ---------------------|
1006 * | sssssssssssssssssssssssssssssssss |
1007 * | ddddddddddddddddddddddd |
1008 * | rrrrrrrrrrrrrrrr |
1009 *
1010 */
1011 void rf_RangeRestrictPDA(
1012 RF_Raid_t *raidPtr,
1013 RF_PhysDiskAddr_t *src,
1014 RF_PhysDiskAddr_t *dest,
1015 int dobuffer,
1016 int doraidaddr)
1017 {
1018 RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1019 RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
1020 RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
1021 RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1); /* use -1 to be sure we stay within SU */
1022 RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
1023 RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */
1024
1025 dest->startSector = subAddr + RF_MAX(soffs,doffs);
1026 dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector;
1027
1028 if (dobuffer)
1029 dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0;
1030 if (doraidaddr) {
1031 dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
1032 rf_StripeUnitOffset(layoutPtr, dest->startSector);
1033 }
1034 }
1035
1036 /*
1037 * Want the highest of these primes to be the largest one
1038 * less than the max expected number of columns (won't hurt
1039 * to be too small or too large, but won't be optimal, either)
1040 * --jimz
1041 */
1042 #define NLOWPRIMES 8
1043 static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19};
1044
1045 /*****************************************************************************
1046 * compute the workload shift factor. (chained declustering)
1047 *
1048 * return nonzero if access should shift to secondary, otherwise,
1049 * access is to primary
1050 *****************************************************************************/
1051 int rf_compute_workload_shift(
1052 RF_Raid_t *raidPtr,
1053 RF_PhysDiskAddr_t *pda)
1054 {
1055 /*
1056 * variables:
1057 * d = column of disk containing primary
1058 * f = column of failed disk
1059 * n = number of disks in array
1060 * sd = "shift distance" (number of columns that d is to the right of f)
1061 * row = row of array the access is in
1062 * v = numerator of redirection ratio
1063 * k = denominator of redirection ratio
1064 */
1065 RF_RowCol_t d, f, sd, row, n;
1066 int k, v, ret, i;
1067
1068 row = pda->row;
1069 n = raidPtr->numCol;
1070
1071 /* assign column of primary copy to d */
1072 d = pda->col;
1073
1074 /* assign column of dead disk to f */
1075 for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++);
1076
1077 RF_ASSERT(f < n);
1078 RF_ASSERT(f != d);
1079
1080 sd = (f > d) ? (n + d - f) : (d - f);
1081 RF_ASSERT(sd < n);
1082
1083 /*
1084 * v of every k accesses should be redirected
1085 *
1086 * v/k := (n-1-sd)/(n-1)
1087 */
1088 v = (n-1-sd);
1089 k = (n-1);
1090
1091 #if 1
1092 /*
1093 * XXX
1094 * Is this worth it?
1095 *
1096 * Now reduce the fraction, by repeatedly factoring
1097 * out primes (just like they teach in elementary school!)
1098 */
1099 for(i=0;i<NLOWPRIMES;i++) {
1100 if (lowprimes[i] > v)
1101 break;
1102 while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) {
1103 v /= lowprimes[i];
1104 k /= lowprimes[i];
1105 }
1106 }
1107 #endif
1108
1109 raidPtr->hist_diskreq[row][d]++;
1110 if (raidPtr->hist_diskreq[row][d] > v) {
1111 ret = 0; /* do not redirect */
1112 }
1113 else {
1114 ret = 1; /* redirect */
1115 }
1116
1117 #if 0
1118 printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
1119 raidPtr->hist_diskreq[row][d]);
1120 #endif
1121
1122 if (raidPtr->hist_diskreq[row][d] >= k) {
1123 /* reset counter */
1124 raidPtr->hist_diskreq[row][d] = 0;
1125 }
1126
1127 return(ret);
1128 }
1129
1130 /*
1131 * Disk selection routines
1132 */
1133
1134 /*
1135 * Selects the disk with the shortest queue from a mirror pair.
1136 * Both the disk I/Os queued in RAIDframe as well as those at the physical
1137 * disk are counted as members of the "queue"
1138 */
1139 void rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
1140 {
1141 RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1142 RF_RowCol_t rowData, colData, rowMirror, colMirror;
1143 int dataQueueLength, mirrorQueueLength, usemirror;
1144 RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
1145 RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
1146 RF_PhysDiskAddr_t *tmp_pda;
1147 RF_RaidDisk_t **disks = raidPtr->Disks;
1148 RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1149
1150 /* return the [row col] of the disk with the shortest queue */
1151 rowData = data_pda->row;
1152 colData = data_pda->col;
1153 rowMirror = mirror_pda->row;
1154 colMirror = mirror_pda->col;
1155 dataQueue = &(dqs[rowData][colData]);
1156 mirrorQueue = &(dqs[rowMirror][colMirror]);
1157
1158 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1159 RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1160 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1161 dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
1162 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1163 RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1164 RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1165 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1166 mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
1167 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1168 RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1169 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1170
1171 usemirror = 0;
1172 if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1173 usemirror = 0;
1174 }
1175 else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1176 usemirror = 1;
1177 }
1178 else if (dataQueueLength < mirrorQueueLength) {
1179 usemirror = 0;
1180 }
1181 else if (mirrorQueueLength < dataQueueLength) {
1182 usemirror = 1;
1183 }
1184 else {
1185 /* queues are equal length. attempt cleverness. */
1186 if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector)
1187 <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector))
1188 {
1189 usemirror = 0;
1190 }
1191 else {
1192 usemirror = 1;
1193 }
1194 }
1195
1196 if (usemirror) {
1197 /* use mirror (parity) disk, swap params 0 & 4 */
1198 tmp_pda = data_pda;
1199 node->params[0].p = mirror_pda;
1200 node->params[4].p = tmp_pda;
1201 }
1202 else {
1203 /* use data disk, leave param 0 unchanged */
1204 }
1205 /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */
1206 }
1207
1208 /*
1209 * Do simple partitioning. This assumes that
1210 * the data and parity disks are laid out identically.
1211 */
1212 void rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
1213 {
1214 RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1215 RF_RowCol_t rowData, colData, rowMirror, colMirror;
1216 RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
1217 RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
1218 RF_PhysDiskAddr_t *tmp_pda;
1219 RF_RaidDisk_t **disks = raidPtr->Disks;
1220 RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1221 int usemirror;
1222
1223 /* return the [row col] of the disk with the shortest queue */
1224 rowData = data_pda->row;
1225 colData = data_pda->col;
1226 rowMirror = mirror_pda->row;
1227 colMirror = mirror_pda->col;
1228 dataQueue = &(dqs[rowData][colData]);
1229 mirrorQueue = &(dqs[rowMirror][colMirror]);
1230
1231 usemirror = 0;
1232 if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1233 usemirror = 0;
1234 }
1235 else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1236 usemirror = 1;
1237 }
1238 else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) {
1239 usemirror = 0;
1240 }
1241 else {
1242 usemirror = 1;
1243 }
1244
1245 if (usemirror) {
1246 /* use mirror (parity) disk, swap params 0 & 4 */
1247 tmp_pda = data_pda;
1248 node->params[0].p = mirror_pda;
1249 node->params[4].p = tmp_pda;
1250 }
1251 else {
1252 /* use data disk, leave param 0 unchanged */
1253 }
1254 }
1255