Home | History | Annotate | Line # | Download | only in raidframe
rf_dagutils.c revision 1.2
      1 /*	$NetBSD: rf_dagutils.c,v 1.2 1999/01/26 02:33:54 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /******************************************************************************
     30  *
     31  * rf_dagutils.c -- utility routines for manipulating dags
     32  *
     33  *****************************************************************************/
     34 
     35 #include "rf_archs.h"
     36 #include "rf_types.h"
     37 #include "rf_threadstuff.h"
     38 #include "rf_raid.h"
     39 #include "rf_dag.h"
     40 #include "rf_dagutils.h"
     41 #include "rf_dagfuncs.h"
     42 #include "rf_general.h"
     43 #include "rf_freelist.h"
     44 #include "rf_map.h"
     45 #include "rf_shutdown.h"
     46 #include "rf_sys.h"
     47 
     48 #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
     49 
     50 RF_RedFuncs_t rf_xorFuncs = {
     51 	rf_RegularXorFunc, "Reg Xr",
     52 	rf_SimpleXorFunc, "Simple Xr"};
     53 
     54 RF_RedFuncs_t rf_xorRecoveryFuncs = {
     55 	rf_RecoveryXorFunc, "Recovery Xr",
     56 	rf_RecoveryXorFunc, "Recovery Xr"};
     57 
     58 static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
     59 static void rf_PrintDAG(RF_DagHeader_t *);
     60 static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
     61 			     RF_DagNode_t **, int );
     62 static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
     63 static void rf_ValidateVisitedBits(RF_DagHeader_t *);
     64 
     65 /******************************************************************************
     66  *
     67  * InitNode - initialize a dag node
     68  *
     69  * the size of the propList array is always the same as that of the
     70  * successors array.
     71  *
     72  *****************************************************************************/
     73 void rf_InitNode(
     74   RF_DagNode_t         *node,
     75   RF_NodeStatus_t       initstatus,
     76   int                   commit,
     77   int                 (*doFunc)(RF_DagNode_t *node),
     78   int                 (*undoFunc)(RF_DagNode_t *node),
     79   int                 (*wakeFunc)(RF_DagNode_t *node,int status),
     80   int                   nSucc,
     81   int                   nAnte,
     82   int                   nParam,
     83   int                   nResult,
     84   RF_DagHeader_t       *hdr,
     85   char                 *name,
     86   RF_AllocListElem_t   *alist)
     87 {
     88   void **ptrs;
     89   int nptrs;
     90 
     91   if (nAnte > RF_MAX_ANTECEDENTS)
     92     RF_PANIC();
     93   node->status         = initstatus;
     94   node->commitNode     = commit;
     95   node->doFunc         = doFunc;
     96   node->undoFunc       = undoFunc;
     97   node->wakeFunc       = wakeFunc;
     98   node->numParams      = nParam;
     99   node->numResults     = nResult;
    100   node->numAntecedents = nAnte;
    101   node->numAntDone     = 0;
    102   node->next           = NULL;
    103   node->numSuccedents  = nSucc;
    104   node->name           = name;
    105   node->dagHdr         = hdr;
    106   node->visited        = 0;
    107 
    108   /* allocate all the pointers with one call to malloc */
    109   nptrs = nSucc+nAnte+nResult+nSucc;
    110 
    111   if (nptrs <= RF_DAG_PTRCACHESIZE) {
    112     /*
    113      * The dag_ptrs field of the node is basically some scribble
    114      * space to be used here. We could get rid of it, and always
    115      * allocate the range of pointers, but that's expensive. So,
    116      * we pick a "common case" size for the pointer cache. Hopefully,
    117      * we'll find that:
    118      * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
    119      *     only a little bit (least efficient case)
    120      * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
    121      *     (wasted memory)
    122      */
    123     ptrs = (void **)node->dag_ptrs;
    124   }
    125   else {
    126     RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
    127   }
    128   node->succedents  = (nSucc)   ? (RF_DagNode_t         **) ptrs                  : NULL;
    129   node->antecedents = (nAnte)   ? (RF_DagNode_t         **) (ptrs+nSucc)          : NULL;
    130   node->results     = (nResult) ? (void            **) (ptrs+nSucc+nAnte)         : NULL;
    131   node->propList    = (nSucc)   ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL;
    132 
    133   if (nParam) {
    134     if (nParam <= RF_DAG_PARAMCACHESIZE) {
    135       node->params = (RF_DagParam_t *)node->dag_params;
    136     }
    137     else {
    138       RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
    139     }
    140   }
    141   else {
    142     node->params = NULL;
    143   }
    144 }
    145 
    146 
    147 
    148 /******************************************************************************
    149  *
    150  * allocation and deallocation routines
    151  *
    152  *****************************************************************************/
    153 
    154 void rf_FreeDAG(dag_h)
    155   RF_DagHeader_t  *dag_h;
    156 {
    157   RF_AccessStripeMapHeader_t *asmap, *t_asmap;
    158   RF_DagHeader_t *nextDag;
    159   int i;
    160 
    161   while (dag_h) {
    162     nextDag = dag_h->next;
    163     for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
    164       /* release mem chunks */
    165       rf_ReleaseMemChunk(dag_h->memChunk[i]);
    166       dag_h->memChunk[i] = NULL;
    167     }
    168 
    169     RF_ASSERT(i == dag_h->chunkIndex);
    170     if (dag_h->xtraChunkCnt > 0) {
    171       /* free xtraMemChunks */
    172       for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
    173         rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
    174         dag_h->xtraMemChunk[i] = NULL;
    175       }
    176       RF_ASSERT(i == dag_h->xtraChunkIndex);
    177       /* free ptrs to xtraMemChunks */
    178       RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
    179     }
    180     rf_FreeAllocList(dag_h->allocList);
    181     for (asmap = dag_h->asmList; asmap;) {
    182       t_asmap = asmap;
    183       asmap = asmap->next;
    184       rf_FreeAccessStripeMap(t_asmap);
    185     }
    186     rf_FreeDAGHeader(dag_h);
    187     dag_h = nextDag;
    188   }
    189 }
    190 
    191 RF_PropHeader_t *rf_MakePropListEntry(
    192   RF_DagHeader_t      *dag_h,
    193   int                  resultNum,
    194   int                  paramNum,
    195   RF_PropHeader_t     *next,
    196   RF_AllocListElem_t  *allocList)
    197 {
    198   RF_PropHeader_t *p;
    199 
    200   RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
    201     (RF_PropHeader_t *), allocList);
    202   p->resultNum = resultNum;
    203   p->paramNum = paramNum;
    204   p->next = next;
    205   return(p);
    206 }
    207 
    208 static RF_FreeList_t *rf_dagh_freelist;
    209 
    210 #define RF_MAX_FREE_DAGH 128
    211 #define RF_DAGH_INC       16
    212 #define RF_DAGH_INITIAL   32
    213 
    214 static void rf_ShutdownDAGs(void *);
    215 static void rf_ShutdownDAGs(ignored)
    216   void  *ignored;
    217 {
    218 	RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *));
    219 }
    220 
    221 int rf_ConfigureDAGs(listp)
    222   RF_ShutdownList_t  **listp;
    223 {
    224 	int rc;
    225 
    226 	RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
    227 		RF_DAGH_INC, sizeof(RF_DagHeader_t));
    228 	if (rf_dagh_freelist == NULL)
    229 		return(ENOMEM);
    230 	rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
    231 	if (rc) {
    232 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
    233 			__FILE__, __LINE__, rc);
    234 		rf_ShutdownDAGs(NULL);
    235 		return(rc);
    236 	}
    237 	RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next,
    238 		(RF_DagHeader_t *));
    239 	return(0);
    240 }
    241 
    242 RF_DagHeader_t *rf_AllocDAGHeader()
    243 {
    244 	RF_DagHeader_t *dh;
    245 
    246 	RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *));
    247 	if (dh) {
    248 		bzero((char *)dh, sizeof(RF_DagHeader_t));
    249 	}
    250 	return(dh);
    251 }
    252 
    253 void rf_FreeDAGHeader(RF_DagHeader_t *dh)
    254 {
    255 	RF_FREELIST_FREE(rf_dagh_freelist,dh,next);
    256 }
    257 
    258 /* allocates a buffer big enough to hold the data described by pda */
    259 void *rf_AllocBuffer(
    260   RF_Raid_t           *raidPtr,
    261   RF_DagHeader_t      *dag_h,
    262   RF_PhysDiskAddr_t   *pda,
    263   RF_AllocListElem_t  *allocList)
    264 {
    265   char *p;
    266 
    267   RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
    268     (char *), allocList);
    269   return((void *)p);
    270 }
    271 
    272 /******************************************************************************
    273  *
    274  * debug routines
    275  *
    276  *****************************************************************************/
    277 
    278 char *rf_NodeStatusString(RF_DagNode_t *node)
    279 {
    280   switch (node->status) {
    281     case rf_wait: return("wait");
    282     case rf_fired: return("fired");
    283     case rf_good: return("good");
    284     case rf_bad: return("bad");
    285     default: return("?");
    286   }
    287 }
    288 
    289 void rf_PrintNodeInfoString(RF_DagNode_t *node)
    290 {
    291   RF_PhysDiskAddr_t *pda;
    292   int (*df)(RF_DagNode_t *) = node->doFunc;
    293   int i, lk, unlk;
    294   void *bufPtr;
    295 
    296   if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc)
    297       || (df==rf_DiskReadMirrorIdleFunc)
    298       || (df == rf_DiskReadMirrorPartitionFunc))
    299   {
    300     pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    301     bufPtr = (void *)node->params[1].p;
    302     lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    303     unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    304     RF_ASSERT( !(lk && unlk) );
    305     printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
    306        (long)pda->startSector, (int) pda->numSector, (long)bufPtr,
    307        (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
    308     return;
    309   }
    310 
    311   if (df == rf_DiskUnlockFunc) {
    312     pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    313     lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    314     unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    315     RF_ASSERT( !(lk && unlk) );
    316     printf("r %d c %d %s\n", pda->row, pda->col,
    317       (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
    318     return;
    319   }
    320 
    321   if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc)
    322       || (df==rf_RecoveryXorFunc))
    323   {
    324     printf("result buf 0x%lx\n",(long) node->results[0]);
    325     for (i=0; i<node->numParams-1; i+=2) {
    326       pda = (RF_PhysDiskAddr_t *)node->params[i].p;
    327       bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
    328       printf("    buf 0x%lx r%d c%d offs %ld nsect %d\n",
    329 	     (long)bufPtr, pda->row, pda->col,
    330 	     (long)pda->startSector, (int)pda->numSector);
    331     }
    332     return;
    333   }
    334 
    335 #if RF_INCLUDE_PARITYLOGGING > 0
    336   if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) {
    337     for (i=0; i<node->numParams-1; i+=2) {
    338       pda = (RF_PhysDiskAddr_t *)node->params[i].p;
    339       bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
    340       printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
    341 	     pda->row, pda->col, (long) pda->startSector,
    342 	     (int) pda->numSector, (long) bufPtr);
    343     }
    344     return;
    345   }
    346 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
    347 
    348   if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) {
    349     printf("\n");
    350     return;
    351   }
    352 
    353   printf("?\n");
    354 }
    355 
    356 static void rf_RecurPrintDAG(node, depth, unvisited)
    357   RF_DagNode_t  *node;
    358   int            depth;
    359   int            unvisited;
    360 {
    361   char *anttype;
    362   int i;
    363 
    364   node->visited = (unvisited) ? 0 : 1;
    365   printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
    366     node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
    367     node->numSuccedents, node->numSuccFired, node->numSuccDone,
    368 	node->numAntecedents, node->numAntDone, node->numParams,node->numResults);
    369   for (i=0; i<node->numSuccedents; i++) {
    370     printf("%d%s", node->succedents[i]->nodeNum,
    371       ((i==node->numSuccedents-1) ? "\0" : " "));
    372   }
    373   printf("} A{");
    374   for (i=0; i<node->numAntecedents; i++) {
    375     switch (node->antType[i]) {
    376     case rf_trueData :
    377       anttype = "T";
    378       break;
    379     case rf_antiData :
    380       anttype = "A";
    381       break;
    382     case rf_outputData :
    383       anttype = "O";
    384       break;
    385     case rf_control :
    386       anttype = "C";
    387       break;
    388     default :
    389       anttype = "?";
    390       break;
    391     }
    392     printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " ");
    393   }
    394   printf("}; ");
    395   rf_PrintNodeInfoString(node);
    396   for (i=0; i<node->numSuccedents; i++) {
    397     if (node->succedents[i]->visited == unvisited)
    398       rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited);
    399   }
    400 }
    401 
    402 static void rf_PrintDAG(dag_h)
    403   RF_DagHeader_t  *dag_h;
    404 {
    405   int unvisited, i;
    406   char *status;
    407 
    408   /* set dag status */
    409   switch (dag_h->status) {
    410   case rf_enable :
    411     status = "enable";
    412     break;
    413   case rf_rollForward :
    414     status = "rollForward";
    415     break;
    416   case rf_rollBackward :
    417     status = "rollBackward";
    418     break;
    419   default :
    420     status = "illegal!";
    421     break;
    422   }
    423   /* find out if visited bits are currently set or clear */
    424   unvisited = dag_h->succedents[0]->visited;
    425 
    426   printf("DAG type:  %s\n", dag_h->creator);
    427   printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)};  info\n");
    428   printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
    429     status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
    430   for (i=0; i<dag_h->numSuccedents; i++) {
    431     printf("%d%s", dag_h->succedents[i]->nodeNum,
    432       ((i==dag_h->numSuccedents-1) ? "\0" : " "));
    433   }
    434   printf("};\n");
    435   for (i=0; i<dag_h->numSuccedents; i++) {
    436     if (dag_h->succedents[i]->visited == unvisited)
    437       rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
    438   }
    439 }
    440 
    441 /* assigns node numbers */
    442 int rf_AssignNodeNums(RF_DagHeader_t *dag_h)
    443 {
    444   int unvisited, i, nnum;
    445   RF_DagNode_t *node;
    446 
    447   nnum = 0;
    448   unvisited = dag_h->succedents[0]->visited;
    449 
    450   dag_h->nodeNum = nnum++;
    451   for (i=0; i<dag_h->numSuccedents; i++) {
    452     node = dag_h->succedents[i];
    453     if (node->visited == unvisited) {
    454       nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
    455     }
    456   }
    457   return(nnum);
    458 }
    459 
    460 int rf_RecurAssignNodeNums(node, num, unvisited)
    461   RF_DagNode_t  *node;
    462   int            num;
    463   int            unvisited;
    464 {
    465   int i;
    466 
    467   node->visited = (unvisited) ? 0 : 1;
    468 
    469   node->nodeNum = num++;
    470   for (i=0; i<node->numSuccedents; i++) {
    471     if (node->succedents[i]->visited == unvisited) {
    472       num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
    473     }
    474   }
    475   return(num);
    476 }
    477 
    478 /* set the header pointers in each node to "newptr" */
    479 void rf_ResetDAGHeaderPointers(dag_h, newptr)
    480   RF_DagHeader_t  *dag_h;
    481   RF_DagHeader_t  *newptr;
    482 {
    483   int i;
    484   for (i=0; i<dag_h->numSuccedents; i++)
    485     if (dag_h->succedents[i]->dagHdr != newptr)
    486       rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
    487 }
    488 
    489 void rf_RecurResetDAGHeaderPointers(node, newptr)
    490   RF_DagNode_t    *node;
    491   RF_DagHeader_t  *newptr;
    492 {
    493   int i;
    494   node->dagHdr = newptr;
    495   for (i=0; i<node->numSuccedents; i++)
    496     if (node->succedents[i]->dagHdr != newptr)
    497       rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
    498 }
    499 
    500 
    501 void rf_PrintDAGList(RF_DagHeader_t *dag_h)
    502 {
    503   int i=0;
    504 
    505   for (; dag_h; dag_h=dag_h->next) {
    506     rf_AssignNodeNums(dag_h);
    507     printf("\n\nDAG %d IN LIST:\n",i++);
    508     rf_PrintDAG(dag_h);
    509   }
    510 }
    511 
    512 static int rf_ValidateBranch(node, scount, acount, nodes, unvisited)
    513   RF_DagNode_t   *node;
    514   int            *scount;
    515   int            *acount;
    516   RF_DagNode_t  **nodes;
    517   int             unvisited;
    518 {
    519   int i, retcode = 0;
    520 
    521   /* construct an array of node pointers indexed by node num */
    522   node->visited = (unvisited) ? 0 : 1;
    523   nodes[ node->nodeNum ] = node;
    524 
    525   if (node->next != NULL) {
    526     printf("INVALID DAG: next pointer in node is not NULL\n");
    527     retcode = 1;
    528   }
    529   if (node->status != rf_wait) {
    530     printf("INVALID DAG: Node status is not wait\n");
    531     retcode = 1;
    532   }
    533   if (node->numAntDone != 0) {
    534     printf("INVALID DAG: numAntDone is not zero\n");
    535     retcode = 1;
    536   }
    537   if (node->doFunc == rf_TerminateFunc) {
    538     if (node->numSuccedents != 0) {
    539       printf("INVALID DAG: Terminator node has succedents\n");
    540       retcode = 1;
    541     }
    542   } else {
    543     if (node->numSuccedents == 0) {
    544       printf("INVALID DAG: Non-terminator node has no succedents\n");
    545       retcode = 1;
    546     }
    547   }
    548   for (i=0; i<node->numSuccedents; i++)  {
    549     if (!node->succedents[i]) {
    550       printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name);
    551       retcode = 1;
    552     }
    553     scount[ node->succedents[i]->nodeNum ]++;
    554   }
    555   for (i=0; i<node->numAntecedents; i++) {
    556     if (!node->antecedents[i]) {
    557       printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name);
    558       retcode = 1;
    559     }
    560     acount[ node->antecedents[i]->nodeNum ]++;
    561   }
    562   for (i=0; i<node->numSuccedents; i++) {
    563     if (node->succedents[i]->visited == unvisited) {
    564       if (rf_ValidateBranch(node->succedents[i], scount,
    565           acount, nodes, unvisited))
    566       {
    567         retcode = 1;
    568       }
    569     }
    570   }
    571   return(retcode);
    572 }
    573 
    574 static void rf_ValidateBranchVisitedBits(node, unvisited, rl)
    575   RF_DagNode_t  *node;
    576   int            unvisited;
    577   int            rl;
    578 {
    579   int i;
    580 
    581   RF_ASSERT(node->visited == unvisited);
    582   for (i=0; i<node->numSuccedents; i++) {
    583     if (node->succedents[i] == NULL) {
    584       printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i);
    585       RF_ASSERT(0);
    586     }
    587     rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1);
    588   }
    589 }
    590 
    591 /* NOTE:  never call this on a big dag, because it is exponential
    592  * in execution time
    593  */
    594 static void rf_ValidateVisitedBits(dag)
    595   RF_DagHeader_t  *dag;
    596 {
    597   int i, unvisited;
    598 
    599   unvisited = dag->succedents[0]->visited;
    600 
    601   for (i=0; i<dag->numSuccedents; i++) {
    602     if (dag->succedents[i] == NULL) {
    603       printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
    604       RF_ASSERT(0);
    605     }
    606     rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0);
    607   }
    608 }
    609 
    610 /* validate a DAG.  _at entry_ verify that:
    611  *   -- numNodesCompleted is zero
    612  *   -- node queue is null
    613  *   -- dag status is rf_enable
    614  *   -- next pointer is null on every node
    615  *   -- all nodes have status wait
    616  *   -- numAntDone is zero in all nodes
    617  *   -- terminator node has zero successors
    618  *   -- no other node besides terminator has zero successors
    619  *   -- no successor or antecedent pointer in a node is NULL
    620  *   -- number of times that each node appears as a successor of another node
    621  *      is equal to the antecedent count on that node
    622  *   -- number of times that each node appears as an antecedent of another node
    623  *      is equal to the succedent count on that node
    624  *   -- what else?
    625  */
    626 int rf_ValidateDAG(dag_h)
    627   RF_DagHeader_t  *dag_h;
    628 {
    629   int i, nodecount;
    630   int *scount, *acount;        /* per-node successor and antecedent counts */
    631   RF_DagNode_t **nodes;        /* array of ptrs to nodes in dag */
    632   int retcode = 0;
    633   int unvisited;
    634   int commitNodeCount = 0;
    635 
    636   if (rf_validateVisitedDebug)
    637     rf_ValidateVisitedBits(dag_h);
    638 
    639   if (dag_h->numNodesCompleted != 0) {
    640     printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted);
    641     retcode = 1; goto validate_dag_bad;
    642   }
    643   if (dag_h->status != rf_enable) {
    644     printf("INVALID DAG: not enabled\n");
    645     retcode = 1; goto validate_dag_bad;
    646   }
    647   if (dag_h->numCommits != 0) {
    648     printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits);
    649     retcode = 1; goto validate_dag_bad;
    650   }
    651   if (dag_h->numSuccedents != 1) {
    652     /* currently, all dags must have only one succedent */
    653     printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents);
    654     retcode = 1; goto validate_dag_bad;
    655   }
    656   nodecount = rf_AssignNodeNums(dag_h);
    657 
    658   unvisited = dag_h->succedents[0]->visited;
    659 
    660   RF_Calloc(scount, nodecount, sizeof(int), (int *));
    661   RF_Calloc(acount, nodecount, sizeof(int), (int *));
    662   RF_Calloc(nodes,  nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
    663   for (i=0; i<dag_h->numSuccedents; i++) {
    664     if ((dag_h->succedents[i]->visited == unvisited)
    665         && rf_ValidateBranch(dag_h->succedents[i], scount,
    666         acount, nodes, unvisited))
    667     {
    668       retcode = 1;
    669     }
    670   }
    671   /* start at 1 to skip the header node */
    672   for (i=1; i<nodecount; i++) {
    673     if ( nodes[i]->commitNode )
    674       commitNodeCount++;
    675     if ( nodes[i]->doFunc == NULL ) {
    676       printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
    677       retcode = 1;
    678       goto validate_dag_out;
    679     }
    680     if ( nodes[i]->undoFunc == NULL ) {
    681       printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
    682       retcode = 1;
    683       goto validate_dag_out;
    684     }
    685     if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) {
    686       printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
    687 	     nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
    688       retcode = 1;
    689       goto validate_dag_out;
    690     }
    691     if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) {
    692       printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
    693 	     nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
    694       retcode = 1;
    695       goto validate_dag_out;
    696     }
    697   }
    698 
    699   if ( dag_h->numCommitNodes != commitNodeCount ) {
    700     printf("INVALID DAG: incorrect commit node count.  hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
    701       dag_h->numCommitNodes, commitNodeCount);
    702     retcode = 1;
    703     goto validate_dag_out;
    704   }
    705 
    706 validate_dag_out:
    707   RF_Free(scount, nodecount*sizeof(int));
    708   RF_Free(acount, nodecount*sizeof(int));
    709   RF_Free(nodes,  nodecount*sizeof(RF_DagNode_t *));
    710   if (retcode)
    711     rf_PrintDAGList(dag_h);
    712 
    713   if (rf_validateVisitedDebug)
    714     rf_ValidateVisitedBits(dag_h);
    715 
    716   return(retcode);
    717 
    718 validate_dag_bad:
    719   rf_PrintDAGList(dag_h);
    720   return(retcode);
    721 }
    722 
    723 
    724 /******************************************************************************
    725  *
    726  * misc construction routines
    727  *
    728  *****************************************************************************/
    729 
    730 void rf_redirect_asm(
    731   RF_Raid_t             *raidPtr,
    732   RF_AccessStripeMap_t  *asmap)
    733 {
    734   int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
    735   int row = asmap->physInfo->row;
    736   int fcol = raidPtr->reconControl[row]->fcol;
    737   int srow = raidPtr->reconControl[row]->spareRow;
    738   int scol = raidPtr->reconControl[row]->spareCol;
    739   RF_PhysDiskAddr_t *pda;
    740 
    741   RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing );
    742   for (pda = asmap->physInfo; pda; pda=pda->next) {
    743     if (pda->col == fcol) {
    744       if (rf_dagDebug) {
    745         if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
    746           pda->startSector))
    747         {
    748           RF_PANIC();
    749         }
    750       }
    751       /*printf("Remapped data for large write\n");*/
    752       if (ds) {
    753         raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
    754           &pda->row, &pda->col, &pda->startSector, RF_REMAP);
    755       }
    756       else {
    757         pda->row = srow; pda->col = scol;
    758       }
    759     }
    760   }
    761   for (pda = asmap->parityInfo; pda; pda=pda->next) {
    762     if (pda->col == fcol) {
    763       if (rf_dagDebug) {
    764         if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
    765           RF_PANIC();
    766         }
    767       }
    768     }
    769     if (ds) {
    770       (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
    771     }
    772     else {
    773       pda->row = srow; pda->col = scol;
    774     }
    775   }
    776 }
    777 
    778 
    779 /* this routine allocates read buffers and generates stripe maps for the
    780  * regions of the array from the start of the stripe to the start of the
    781  * access, and from the end of the access to the end of the stripe.  It also
    782  * computes and returns the number of DAG nodes needed to read all this data.
    783  * Note that this routine does the wrong thing if the access is fully
    784  * contained within one stripe unit, so we RF_ASSERT against this case at the
    785  * start.
    786  */
    787 void rf_MapUnaccessedPortionOfStripe(
    788   RF_Raid_t                    *raidPtr,
    789   RF_RaidLayout_t              *layoutPtr, /* in: layout information */
    790   RF_AccessStripeMap_t         *asmap,     /* in: access stripe map */
    791   RF_DagHeader_t               *dag_h,     /* in: header of the dag to create */
    792   RF_AccessStripeMapHeader_t  **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */
    793   int                          *nRodNodes, /* out: num nodes to be generated to read unaccessed data */
    794   char                        **sosBuffer, /* out: pointers to newly allocated buffer */
    795   char                        **eosBuffer,
    796   RF_AllocListElem_t           *allocList)
    797 {
    798   RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
    799   RF_SectorNum_t sosNumSector, eosNumSector;
    800 
    801   RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) );
    802   /* generate an access map for the region of the array from start of stripe
    803    * to start of access */
    804   new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0;
    805   if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
    806     sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    807     sosNumSector   = asmap->raidAddress - sosRaidAddress;
    808     RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
    809     new_asm_h[0]   = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
    810     new_asm_h[0]->next = dag_h->asmList;
    811     dag_h->asmList = new_asm_h[0];
    812     *nRodNodes    += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
    813 
    814     RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
    815     /* we're totally within one stripe here */
    816     if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
    817       rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
    818   }
    819   /* generate an access map for the region of the array from end of access
    820    * to end of stripe */
    821   if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
    822     eosRaidAddress = asmap->endRaidAddress;
    823     eosNumSector   = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
    824     RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
    825     new_asm_h[1]   = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
    826     new_asm_h[1]->next = dag_h->asmList;
    827     dag_h->asmList = new_asm_h[1];
    828     *nRodNodes    += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
    829 
    830     RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
    831     /* we're totally within one stripe here */
    832     if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
    833       rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
    834   }
    835 }
    836 
    837 
    838 
    839 /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
    840 int rf_PDAOverlap(
    841   RF_RaidLayout_t    *layoutPtr,
    842   RF_PhysDiskAddr_t  *src,
    843   RF_PhysDiskAddr_t  *dest)
    844 {
    845   RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
    846   RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
    847   /* use -1 to be sure we stay within SU */
    848   RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);
    849   RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
    850   return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 );
    851 }
    852 
    853 
    854 /* GenerateFailedAccessASMs
    855  *
    856  * this routine figures out what portion of the stripe needs to be read
    857  * to effect the degraded read or write operation.  It's primary function
    858  * is to identify everything required to recover the data, and then
    859  * eliminate anything that is already being accessed by the user.
    860  *
    861  * The main result is two new ASMs, one for the region from the start of the
    862  * stripe to the start of the access, and one for the region from the end of
    863  * the access to the end of the stripe.  These ASMs describe everything that
    864  * needs to be read to effect the degraded access.  Other results are:
    865  *    nXorBufs -- the total number of buffers that need to be XORed together to
    866  *                recover the lost data,
    867  *    rpBufPtr -- ptr to a newly-allocated buffer to hold the parity.  If NULL
    868  *                at entry, not allocated.
    869  *    overlappingPDAs --
    870  *                describes which of the non-failed PDAs in the user access
    871  *                overlap data that needs to be read to effect recovery.
    872  *                overlappingPDAs[i]==1 if and only if, neglecting the failed
    873  *                PDA, the ith pda in the input asm overlaps data that needs
    874  *                to be read for recovery.
    875  */
    876  /* in: asm - ASM for the actual access, one stripe only */
    877  /* in: faildPDA - which component of the access has failed */
    878  /* in: dag_h - header of the DAG we're going to create */
    879  /* out: new_asm_h - the two new ASMs */
    880  /* out: nXorBufs - the total number of xor bufs required */
    881  /* out: rpBufPtr - a buffer for the parity read */
    882 void rf_GenerateFailedAccessASMs(
    883   RF_Raid_t                    *raidPtr,
    884   RF_AccessStripeMap_t         *asmap,
    885   RF_PhysDiskAddr_t            *failedPDA,
    886   RF_DagHeader_t               *dag_h,
    887   RF_AccessStripeMapHeader_t  **new_asm_h,
    888   int                          *nXorBufs,
    889   char                        **rpBufPtr,
    890   char                         *overlappingPDAs,
    891   RF_AllocListElem_t           *allocList)
    892 {
    893   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    894 
    895   /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
    896   RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
    897 
    898   RF_SectorCount_t numSect[2], numParitySect;
    899   RF_PhysDiskAddr_t *pda;
    900   char *rdBuf, *bufP;
    901   int foundit, i;
    902 
    903   bufP = NULL;
    904   foundit = 0;
    905   /* first compute the following raid addresses:
    906        start of stripe,                            (sosAddr)
    907        MIN(start of access, start of failed SU),   (sosEndAddr)
    908        MAX(end of access, end of failed SU),       (eosStartAddr)
    909        end of stripe (i.e. start of next stripe)   (eosAddr)
    910   */
    911   sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    912   sosEndAddr   = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress));
    913   eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
    914   eosAddr      = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
    915 
    916   /* now generate access stripe maps for each of the above regions of the
    917    * stripe.  Use a dummy (NULL) buf ptr for now */
    918 
    919   new_asm_h[0] = (sosAddr != sosEndAddr)   ? rf_MapAccess(raidPtr, sosAddr,      sosEndAddr-sosAddr,   NULL, RF_DONT_REMAP) : NULL;
    920   new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
    921 
    922   /* walk through the PDAs and range-restrict each SU to the region of the
    923    * SU touched on the failed PDA.  also compute total data buffer space
    924    * requirements in this step.  Ignore the parity for now. */
    925 
    926   numSect[0] = numSect[1] = 0;
    927   if (new_asm_h[0]) {
    928     new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0];
    929     for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
    930       rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector;
    931     }
    932   }
    933   if (new_asm_h[1]) {
    934     new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1];
    935     for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
    936       rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector;
    937     }
    938   }
    939   numParitySect = failedPDA->numSector;
    940 
    941   /* allocate buffer space for the data & parity we have to read to recover
    942    * from the failure */
    943 
    944   if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) {      /* don't allocate parity buf if not needed */
    945     RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList);
    946     bufP = rdBuf;
    947     if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
    948 			    (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP);
    949   }
    950 
    951   /* now walk through the pdas one last time and assign buffer pointers
    952    * (ugh!).  Again, ignore the parity.  also, count nodes to find out how
    953    * many bufs need to be xored together */
    954   (*nXorBufs) = 1;   /* in read case, 1 is for parity.  In write case, 1 is for failed data */
    955   if (new_asm_h[0]) {
    956     for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
    957     *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
    958   }
    959   if (new_asm_h[1]) {
    960     for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
    961     (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
    962   }
    963   if (rpBufPtr) *rpBufPtr = bufP;     /* the rest of the buffer is for parity */
    964 
    965   /* the last step is to figure out how many more distinct buffers need to
    966    * get xor'd to produce the missing unit.  there's one for each user-data
    967    * read node that overlaps the portion of the failed unit being accessed */
    968 
    969   for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) {
    970     if (pda == failedPDA) {i--; foundit=1; continue;}
    971     if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
    972       overlappingPDAs[i] = 1;
    973       (*nXorBufs)++;
    974     }
    975   }
    976   if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);}
    977 
    978   if (rf_degDagDebug) {
    979     if (new_asm_h[0]) {
    980       printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
    981     }
    982     if (new_asm_h[1]) {
    983       printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
    984     }
    985   }
    986 }
    987 
    988 
    989 /* adjusts the offset and number of sectors in the destination pda so that
    990  * it covers at most the region of the SU covered by the source PDA.  This
    991  * is exclusively a restriction:  the number of sectors indicated by the
    992  * target PDA can only shrink.
    993  *
    994  * For example:  s = sectors within SU indicated by source PDA
    995  *               d = sectors within SU indicated by dest PDA
    996  *               r = results, stored in dest PDA
    997  *
    998  * |--------------- one stripe unit ---------------------|
    999  * |           sssssssssssssssssssssssssssssssss         |
   1000  * |    ddddddddddddddddddddddddddddddddddddddddddddd    |
   1001  * |           rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr         |
   1002  *
   1003  * Another example:
   1004  *
   1005  * |--------------- one stripe unit ---------------------|
   1006  * |           sssssssssssssssssssssssssssssssss         |
   1007  * |    ddddddddddddddddddddddd                          |
   1008  * |           rrrrrrrrrrrrrrrr                          |
   1009  *
   1010  */
   1011 void rf_RangeRestrictPDA(
   1012   RF_Raid_t          *raidPtr,
   1013   RF_PhysDiskAddr_t  *src,
   1014   RF_PhysDiskAddr_t  *dest,
   1015   int                 dobuffer,
   1016   int                 doraidaddr)
   1017 {
   1018   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
   1019   RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
   1020   RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
   1021   RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);     /* use -1 to be sure we stay within SU */
   1022   RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
   1023   RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */
   1024 
   1025   dest->startSector = subAddr + RF_MAX(soffs,doffs);
   1026   dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector;
   1027 
   1028   if (dobuffer)
   1029     dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0;
   1030   if (doraidaddr) {
   1031     dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
   1032                         rf_StripeUnitOffset(layoutPtr, dest->startSector);
   1033   }
   1034 }
   1035 
   1036 /*
   1037  * Want the highest of these primes to be the largest one
   1038  * less than the max expected number of columns (won't hurt
   1039  * to be too small or too large, but won't be optimal, either)
   1040  * --jimz
   1041  */
   1042 #define NLOWPRIMES 8
   1043 static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19};
   1044 
   1045 /*****************************************************************************
   1046  * compute the workload shift factor.  (chained declustering)
   1047  *
   1048  * return nonzero if access should shift to secondary, otherwise,
   1049  * access is to primary
   1050  *****************************************************************************/
   1051 int rf_compute_workload_shift(
   1052   RF_Raid_t          *raidPtr,
   1053   RF_PhysDiskAddr_t  *pda)
   1054 {
   1055   /*
   1056    * variables:
   1057    *  d   = column of disk containing primary
   1058    *  f   = column of failed disk
   1059    *  n   = number of disks in array
   1060    *  sd  = "shift distance" (number of columns that d is to the right of f)
   1061    *  row = row of array the access is in
   1062    *  v   = numerator of redirection ratio
   1063    *  k   = denominator of redirection ratio
   1064    */
   1065   RF_RowCol_t d, f, sd, row, n;
   1066   int k, v, ret, i;
   1067 
   1068   row = pda->row;
   1069   n = raidPtr->numCol;
   1070 
   1071   /* assign column of primary copy to d */
   1072   d = pda->col;
   1073 
   1074   /* assign column of dead disk to f */
   1075   for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++);
   1076 
   1077   RF_ASSERT(f < n);
   1078   RF_ASSERT(f != d);
   1079 
   1080   sd = (f > d) ? (n + d - f) : (d - f);
   1081   RF_ASSERT(sd < n);
   1082 
   1083   /*
   1084    * v of every k accesses should be redirected
   1085    *
   1086    * v/k := (n-1-sd)/(n-1)
   1087    */
   1088   v = (n-1-sd);
   1089   k = (n-1);
   1090 
   1091 #if 1
   1092   /*
   1093    * XXX
   1094    * Is this worth it?
   1095    *
   1096    * Now reduce the fraction, by repeatedly factoring
   1097    * out primes (just like they teach in elementary school!)
   1098    */
   1099   for(i=0;i<NLOWPRIMES;i++) {
   1100     if (lowprimes[i] > v)
   1101       break;
   1102     while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) {
   1103       v /= lowprimes[i];
   1104       k /= lowprimes[i];
   1105     }
   1106   }
   1107 #endif
   1108 
   1109   raidPtr->hist_diskreq[row][d]++;
   1110   if (raidPtr->hist_diskreq[row][d] > v) {
   1111     ret = 0; /* do not redirect */
   1112   }
   1113   else {
   1114     ret = 1; /* redirect */
   1115   }
   1116 
   1117 #if 0
   1118   printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
   1119     raidPtr->hist_diskreq[row][d]);
   1120 #endif
   1121 
   1122   if (raidPtr->hist_diskreq[row][d] >= k) {
   1123     /* reset counter */
   1124     raidPtr->hist_diskreq[row][d] = 0;
   1125   }
   1126 
   1127   return(ret);
   1128 }
   1129 
   1130 /*
   1131  * Disk selection routines
   1132  */
   1133 
   1134 /*
   1135  * Selects the disk with the shortest queue from a mirror pair.
   1136  * Both the disk I/Os queued in RAIDframe as well as those at the physical
   1137  * disk are counted as members of the "queue"
   1138  */
   1139 void rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
   1140 {
   1141   RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
   1142   RF_RowCol_t rowData, colData, rowMirror, colMirror;
   1143   int dataQueueLength, mirrorQueueLength, usemirror;
   1144   RF_PhysDiskAddr_t *data_pda   = (RF_PhysDiskAddr_t *)node->params[0].p;
   1145   RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
   1146   RF_PhysDiskAddr_t *tmp_pda;
   1147   RF_RaidDisk_t **disks = raidPtr->Disks;
   1148   RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
   1149 
   1150   /* return the [row col] of the disk with the shortest queue */
   1151   rowData = data_pda->row;
   1152   colData = data_pda->col;
   1153   rowMirror = mirror_pda->row;
   1154   colMirror = mirror_pda->col;
   1155   dataQueue   = &(dqs[rowData][colData]);
   1156   mirrorQueue = &(dqs[rowMirror][colMirror]);
   1157 
   1158 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1159   RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
   1160 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1161   dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
   1162 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1163   RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
   1164   RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
   1165 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1166   mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
   1167 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1168   RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
   1169 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1170 
   1171   usemirror = 0;
   1172   if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
   1173     usemirror = 0;
   1174   }
   1175   else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
   1176     usemirror = 1;
   1177   }
   1178   else if (dataQueueLength < mirrorQueueLength) {
   1179     usemirror = 0;
   1180   }
   1181   else if (mirrorQueueLength < dataQueueLength) {
   1182     usemirror = 1;
   1183   }
   1184   else {
   1185     /* queues are equal length. attempt cleverness. */
   1186     if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector)
   1187         <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector))
   1188     {
   1189       usemirror = 0;
   1190     }
   1191     else {
   1192       usemirror = 1;
   1193     }
   1194   }
   1195 
   1196   if (usemirror) {
   1197     /* use mirror (parity) disk, swap params 0 & 4 */
   1198     tmp_pda = data_pda;
   1199     node->params[0].p = mirror_pda;
   1200     node->params[4].p = tmp_pda;
   1201   }
   1202   else {
   1203     /* use data disk, leave param 0 unchanged */
   1204   }
   1205   /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */
   1206 }
   1207 
   1208 /*
   1209  * Do simple partitioning. This assumes that
   1210  * the data and parity disks are laid out identically.
   1211  */
   1212 void rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
   1213 {
   1214   RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
   1215   RF_RowCol_t rowData, colData, rowMirror, colMirror;
   1216   RF_PhysDiskAddr_t *data_pda   = (RF_PhysDiskAddr_t *)node->params[0].p;
   1217   RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
   1218   RF_PhysDiskAddr_t *tmp_pda;
   1219   RF_RaidDisk_t **disks = raidPtr->Disks;
   1220   RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
   1221   int usemirror;
   1222 
   1223   /* return the [row col] of the disk with the shortest queue */
   1224   rowData = data_pda->row;
   1225   colData = data_pda->col;
   1226   rowMirror = mirror_pda->row;
   1227   colMirror = mirror_pda->col;
   1228   dataQueue   = &(dqs[rowData][colData]);
   1229   mirrorQueue = &(dqs[rowMirror][colMirror]);
   1230 
   1231   usemirror = 0;
   1232   if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
   1233     usemirror = 0;
   1234   }
   1235   else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
   1236     usemirror = 1;
   1237   }
   1238   else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) {
   1239     usemirror = 0;
   1240   }
   1241   else {
   1242     usemirror = 1;
   1243   }
   1244 
   1245   if (usemirror) {
   1246     /* use mirror (parity) disk, swap params 0 & 4 */
   1247     tmp_pda = data_pda;
   1248     node->params[0].p = mirror_pda;
   1249     node->params[4].p = tmp_pda;
   1250   }
   1251   else {
   1252     /* use data disk, leave param 0 unchanged */
   1253   }
   1254 }
   1255