Home | History | Annotate | Line # | Download | only in raidframe
rf_dagutils.c revision 1.1
      1 /*	$NetBSD: rf_dagutils.c,v 1.1 1998/11/13 04:20:28 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /******************************************************************************
     30  *
     31  * rf_dagutils.c -- utility routines for manipulating dags
     32  *
     33  *****************************************************************************/
     34 
     35 /*
     36  * :
     37  * Log: rf_dagutils.c,v
     38  * Revision 1.55  1996/08/22 14:39:47  jimz
     39  * reduce v/k fraction (better load balancing)
     40  *
     41  * Revision 1.54  1996/08/21  04:14:12  jimz
     42  * minor workload shift tweaking
     43  *
     44  * Revision 1.53  1996/08/20  23:41:16  jimz
     45  * fix up workload shift computation
     46  *
     47  * Revision 1.52  1996/08/20  22:34:16  jimz
     48  * first cut at fixing workload shift
     49  * needs work
     50  *
     51  * Revision 1.51  1996/08/20  16:51:16  jimz
     52  * comment more verbosely compute_workload_shift()
     53  *
     54  * Revision 1.50  1996/08/11  00:40:50  jimz
     55  * fix up broken comment
     56  *
     57  * Revision 1.49  1996/07/27  23:36:08  jimz
     58  * Solaris port of simulator
     59  *
     60  * Revision 1.48  1996/07/27  18:40:01  jimz
     61  * cleanup sweep
     62  *
     63  * Revision 1.47  1996/07/22  19:52:16  jimz
     64  * switched node params to RF_DagParam_t, a union of
     65  * a 64-bit int and a void *, for better portability
     66  * attempted hpux port, but failed partway through for
     67  * lack of a single C compiler capable of compiling all
     68  * source files
     69  *
     70  * Revision 1.46  1996/07/18  22:57:14  jimz
     71  * port simulator to AIX
     72  *
     73  * Revision 1.45  1996/06/17  03:24:59  jimz
     74  * include shutdown.h for define of now-macroized ShutdownCreate
     75  *
     76  * Revision 1.44  1996/06/10  12:50:57  jimz
     77  * Add counters to freelists to track number of allocations, frees,
     78  * grows, max size, etc. Adjust a couple sets of PRIME params based
     79  * on the results.
     80  *
     81  * Revision 1.43  1996/06/10  11:55:47  jimz
     82  * Straightened out some per-array/not-per-array distinctions, fixed
     83  * a couple bugs related to confusion. Added shutdown lists. Removed
     84  * layout shutdown function (now subsumed by shutdown lists).
     85  *
     86  * Revision 1.42  1996/06/07  21:33:04  jimz
     87  * begin using consistent types for sector numbers,
     88  * stripe numbers, row+col numbers, recon unit numbers
     89  *
     90  * Revision 1.41  1996/06/06  17:28:58  jimz
     91  * make PrintNodeInfoString aware of new mirroring funcs
     92  *
     93  * Revision 1.40  1996/06/05  18:06:02  jimz
     94  * Major code cleanup. The Great Renaming is now done.
     95  * Better modularity. Better typing. Fixed a bunch of
     96  * synchronization bugs. Made a lot of global stuff
     97  * per-desc or per-array. Removed dead code.
     98  *
     99  * Revision 1.39  1996/06/03  23:28:26  jimz
    100  * more bugfixes
    101  * check in tree to sync for IPDS runs with current bugfixes
    102  * there still may be a problem with threads in the script test
    103  * getting I/Os stuck- not trivially reproducible (runs ~50 times
    104  * in a row without getting stuck)
    105  *
    106  * Revision 1.38  1996/06/02  17:31:48  jimz
    107  * Moved a lot of global stuff into array structure, where it belongs.
    108  * Fixed up paritylogging, pss modules in this manner. Some general
    109  * code cleanup. Removed lots of dead code, some dead files.
    110  *
    111  * Revision 1.37  1996/05/31  22:26:54  jimz
    112  * fix a lot of mapping problems, memory allocation problems
    113  * found some weird lock issues, fixed 'em
    114  * more code cleanup
    115  *
    116  * Revision 1.36  1996/05/30  23:22:16  jimz
    117  * bugfixes of serialization, timing problems
    118  * more cleanup
    119  *
    120  * Revision 1.35  1996/05/30  11:29:41  jimz
    121  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
    122  * about when stripes should be locked (I made it consistent: no parity, no lock)
    123  * There was a lot of extra serialization of I/Os which I've removed- a lot of
    124  * it was to calculate values for the cache code, which is no longer with us.
    125  * More types, function, macro cleanup. Added code to properly quiesce the array
    126  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
    127  * before. Fixed memory allocation, freeing bugs.
    128  *
    129  * Revision 1.34  1996/05/27  18:56:37  jimz
    130  * more code cleanup
    131  * better typing
    132  * compiles in all 3 environments
    133  *
    134  * Revision 1.33  1996/05/24  22:17:04  jimz
    135  * continue code + namespace cleanup
    136  * typed a bunch of flags
    137  *
    138  * Revision 1.32  1996/05/24  04:28:55  jimz
    139  * release cleanup ckpt
    140  *
    141  * Revision 1.31  1996/05/23  21:46:35  jimz
    142  * checkpoint in code cleanup (release prep)
    143  * lots of types, function names have been fixed
    144  *
    145  * Revision 1.30  1996/05/23  00:33:23  jimz
    146  * code cleanup: move all debug decls to rf_options.c, all extern
    147  * debug decls to rf_options.h, all debug vars preceded by rf_
    148  *
    149  * Revision 1.29  1996/05/18  19:51:34  jimz
    150  * major code cleanup- fix syntax, make some types consistent,
    151  * add prototypes, clean out dead code, et cetera
    152  *
    153  * Revision 1.28  1996/05/16  23:05:52  jimz
    154  * changed InitNode() to use dag_ptrs field of node when appropriate
    155  * (see rf_dag.h or comments within InitNode() for details)
    156  *
    157  * Revision 1.27  1996/05/16  15:37:19  jimz
    158  * convert to RF_FREELIST stuff for dag headers
    159  *
    160  * Revision 1.26  1996/05/08  21:01:24  jimz
    161  * fixed up enum type names that were conflicting with other
    162  * enums and function names (ie, "panic")
    163  * future naming trends will be towards RF_ and rf_ for
    164  * everything raidframe-related
    165  *
    166  * Revision 1.25  1996/05/03  19:56:15  wvcii
    167  * added misc routines from old dag creation files
    168  *
    169  * Revision 1.24  1995/12/12  18:10:06  jimz
    170  * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
    171  * fix 80-column brain damage in comments
    172  *
    173  * Revision 1.23  1995/12/01  15:59:50  root
    174  * added copyright info
    175  *
    176  * Revision 1.22  1995/11/17  15:14:12  wvcii
    177  * PrintDAG now processes DiskReadMirrorFunc nodes
    178  *
    179  * Revision 1.21  1995/11/07  16:22:38  wvcii
    180  * InitNode and InitNodeFromBuf now initialize commit fields
    181  * beefed up ValidateDag
    182  * prettied up PrintDAGList
    183  *
    184  */
    185 
    186 #include "rf_archs.h"
    187 #include "rf_types.h"
    188 #include "rf_threadstuff.h"
    189 #include "rf_raid.h"
    190 #include "rf_dag.h"
    191 #include "rf_dagutils.h"
    192 #include "rf_dagfuncs.h"
    193 #include "rf_general.h"
    194 #include "rf_freelist.h"
    195 #include "rf_map.h"
    196 #include "rf_shutdown.h"
    197 #include "rf_sys.h"
    198 
    199 #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
    200 
    201 RF_RedFuncs_t rf_xorFuncs = {
    202 	rf_RegularXorFunc, "Reg Xr",
    203 	rf_SimpleXorFunc, "Simple Xr"};
    204 
    205 RF_RedFuncs_t rf_xorRecoveryFuncs = {
    206 	rf_RecoveryXorFunc, "Recovery Xr",
    207 	rf_RecoveryXorFunc, "Recovery Xr"};
    208 
    209 static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
    210 static void rf_PrintDAG(RF_DagHeader_t *);
    211 static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
    212 			     RF_DagNode_t **, int );
    213 static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
    214 static void rf_ValidateVisitedBits(RF_DagHeader_t *);
    215 
    216 /******************************************************************************
    217  *
    218  * InitNode - initialize a dag node
    219  *
    220  * the size of the propList array is always the same as that of the
    221  * successors array.
    222  *
    223  *****************************************************************************/
    224 void rf_InitNode(
    225   RF_DagNode_t         *node,
    226   RF_NodeStatus_t       initstatus,
    227   int                   commit,
    228   int                 (*doFunc)(RF_DagNode_t *node),
    229   int                 (*undoFunc)(RF_DagNode_t *node),
    230   int                 (*wakeFunc)(RF_DagNode_t *node,int status),
    231   int                   nSucc,
    232   int                   nAnte,
    233   int                   nParam,
    234   int                   nResult,
    235   RF_DagHeader_t       *hdr,
    236   char                 *name,
    237   RF_AllocListElem_t   *alist)
    238 {
    239   void **ptrs;
    240   int nptrs;
    241 
    242   if (nAnte > RF_MAX_ANTECEDENTS)
    243     RF_PANIC();
    244   node->status         = initstatus;
    245   node->commitNode     = commit;
    246   node->doFunc         = doFunc;
    247   node->undoFunc       = undoFunc;
    248   node->wakeFunc       = wakeFunc;
    249   node->numParams      = nParam;
    250   node->numResults     = nResult;
    251   node->numAntecedents = nAnte;
    252   node->numAntDone     = 0;
    253   node->next           = NULL;
    254   node->numSuccedents  = nSucc;
    255   node->name           = name;
    256   node->dagHdr         = hdr;
    257   node->visited        = 0;
    258 
    259   /* allocate all the pointers with one call to malloc */
    260   nptrs = nSucc+nAnte+nResult+nSucc;
    261 
    262   if (nptrs <= RF_DAG_PTRCACHESIZE) {
    263     /*
    264      * The dag_ptrs field of the node is basically some scribble
    265      * space to be used here. We could get rid of it, and always
    266      * allocate the range of pointers, but that's expensive. So,
    267      * we pick a "common case" size for the pointer cache. Hopefully,
    268      * we'll find that:
    269      * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
    270      *     only a little bit (least efficient case)
    271      * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
    272      *     (wasted memory)
    273      */
    274     ptrs = (void **)node->dag_ptrs;
    275   }
    276   else {
    277     RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
    278   }
    279   node->succedents  = (nSucc)   ? (RF_DagNode_t         **) ptrs                  : NULL;
    280   node->antecedents = (nAnte)   ? (RF_DagNode_t         **) (ptrs+nSucc)          : NULL;
    281   node->results     = (nResult) ? (void            **) (ptrs+nSucc+nAnte)         : NULL;
    282   node->propList    = (nSucc)   ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL;
    283 
    284   if (nParam) {
    285     if (nParam <= RF_DAG_PARAMCACHESIZE) {
    286       node->params = (RF_DagParam_t *)node->dag_params;
    287     }
    288     else {
    289       RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
    290     }
    291   }
    292   else {
    293     node->params = NULL;
    294   }
    295 }
    296 
    297 
    298 
    299 /******************************************************************************
    300  *
    301  * allocation and deallocation routines
    302  *
    303  *****************************************************************************/
    304 
    305 void rf_FreeDAG(dag_h)
    306   RF_DagHeader_t  *dag_h;
    307 {
    308   RF_AccessStripeMapHeader_t *asmap, *t_asmap;
    309   RF_DagHeader_t *nextDag;
    310   int i;
    311 
    312   while (dag_h) {
    313     nextDag = dag_h->next;
    314     for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
    315       /* release mem chunks */
    316       rf_ReleaseMemChunk(dag_h->memChunk[i]);
    317       dag_h->memChunk[i] = NULL;
    318     }
    319 
    320     RF_ASSERT(i == dag_h->chunkIndex);
    321     if (dag_h->xtraChunkCnt > 0) {
    322       /* free xtraMemChunks */
    323       for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
    324         rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
    325         dag_h->xtraMemChunk[i] = NULL;
    326       }
    327       RF_ASSERT(i == dag_h->xtraChunkIndex);
    328       /* free ptrs to xtraMemChunks */
    329       RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
    330     }
    331     rf_FreeAllocList(dag_h->allocList);
    332     for (asmap = dag_h->asmList; asmap;) {
    333       t_asmap = asmap;
    334       asmap = asmap->next;
    335       rf_FreeAccessStripeMap(t_asmap);
    336     }
    337     rf_FreeDAGHeader(dag_h);
    338     dag_h = nextDag;
    339   }
    340 }
    341 
    342 RF_PropHeader_t *rf_MakePropListEntry(
    343   RF_DagHeader_t      *dag_h,
    344   int                  resultNum,
    345   int                  paramNum,
    346   RF_PropHeader_t     *next,
    347   RF_AllocListElem_t  *allocList)
    348 {
    349   RF_PropHeader_t *p;
    350 
    351   RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
    352     (RF_PropHeader_t *), allocList);
    353   p->resultNum = resultNum;
    354   p->paramNum = paramNum;
    355   p->next = next;
    356   return(p);
    357 }
    358 
    359 static RF_FreeList_t *rf_dagh_freelist;
    360 
    361 #define RF_MAX_FREE_DAGH 128
    362 #define RF_DAGH_INC       16
    363 #define RF_DAGH_INITIAL   32
    364 
    365 static void rf_ShutdownDAGs(void *);
    366 static void rf_ShutdownDAGs(ignored)
    367   void  *ignored;
    368 {
    369 	RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *));
    370 }
    371 
    372 int rf_ConfigureDAGs(listp)
    373   RF_ShutdownList_t  **listp;
    374 {
    375 	int rc;
    376 
    377 	RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
    378 		RF_DAGH_INC, sizeof(RF_DagHeader_t));
    379 	if (rf_dagh_freelist == NULL)
    380 		return(ENOMEM);
    381 	rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
    382 	if (rc) {
    383 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
    384 			__FILE__, __LINE__, rc);
    385 		rf_ShutdownDAGs(NULL);
    386 		return(rc);
    387 	}
    388 	RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next,
    389 		(RF_DagHeader_t *));
    390 	return(0);
    391 }
    392 
    393 RF_DagHeader_t *rf_AllocDAGHeader()
    394 {
    395 	RF_DagHeader_t *dh;
    396 
    397 	RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *));
    398 	if (dh) {
    399 		bzero((char *)dh, sizeof(RF_DagHeader_t));
    400 	}
    401 	return(dh);
    402 }
    403 
    404 void rf_FreeDAGHeader(RF_DagHeader_t *dh)
    405 {
    406 	RF_FREELIST_FREE(rf_dagh_freelist,dh,next);
    407 }
    408 
    409 /* allocates a buffer big enough to hold the data described by pda */
    410 void *rf_AllocBuffer(
    411   RF_Raid_t           *raidPtr,
    412   RF_DagHeader_t      *dag_h,
    413   RF_PhysDiskAddr_t   *pda,
    414   RF_AllocListElem_t  *allocList)
    415 {
    416   char *p;
    417 
    418   RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
    419     (char *), allocList);
    420   return((void *)p);
    421 }
    422 
    423 /******************************************************************************
    424  *
    425  * debug routines
    426  *
    427  *****************************************************************************/
    428 
    429 char *rf_NodeStatusString(RF_DagNode_t *node)
    430 {
    431   switch (node->status) {
    432     case rf_wait: return("wait");
    433     case rf_fired: return("fired");
    434     case rf_good: return("good");
    435     case rf_bad: return("bad");
    436     default: return("?");
    437   }
    438 }
    439 
    440 void rf_PrintNodeInfoString(RF_DagNode_t *node)
    441 {
    442   RF_PhysDiskAddr_t *pda;
    443   int (*df)(RF_DagNode_t *) = node->doFunc;
    444   int i, lk, unlk;
    445   void *bufPtr;
    446 
    447   if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc)
    448       || (df==rf_DiskReadMirrorIdleFunc)
    449       || (df == rf_DiskReadMirrorPartitionFunc))
    450   {
    451     pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    452     bufPtr = (void *)node->params[1].p;
    453     lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    454     unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    455     RF_ASSERT( !(lk && unlk) );
    456     printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
    457        (long)pda->startSector, (int) pda->numSector, (long)bufPtr,
    458        (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
    459     return;
    460   }
    461 
    462   if (df == rf_DiskUnlockFunc) {
    463     pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    464     lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    465     unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    466     RF_ASSERT( !(lk && unlk) );
    467     printf("r %d c %d %s\n", pda->row, pda->col,
    468       (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
    469     return;
    470   }
    471 
    472   if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc)
    473       || (df==rf_RecoveryXorFunc))
    474   {
    475     printf("result buf 0x%lx\n",(long) node->results[0]);
    476     for (i=0; i<node->numParams-1; i+=2) {
    477       pda = (RF_PhysDiskAddr_t *)node->params[i].p;
    478       bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
    479       printf("    buf 0x%lx r%d c%d offs %ld nsect %d\n",
    480 	     (long)bufPtr, pda->row, pda->col,
    481 	     (long)pda->startSector, (int)pda->numSector);
    482     }
    483     return;
    484   }
    485 
    486 #if RF_INCLUDE_PARITYLOGGING > 0
    487   if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) {
    488     for (i=0; i<node->numParams-1; i+=2) {
    489       pda = (RF_PhysDiskAddr_t *)node->params[i].p;
    490       bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
    491       printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
    492 	     pda->row, pda->col, (long) pda->startSector,
    493 	     (int) pda->numSector, (long) bufPtr);
    494     }
    495     return;
    496   }
    497 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
    498 
    499   if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) {
    500     printf("\n");
    501     return;
    502   }
    503 
    504   printf("?\n");
    505 }
    506 
    507 static void rf_RecurPrintDAG(node, depth, unvisited)
    508   RF_DagNode_t  *node;
    509   int            depth;
    510   int            unvisited;
    511 {
    512   char *anttype;
    513   int i;
    514 
    515   node->visited = (unvisited) ? 0 : 1;
    516   printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
    517     node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
    518     node->numSuccedents, node->numSuccFired, node->numSuccDone,
    519 	node->numAntecedents, node->numAntDone, node->numParams,node->numResults);
    520   for (i=0; i<node->numSuccedents; i++) {
    521     printf("%d%s", node->succedents[i]->nodeNum,
    522       ((i==node->numSuccedents-1) ? "\0" : " "));
    523   }
    524   printf("} A{");
    525   for (i=0; i<node->numAntecedents; i++) {
    526     switch (node->antType[i]) {
    527     case rf_trueData :
    528       anttype = "T";
    529       break;
    530     case rf_antiData :
    531       anttype = "A";
    532       break;
    533     case rf_outputData :
    534       anttype = "O";
    535       break;
    536     case rf_control :
    537       anttype = "C";
    538       break;
    539     default :
    540       anttype = "?";
    541       break;
    542     }
    543     printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " ");
    544   }
    545   printf("}; ");
    546   rf_PrintNodeInfoString(node);
    547   for (i=0; i<node->numSuccedents; i++) {
    548     if (node->succedents[i]->visited == unvisited)
    549       rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited);
    550   }
    551 }
    552 
    553 static void rf_PrintDAG(dag_h)
    554   RF_DagHeader_t  *dag_h;
    555 {
    556   int unvisited, i;
    557   char *status;
    558 
    559   /* set dag status */
    560   switch (dag_h->status) {
    561   case rf_enable :
    562     status = "enable";
    563     break;
    564   case rf_rollForward :
    565     status = "rollForward";
    566     break;
    567   case rf_rollBackward :
    568     status = "rollBackward";
    569     break;
    570   default :
    571     status = "illegal!";
    572     break;
    573   }
    574   /* find out if visited bits are currently set or clear */
    575   unvisited = dag_h->succedents[0]->visited;
    576 
    577   printf("DAG type:  %s\n", dag_h->creator);
    578   printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)};  info\n");
    579   printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
    580     status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
    581   for (i=0; i<dag_h->numSuccedents; i++) {
    582     printf("%d%s", dag_h->succedents[i]->nodeNum,
    583       ((i==dag_h->numSuccedents-1) ? "\0" : " "));
    584   }
    585   printf("};\n");
    586   for (i=0; i<dag_h->numSuccedents; i++) {
    587     if (dag_h->succedents[i]->visited == unvisited)
    588       rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
    589   }
    590 }
    591 
    592 /* assigns node numbers */
    593 int rf_AssignNodeNums(RF_DagHeader_t *dag_h)
    594 {
    595   int unvisited, i, nnum;
    596   RF_DagNode_t *node;
    597 
    598   nnum = 0;
    599   unvisited = dag_h->succedents[0]->visited;
    600 
    601   dag_h->nodeNum = nnum++;
    602   for (i=0; i<dag_h->numSuccedents; i++) {
    603     node = dag_h->succedents[i];
    604     if (node->visited == unvisited) {
    605       nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
    606     }
    607   }
    608   return(nnum);
    609 }
    610 
    611 int rf_RecurAssignNodeNums(node, num, unvisited)
    612   RF_DagNode_t  *node;
    613   int            num;
    614   int            unvisited;
    615 {
    616   int i;
    617 
    618   node->visited = (unvisited) ? 0 : 1;
    619 
    620   node->nodeNum = num++;
    621   for (i=0; i<node->numSuccedents; i++) {
    622     if (node->succedents[i]->visited == unvisited) {
    623       num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
    624     }
    625   }
    626   return(num);
    627 }
    628 
    629 /* set the header pointers in each node to "newptr" */
    630 void rf_ResetDAGHeaderPointers(dag_h, newptr)
    631   RF_DagHeader_t  *dag_h;
    632   RF_DagHeader_t  *newptr;
    633 {
    634   int i;
    635   for (i=0; i<dag_h->numSuccedents; i++)
    636     if (dag_h->succedents[i]->dagHdr != newptr)
    637       rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
    638 }
    639 
    640 void rf_RecurResetDAGHeaderPointers(node, newptr)
    641   RF_DagNode_t    *node;
    642   RF_DagHeader_t  *newptr;
    643 {
    644   int i;
    645   node->dagHdr = newptr;
    646   for (i=0; i<node->numSuccedents; i++)
    647     if (node->succedents[i]->dagHdr != newptr)
    648       rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
    649 }
    650 
    651 
    652 void rf_PrintDAGList(RF_DagHeader_t *dag_h)
    653 {
    654   int i=0;
    655 
    656   for (; dag_h; dag_h=dag_h->next) {
    657     rf_AssignNodeNums(dag_h);
    658     printf("\n\nDAG %d IN LIST:\n",i++);
    659     rf_PrintDAG(dag_h);
    660   }
    661 }
    662 
    663 static int rf_ValidateBranch(node, scount, acount, nodes, unvisited)
    664   RF_DagNode_t   *node;
    665   int            *scount;
    666   int            *acount;
    667   RF_DagNode_t  **nodes;
    668   int             unvisited;
    669 {
    670   int i, retcode = 0;
    671 
    672   /* construct an array of node pointers indexed by node num */
    673   node->visited = (unvisited) ? 0 : 1;
    674   nodes[ node->nodeNum ] = node;
    675 
    676   if (node->next != NULL) {
    677     printf("INVALID DAG: next pointer in node is not NULL\n");
    678     retcode = 1;
    679   }
    680   if (node->status != rf_wait) {
    681     printf("INVALID DAG: Node status is not wait\n");
    682     retcode = 1;
    683   }
    684   if (node->numAntDone != 0) {
    685     printf("INVALID DAG: numAntDone is not zero\n");
    686     retcode = 1;
    687   }
    688   if (node->doFunc == rf_TerminateFunc) {
    689     if (node->numSuccedents != 0) {
    690       printf("INVALID DAG: Terminator node has succedents\n");
    691       retcode = 1;
    692     }
    693   } else {
    694     if (node->numSuccedents == 0) {
    695       printf("INVALID DAG: Non-terminator node has no succedents\n");
    696       retcode = 1;
    697     }
    698   }
    699   for (i=0; i<node->numSuccedents; i++)  {
    700     if (!node->succedents[i]) {
    701       printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name);
    702       retcode = 1;
    703     }
    704     scount[ node->succedents[i]->nodeNum ]++;
    705   }
    706   for (i=0; i<node->numAntecedents; i++) {
    707     if (!node->antecedents[i]) {
    708       printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name);
    709       retcode = 1;
    710     }
    711     acount[ node->antecedents[i]->nodeNum ]++;
    712   }
    713   for (i=0; i<node->numSuccedents; i++) {
    714     if (node->succedents[i]->visited == unvisited) {
    715       if (rf_ValidateBranch(node->succedents[i], scount,
    716           acount, nodes, unvisited))
    717       {
    718         retcode = 1;
    719       }
    720     }
    721   }
    722   return(retcode);
    723 }
    724 
    725 static void rf_ValidateBranchVisitedBits(node, unvisited, rl)
    726   RF_DagNode_t  *node;
    727   int            unvisited;
    728   int            rl;
    729 {
    730   int i;
    731 
    732   RF_ASSERT(node->visited == unvisited);
    733   for (i=0; i<node->numSuccedents; i++) {
    734     if (node->succedents[i] == NULL) {
    735       printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i);
    736       RF_ASSERT(0);
    737     }
    738     rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1);
    739   }
    740 }
    741 
    742 /* NOTE:  never call this on a big dag, because it is exponential
    743  * in execution time
    744  */
    745 static void rf_ValidateVisitedBits(dag)
    746   RF_DagHeader_t  *dag;
    747 {
    748   int i, unvisited;
    749 
    750   unvisited = dag->succedents[0]->visited;
    751 
    752   for (i=0; i<dag->numSuccedents; i++) {
    753     if (dag->succedents[i] == NULL) {
    754       printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
    755       RF_ASSERT(0);
    756     }
    757     rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0);
    758   }
    759 }
    760 
    761 /* validate a DAG.  _at entry_ verify that:
    762  *   -- numNodesCompleted is zero
    763  *   -- node queue is null
    764  *   -- dag status is rf_enable
    765  *   -- next pointer is null on every node
    766  *   -- all nodes have status wait
    767  *   -- numAntDone is zero in all nodes
    768  *   -- terminator node has zero successors
    769  *   -- no other node besides terminator has zero successors
    770  *   -- no successor or antecedent pointer in a node is NULL
    771  *   -- number of times that each node appears as a successor of another node
    772  *      is equal to the antecedent count on that node
    773  *   -- number of times that each node appears as an antecedent of another node
    774  *      is equal to the succedent count on that node
    775  *   -- what else?
    776  */
    777 int rf_ValidateDAG(dag_h)
    778   RF_DagHeader_t  *dag_h;
    779 {
    780   int i, nodecount;
    781   int *scount, *acount;        /* per-node successor and antecedent counts */
    782   RF_DagNode_t **nodes;        /* array of ptrs to nodes in dag */
    783   int retcode = 0;
    784   int unvisited;
    785   int commitNodeCount = 0;
    786 
    787   if (rf_validateVisitedDebug)
    788     rf_ValidateVisitedBits(dag_h);
    789 
    790   if (dag_h->numNodesCompleted != 0) {
    791     printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted);
    792     retcode = 1; goto validate_dag_bad;
    793   }
    794   if (dag_h->status != rf_enable) {
    795     printf("INVALID DAG: not enabled\n");
    796     retcode = 1; goto validate_dag_bad;
    797   }
    798   if (dag_h->numCommits != 0) {
    799     printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits);
    800     retcode = 1; goto validate_dag_bad;
    801   }
    802   if (dag_h->numSuccedents != 1) {
    803     /* currently, all dags must have only one succedent */
    804     printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents);
    805     retcode = 1; goto validate_dag_bad;
    806   }
    807   nodecount = rf_AssignNodeNums(dag_h);
    808 
    809   unvisited = dag_h->succedents[0]->visited;
    810 
    811   RF_Calloc(scount, nodecount, sizeof(int), (int *));
    812   RF_Calloc(acount, nodecount, sizeof(int), (int *));
    813   RF_Calloc(nodes,  nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
    814   for (i=0; i<dag_h->numSuccedents; i++) {
    815     if ((dag_h->succedents[i]->visited == unvisited)
    816         && rf_ValidateBranch(dag_h->succedents[i], scount,
    817         acount, nodes, unvisited))
    818     {
    819       retcode = 1;
    820     }
    821   }
    822   /* start at 1 to skip the header node */
    823   for (i=1; i<nodecount; i++) {
    824     if ( nodes[i]->commitNode )
    825       commitNodeCount++;
    826     if ( nodes[i]->doFunc == NULL ) {
    827       printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
    828       retcode = 1;
    829       goto validate_dag_out;
    830     }
    831     if ( nodes[i]->undoFunc == NULL ) {
    832       printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
    833       retcode = 1;
    834       goto validate_dag_out;
    835     }
    836     if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) {
    837       printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
    838 	     nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
    839       retcode = 1;
    840       goto validate_dag_out;
    841     }
    842     if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) {
    843       printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
    844 	     nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
    845       retcode = 1;
    846       goto validate_dag_out;
    847     }
    848   }
    849 
    850   if ( dag_h->numCommitNodes != commitNodeCount ) {
    851     printf("INVALID DAG: incorrect commit node count.  hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
    852       dag_h->numCommitNodes, commitNodeCount);
    853     retcode = 1;
    854     goto validate_dag_out;
    855   }
    856 
    857 validate_dag_out:
    858   RF_Free(scount, nodecount*sizeof(int));
    859   RF_Free(acount, nodecount*sizeof(int));
    860   RF_Free(nodes,  nodecount*sizeof(RF_DagNode_t *));
    861   if (retcode)
    862     rf_PrintDAGList(dag_h);
    863 
    864   if (rf_validateVisitedDebug)
    865     rf_ValidateVisitedBits(dag_h);
    866 
    867   return(retcode);
    868 
    869 validate_dag_bad:
    870   rf_PrintDAGList(dag_h);
    871   return(retcode);
    872 }
    873 
    874 
    875 /******************************************************************************
    876  *
    877  * misc construction routines
    878  *
    879  *****************************************************************************/
    880 
    881 void rf_redirect_asm(
    882   RF_Raid_t             *raidPtr,
    883   RF_AccessStripeMap_t  *asmap)
    884 {
    885   int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
    886   int row = asmap->physInfo->row;
    887   int fcol = raidPtr->reconControl[row]->fcol;
    888   int srow = raidPtr->reconControl[row]->spareRow;
    889   int scol = raidPtr->reconControl[row]->spareCol;
    890   RF_PhysDiskAddr_t *pda;
    891 
    892   RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing );
    893   for (pda = asmap->physInfo; pda; pda=pda->next) {
    894     if (pda->col == fcol) {
    895       if (rf_dagDebug) {
    896         if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
    897           pda->startSector))
    898         {
    899           RF_PANIC();
    900         }
    901       }
    902       /*printf("Remapped data for large write\n");*/
    903       if (ds) {
    904         raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
    905           &pda->row, &pda->col, &pda->startSector, RF_REMAP);
    906       }
    907       else {
    908         pda->row = srow; pda->col = scol;
    909       }
    910     }
    911   }
    912   for (pda = asmap->parityInfo; pda; pda=pda->next) {
    913     if (pda->col == fcol) {
    914       if (rf_dagDebug) {
    915         if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
    916           RF_PANIC();
    917         }
    918       }
    919     }
    920     if (ds) {
    921       (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
    922     }
    923     else {
    924       pda->row = srow; pda->col = scol;
    925     }
    926   }
    927 }
    928 
    929 
    930 /* this routine allocates read buffers and generates stripe maps for the
    931  * regions of the array from the start of the stripe to the start of the
    932  * access, and from the end of the access to the end of the stripe.  It also
    933  * computes and returns the number of DAG nodes needed to read all this data.
    934  * Note that this routine does the wrong thing if the access is fully
    935  * contained within one stripe unit, so we RF_ASSERT against this case at the
    936  * start.
    937  */
    938 void rf_MapUnaccessedPortionOfStripe(
    939   RF_Raid_t                    *raidPtr,
    940   RF_RaidLayout_t              *layoutPtr, /* in: layout information */
    941   RF_AccessStripeMap_t         *asmap,     /* in: access stripe map */
    942   RF_DagHeader_t               *dag_h,     /* in: header of the dag to create */
    943   RF_AccessStripeMapHeader_t  **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */
    944   int                          *nRodNodes, /* out: num nodes to be generated to read unaccessed data */
    945   char                        **sosBuffer, /* out: pointers to newly allocated buffer */
    946   char                        **eosBuffer,
    947   RF_AllocListElem_t           *allocList)
    948 {
    949   RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
    950   RF_SectorNum_t sosNumSector, eosNumSector;
    951 
    952   RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) );
    953   /* generate an access map for the region of the array from start of stripe
    954    * to start of access */
    955   new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0;
    956   if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
    957     sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    958     sosNumSector   = asmap->raidAddress - sosRaidAddress;
    959     RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
    960     new_asm_h[0]   = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
    961     new_asm_h[0]->next = dag_h->asmList;
    962     dag_h->asmList = new_asm_h[0];
    963     *nRodNodes    += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
    964 
    965     RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
    966     /* we're totally within one stripe here */
    967     if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
    968       rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
    969   }
    970   /* generate an access map for the region of the array from end of access
    971    * to end of stripe */
    972   if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
    973     eosRaidAddress = asmap->endRaidAddress;
    974     eosNumSector   = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
    975     RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
    976     new_asm_h[1]   = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
    977     new_asm_h[1]->next = dag_h->asmList;
    978     dag_h->asmList = new_asm_h[1];
    979     *nRodNodes    += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
    980 
    981     RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
    982     /* we're totally within one stripe here */
    983     if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
    984       rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
    985   }
    986 }
    987 
    988 
    989 
    990 /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
    991 int rf_PDAOverlap(
    992   RF_RaidLayout_t    *layoutPtr,
    993   RF_PhysDiskAddr_t  *src,
    994   RF_PhysDiskAddr_t  *dest)
    995 {
    996   RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
    997   RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
    998   /* use -1 to be sure we stay within SU */
    999   RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);
   1000   RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
   1001   return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 );
   1002 }
   1003 
   1004 
   1005 /* GenerateFailedAccessASMs
   1006  *
   1007  * this routine figures out what portion of the stripe needs to be read
   1008  * to effect the degraded read or write operation.  It's primary function
   1009  * is to identify everything required to recover the data, and then
   1010  * eliminate anything that is already being accessed by the user.
   1011  *
   1012  * The main result is two new ASMs, one for the region from the start of the
   1013  * stripe to the start of the access, and one for the region from the end of
   1014  * the access to the end of the stripe.  These ASMs describe everything that
   1015  * needs to be read to effect the degraded access.  Other results are:
   1016  *    nXorBufs -- the total number of buffers that need to be XORed together to
   1017  *                recover the lost data,
   1018  *    rpBufPtr -- ptr to a newly-allocated buffer to hold the parity.  If NULL
   1019  *                at entry, not allocated.
   1020  *    overlappingPDAs --
   1021  *                describes which of the non-failed PDAs in the user access
   1022  *                overlap data that needs to be read to effect recovery.
   1023  *                overlappingPDAs[i]==1 if and only if, neglecting the failed
   1024  *                PDA, the ith pda in the input asm overlaps data that needs
   1025  *                to be read for recovery.
   1026  */
   1027  /* in: asm - ASM for the actual access, one stripe only */
   1028  /* in: faildPDA - which component of the access has failed */
   1029  /* in: dag_h - header of the DAG we're going to create */
   1030  /* out: new_asm_h - the two new ASMs */
   1031  /* out: nXorBufs - the total number of xor bufs required */
   1032  /* out: rpBufPtr - a buffer for the parity read */
   1033 void rf_GenerateFailedAccessASMs(
   1034   RF_Raid_t                    *raidPtr,
   1035   RF_AccessStripeMap_t         *asmap,
   1036   RF_PhysDiskAddr_t            *failedPDA,
   1037   RF_DagHeader_t               *dag_h,
   1038   RF_AccessStripeMapHeader_t  **new_asm_h,
   1039   int                          *nXorBufs,
   1040   char                        **rpBufPtr,
   1041   char                         *overlappingPDAs,
   1042   RF_AllocListElem_t           *allocList)
   1043 {
   1044   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
   1045 
   1046   /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
   1047   RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
   1048 
   1049   RF_SectorCount_t numSect[2], numParitySect;
   1050   RF_PhysDiskAddr_t *pda;
   1051   char *rdBuf, *bufP;
   1052   int foundit, i;
   1053 
   1054   bufP = NULL;
   1055   foundit = 0;
   1056   /* first compute the following raid addresses:
   1057        start of stripe,                            (sosAddr)
   1058        MIN(start of access, start of failed SU),   (sosEndAddr)
   1059        MAX(end of access, end of failed SU),       (eosStartAddr)
   1060        end of stripe (i.e. start of next stripe)   (eosAddr)
   1061   */
   1062   sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
   1063   sosEndAddr   = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress));
   1064   eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
   1065   eosAddr      = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
   1066 
   1067   /* now generate access stripe maps for each of the above regions of the
   1068    * stripe.  Use a dummy (NULL) buf ptr for now */
   1069 
   1070   new_asm_h[0] = (sosAddr != sosEndAddr)   ? rf_MapAccess(raidPtr, sosAddr,      sosEndAddr-sosAddr,   NULL, RF_DONT_REMAP) : NULL;
   1071   new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
   1072 
   1073   /* walk through the PDAs and range-restrict each SU to the region of the
   1074    * SU touched on the failed PDA.  also compute total data buffer space
   1075    * requirements in this step.  Ignore the parity for now. */
   1076 
   1077   numSect[0] = numSect[1] = 0;
   1078   if (new_asm_h[0]) {
   1079     new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0];
   1080     for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
   1081       rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector;
   1082     }
   1083   }
   1084   if (new_asm_h[1]) {
   1085     new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1];
   1086     for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
   1087       rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector;
   1088     }
   1089   }
   1090   numParitySect = failedPDA->numSector;
   1091 
   1092   /* allocate buffer space for the data & parity we have to read to recover
   1093    * from the failure */
   1094 
   1095   if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) {      /* don't allocate parity buf if not needed */
   1096     RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList);
   1097     bufP = rdBuf;
   1098     if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
   1099 			    (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP);
   1100   }
   1101 
   1102   /* now walk through the pdas one last time and assign buffer pointers
   1103    * (ugh!).  Again, ignore the parity.  also, count nodes to find out how
   1104    * many bufs need to be xored together */
   1105   (*nXorBufs) = 1;   /* in read case, 1 is for parity.  In write case, 1 is for failed data */
   1106   if (new_asm_h[0]) {
   1107     for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
   1108     *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
   1109   }
   1110   if (new_asm_h[1]) {
   1111     for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
   1112     (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
   1113   }
   1114   if (rpBufPtr) *rpBufPtr = bufP;     /* the rest of the buffer is for parity */
   1115 
   1116   /* the last step is to figure out how many more distinct buffers need to
   1117    * get xor'd to produce the missing unit.  there's one for each user-data
   1118    * read node that overlaps the portion of the failed unit being accessed */
   1119 
   1120   for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) {
   1121     if (pda == failedPDA) {i--; foundit=1; continue;}
   1122     if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
   1123       overlappingPDAs[i] = 1;
   1124       (*nXorBufs)++;
   1125     }
   1126   }
   1127   if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);}
   1128 
   1129   if (rf_degDagDebug) {
   1130     if (new_asm_h[0]) {
   1131       printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
   1132     }
   1133     if (new_asm_h[1]) {
   1134       printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
   1135     }
   1136   }
   1137 }
   1138 
   1139 
   1140 /* adjusts the offset and number of sectors in the destination pda so that
   1141  * it covers at most the region of the SU covered by the source PDA.  This
   1142  * is exclusively a restriction:  the number of sectors indicated by the
   1143  * target PDA can only shrink.
   1144  *
   1145  * For example:  s = sectors within SU indicated by source PDA
   1146  *               d = sectors within SU indicated by dest PDA
   1147  *               r = results, stored in dest PDA
   1148  *
   1149  * |--------------- one stripe unit ---------------------|
   1150  * |           sssssssssssssssssssssssssssssssss         |
   1151  * |    ddddddddddddddddddddddddddddddddddddddddddddd    |
   1152  * |           rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr         |
   1153  *
   1154  * Another example:
   1155  *
   1156  * |--------------- one stripe unit ---------------------|
   1157  * |           sssssssssssssssssssssssssssssssss         |
   1158  * |    ddddddddddddddddddddddd                          |
   1159  * |           rrrrrrrrrrrrrrrr                          |
   1160  *
   1161  */
   1162 void rf_RangeRestrictPDA(
   1163   RF_Raid_t          *raidPtr,
   1164   RF_PhysDiskAddr_t  *src,
   1165   RF_PhysDiskAddr_t  *dest,
   1166   int                 dobuffer,
   1167   int                 doraidaddr)
   1168 {
   1169   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
   1170   RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
   1171   RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
   1172   RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);     /* use -1 to be sure we stay within SU */
   1173   RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
   1174   RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */
   1175 
   1176   dest->startSector = subAddr + RF_MAX(soffs,doffs);
   1177   dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector;
   1178 
   1179   if (dobuffer)
   1180     dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0;
   1181   if (doraidaddr) {
   1182     dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
   1183                         rf_StripeUnitOffset(layoutPtr, dest->startSector);
   1184   }
   1185 }
   1186 
   1187 /*
   1188  * Want the highest of these primes to be the largest one
   1189  * less than the max expected number of columns (won't hurt
   1190  * to be too small or too large, but won't be optimal, either)
   1191  * --jimz
   1192  */
   1193 #define NLOWPRIMES 8
   1194 static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19};
   1195 
   1196 /*****************************************************************************
   1197  * compute the workload shift factor.  (chained declustering)
   1198  *
   1199  * return nonzero if access should shift to secondary, otherwise,
   1200  * access is to primary
   1201  *****************************************************************************/
   1202 int rf_compute_workload_shift(
   1203   RF_Raid_t          *raidPtr,
   1204   RF_PhysDiskAddr_t  *pda)
   1205 {
   1206   /*
   1207    * variables:
   1208    *  d   = column of disk containing primary
   1209    *  f   = column of failed disk
   1210    *  n   = number of disks in array
   1211    *  sd  = "shift distance" (number of columns that d is to the right of f)
   1212    *  row = row of array the access is in
   1213    *  v   = numerator of redirection ratio
   1214    *  k   = denominator of redirection ratio
   1215    */
   1216   RF_RowCol_t d, f, sd, row, n;
   1217   int k, v, ret, i;
   1218 
   1219   row = pda->row;
   1220   n = raidPtr->numCol;
   1221 
   1222   /* assign column of primary copy to d */
   1223   d = pda->col;
   1224 
   1225   /* assign column of dead disk to f */
   1226   for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++);
   1227 
   1228   RF_ASSERT(f < n);
   1229   RF_ASSERT(f != d);
   1230 
   1231   sd = (f > d) ? (n + d - f) : (d - f);
   1232   RF_ASSERT(sd < n);
   1233 
   1234   /*
   1235    * v of every k accesses should be redirected
   1236    *
   1237    * v/k := (n-1-sd)/(n-1)
   1238    */
   1239   v = (n-1-sd);
   1240   k = (n-1);
   1241 
   1242 #if 1
   1243   /*
   1244    * XXX
   1245    * Is this worth it?
   1246    *
   1247    * Now reduce the fraction, by repeatedly factoring
   1248    * out primes (just like they teach in elementary school!)
   1249    */
   1250   for(i=0;i<NLOWPRIMES;i++) {
   1251     if (lowprimes[i] > v)
   1252       break;
   1253     while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) {
   1254       v /= lowprimes[i];
   1255       k /= lowprimes[i];
   1256     }
   1257   }
   1258 #endif
   1259 
   1260   raidPtr->hist_diskreq[row][d]++;
   1261   if (raidPtr->hist_diskreq[row][d] > v) {
   1262     ret = 0; /* do not redirect */
   1263   }
   1264   else {
   1265     ret = 1; /* redirect */
   1266   }
   1267 
   1268 #if 0
   1269   printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
   1270     raidPtr->hist_diskreq[row][d]);
   1271 #endif
   1272 
   1273   if (raidPtr->hist_diskreq[row][d] >= k) {
   1274     /* reset counter */
   1275     raidPtr->hist_diskreq[row][d] = 0;
   1276   }
   1277 
   1278   return(ret);
   1279 }
   1280 
   1281 /*
   1282  * Disk selection routines
   1283  */
   1284 
   1285 /*
   1286  * Selects the disk with the shortest queue from a mirror pair.
   1287  * Both the disk I/Os queued in RAIDframe as well as those at the physical
   1288  * disk are counted as members of the "queue"
   1289  */
   1290 void rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
   1291 {
   1292   RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
   1293   RF_RowCol_t rowData, colData, rowMirror, colMirror;
   1294   int dataQueueLength, mirrorQueueLength, usemirror;
   1295   RF_PhysDiskAddr_t *data_pda   = (RF_PhysDiskAddr_t *)node->params[0].p;
   1296   RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
   1297   RF_PhysDiskAddr_t *tmp_pda;
   1298   RF_RaidDisk_t **disks = raidPtr->Disks;
   1299   RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
   1300 
   1301   /* return the [row col] of the disk with the shortest queue */
   1302   rowData = data_pda->row;
   1303   colData = data_pda->col;
   1304   rowMirror = mirror_pda->row;
   1305   colMirror = mirror_pda->col;
   1306   dataQueue   = &(dqs[rowData][colData]);
   1307   mirrorQueue = &(dqs[rowMirror][colMirror]);
   1308 
   1309 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1310   RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
   1311 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1312   dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
   1313 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1314   RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
   1315   RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
   1316 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1317   mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
   1318 #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1319   RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
   1320 #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1321 
   1322   usemirror = 0;
   1323   if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
   1324     usemirror = 0;
   1325   }
   1326   else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
   1327     usemirror = 1;
   1328   }
   1329   else if (dataQueueLength < mirrorQueueLength) {
   1330     usemirror = 0;
   1331   }
   1332   else if (mirrorQueueLength < dataQueueLength) {
   1333     usemirror = 1;
   1334   }
   1335   else {
   1336     /* queues are equal length. attempt cleverness. */
   1337     if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector)
   1338         <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector))
   1339     {
   1340       usemirror = 0;
   1341     }
   1342     else {
   1343       usemirror = 1;
   1344     }
   1345   }
   1346 
   1347   if (usemirror) {
   1348     /* use mirror (parity) disk, swap params 0 & 4 */
   1349     tmp_pda = data_pda;
   1350     node->params[0].p = mirror_pda;
   1351     node->params[4].p = tmp_pda;
   1352   }
   1353   else {
   1354     /* use data disk, leave param 0 unchanged */
   1355   }
   1356   /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */
   1357 }
   1358 
   1359 /*
   1360  * Do simple partitioning. This assumes that
   1361  * the data and parity disks are laid out identically.
   1362  */
   1363 void rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
   1364 {
   1365   RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
   1366   RF_RowCol_t rowData, colData, rowMirror, colMirror;
   1367   RF_PhysDiskAddr_t *data_pda   = (RF_PhysDiskAddr_t *)node->params[0].p;
   1368   RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
   1369   RF_PhysDiskAddr_t *tmp_pda;
   1370   RF_RaidDisk_t **disks = raidPtr->Disks;
   1371   RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
   1372   int usemirror;
   1373 
   1374   /* return the [row col] of the disk with the shortest queue */
   1375   rowData = data_pda->row;
   1376   colData = data_pda->col;
   1377   rowMirror = mirror_pda->row;
   1378   colMirror = mirror_pda->col;
   1379   dataQueue   = &(dqs[rowData][colData]);
   1380   mirrorQueue = &(dqs[rowMirror][colMirror]);
   1381 
   1382   usemirror = 0;
   1383   if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
   1384     usemirror = 0;
   1385   }
   1386   else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
   1387     usemirror = 1;
   1388   }
   1389   else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) {
   1390     usemirror = 0;
   1391   }
   1392   else {
   1393     usemirror = 1;
   1394   }
   1395 
   1396   if (usemirror) {
   1397     /* use mirror (parity) disk, swap params 0 & 4 */
   1398     tmp_pda = data_pda;
   1399     node->params[0].p = mirror_pda;
   1400     node->params[4].p = tmp_pda;
   1401   }
   1402   else {
   1403     /* use data disk, leave param 0 unchanged */
   1404   }
   1405 }
   1406