Home | History | Annotate | Line # | Download | only in raidframe
rf_dagutils.c revision 1.1
      1  1.1  oster /*	$NetBSD: rf_dagutils.c,v 1.1 1998/11/13 04:20:28 oster Exp $	*/
      2  1.1  oster /*
      3  1.1  oster  * Copyright (c) 1995 Carnegie-Mellon University.
      4  1.1  oster  * All rights reserved.
      5  1.1  oster  *
      6  1.1  oster  * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
      7  1.1  oster  *
      8  1.1  oster  * Permission to use, copy, modify and distribute this software and
      9  1.1  oster  * its documentation is hereby granted, provided that both the copyright
     10  1.1  oster  * notice and this permission notice appear in all copies of the
     11  1.1  oster  * software, derivative works or modified versions, and any portions
     12  1.1  oster  * thereof, and that both notices appear in supporting documentation.
     13  1.1  oster  *
     14  1.1  oster  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  1.1  oster  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  1.1  oster  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  1.1  oster  *
     18  1.1  oster  * Carnegie Mellon requests users of this software to return to
     19  1.1  oster  *
     20  1.1  oster  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  1.1  oster  *  School of Computer Science
     22  1.1  oster  *  Carnegie Mellon University
     23  1.1  oster  *  Pittsburgh PA 15213-3890
     24  1.1  oster  *
     25  1.1  oster  * any improvements or extensions that they make and grant Carnegie the
     26  1.1  oster  * rights to redistribute these changes.
     27  1.1  oster  */
     28  1.1  oster 
     29  1.1  oster /******************************************************************************
     30  1.1  oster  *
     31  1.1  oster  * rf_dagutils.c -- utility routines for manipulating dags
     32  1.1  oster  *
     33  1.1  oster  *****************************************************************************/
     34  1.1  oster 
     35  1.1  oster /*
     36  1.1  oster  * :
     37  1.1  oster  * Log: rf_dagutils.c,v
     38  1.1  oster  * Revision 1.55  1996/08/22 14:39:47  jimz
     39  1.1  oster  * reduce v/k fraction (better load balancing)
     40  1.1  oster  *
     41  1.1  oster  * Revision 1.54  1996/08/21  04:14:12  jimz
     42  1.1  oster  * minor workload shift tweaking
     43  1.1  oster  *
     44  1.1  oster  * Revision 1.53  1996/08/20  23:41:16  jimz
     45  1.1  oster  * fix up workload shift computation
     46  1.1  oster  *
     47  1.1  oster  * Revision 1.52  1996/08/20  22:34:16  jimz
     48  1.1  oster  * first cut at fixing workload shift
     49  1.1  oster  * needs work
     50  1.1  oster  *
     51  1.1  oster  * Revision 1.51  1996/08/20  16:51:16  jimz
     52  1.1  oster  * comment more verbosely compute_workload_shift()
     53  1.1  oster  *
     54  1.1  oster  * Revision 1.50  1996/08/11  00:40:50  jimz
     55  1.1  oster  * fix up broken comment
     56  1.1  oster  *
     57  1.1  oster  * Revision 1.49  1996/07/27  23:36:08  jimz
     58  1.1  oster  * Solaris port of simulator
     59  1.1  oster  *
     60  1.1  oster  * Revision 1.48  1996/07/27  18:40:01  jimz
     61  1.1  oster  * cleanup sweep
     62  1.1  oster  *
     63  1.1  oster  * Revision 1.47  1996/07/22  19:52:16  jimz
     64  1.1  oster  * switched node params to RF_DagParam_t, a union of
     65  1.1  oster  * a 64-bit int and a void *, for better portability
     66  1.1  oster  * attempted hpux port, but failed partway through for
     67  1.1  oster  * lack of a single C compiler capable of compiling all
     68  1.1  oster  * source files
     69  1.1  oster  *
     70  1.1  oster  * Revision 1.46  1996/07/18  22:57:14  jimz
     71  1.1  oster  * port simulator to AIX
     72  1.1  oster  *
     73  1.1  oster  * Revision 1.45  1996/06/17  03:24:59  jimz
     74  1.1  oster  * include shutdown.h for define of now-macroized ShutdownCreate
     75  1.1  oster  *
     76  1.1  oster  * Revision 1.44  1996/06/10  12:50:57  jimz
     77  1.1  oster  * Add counters to freelists to track number of allocations, frees,
     78  1.1  oster  * grows, max size, etc. Adjust a couple sets of PRIME params based
     79  1.1  oster  * on the results.
     80  1.1  oster  *
     81  1.1  oster  * Revision 1.43  1996/06/10  11:55:47  jimz
     82  1.1  oster  * Straightened out some per-array/not-per-array distinctions, fixed
     83  1.1  oster  * a couple bugs related to confusion. Added shutdown lists. Removed
     84  1.1  oster  * layout shutdown function (now subsumed by shutdown lists).
     85  1.1  oster  *
     86  1.1  oster  * Revision 1.42  1996/06/07  21:33:04  jimz
     87  1.1  oster  * begin using consistent types for sector numbers,
     88  1.1  oster  * stripe numbers, row+col numbers, recon unit numbers
     89  1.1  oster  *
     90  1.1  oster  * Revision 1.41  1996/06/06  17:28:58  jimz
     91  1.1  oster  * make PrintNodeInfoString aware of new mirroring funcs
     92  1.1  oster  *
     93  1.1  oster  * Revision 1.40  1996/06/05  18:06:02  jimz
     94  1.1  oster  * Major code cleanup. The Great Renaming is now done.
     95  1.1  oster  * Better modularity. Better typing. Fixed a bunch of
     96  1.1  oster  * synchronization bugs. Made a lot of global stuff
     97  1.1  oster  * per-desc or per-array. Removed dead code.
     98  1.1  oster  *
     99  1.1  oster  * Revision 1.39  1996/06/03  23:28:26  jimz
    100  1.1  oster  * more bugfixes
    101  1.1  oster  * check in tree to sync for IPDS runs with current bugfixes
    102  1.1  oster  * there still may be a problem with threads in the script test
    103  1.1  oster  * getting I/Os stuck- not trivially reproducible (runs ~50 times
    104  1.1  oster  * in a row without getting stuck)
    105  1.1  oster  *
    106  1.1  oster  * Revision 1.38  1996/06/02  17:31:48  jimz
    107  1.1  oster  * Moved a lot of global stuff into array structure, where it belongs.
    108  1.1  oster  * Fixed up paritylogging, pss modules in this manner. Some general
    109  1.1  oster  * code cleanup. Removed lots of dead code, some dead files.
    110  1.1  oster  *
    111  1.1  oster  * Revision 1.37  1996/05/31  22:26:54  jimz
    112  1.1  oster  * fix a lot of mapping problems, memory allocation problems
    113  1.1  oster  * found some weird lock issues, fixed 'em
    114  1.1  oster  * more code cleanup
    115  1.1  oster  *
    116  1.1  oster  * Revision 1.36  1996/05/30  23:22:16  jimz
    117  1.1  oster  * bugfixes of serialization, timing problems
    118  1.1  oster  * more cleanup
    119  1.1  oster  *
    120  1.1  oster  * Revision 1.35  1996/05/30  11:29:41  jimz
    121  1.1  oster  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
    122  1.1  oster  * about when stripes should be locked (I made it consistent: no parity, no lock)
    123  1.1  oster  * There was a lot of extra serialization of I/Os which I've removed- a lot of
    124  1.1  oster  * it was to calculate values for the cache code, which is no longer with us.
    125  1.1  oster  * More types, function, macro cleanup. Added code to properly quiesce the array
    126  1.1  oster  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
    127  1.1  oster  * before. Fixed memory allocation, freeing bugs.
    128  1.1  oster  *
    129  1.1  oster  * Revision 1.34  1996/05/27  18:56:37  jimz
    130  1.1  oster  * more code cleanup
    131  1.1  oster  * better typing
    132  1.1  oster  * compiles in all 3 environments
    133  1.1  oster  *
    134  1.1  oster  * Revision 1.33  1996/05/24  22:17:04  jimz
    135  1.1  oster  * continue code + namespace cleanup
    136  1.1  oster  * typed a bunch of flags
    137  1.1  oster  *
    138  1.1  oster  * Revision 1.32  1996/05/24  04:28:55  jimz
    139  1.1  oster  * release cleanup ckpt
    140  1.1  oster  *
    141  1.1  oster  * Revision 1.31  1996/05/23  21:46:35  jimz
    142  1.1  oster  * checkpoint in code cleanup (release prep)
    143  1.1  oster  * lots of types, function names have been fixed
    144  1.1  oster  *
    145  1.1  oster  * Revision 1.30  1996/05/23  00:33:23  jimz
    146  1.1  oster  * code cleanup: move all debug decls to rf_options.c, all extern
    147  1.1  oster  * debug decls to rf_options.h, all debug vars preceded by rf_
    148  1.1  oster  *
    149  1.1  oster  * Revision 1.29  1996/05/18  19:51:34  jimz
    150  1.1  oster  * major code cleanup- fix syntax, make some types consistent,
    151  1.1  oster  * add prototypes, clean out dead code, et cetera
    152  1.1  oster  *
    153  1.1  oster  * Revision 1.28  1996/05/16  23:05:52  jimz
    154  1.1  oster  * changed InitNode() to use dag_ptrs field of node when appropriate
    155  1.1  oster  * (see rf_dag.h or comments within InitNode() for details)
    156  1.1  oster  *
    157  1.1  oster  * Revision 1.27  1996/05/16  15:37:19  jimz
    158  1.1  oster  * convert to RF_FREELIST stuff for dag headers
    159  1.1  oster  *
    160  1.1  oster  * Revision 1.26  1996/05/08  21:01:24  jimz
    161  1.1  oster  * fixed up enum type names that were conflicting with other
    162  1.1  oster  * enums and function names (ie, "panic")
    163  1.1  oster  * future naming trends will be towards RF_ and rf_ for
    164  1.1  oster  * everything raidframe-related
    165  1.1  oster  *
    166  1.1  oster  * Revision 1.25  1996/05/03  19:56:15  wvcii
    167  1.1  oster  * added misc routines from old dag creation files
    168  1.1  oster  *
    169  1.1  oster  * Revision 1.24  1995/12/12  18:10:06  jimz
    170  1.1  oster  * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
    171  1.1  oster  * fix 80-column brain damage in comments
    172  1.1  oster  *
    173  1.1  oster  * Revision 1.23  1995/12/01  15:59:50  root
    174  1.1  oster  * added copyright info
    175  1.1  oster  *
    176  1.1  oster  * Revision 1.22  1995/11/17  15:14:12  wvcii
    177  1.1  oster  * PrintDAG now processes DiskReadMirrorFunc nodes
    178  1.1  oster  *
    179  1.1  oster  * Revision 1.21  1995/11/07  16:22:38  wvcii
    180  1.1  oster  * InitNode and InitNodeFromBuf now initialize commit fields
    181  1.1  oster  * beefed up ValidateDag
    182  1.1  oster  * prettied up PrintDAGList
    183  1.1  oster  *
    184  1.1  oster  */
    185  1.1  oster 
    186  1.1  oster #include "rf_archs.h"
    187  1.1  oster #include "rf_types.h"
    188  1.1  oster #include "rf_threadstuff.h"
    189  1.1  oster #include "rf_raid.h"
    190  1.1  oster #include "rf_dag.h"
    191  1.1  oster #include "rf_dagutils.h"
    192  1.1  oster #include "rf_dagfuncs.h"
    193  1.1  oster #include "rf_general.h"
    194  1.1  oster #include "rf_freelist.h"
    195  1.1  oster #include "rf_map.h"
    196  1.1  oster #include "rf_shutdown.h"
    197  1.1  oster #include "rf_sys.h"
    198  1.1  oster 
    199  1.1  oster #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
    200  1.1  oster 
    201  1.1  oster RF_RedFuncs_t rf_xorFuncs = {
    202  1.1  oster 	rf_RegularXorFunc, "Reg Xr",
    203  1.1  oster 	rf_SimpleXorFunc, "Simple Xr"};
    204  1.1  oster 
    205  1.1  oster RF_RedFuncs_t rf_xorRecoveryFuncs = {
    206  1.1  oster 	rf_RecoveryXorFunc, "Recovery Xr",
    207  1.1  oster 	rf_RecoveryXorFunc, "Recovery Xr"};
    208  1.1  oster 
    209  1.1  oster static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
    210  1.1  oster static void rf_PrintDAG(RF_DagHeader_t *);
    211  1.1  oster static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
    212  1.1  oster 			     RF_DagNode_t **, int );
    213  1.1  oster static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
    214  1.1  oster static void rf_ValidateVisitedBits(RF_DagHeader_t *);
    215  1.1  oster 
    216  1.1  oster /******************************************************************************
    217  1.1  oster  *
    218  1.1  oster  * InitNode - initialize a dag node
    219  1.1  oster  *
    220  1.1  oster  * the size of the propList array is always the same as that of the
    221  1.1  oster  * successors array.
    222  1.1  oster  *
    223  1.1  oster  *****************************************************************************/
    224  1.1  oster void rf_InitNode(
    225  1.1  oster   RF_DagNode_t         *node,
    226  1.1  oster   RF_NodeStatus_t       initstatus,
    227  1.1  oster   int                   commit,
    228  1.1  oster   int                 (*doFunc)(RF_DagNode_t *node),
    229  1.1  oster   int                 (*undoFunc)(RF_DagNode_t *node),
    230  1.1  oster   int                 (*wakeFunc)(RF_DagNode_t *node,int status),
    231  1.1  oster   int                   nSucc,
    232  1.1  oster   int                   nAnte,
    233  1.1  oster   int                   nParam,
    234  1.1  oster   int                   nResult,
    235  1.1  oster   RF_DagHeader_t       *hdr,
    236  1.1  oster   char                 *name,
    237  1.1  oster   RF_AllocListElem_t   *alist)
    238  1.1  oster {
    239  1.1  oster   void **ptrs;
    240  1.1  oster   int nptrs;
    241  1.1  oster 
    242  1.1  oster   if (nAnte > RF_MAX_ANTECEDENTS)
    243  1.1  oster     RF_PANIC();
    244  1.1  oster   node->status         = initstatus;
    245  1.1  oster   node->commitNode     = commit;
    246  1.1  oster   node->doFunc         = doFunc;
    247  1.1  oster   node->undoFunc       = undoFunc;
    248  1.1  oster   node->wakeFunc       = wakeFunc;
    249  1.1  oster   node->numParams      = nParam;
    250  1.1  oster   node->numResults     = nResult;
    251  1.1  oster   node->numAntecedents = nAnte;
    252  1.1  oster   node->numAntDone     = 0;
    253  1.1  oster   node->next           = NULL;
    254  1.1  oster   node->numSuccedents  = nSucc;
    255  1.1  oster   node->name           = name;
    256  1.1  oster   node->dagHdr         = hdr;
    257  1.1  oster   node->visited        = 0;
    258  1.1  oster 
    259  1.1  oster   /* allocate all the pointers with one call to malloc */
    260  1.1  oster   nptrs = nSucc+nAnte+nResult+nSucc;
    261  1.1  oster 
    262  1.1  oster   if (nptrs <= RF_DAG_PTRCACHESIZE) {
    263  1.1  oster     /*
    264  1.1  oster      * The dag_ptrs field of the node is basically some scribble
    265  1.1  oster      * space to be used here. We could get rid of it, and always
    266  1.1  oster      * allocate the range of pointers, but that's expensive. So,
    267  1.1  oster      * we pick a "common case" size for the pointer cache. Hopefully,
    268  1.1  oster      * we'll find that:
    269  1.1  oster      * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
    270  1.1  oster      *     only a little bit (least efficient case)
    271  1.1  oster      * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
    272  1.1  oster      *     (wasted memory)
    273  1.1  oster      */
    274  1.1  oster     ptrs = (void **)node->dag_ptrs;
    275  1.1  oster   }
    276  1.1  oster   else {
    277  1.1  oster     RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
    278  1.1  oster   }
    279  1.1  oster   node->succedents  = (nSucc)   ? (RF_DagNode_t         **) ptrs                  : NULL;
    280  1.1  oster   node->antecedents = (nAnte)   ? (RF_DagNode_t         **) (ptrs+nSucc)          : NULL;
    281  1.1  oster   node->results     = (nResult) ? (void            **) (ptrs+nSucc+nAnte)         : NULL;
    282  1.1  oster   node->propList    = (nSucc)   ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL;
    283  1.1  oster 
    284  1.1  oster   if (nParam) {
    285  1.1  oster     if (nParam <= RF_DAG_PARAMCACHESIZE) {
    286  1.1  oster       node->params = (RF_DagParam_t *)node->dag_params;
    287  1.1  oster     }
    288  1.1  oster     else {
    289  1.1  oster       RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
    290  1.1  oster     }
    291  1.1  oster   }
    292  1.1  oster   else {
    293  1.1  oster     node->params = NULL;
    294  1.1  oster   }
    295  1.1  oster }
    296  1.1  oster 
    297  1.1  oster 
    298  1.1  oster 
    299  1.1  oster /******************************************************************************
    300  1.1  oster  *
    301  1.1  oster  * allocation and deallocation routines
    302  1.1  oster  *
    303  1.1  oster  *****************************************************************************/
    304  1.1  oster 
    305  1.1  oster void rf_FreeDAG(dag_h)
    306  1.1  oster   RF_DagHeader_t  *dag_h;
    307  1.1  oster {
    308  1.1  oster   RF_AccessStripeMapHeader_t *asmap, *t_asmap;
    309  1.1  oster   RF_DagHeader_t *nextDag;
    310  1.1  oster   int i;
    311  1.1  oster 
    312  1.1  oster   while (dag_h) {
    313  1.1  oster     nextDag = dag_h->next;
    314  1.1  oster     for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
    315  1.1  oster       /* release mem chunks */
    316  1.1  oster       rf_ReleaseMemChunk(dag_h->memChunk[i]);
    317  1.1  oster       dag_h->memChunk[i] = NULL;
    318  1.1  oster     }
    319  1.1  oster 
    320  1.1  oster     RF_ASSERT(i == dag_h->chunkIndex);
    321  1.1  oster     if (dag_h->xtraChunkCnt > 0) {
    322  1.1  oster       /* free xtraMemChunks */
    323  1.1  oster       for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
    324  1.1  oster         rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
    325  1.1  oster         dag_h->xtraMemChunk[i] = NULL;
    326  1.1  oster       }
    327  1.1  oster       RF_ASSERT(i == dag_h->xtraChunkIndex);
    328  1.1  oster       /* free ptrs to xtraMemChunks */
    329  1.1  oster       RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
    330  1.1  oster     }
    331  1.1  oster     rf_FreeAllocList(dag_h->allocList);
    332  1.1  oster     for (asmap = dag_h->asmList; asmap;) {
    333  1.1  oster       t_asmap = asmap;
    334  1.1  oster       asmap = asmap->next;
    335  1.1  oster       rf_FreeAccessStripeMap(t_asmap);
    336  1.1  oster     }
    337  1.1  oster     rf_FreeDAGHeader(dag_h);
    338  1.1  oster     dag_h = nextDag;
    339  1.1  oster   }
    340  1.1  oster }
    341  1.1  oster 
    342  1.1  oster RF_PropHeader_t *rf_MakePropListEntry(
    343  1.1  oster   RF_DagHeader_t      *dag_h,
    344  1.1  oster   int                  resultNum,
    345  1.1  oster   int                  paramNum,
    346  1.1  oster   RF_PropHeader_t     *next,
    347  1.1  oster   RF_AllocListElem_t  *allocList)
    348  1.1  oster {
    349  1.1  oster   RF_PropHeader_t *p;
    350  1.1  oster 
    351  1.1  oster   RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
    352  1.1  oster     (RF_PropHeader_t *), allocList);
    353  1.1  oster   p->resultNum = resultNum;
    354  1.1  oster   p->paramNum = paramNum;
    355  1.1  oster   p->next = next;
    356  1.1  oster   return(p);
    357  1.1  oster }
    358  1.1  oster 
    359  1.1  oster static RF_FreeList_t *rf_dagh_freelist;
    360  1.1  oster 
    361  1.1  oster #define RF_MAX_FREE_DAGH 128
    362  1.1  oster #define RF_DAGH_INC       16
    363  1.1  oster #define RF_DAGH_INITIAL   32
    364  1.1  oster 
    365  1.1  oster static void rf_ShutdownDAGs(void *);
    366  1.1  oster static void rf_ShutdownDAGs(ignored)
    367  1.1  oster   void  *ignored;
    368  1.1  oster {
    369  1.1  oster 	RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *));
    370  1.1  oster }
    371  1.1  oster 
    372  1.1  oster int rf_ConfigureDAGs(listp)
    373  1.1  oster   RF_ShutdownList_t  **listp;
    374  1.1  oster {
    375  1.1  oster 	int rc;
    376  1.1  oster 
    377  1.1  oster 	RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
    378  1.1  oster 		RF_DAGH_INC, sizeof(RF_DagHeader_t));
    379  1.1  oster 	if (rf_dagh_freelist == NULL)
    380  1.1  oster 		return(ENOMEM);
    381  1.1  oster 	rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
    382  1.1  oster 	if (rc) {
    383  1.1  oster 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
    384  1.1  oster 			__FILE__, __LINE__, rc);
    385  1.1  oster 		rf_ShutdownDAGs(NULL);
    386  1.1  oster 		return(rc);
    387  1.1  oster 	}
    388  1.1  oster 	RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next,
    389  1.1  oster 		(RF_DagHeader_t *));
    390  1.1  oster 	return(0);
    391  1.1  oster }
    392  1.1  oster 
    393  1.1  oster RF_DagHeader_t *rf_AllocDAGHeader()
    394  1.1  oster {
    395  1.1  oster 	RF_DagHeader_t *dh;
    396  1.1  oster 
    397  1.1  oster 	RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *));
    398  1.1  oster 	if (dh) {
    399  1.1  oster 		bzero((char *)dh, sizeof(RF_DagHeader_t));
    400  1.1  oster 	}
    401  1.1  oster 	return(dh);
    402  1.1  oster }
    403  1.1  oster 
    404  1.1  oster void rf_FreeDAGHeader(RF_DagHeader_t *dh)
    405  1.1  oster {
    406  1.1  oster 	RF_FREELIST_FREE(rf_dagh_freelist,dh,next);
    407  1.1  oster }
    408  1.1  oster 
    409  1.1  oster /* allocates a buffer big enough to hold the data described by pda */
    410  1.1  oster void *rf_AllocBuffer(
    411  1.1  oster   RF_Raid_t           *raidPtr,
    412  1.1  oster   RF_DagHeader_t      *dag_h,
    413  1.1  oster   RF_PhysDiskAddr_t   *pda,
    414  1.1  oster   RF_AllocListElem_t  *allocList)
    415  1.1  oster {
    416  1.1  oster   char *p;
    417  1.1  oster 
    418  1.1  oster   RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
    419  1.1  oster     (char *), allocList);
    420  1.1  oster   return((void *)p);
    421  1.1  oster }
    422  1.1  oster 
    423  1.1  oster /******************************************************************************
    424  1.1  oster  *
    425  1.1  oster  * debug routines
    426  1.1  oster  *
    427  1.1  oster  *****************************************************************************/
    428  1.1  oster 
    429  1.1  oster char *rf_NodeStatusString(RF_DagNode_t *node)
    430  1.1  oster {
    431  1.1  oster   switch (node->status) {
    432  1.1  oster     case rf_wait: return("wait");
    433  1.1  oster     case rf_fired: return("fired");
    434  1.1  oster     case rf_good: return("good");
    435  1.1  oster     case rf_bad: return("bad");
    436  1.1  oster     default: return("?");
    437  1.1  oster   }
    438  1.1  oster }
    439  1.1  oster 
    440  1.1  oster void rf_PrintNodeInfoString(RF_DagNode_t *node)
    441  1.1  oster {
    442  1.1  oster   RF_PhysDiskAddr_t *pda;
    443  1.1  oster   int (*df)(RF_DagNode_t *) = node->doFunc;
    444  1.1  oster   int i, lk, unlk;
    445  1.1  oster   void *bufPtr;
    446  1.1  oster 
    447  1.1  oster   if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc)
    448  1.1  oster       || (df==rf_DiskReadMirrorIdleFunc)
    449  1.1  oster       || (df == rf_DiskReadMirrorPartitionFunc))
    450  1.1  oster   {
    451  1.1  oster     pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    452  1.1  oster     bufPtr = (void *)node->params[1].p;
    453  1.1  oster     lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    454  1.1  oster     unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    455  1.1  oster     RF_ASSERT( !(lk && unlk) );
    456  1.1  oster     printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
    457  1.1  oster        (long)pda->startSector, (int) pda->numSector, (long)bufPtr,
    458  1.1  oster        (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
    459  1.1  oster     return;
    460  1.1  oster   }
    461  1.1  oster 
    462  1.1  oster   if (df == rf_DiskUnlockFunc) {
    463  1.1  oster     pda = (RF_PhysDiskAddr_t *)node->params[0].p;
    464  1.1  oster     lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
    465  1.1  oster     unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
    466  1.1  oster     RF_ASSERT( !(lk && unlk) );
    467  1.1  oster     printf("r %d c %d %s\n", pda->row, pda->col,
    468  1.1  oster       (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
    469  1.1  oster     return;
    470  1.1  oster   }
    471  1.1  oster 
    472  1.1  oster   if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc)
    473  1.1  oster       || (df==rf_RecoveryXorFunc))
    474  1.1  oster   {
    475  1.1  oster     printf("result buf 0x%lx\n",(long) node->results[0]);
    476  1.1  oster     for (i=0; i<node->numParams-1; i+=2) {
    477  1.1  oster       pda = (RF_PhysDiskAddr_t *)node->params[i].p;
    478  1.1  oster       bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
    479  1.1  oster       printf("    buf 0x%lx r%d c%d offs %ld nsect %d\n",
    480  1.1  oster 	     (long)bufPtr, pda->row, pda->col,
    481  1.1  oster 	     (long)pda->startSector, (int)pda->numSector);
    482  1.1  oster     }
    483  1.1  oster     return;
    484  1.1  oster   }
    485  1.1  oster 
    486  1.1  oster #if RF_INCLUDE_PARITYLOGGING > 0
    487  1.1  oster   if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) {
    488  1.1  oster     for (i=0; i<node->numParams-1; i+=2) {
    489  1.1  oster       pda = (RF_PhysDiskAddr_t *)node->params[i].p;
    490  1.1  oster       bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
    491  1.1  oster       printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
    492  1.1  oster 	     pda->row, pda->col, (long) pda->startSector,
    493  1.1  oster 	     (int) pda->numSector, (long) bufPtr);
    494  1.1  oster     }
    495  1.1  oster     return;
    496  1.1  oster   }
    497  1.1  oster #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
    498  1.1  oster 
    499  1.1  oster   if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) {
    500  1.1  oster     printf("\n");
    501  1.1  oster     return;
    502  1.1  oster   }
    503  1.1  oster 
    504  1.1  oster   printf("?\n");
    505  1.1  oster }
    506  1.1  oster 
    507  1.1  oster static void rf_RecurPrintDAG(node, depth, unvisited)
    508  1.1  oster   RF_DagNode_t  *node;
    509  1.1  oster   int            depth;
    510  1.1  oster   int            unvisited;
    511  1.1  oster {
    512  1.1  oster   char *anttype;
    513  1.1  oster   int i;
    514  1.1  oster 
    515  1.1  oster   node->visited = (unvisited) ? 0 : 1;
    516  1.1  oster   printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
    517  1.1  oster     node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
    518  1.1  oster     node->numSuccedents, node->numSuccFired, node->numSuccDone,
    519  1.1  oster 	node->numAntecedents, node->numAntDone, node->numParams,node->numResults);
    520  1.1  oster   for (i=0; i<node->numSuccedents; i++) {
    521  1.1  oster     printf("%d%s", node->succedents[i]->nodeNum,
    522  1.1  oster       ((i==node->numSuccedents-1) ? "\0" : " "));
    523  1.1  oster   }
    524  1.1  oster   printf("} A{");
    525  1.1  oster   for (i=0; i<node->numAntecedents; i++) {
    526  1.1  oster     switch (node->antType[i]) {
    527  1.1  oster     case rf_trueData :
    528  1.1  oster       anttype = "T";
    529  1.1  oster       break;
    530  1.1  oster     case rf_antiData :
    531  1.1  oster       anttype = "A";
    532  1.1  oster       break;
    533  1.1  oster     case rf_outputData :
    534  1.1  oster       anttype = "O";
    535  1.1  oster       break;
    536  1.1  oster     case rf_control :
    537  1.1  oster       anttype = "C";
    538  1.1  oster       break;
    539  1.1  oster     default :
    540  1.1  oster       anttype = "?";
    541  1.1  oster       break;
    542  1.1  oster     }
    543  1.1  oster     printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " ");
    544  1.1  oster   }
    545  1.1  oster   printf("}; ");
    546  1.1  oster   rf_PrintNodeInfoString(node);
    547  1.1  oster   for (i=0; i<node->numSuccedents; i++) {
    548  1.1  oster     if (node->succedents[i]->visited == unvisited)
    549  1.1  oster       rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited);
    550  1.1  oster   }
    551  1.1  oster }
    552  1.1  oster 
    553  1.1  oster static void rf_PrintDAG(dag_h)
    554  1.1  oster   RF_DagHeader_t  *dag_h;
    555  1.1  oster {
    556  1.1  oster   int unvisited, i;
    557  1.1  oster   char *status;
    558  1.1  oster 
    559  1.1  oster   /* set dag status */
    560  1.1  oster   switch (dag_h->status) {
    561  1.1  oster   case rf_enable :
    562  1.1  oster     status = "enable";
    563  1.1  oster     break;
    564  1.1  oster   case rf_rollForward :
    565  1.1  oster     status = "rollForward";
    566  1.1  oster     break;
    567  1.1  oster   case rf_rollBackward :
    568  1.1  oster     status = "rollBackward";
    569  1.1  oster     break;
    570  1.1  oster   default :
    571  1.1  oster     status = "illegal!";
    572  1.1  oster     break;
    573  1.1  oster   }
    574  1.1  oster   /* find out if visited bits are currently set or clear */
    575  1.1  oster   unvisited = dag_h->succedents[0]->visited;
    576  1.1  oster 
    577  1.1  oster   printf("DAG type:  %s\n", dag_h->creator);
    578  1.1  oster   printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)};  info\n");
    579  1.1  oster   printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
    580  1.1  oster     status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
    581  1.1  oster   for (i=0; i<dag_h->numSuccedents; i++) {
    582  1.1  oster     printf("%d%s", dag_h->succedents[i]->nodeNum,
    583  1.1  oster       ((i==dag_h->numSuccedents-1) ? "\0" : " "));
    584  1.1  oster   }
    585  1.1  oster   printf("};\n");
    586  1.1  oster   for (i=0; i<dag_h->numSuccedents; i++) {
    587  1.1  oster     if (dag_h->succedents[i]->visited == unvisited)
    588  1.1  oster       rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
    589  1.1  oster   }
    590  1.1  oster }
    591  1.1  oster 
    592  1.1  oster /* assigns node numbers */
    593  1.1  oster int rf_AssignNodeNums(RF_DagHeader_t *dag_h)
    594  1.1  oster {
    595  1.1  oster   int unvisited, i, nnum;
    596  1.1  oster   RF_DagNode_t *node;
    597  1.1  oster 
    598  1.1  oster   nnum = 0;
    599  1.1  oster   unvisited = dag_h->succedents[0]->visited;
    600  1.1  oster 
    601  1.1  oster   dag_h->nodeNum = nnum++;
    602  1.1  oster   for (i=0; i<dag_h->numSuccedents; i++) {
    603  1.1  oster     node = dag_h->succedents[i];
    604  1.1  oster     if (node->visited == unvisited) {
    605  1.1  oster       nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
    606  1.1  oster     }
    607  1.1  oster   }
    608  1.1  oster   return(nnum);
    609  1.1  oster }
    610  1.1  oster 
    611  1.1  oster int rf_RecurAssignNodeNums(node, num, unvisited)
    612  1.1  oster   RF_DagNode_t  *node;
    613  1.1  oster   int            num;
    614  1.1  oster   int            unvisited;
    615  1.1  oster {
    616  1.1  oster   int i;
    617  1.1  oster 
    618  1.1  oster   node->visited = (unvisited) ? 0 : 1;
    619  1.1  oster 
    620  1.1  oster   node->nodeNum = num++;
    621  1.1  oster   for (i=0; i<node->numSuccedents; i++) {
    622  1.1  oster     if (node->succedents[i]->visited == unvisited) {
    623  1.1  oster       num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
    624  1.1  oster     }
    625  1.1  oster   }
    626  1.1  oster   return(num);
    627  1.1  oster }
    628  1.1  oster 
    629  1.1  oster /* set the header pointers in each node to "newptr" */
    630  1.1  oster void rf_ResetDAGHeaderPointers(dag_h, newptr)
    631  1.1  oster   RF_DagHeader_t  *dag_h;
    632  1.1  oster   RF_DagHeader_t  *newptr;
    633  1.1  oster {
    634  1.1  oster   int i;
    635  1.1  oster   for (i=0; i<dag_h->numSuccedents; i++)
    636  1.1  oster     if (dag_h->succedents[i]->dagHdr != newptr)
    637  1.1  oster       rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
    638  1.1  oster }
    639  1.1  oster 
    640  1.1  oster void rf_RecurResetDAGHeaderPointers(node, newptr)
    641  1.1  oster   RF_DagNode_t    *node;
    642  1.1  oster   RF_DagHeader_t  *newptr;
    643  1.1  oster {
    644  1.1  oster   int i;
    645  1.1  oster   node->dagHdr = newptr;
    646  1.1  oster   for (i=0; i<node->numSuccedents; i++)
    647  1.1  oster     if (node->succedents[i]->dagHdr != newptr)
    648  1.1  oster       rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
    649  1.1  oster }
    650  1.1  oster 
    651  1.1  oster 
    652  1.1  oster void rf_PrintDAGList(RF_DagHeader_t *dag_h)
    653  1.1  oster {
    654  1.1  oster   int i=0;
    655  1.1  oster 
    656  1.1  oster   for (; dag_h; dag_h=dag_h->next) {
    657  1.1  oster     rf_AssignNodeNums(dag_h);
    658  1.1  oster     printf("\n\nDAG %d IN LIST:\n",i++);
    659  1.1  oster     rf_PrintDAG(dag_h);
    660  1.1  oster   }
    661  1.1  oster }
    662  1.1  oster 
    663  1.1  oster static int rf_ValidateBranch(node, scount, acount, nodes, unvisited)
    664  1.1  oster   RF_DagNode_t   *node;
    665  1.1  oster   int            *scount;
    666  1.1  oster   int            *acount;
    667  1.1  oster   RF_DagNode_t  **nodes;
    668  1.1  oster   int             unvisited;
    669  1.1  oster {
    670  1.1  oster   int i, retcode = 0;
    671  1.1  oster 
    672  1.1  oster   /* construct an array of node pointers indexed by node num */
    673  1.1  oster   node->visited = (unvisited) ? 0 : 1;
    674  1.1  oster   nodes[ node->nodeNum ] = node;
    675  1.1  oster 
    676  1.1  oster   if (node->next != NULL) {
    677  1.1  oster     printf("INVALID DAG: next pointer in node is not NULL\n");
    678  1.1  oster     retcode = 1;
    679  1.1  oster   }
    680  1.1  oster   if (node->status != rf_wait) {
    681  1.1  oster     printf("INVALID DAG: Node status is not wait\n");
    682  1.1  oster     retcode = 1;
    683  1.1  oster   }
    684  1.1  oster   if (node->numAntDone != 0) {
    685  1.1  oster     printf("INVALID DAG: numAntDone is not zero\n");
    686  1.1  oster     retcode = 1;
    687  1.1  oster   }
    688  1.1  oster   if (node->doFunc == rf_TerminateFunc) {
    689  1.1  oster     if (node->numSuccedents != 0) {
    690  1.1  oster       printf("INVALID DAG: Terminator node has succedents\n");
    691  1.1  oster       retcode = 1;
    692  1.1  oster     }
    693  1.1  oster   } else {
    694  1.1  oster     if (node->numSuccedents == 0) {
    695  1.1  oster       printf("INVALID DAG: Non-terminator node has no succedents\n");
    696  1.1  oster       retcode = 1;
    697  1.1  oster     }
    698  1.1  oster   }
    699  1.1  oster   for (i=0; i<node->numSuccedents; i++)  {
    700  1.1  oster     if (!node->succedents[i]) {
    701  1.1  oster       printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name);
    702  1.1  oster       retcode = 1;
    703  1.1  oster     }
    704  1.1  oster     scount[ node->succedents[i]->nodeNum ]++;
    705  1.1  oster   }
    706  1.1  oster   for (i=0; i<node->numAntecedents; i++) {
    707  1.1  oster     if (!node->antecedents[i]) {
    708  1.1  oster       printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name);
    709  1.1  oster       retcode = 1;
    710  1.1  oster     }
    711  1.1  oster     acount[ node->antecedents[i]->nodeNum ]++;
    712  1.1  oster   }
    713  1.1  oster   for (i=0; i<node->numSuccedents; i++) {
    714  1.1  oster     if (node->succedents[i]->visited == unvisited) {
    715  1.1  oster       if (rf_ValidateBranch(node->succedents[i], scount,
    716  1.1  oster           acount, nodes, unvisited))
    717  1.1  oster       {
    718  1.1  oster         retcode = 1;
    719  1.1  oster       }
    720  1.1  oster     }
    721  1.1  oster   }
    722  1.1  oster   return(retcode);
    723  1.1  oster }
    724  1.1  oster 
    725  1.1  oster static void rf_ValidateBranchVisitedBits(node, unvisited, rl)
    726  1.1  oster   RF_DagNode_t  *node;
    727  1.1  oster   int            unvisited;
    728  1.1  oster   int            rl;
    729  1.1  oster {
    730  1.1  oster   int i;
    731  1.1  oster 
    732  1.1  oster   RF_ASSERT(node->visited == unvisited);
    733  1.1  oster   for (i=0; i<node->numSuccedents; i++) {
    734  1.1  oster     if (node->succedents[i] == NULL) {
    735  1.1  oster       printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i);
    736  1.1  oster       RF_ASSERT(0);
    737  1.1  oster     }
    738  1.1  oster     rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1);
    739  1.1  oster   }
    740  1.1  oster }
    741  1.1  oster 
    742  1.1  oster /* NOTE:  never call this on a big dag, because it is exponential
    743  1.1  oster  * in execution time
    744  1.1  oster  */
    745  1.1  oster static void rf_ValidateVisitedBits(dag)
    746  1.1  oster   RF_DagHeader_t  *dag;
    747  1.1  oster {
    748  1.1  oster   int i, unvisited;
    749  1.1  oster 
    750  1.1  oster   unvisited = dag->succedents[0]->visited;
    751  1.1  oster 
    752  1.1  oster   for (i=0; i<dag->numSuccedents; i++) {
    753  1.1  oster     if (dag->succedents[i] == NULL) {
    754  1.1  oster       printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
    755  1.1  oster       RF_ASSERT(0);
    756  1.1  oster     }
    757  1.1  oster     rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0);
    758  1.1  oster   }
    759  1.1  oster }
    760  1.1  oster 
    761  1.1  oster /* validate a DAG.  _at entry_ verify that:
    762  1.1  oster  *   -- numNodesCompleted is zero
    763  1.1  oster  *   -- node queue is null
    764  1.1  oster  *   -- dag status is rf_enable
    765  1.1  oster  *   -- next pointer is null on every node
    766  1.1  oster  *   -- all nodes have status wait
    767  1.1  oster  *   -- numAntDone is zero in all nodes
    768  1.1  oster  *   -- terminator node has zero successors
    769  1.1  oster  *   -- no other node besides terminator has zero successors
    770  1.1  oster  *   -- no successor or antecedent pointer in a node is NULL
    771  1.1  oster  *   -- number of times that each node appears as a successor of another node
    772  1.1  oster  *      is equal to the antecedent count on that node
    773  1.1  oster  *   -- number of times that each node appears as an antecedent of another node
    774  1.1  oster  *      is equal to the succedent count on that node
    775  1.1  oster  *   -- what else?
    776  1.1  oster  */
    777  1.1  oster int rf_ValidateDAG(dag_h)
    778  1.1  oster   RF_DagHeader_t  *dag_h;
    779  1.1  oster {
    780  1.1  oster   int i, nodecount;
    781  1.1  oster   int *scount, *acount;        /* per-node successor and antecedent counts */
    782  1.1  oster   RF_DagNode_t **nodes;        /* array of ptrs to nodes in dag */
    783  1.1  oster   int retcode = 0;
    784  1.1  oster   int unvisited;
    785  1.1  oster   int commitNodeCount = 0;
    786  1.1  oster 
    787  1.1  oster   if (rf_validateVisitedDebug)
    788  1.1  oster     rf_ValidateVisitedBits(dag_h);
    789  1.1  oster 
    790  1.1  oster   if (dag_h->numNodesCompleted != 0) {
    791  1.1  oster     printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted);
    792  1.1  oster     retcode = 1; goto validate_dag_bad;
    793  1.1  oster   }
    794  1.1  oster   if (dag_h->status != rf_enable) {
    795  1.1  oster     printf("INVALID DAG: not enabled\n");
    796  1.1  oster     retcode = 1; goto validate_dag_bad;
    797  1.1  oster   }
    798  1.1  oster   if (dag_h->numCommits != 0) {
    799  1.1  oster     printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits);
    800  1.1  oster     retcode = 1; goto validate_dag_bad;
    801  1.1  oster   }
    802  1.1  oster   if (dag_h->numSuccedents != 1) {
    803  1.1  oster     /* currently, all dags must have only one succedent */
    804  1.1  oster     printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents);
    805  1.1  oster     retcode = 1; goto validate_dag_bad;
    806  1.1  oster   }
    807  1.1  oster   nodecount = rf_AssignNodeNums(dag_h);
    808  1.1  oster 
    809  1.1  oster   unvisited = dag_h->succedents[0]->visited;
    810  1.1  oster 
    811  1.1  oster   RF_Calloc(scount, nodecount, sizeof(int), (int *));
    812  1.1  oster   RF_Calloc(acount, nodecount, sizeof(int), (int *));
    813  1.1  oster   RF_Calloc(nodes,  nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
    814  1.1  oster   for (i=0; i<dag_h->numSuccedents; i++) {
    815  1.1  oster     if ((dag_h->succedents[i]->visited == unvisited)
    816  1.1  oster         && rf_ValidateBranch(dag_h->succedents[i], scount,
    817  1.1  oster         acount, nodes, unvisited))
    818  1.1  oster     {
    819  1.1  oster       retcode = 1;
    820  1.1  oster     }
    821  1.1  oster   }
    822  1.1  oster   /* start at 1 to skip the header node */
    823  1.1  oster   for (i=1; i<nodecount; i++) {
    824  1.1  oster     if ( nodes[i]->commitNode )
    825  1.1  oster       commitNodeCount++;
    826  1.1  oster     if ( nodes[i]->doFunc == NULL ) {
    827  1.1  oster       printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
    828  1.1  oster       retcode = 1;
    829  1.1  oster       goto validate_dag_out;
    830  1.1  oster     }
    831  1.1  oster     if ( nodes[i]->undoFunc == NULL ) {
    832  1.1  oster       printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
    833  1.1  oster       retcode = 1;
    834  1.1  oster       goto validate_dag_out;
    835  1.1  oster     }
    836  1.1  oster     if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) {
    837  1.1  oster       printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
    838  1.1  oster 	     nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
    839  1.1  oster       retcode = 1;
    840  1.1  oster       goto validate_dag_out;
    841  1.1  oster     }
    842  1.1  oster     if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) {
    843  1.1  oster       printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
    844  1.1  oster 	     nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
    845  1.1  oster       retcode = 1;
    846  1.1  oster       goto validate_dag_out;
    847  1.1  oster     }
    848  1.1  oster   }
    849  1.1  oster 
    850  1.1  oster   if ( dag_h->numCommitNodes != commitNodeCount ) {
    851  1.1  oster     printf("INVALID DAG: incorrect commit node count.  hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
    852  1.1  oster       dag_h->numCommitNodes, commitNodeCount);
    853  1.1  oster     retcode = 1;
    854  1.1  oster     goto validate_dag_out;
    855  1.1  oster   }
    856  1.1  oster 
    857  1.1  oster validate_dag_out:
    858  1.1  oster   RF_Free(scount, nodecount*sizeof(int));
    859  1.1  oster   RF_Free(acount, nodecount*sizeof(int));
    860  1.1  oster   RF_Free(nodes,  nodecount*sizeof(RF_DagNode_t *));
    861  1.1  oster   if (retcode)
    862  1.1  oster     rf_PrintDAGList(dag_h);
    863  1.1  oster 
    864  1.1  oster   if (rf_validateVisitedDebug)
    865  1.1  oster     rf_ValidateVisitedBits(dag_h);
    866  1.1  oster 
    867  1.1  oster   return(retcode);
    868  1.1  oster 
    869  1.1  oster validate_dag_bad:
    870  1.1  oster   rf_PrintDAGList(dag_h);
    871  1.1  oster   return(retcode);
    872  1.1  oster }
    873  1.1  oster 
    874  1.1  oster 
    875  1.1  oster /******************************************************************************
    876  1.1  oster  *
    877  1.1  oster  * misc construction routines
    878  1.1  oster  *
    879  1.1  oster  *****************************************************************************/
    880  1.1  oster 
    881  1.1  oster void rf_redirect_asm(
    882  1.1  oster   RF_Raid_t             *raidPtr,
    883  1.1  oster   RF_AccessStripeMap_t  *asmap)
    884  1.1  oster {
    885  1.1  oster   int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
    886  1.1  oster   int row = asmap->physInfo->row;
    887  1.1  oster   int fcol = raidPtr->reconControl[row]->fcol;
    888  1.1  oster   int srow = raidPtr->reconControl[row]->spareRow;
    889  1.1  oster   int scol = raidPtr->reconControl[row]->spareCol;
    890  1.1  oster   RF_PhysDiskAddr_t *pda;
    891  1.1  oster 
    892  1.1  oster   RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing );
    893  1.1  oster   for (pda = asmap->physInfo; pda; pda=pda->next) {
    894  1.1  oster     if (pda->col == fcol) {
    895  1.1  oster       if (rf_dagDebug) {
    896  1.1  oster         if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
    897  1.1  oster           pda->startSector))
    898  1.1  oster         {
    899  1.1  oster           RF_PANIC();
    900  1.1  oster         }
    901  1.1  oster       }
    902  1.1  oster       /*printf("Remapped data for large write\n");*/
    903  1.1  oster       if (ds) {
    904  1.1  oster         raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
    905  1.1  oster           &pda->row, &pda->col, &pda->startSector, RF_REMAP);
    906  1.1  oster       }
    907  1.1  oster       else {
    908  1.1  oster         pda->row = srow; pda->col = scol;
    909  1.1  oster       }
    910  1.1  oster     }
    911  1.1  oster   }
    912  1.1  oster   for (pda = asmap->parityInfo; pda; pda=pda->next) {
    913  1.1  oster     if (pda->col == fcol) {
    914  1.1  oster       if (rf_dagDebug) {
    915  1.1  oster         if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
    916  1.1  oster           RF_PANIC();
    917  1.1  oster         }
    918  1.1  oster       }
    919  1.1  oster     }
    920  1.1  oster     if (ds) {
    921  1.1  oster       (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
    922  1.1  oster     }
    923  1.1  oster     else {
    924  1.1  oster       pda->row = srow; pda->col = scol;
    925  1.1  oster     }
    926  1.1  oster   }
    927  1.1  oster }
    928  1.1  oster 
    929  1.1  oster 
    930  1.1  oster /* this routine allocates read buffers and generates stripe maps for the
    931  1.1  oster  * regions of the array from the start of the stripe to the start of the
    932  1.1  oster  * access, and from the end of the access to the end of the stripe.  It also
    933  1.1  oster  * computes and returns the number of DAG nodes needed to read all this data.
    934  1.1  oster  * Note that this routine does the wrong thing if the access is fully
    935  1.1  oster  * contained within one stripe unit, so we RF_ASSERT against this case at the
    936  1.1  oster  * start.
    937  1.1  oster  */
    938  1.1  oster void rf_MapUnaccessedPortionOfStripe(
    939  1.1  oster   RF_Raid_t                    *raidPtr,
    940  1.1  oster   RF_RaidLayout_t              *layoutPtr, /* in: layout information */
    941  1.1  oster   RF_AccessStripeMap_t         *asmap,     /* in: access stripe map */
    942  1.1  oster   RF_DagHeader_t               *dag_h,     /* in: header of the dag to create */
    943  1.1  oster   RF_AccessStripeMapHeader_t  **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */
    944  1.1  oster   int                          *nRodNodes, /* out: num nodes to be generated to read unaccessed data */
    945  1.1  oster   char                        **sosBuffer, /* out: pointers to newly allocated buffer */
    946  1.1  oster   char                        **eosBuffer,
    947  1.1  oster   RF_AllocListElem_t           *allocList)
    948  1.1  oster {
    949  1.1  oster   RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
    950  1.1  oster   RF_SectorNum_t sosNumSector, eosNumSector;
    951  1.1  oster 
    952  1.1  oster   RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) );
    953  1.1  oster   /* generate an access map for the region of the array from start of stripe
    954  1.1  oster    * to start of access */
    955  1.1  oster   new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0;
    956  1.1  oster   if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
    957  1.1  oster     sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
    958  1.1  oster     sosNumSector   = asmap->raidAddress - sosRaidAddress;
    959  1.1  oster     RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
    960  1.1  oster     new_asm_h[0]   = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
    961  1.1  oster     new_asm_h[0]->next = dag_h->asmList;
    962  1.1  oster     dag_h->asmList = new_asm_h[0];
    963  1.1  oster     *nRodNodes    += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
    964  1.1  oster 
    965  1.1  oster     RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
    966  1.1  oster     /* we're totally within one stripe here */
    967  1.1  oster     if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
    968  1.1  oster       rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
    969  1.1  oster   }
    970  1.1  oster   /* generate an access map for the region of the array from end of access
    971  1.1  oster    * to end of stripe */
    972  1.1  oster   if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
    973  1.1  oster     eosRaidAddress = asmap->endRaidAddress;
    974  1.1  oster     eosNumSector   = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
    975  1.1  oster     RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
    976  1.1  oster     new_asm_h[1]   = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
    977  1.1  oster     new_asm_h[1]->next = dag_h->asmList;
    978  1.1  oster     dag_h->asmList = new_asm_h[1];
    979  1.1  oster     *nRodNodes    += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
    980  1.1  oster 
    981  1.1  oster     RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
    982  1.1  oster     /* we're totally within one stripe here */
    983  1.1  oster     if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
    984  1.1  oster       rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
    985  1.1  oster   }
    986  1.1  oster }
    987  1.1  oster 
    988  1.1  oster 
    989  1.1  oster 
    990  1.1  oster /* returns non-zero if the indicated ranges of stripe unit offsets overlap */
    991  1.1  oster int rf_PDAOverlap(
    992  1.1  oster   RF_RaidLayout_t    *layoutPtr,
    993  1.1  oster   RF_PhysDiskAddr_t  *src,
    994  1.1  oster   RF_PhysDiskAddr_t  *dest)
    995  1.1  oster {
    996  1.1  oster   RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
    997  1.1  oster   RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
    998  1.1  oster   /* use -1 to be sure we stay within SU */
    999  1.1  oster   RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);
   1000  1.1  oster   RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
   1001  1.1  oster   return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 );
   1002  1.1  oster }
   1003  1.1  oster 
   1004  1.1  oster 
   1005  1.1  oster /* GenerateFailedAccessASMs
   1006  1.1  oster  *
   1007  1.1  oster  * this routine figures out what portion of the stripe needs to be read
   1008  1.1  oster  * to effect the degraded read or write operation.  It's primary function
   1009  1.1  oster  * is to identify everything required to recover the data, and then
   1010  1.1  oster  * eliminate anything that is already being accessed by the user.
   1011  1.1  oster  *
   1012  1.1  oster  * The main result is two new ASMs, one for the region from the start of the
   1013  1.1  oster  * stripe to the start of the access, and one for the region from the end of
   1014  1.1  oster  * the access to the end of the stripe.  These ASMs describe everything that
   1015  1.1  oster  * needs to be read to effect the degraded access.  Other results are:
   1016  1.1  oster  *    nXorBufs -- the total number of buffers that need to be XORed together to
   1017  1.1  oster  *                recover the lost data,
   1018  1.1  oster  *    rpBufPtr -- ptr to a newly-allocated buffer to hold the parity.  If NULL
   1019  1.1  oster  *                at entry, not allocated.
   1020  1.1  oster  *    overlappingPDAs --
   1021  1.1  oster  *                describes which of the non-failed PDAs in the user access
   1022  1.1  oster  *                overlap data that needs to be read to effect recovery.
   1023  1.1  oster  *                overlappingPDAs[i]==1 if and only if, neglecting the failed
   1024  1.1  oster  *                PDA, the ith pda in the input asm overlaps data that needs
   1025  1.1  oster  *                to be read for recovery.
   1026  1.1  oster  */
   1027  1.1  oster  /* in: asm - ASM for the actual access, one stripe only */
   1028  1.1  oster  /* in: faildPDA - which component of the access has failed */
   1029  1.1  oster  /* in: dag_h - header of the DAG we're going to create */
   1030  1.1  oster  /* out: new_asm_h - the two new ASMs */
   1031  1.1  oster  /* out: nXorBufs - the total number of xor bufs required */
   1032  1.1  oster  /* out: rpBufPtr - a buffer for the parity read */
   1033  1.1  oster void rf_GenerateFailedAccessASMs(
   1034  1.1  oster   RF_Raid_t                    *raidPtr,
   1035  1.1  oster   RF_AccessStripeMap_t         *asmap,
   1036  1.1  oster   RF_PhysDiskAddr_t            *failedPDA,
   1037  1.1  oster   RF_DagHeader_t               *dag_h,
   1038  1.1  oster   RF_AccessStripeMapHeader_t  **new_asm_h,
   1039  1.1  oster   int                          *nXorBufs,
   1040  1.1  oster   char                        **rpBufPtr,
   1041  1.1  oster   char                         *overlappingPDAs,
   1042  1.1  oster   RF_AllocListElem_t           *allocList)
   1043  1.1  oster {
   1044  1.1  oster   RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
   1045  1.1  oster 
   1046  1.1  oster   /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
   1047  1.1  oster   RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
   1048  1.1  oster 
   1049  1.1  oster   RF_SectorCount_t numSect[2], numParitySect;
   1050  1.1  oster   RF_PhysDiskAddr_t *pda;
   1051  1.1  oster   char *rdBuf, *bufP;
   1052  1.1  oster   int foundit, i;
   1053  1.1  oster 
   1054  1.1  oster   bufP = NULL;
   1055  1.1  oster   foundit = 0;
   1056  1.1  oster   /* first compute the following raid addresses:
   1057  1.1  oster        start of stripe,                            (sosAddr)
   1058  1.1  oster        MIN(start of access, start of failed SU),   (sosEndAddr)
   1059  1.1  oster        MAX(end of access, end of failed SU),       (eosStartAddr)
   1060  1.1  oster        end of stripe (i.e. start of next stripe)   (eosAddr)
   1061  1.1  oster   */
   1062  1.1  oster   sosAddr      = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
   1063  1.1  oster   sosEndAddr   = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress));
   1064  1.1  oster   eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
   1065  1.1  oster   eosAddr      = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
   1066  1.1  oster 
   1067  1.1  oster   /* now generate access stripe maps for each of the above regions of the
   1068  1.1  oster    * stripe.  Use a dummy (NULL) buf ptr for now */
   1069  1.1  oster 
   1070  1.1  oster   new_asm_h[0] = (sosAddr != sosEndAddr)   ? rf_MapAccess(raidPtr, sosAddr,      sosEndAddr-sosAddr,   NULL, RF_DONT_REMAP) : NULL;
   1071  1.1  oster   new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
   1072  1.1  oster 
   1073  1.1  oster   /* walk through the PDAs and range-restrict each SU to the region of the
   1074  1.1  oster    * SU touched on the failed PDA.  also compute total data buffer space
   1075  1.1  oster    * requirements in this step.  Ignore the parity for now. */
   1076  1.1  oster 
   1077  1.1  oster   numSect[0] = numSect[1] = 0;
   1078  1.1  oster   if (new_asm_h[0]) {
   1079  1.1  oster     new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0];
   1080  1.1  oster     for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
   1081  1.1  oster       rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector;
   1082  1.1  oster     }
   1083  1.1  oster   }
   1084  1.1  oster   if (new_asm_h[1]) {
   1085  1.1  oster     new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1];
   1086  1.1  oster     for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
   1087  1.1  oster       rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector;
   1088  1.1  oster     }
   1089  1.1  oster   }
   1090  1.1  oster   numParitySect = failedPDA->numSector;
   1091  1.1  oster 
   1092  1.1  oster   /* allocate buffer space for the data & parity we have to read to recover
   1093  1.1  oster    * from the failure */
   1094  1.1  oster 
   1095  1.1  oster   if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) {      /* don't allocate parity buf if not needed */
   1096  1.1  oster     RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList);
   1097  1.1  oster     bufP = rdBuf;
   1098  1.1  oster     if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
   1099  1.1  oster 			    (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP);
   1100  1.1  oster   }
   1101  1.1  oster 
   1102  1.1  oster   /* now walk through the pdas one last time and assign buffer pointers
   1103  1.1  oster    * (ugh!).  Again, ignore the parity.  also, count nodes to find out how
   1104  1.1  oster    * many bufs need to be xored together */
   1105  1.1  oster   (*nXorBufs) = 1;   /* in read case, 1 is for parity.  In write case, 1 is for failed data */
   1106  1.1  oster   if (new_asm_h[0]) {
   1107  1.1  oster     for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
   1108  1.1  oster     *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
   1109  1.1  oster   }
   1110  1.1  oster   if (new_asm_h[1]) {
   1111  1.1  oster     for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
   1112  1.1  oster     (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
   1113  1.1  oster   }
   1114  1.1  oster   if (rpBufPtr) *rpBufPtr = bufP;     /* the rest of the buffer is for parity */
   1115  1.1  oster 
   1116  1.1  oster   /* the last step is to figure out how many more distinct buffers need to
   1117  1.1  oster    * get xor'd to produce the missing unit.  there's one for each user-data
   1118  1.1  oster    * read node that overlaps the portion of the failed unit being accessed */
   1119  1.1  oster 
   1120  1.1  oster   for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) {
   1121  1.1  oster     if (pda == failedPDA) {i--; foundit=1; continue;}
   1122  1.1  oster     if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
   1123  1.1  oster       overlappingPDAs[i] = 1;
   1124  1.1  oster       (*nXorBufs)++;
   1125  1.1  oster     }
   1126  1.1  oster   }
   1127  1.1  oster   if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);}
   1128  1.1  oster 
   1129  1.1  oster   if (rf_degDagDebug) {
   1130  1.1  oster     if (new_asm_h[0]) {
   1131  1.1  oster       printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
   1132  1.1  oster     }
   1133  1.1  oster     if (new_asm_h[1]) {
   1134  1.1  oster       printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
   1135  1.1  oster     }
   1136  1.1  oster   }
   1137  1.1  oster }
   1138  1.1  oster 
   1139  1.1  oster 
   1140  1.1  oster /* adjusts the offset and number of sectors in the destination pda so that
   1141  1.1  oster  * it covers at most the region of the SU covered by the source PDA.  This
   1142  1.1  oster  * is exclusively a restriction:  the number of sectors indicated by the
   1143  1.1  oster  * target PDA can only shrink.
   1144  1.1  oster  *
   1145  1.1  oster  * For example:  s = sectors within SU indicated by source PDA
   1146  1.1  oster  *               d = sectors within SU indicated by dest PDA
   1147  1.1  oster  *               r = results, stored in dest PDA
   1148  1.1  oster  *
   1149  1.1  oster  * |--------------- one stripe unit ---------------------|
   1150  1.1  oster  * |           sssssssssssssssssssssssssssssssss         |
   1151  1.1  oster  * |    ddddddddddddddddddddddddddddddddddddddddddddd    |
   1152  1.1  oster  * |           rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr         |
   1153  1.1  oster  *
   1154  1.1  oster  * Another example:
   1155  1.1  oster  *
   1156  1.1  oster  * |--------------- one stripe unit ---------------------|
   1157  1.1  oster  * |           sssssssssssssssssssssssssssssssss         |
   1158  1.1  oster  * |    ddddddddddddddddddddddd                          |
   1159  1.1  oster  * |           rrrrrrrrrrrrrrrr                          |
   1160  1.1  oster  *
   1161  1.1  oster  */
   1162  1.1  oster void rf_RangeRestrictPDA(
   1163  1.1  oster   RF_Raid_t          *raidPtr,
   1164  1.1  oster   RF_PhysDiskAddr_t  *src,
   1165  1.1  oster   RF_PhysDiskAddr_t  *dest,
   1166  1.1  oster   int                 dobuffer,
   1167  1.1  oster   int                 doraidaddr)
   1168  1.1  oster {
   1169  1.1  oster   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
   1170  1.1  oster   RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
   1171  1.1  oster   RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
   1172  1.1  oster   RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);     /* use -1 to be sure we stay within SU */
   1173  1.1  oster   RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
   1174  1.1  oster   RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */
   1175  1.1  oster 
   1176  1.1  oster   dest->startSector = subAddr + RF_MAX(soffs,doffs);
   1177  1.1  oster   dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector;
   1178  1.1  oster 
   1179  1.1  oster   if (dobuffer)
   1180  1.1  oster     dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0;
   1181  1.1  oster   if (doraidaddr) {
   1182  1.1  oster     dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
   1183  1.1  oster                         rf_StripeUnitOffset(layoutPtr, dest->startSector);
   1184  1.1  oster   }
   1185  1.1  oster }
   1186  1.1  oster 
   1187  1.1  oster /*
   1188  1.1  oster  * Want the highest of these primes to be the largest one
   1189  1.1  oster  * less than the max expected number of columns (won't hurt
   1190  1.1  oster  * to be too small or too large, but won't be optimal, either)
   1191  1.1  oster  * --jimz
   1192  1.1  oster  */
   1193  1.1  oster #define NLOWPRIMES 8
   1194  1.1  oster static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19};
   1195  1.1  oster 
   1196  1.1  oster /*****************************************************************************
   1197  1.1  oster  * compute the workload shift factor.  (chained declustering)
   1198  1.1  oster  *
   1199  1.1  oster  * return nonzero if access should shift to secondary, otherwise,
   1200  1.1  oster  * access is to primary
   1201  1.1  oster  *****************************************************************************/
   1202  1.1  oster int rf_compute_workload_shift(
   1203  1.1  oster   RF_Raid_t          *raidPtr,
   1204  1.1  oster   RF_PhysDiskAddr_t  *pda)
   1205  1.1  oster {
   1206  1.1  oster   /*
   1207  1.1  oster    * variables:
   1208  1.1  oster    *  d   = column of disk containing primary
   1209  1.1  oster    *  f   = column of failed disk
   1210  1.1  oster    *  n   = number of disks in array
   1211  1.1  oster    *  sd  = "shift distance" (number of columns that d is to the right of f)
   1212  1.1  oster    *  row = row of array the access is in
   1213  1.1  oster    *  v   = numerator of redirection ratio
   1214  1.1  oster    *  k   = denominator of redirection ratio
   1215  1.1  oster    */
   1216  1.1  oster   RF_RowCol_t d, f, sd, row, n;
   1217  1.1  oster   int k, v, ret, i;
   1218  1.1  oster 
   1219  1.1  oster   row = pda->row;
   1220  1.1  oster   n = raidPtr->numCol;
   1221  1.1  oster 
   1222  1.1  oster   /* assign column of primary copy to d */
   1223  1.1  oster   d = pda->col;
   1224  1.1  oster 
   1225  1.1  oster   /* assign column of dead disk to f */
   1226  1.1  oster   for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++);
   1227  1.1  oster 
   1228  1.1  oster   RF_ASSERT(f < n);
   1229  1.1  oster   RF_ASSERT(f != d);
   1230  1.1  oster 
   1231  1.1  oster   sd = (f > d) ? (n + d - f) : (d - f);
   1232  1.1  oster   RF_ASSERT(sd < n);
   1233  1.1  oster 
   1234  1.1  oster   /*
   1235  1.1  oster    * v of every k accesses should be redirected
   1236  1.1  oster    *
   1237  1.1  oster    * v/k := (n-1-sd)/(n-1)
   1238  1.1  oster    */
   1239  1.1  oster   v = (n-1-sd);
   1240  1.1  oster   k = (n-1);
   1241  1.1  oster 
   1242  1.1  oster #if 1
   1243  1.1  oster   /*
   1244  1.1  oster    * XXX
   1245  1.1  oster    * Is this worth it?
   1246  1.1  oster    *
   1247  1.1  oster    * Now reduce the fraction, by repeatedly factoring
   1248  1.1  oster    * out primes (just like they teach in elementary school!)
   1249  1.1  oster    */
   1250  1.1  oster   for(i=0;i<NLOWPRIMES;i++) {
   1251  1.1  oster     if (lowprimes[i] > v)
   1252  1.1  oster       break;
   1253  1.1  oster     while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) {
   1254  1.1  oster       v /= lowprimes[i];
   1255  1.1  oster       k /= lowprimes[i];
   1256  1.1  oster     }
   1257  1.1  oster   }
   1258  1.1  oster #endif
   1259  1.1  oster 
   1260  1.1  oster   raidPtr->hist_diskreq[row][d]++;
   1261  1.1  oster   if (raidPtr->hist_diskreq[row][d] > v) {
   1262  1.1  oster     ret = 0; /* do not redirect */
   1263  1.1  oster   }
   1264  1.1  oster   else {
   1265  1.1  oster     ret = 1; /* redirect */
   1266  1.1  oster   }
   1267  1.1  oster 
   1268  1.1  oster #if 0
   1269  1.1  oster   printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
   1270  1.1  oster     raidPtr->hist_diskreq[row][d]);
   1271  1.1  oster #endif
   1272  1.1  oster 
   1273  1.1  oster   if (raidPtr->hist_diskreq[row][d] >= k) {
   1274  1.1  oster     /* reset counter */
   1275  1.1  oster     raidPtr->hist_diskreq[row][d] = 0;
   1276  1.1  oster   }
   1277  1.1  oster 
   1278  1.1  oster   return(ret);
   1279  1.1  oster }
   1280  1.1  oster 
   1281  1.1  oster /*
   1282  1.1  oster  * Disk selection routines
   1283  1.1  oster  */
   1284  1.1  oster 
   1285  1.1  oster /*
   1286  1.1  oster  * Selects the disk with the shortest queue from a mirror pair.
   1287  1.1  oster  * Both the disk I/Os queued in RAIDframe as well as those at the physical
   1288  1.1  oster  * disk are counted as members of the "queue"
   1289  1.1  oster  */
   1290  1.1  oster void rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
   1291  1.1  oster {
   1292  1.1  oster   RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
   1293  1.1  oster   RF_RowCol_t rowData, colData, rowMirror, colMirror;
   1294  1.1  oster   int dataQueueLength, mirrorQueueLength, usemirror;
   1295  1.1  oster   RF_PhysDiskAddr_t *data_pda   = (RF_PhysDiskAddr_t *)node->params[0].p;
   1296  1.1  oster   RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
   1297  1.1  oster   RF_PhysDiskAddr_t *tmp_pda;
   1298  1.1  oster   RF_RaidDisk_t **disks = raidPtr->Disks;
   1299  1.1  oster   RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
   1300  1.1  oster 
   1301  1.1  oster   /* return the [row col] of the disk with the shortest queue */
   1302  1.1  oster   rowData = data_pda->row;
   1303  1.1  oster   colData = data_pda->col;
   1304  1.1  oster   rowMirror = mirror_pda->row;
   1305  1.1  oster   colMirror = mirror_pda->col;
   1306  1.1  oster   dataQueue   = &(dqs[rowData][colData]);
   1307  1.1  oster   mirrorQueue = &(dqs[rowMirror][colMirror]);
   1308  1.1  oster 
   1309  1.1  oster #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1310  1.1  oster   RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
   1311  1.1  oster #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1312  1.1  oster   dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
   1313  1.1  oster #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1314  1.1  oster   RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
   1315  1.1  oster   RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
   1316  1.1  oster #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1317  1.1  oster   mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
   1318  1.1  oster #ifdef RF_LOCK_QUEUES_TO_READ_LEN
   1319  1.1  oster   RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
   1320  1.1  oster #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
   1321  1.1  oster 
   1322  1.1  oster   usemirror = 0;
   1323  1.1  oster   if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
   1324  1.1  oster     usemirror = 0;
   1325  1.1  oster   }
   1326  1.1  oster   else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
   1327  1.1  oster     usemirror = 1;
   1328  1.1  oster   }
   1329  1.1  oster   else if (dataQueueLength < mirrorQueueLength) {
   1330  1.1  oster     usemirror = 0;
   1331  1.1  oster   }
   1332  1.1  oster   else if (mirrorQueueLength < dataQueueLength) {
   1333  1.1  oster     usemirror = 1;
   1334  1.1  oster   }
   1335  1.1  oster   else {
   1336  1.1  oster     /* queues are equal length. attempt cleverness. */
   1337  1.1  oster     if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector)
   1338  1.1  oster         <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector))
   1339  1.1  oster     {
   1340  1.1  oster       usemirror = 0;
   1341  1.1  oster     }
   1342  1.1  oster     else {
   1343  1.1  oster       usemirror = 1;
   1344  1.1  oster     }
   1345  1.1  oster   }
   1346  1.1  oster 
   1347  1.1  oster   if (usemirror) {
   1348  1.1  oster     /* use mirror (parity) disk, swap params 0 & 4 */
   1349  1.1  oster     tmp_pda = data_pda;
   1350  1.1  oster     node->params[0].p = mirror_pda;
   1351  1.1  oster     node->params[4].p = tmp_pda;
   1352  1.1  oster   }
   1353  1.1  oster   else {
   1354  1.1  oster     /* use data disk, leave param 0 unchanged */
   1355  1.1  oster   }
   1356  1.1  oster   /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */
   1357  1.1  oster }
   1358  1.1  oster 
   1359  1.1  oster /*
   1360  1.1  oster  * Do simple partitioning. This assumes that
   1361  1.1  oster  * the data and parity disks are laid out identically.
   1362  1.1  oster  */
   1363  1.1  oster void rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
   1364  1.1  oster {
   1365  1.1  oster   RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
   1366  1.1  oster   RF_RowCol_t rowData, colData, rowMirror, colMirror;
   1367  1.1  oster   RF_PhysDiskAddr_t *data_pda   = (RF_PhysDiskAddr_t *)node->params[0].p;
   1368  1.1  oster   RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
   1369  1.1  oster   RF_PhysDiskAddr_t *tmp_pda;
   1370  1.1  oster   RF_RaidDisk_t **disks = raidPtr->Disks;
   1371  1.1  oster   RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
   1372  1.1  oster   int usemirror;
   1373  1.1  oster 
   1374  1.1  oster   /* return the [row col] of the disk with the shortest queue */
   1375  1.1  oster   rowData = data_pda->row;
   1376  1.1  oster   colData = data_pda->col;
   1377  1.1  oster   rowMirror = mirror_pda->row;
   1378  1.1  oster   colMirror = mirror_pda->col;
   1379  1.1  oster   dataQueue   = &(dqs[rowData][colData]);
   1380  1.1  oster   mirrorQueue = &(dqs[rowMirror][colMirror]);
   1381  1.1  oster 
   1382  1.1  oster   usemirror = 0;
   1383  1.1  oster   if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
   1384  1.1  oster     usemirror = 0;
   1385  1.1  oster   }
   1386  1.1  oster   else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
   1387  1.1  oster     usemirror = 1;
   1388  1.1  oster   }
   1389  1.1  oster   else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) {
   1390  1.1  oster     usemirror = 0;
   1391  1.1  oster   }
   1392  1.1  oster   else {
   1393  1.1  oster     usemirror = 1;
   1394  1.1  oster   }
   1395  1.1  oster 
   1396  1.1  oster   if (usemirror) {
   1397  1.1  oster     /* use mirror (parity) disk, swap params 0 & 4 */
   1398  1.1  oster     tmp_pda = data_pda;
   1399  1.1  oster     node->params[0].p = mirror_pda;
   1400  1.1  oster     node->params[4].p = tmp_pda;
   1401  1.1  oster   }
   1402  1.1  oster   else {
   1403  1.1  oster     /* use data disk, leave param 0 unchanged */
   1404  1.1  oster   }
   1405  1.1  oster }
   1406