Home | History | Annotate | Line # | Download | only in raidframe
rf_dagffrd.c revision 1.1
      1 /*	$NetBSD: rf_dagffrd.c,v 1.1 1998/11/13 04:20:27 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_dagffrd.c
     31  *
     32  * code for creating fault-free read DAGs
     33  *
     34  * :
     35  * Log: rf_dagffrd.c,v
     36  * Revision 1.14  1996/07/28 20:31:39  jimz
     37  * i386netbsd port
     38  * true/false fixup
     39  *
     40  * Revision 1.13  1996/07/22  19:52:16  jimz
     41  * switched node params to RF_DagParam_t, a union of
     42  * a 64-bit int and a void *, for better portability
     43  * attempted hpux port, but failed partway through for
     44  * lack of a single C compiler capable of compiling all
     45  * source files
     46  *
     47  * Revision 1.12  1996/06/09  02:36:46  jimz
     48  * lots of little crufty cleanup- fixup whitespace
     49  * issues, comment #ifdefs, improve typing in some
     50  * places (esp size-related)
     51  *
     52  * Revision 1.11  1996/06/06  17:30:44  jimz
     53  * turn old Raid1 mirror read creation into a more generic function
     54  * parameterized by an addtional parameter: type of mirrored read
     55  * this is now used by other dag creation routines so chained declustering
     56  * and raid1 can share dag creation code, but have different mirroring
     57  * policies
     58  *
     59  * Revision 1.10  1996/05/31  22:26:54  jimz
     60  * fix a lot of mapping problems, memory allocation problems
     61  * found some weird lock issues, fixed 'em
     62  * more code cleanup
     63  *
     64  * Revision 1.9  1996/05/30  11:29:41  jimz
     65  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
     66  * about when stripes should be locked (I made it consistent: no parity, no lock)
     67  * There was a lot of extra serialization of I/Os which I've removed- a lot of
     68  * it was to calculate values for the cache code, which is no longer with us.
     69  * More types, function, macro cleanup. Added code to properly quiesce the array
     70  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
     71  * before. Fixed memory allocation, freeing bugs.
     72  *
     73  * Revision 1.8  1996/05/27  18:56:37  jimz
     74  * more code cleanup
     75  * better typing
     76  * compiles in all 3 environments
     77  *
     78  * Revision 1.7  1996/05/24  22:17:04  jimz
     79  * continue code + namespace cleanup
     80  * typed a bunch of flags
     81  *
     82  * Revision 1.6  1996/05/24  04:28:55  jimz
     83  * release cleanup ckpt
     84  *
     85  * Revision 1.5  1996/05/23  21:46:35  jimz
     86  * checkpoint in code cleanup (release prep)
     87  * lots of types, function names have been fixed
     88  *
     89  * Revision 1.4  1996/05/23  00:33:23  jimz
     90  * code cleanup: move all debug decls to rf_options.c, all extern
     91  * debug decls to rf_options.h, all debug vars preceded by rf_
     92  *
     93  * Revision 1.3  1996/05/18  19:51:34  jimz
     94  * major code cleanup- fix syntax, make some types consistent,
     95  * add prototypes, clean out dead code, et cetera
     96  *
     97  * Revision 1.2  1996/05/08  21:01:24  jimz
     98  * fixed up enum type names that were conflicting with other
     99  * enums and function names (ie, "panic")
    100  * future naming trends will be towards RF_ and rf_ for
    101  * everything raidframe-related
    102  *
    103  * Revision 1.1  1996/05/03  19:19:20  wvcii
    104  * Initial revision
    105  *
    106  */
    107 
    108 #include "rf_types.h"
    109 #include "rf_raid.h"
    110 #include "rf_dag.h"
    111 #include "rf_dagutils.h"
    112 #include "rf_dagfuncs.h"
    113 #include "rf_threadid.h"
    114 #include "rf_debugMem.h"
    115 #include "rf_memchunk.h"
    116 #include "rf_general.h"
    117 #include "rf_dagffrd.h"
    118 
    119 /******************************************************************************
    120  *
    121  * General comments on DAG creation:
    122  *
    123  * All DAGs in this file use roll-away error recovery.  Each DAG has a single
    124  * commit node, usually called "Cmt."  If an error occurs before the Cmt node
    125  * is reached, the execution engine will halt forward execution and work
    126  * backward through the graph, executing the undo functions.  Assuming that
    127  * each node in the graph prior to the Cmt node are undoable and atomic - or -
    128  * does not make changes to permanent state, the graph will fail atomically.
    129  * If an error occurs after the Cmt node executes, the engine will roll-forward
    130  * through the graph, blindly executing nodes until it reaches the end.
    131  * If a graph reaches the end, it is assumed to have completed successfully.
    132  *
    133  * A graph has only 1 Cmt node.
    134  *
    135  */
    136 
    137 
    138 /******************************************************************************
    139  *
    140  * The following wrappers map the standard DAG creation interface to the
    141  * DAG creation routines.  Additionally, these wrappers enable experimentation
    142  * with new DAG structures by providing an extra level of indirection, allowing
    143  * the DAG creation routines to be replaced at this single point.
    144  */
    145 
    146 void rf_CreateFaultFreeReadDAG(
    147   RF_Raid_t             *raidPtr,
    148   RF_AccessStripeMap_t  *asmap,
    149   RF_DagHeader_t        *dag_h,
    150   void                  *bp,
    151   RF_RaidAccessFlags_t   flags,
    152   RF_AllocListElem_t    *allocList)
    153 {
    154   rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
    155     RF_IO_TYPE_READ);
    156 }
    157 
    158 
    159 /******************************************************************************
    160  *
    161  * DAG creation code begins here
    162  */
    163 
    164 /******************************************************************************
    165  *
    166  * creates a DAG to perform a nonredundant read or write of data within one
    167  * stripe.
    168  * For reads, this DAG is as follows:
    169  *
    170  *                   /---- read ----\
    171  *    Header -- Block ---- read ---- Commit -- Terminate
    172  *                   \---- read ----/
    173  *
    174  * For writes, this DAG is as follows:
    175  *
    176  *                    /---- write ----\
    177  *    Header -- Commit ---- write ---- Block -- Terminate
    178  *                    \---- write ----/
    179  *
    180  * There is one disk node per stripe unit accessed, and all disk nodes are in
    181  * parallel.
    182  *
    183  * Tricky point here:  The first disk node (read or write) is created
    184  * normally.  Subsequent disk nodes are created by copying the first one,
    185  * and modifying a few params.  The "succedents" and "antecedents" fields are
    186  * _not_ re-created in each node, but rather left pointing to the same array
    187  * that was malloc'd when the first node was created.  Thus, it's essential
    188  * that when this DAG is freed, the succedents and antecedents fields be freed
    189  * in ONLY ONE of the read nodes.  This does not apply to the "params" field
    190  * because it is recreated for each READ node.
    191  *
    192  * Note that normal-priority accesses do not need to be tagged with their
    193  * parity stripe ID, because they will never be promoted.  Hence, I've
    194  * commented-out the code to do this, and marked it with UNNEEDED.
    195  *
    196  *****************************************************************************/
    197 
    198 void rf_CreateNonredundantDAG(
    199   RF_Raid_t             *raidPtr,
    200   RF_AccessStripeMap_t  *asmap,
    201   RF_DagHeader_t        *dag_h,
    202   void                  *bp,
    203   RF_RaidAccessFlags_t   flags,
    204   RF_AllocListElem_t    *allocList,
    205   RF_IoType_t            type)
    206 {
    207   RF_DagNode_t *nodes, *diskNodes, *blockNode, *commitNode, *termNode;
    208   RF_PhysDiskAddr_t *pda = asmap->physInfo;
    209   int (*doFunc)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *);
    210   int i, n, totalNumNodes;
    211   char *name;
    212 
    213   n = asmap->numStripeUnitsAccessed;
    214   dag_h->creator = "NonredundantDAG";
    215 
    216   RF_ASSERT(RF_IO_IS_R_OR_W(type));
    217   switch (type) {
    218     case RF_IO_TYPE_READ:
    219       doFunc = rf_DiskReadFunc;
    220       undoFunc = rf_DiskReadUndoFunc;
    221       name = "R  ";
    222       if (rf_dagDebug) printf("[Creating non-redundant read DAG]\n");
    223       break;
    224     case RF_IO_TYPE_WRITE:
    225       doFunc = rf_DiskWriteFunc;
    226       undoFunc = rf_DiskWriteUndoFunc;
    227       name = "W  ";
    228       if (rf_dagDebug) printf("[Creating non-redundant write DAG]\n");
    229       break;
    230     default:
    231       RF_PANIC();
    232   }
    233 
    234   /*
    235    * For reads, the dag can not commit until the block node is reached.
    236    * for writes, the dag commits immediately.
    237    */
    238   dag_h->numCommitNodes = 1;
    239   dag_h->numCommits = 0;
    240   dag_h->numSuccedents = 1;
    241 
    242   /*
    243    * Node count:
    244    * 1 block node
    245    * n data reads (or writes)
    246    * 1 commit node
    247    * 1 terminator node
    248    */
    249   RF_ASSERT(n > 0);
    250   totalNumNodes = n + 3;
    251   RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
    252     (RF_DagNode_t *), allocList);
    253   i = 0;
    254   diskNodes   = &nodes[i]; i += n;
    255   blockNode   = &nodes[i]; i += 1;
    256   commitNode   = &nodes[i]; i += 1;
    257   termNode    = &nodes[i];  i += 1;
    258   RF_ASSERT(i == totalNumNodes);
    259 
    260   /* initialize nodes */
    261   switch (type) {
    262     case RF_IO_TYPE_READ:
    263       rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    264         NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
    265       rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    266         NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
    267       rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
    268         NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    269       break;
    270     case RF_IO_TYPE_WRITE:
    271       rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    272         NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
    273       rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    274         NULL, n, 1, 0, 0, dag_h, "Cmt", allocList);
    275       rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
    276         NULL, 0, n, 0, 0, dag_h, "Trm", allocList);
    277       break;
    278     default:
    279       RF_PANIC();
    280   }
    281 
    282   for (i = 0; i < n; i++) {
    283     RF_ASSERT(pda != NULL);
    284     rf_InitNode(&diskNodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc,
    285       1, 1, 4, 0, dag_h, name, allocList);
    286     diskNodes[i].params[0].p  = pda;
    287     diskNodes[i].params[1].p  = pda->bufPtr;
    288     /* parity stripe id is not necessary */
    289     diskNodes[i].params[2].v  = 0;
    290     diskNodes[i].params[3].v  = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
    291     pda = pda->next;
    292   }
    293 
    294   /*
    295    * Connect nodes.
    296    */
    297 
    298   /* connect hdr to block node */
    299   RF_ASSERT(blockNode->numAntecedents == 0);
    300   dag_h->succedents[0] = blockNode;
    301 
    302   if (type == RF_IO_TYPE_READ) {
    303     /* connecting a nonredundant read DAG */
    304     RF_ASSERT(blockNode->numSuccedents == n);
    305     RF_ASSERT(commitNode->numAntecedents == n);
    306     for (i=0; i < n; i++) {
    307       /* connect block node to each read node */
    308       RF_ASSERT(diskNodes[i].numAntecedents == 1);
    309       blockNode->succedents[i] = &diskNodes[i];
    310       diskNodes[i].antecedents[0] = blockNode;
    311       diskNodes[i].antType[0] = rf_control;
    312 
    313       /* connect each read node to the commit node */
    314       RF_ASSERT(diskNodes[i].numSuccedents == 1);
    315       diskNodes[i].succedents[0] = commitNode;
    316       commitNode->antecedents[i] = &diskNodes[i];
    317       commitNode->antType[i] = rf_control;
    318     }
    319     /* connect the commit node to the term node */
    320     RF_ASSERT(commitNode->numSuccedents == 1);
    321     RF_ASSERT(termNode->numAntecedents == 1);
    322     RF_ASSERT(termNode->numSuccedents == 0);
    323     commitNode->succedents[0] = termNode;
    324     termNode->antecedents[0] = commitNode;
    325     termNode->antType[0] = rf_control;
    326   }
    327   else {
    328     /* connecting a nonredundant write DAG */
    329     /* connect the block node to the commit node */
    330     RF_ASSERT(blockNode->numSuccedents == 1);
    331     RF_ASSERT(commitNode->numAntecedents == 1);
    332     blockNode->succedents[0] = commitNode;
    333     commitNode->antecedents[0] = blockNode;
    334     commitNode->antType[0] = rf_control;
    335 
    336     RF_ASSERT(commitNode->numSuccedents == n);
    337     RF_ASSERT(termNode->numAntecedents == n);
    338     RF_ASSERT(termNode->numSuccedents == 0);
    339     for (i=0; i < n; i++) {
    340       /* connect the commit node to each write node */
    341       RF_ASSERT(diskNodes[i].numAntecedents == 1);
    342       commitNode->succedents[i] = &diskNodes[i];
    343       diskNodes[i].antecedents[0] = commitNode;
    344       diskNodes[i].antType[0] = rf_control;
    345 
    346       /* connect each write node to the term node */
    347       RF_ASSERT(diskNodes[i].numSuccedents == 1);
    348       diskNodes[i].succedents[0] = termNode;
    349       termNode->antecedents[i] = &diskNodes[i];
    350       termNode->antType[i] = rf_control;
    351     }
    352   }
    353 }
    354 
    355 /******************************************************************************
    356  * Create a fault-free read DAG for RAID level 1
    357  *
    358  * Hdr -> Nil -> Rmir -> Cmt -> Trm
    359  *
    360  * The "Rmir" node schedules a read from the disk in the mirror pair with the
    361  * shortest disk queue.  the proper queue is selected at Rmir execution.  this
    362  * deferred mapping is unlike other archs in RAIDframe which generally fix
    363  * mapping at DAG creation time.
    364  *
    365  * Parameters:  raidPtr   - description of the physical array
    366  *              asmap     - logical & physical addresses for this access
    367  *              bp        - buffer ptr (for holding read data)
    368  *              flags     - general flags (e.g. disk locking)
    369  *              allocList - list of memory allocated in DAG creation
    370  *****************************************************************************/
    371 
    372 static void CreateMirrorReadDAG(
    373   RF_Raid_t             *raidPtr,
    374   RF_AccessStripeMap_t  *asmap,
    375   RF_DagHeader_t        *dag_h,
    376   void                  *bp,
    377   RF_RaidAccessFlags_t   flags,
    378   RF_AllocListElem_t    *allocList,
    379   int                   (*readfunc)(RF_DagNode_t *node))
    380 {
    381   RF_DagNode_t *readNodes, *nodes, *blockNode, *commitNode, *termNode;
    382   RF_PhysDiskAddr_t *data_pda = asmap->physInfo;
    383   RF_PhysDiskAddr_t *parity_pda = asmap->parityInfo;
    384   int i, n, totalNumNodes;
    385 
    386   n = asmap->numStripeUnitsAccessed;
    387   dag_h->creator = "RaidOneReadDAG";
    388   if (rf_dagDebug) {
    389     printf("[Creating RAID level 1 read DAG]\n");
    390   }
    391 
    392   /*
    393    * This dag can not commit until the commit node is reached
    394    * errors prior to the commit point imply the dag has failed.
    395    */
    396   dag_h->numCommitNodes = 1;
    397   dag_h->numCommits = 0;
    398   dag_h->numSuccedents = 1;
    399 
    400   /*
    401    * Node count:
    402    * n data reads
    403    * 1 block node
    404    * 1 commit node
    405    * 1 terminator node
    406    */
    407   RF_ASSERT(n > 0);
    408   totalNumNodes = n + 3;
    409   RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
    410     (RF_DagNode_t *), allocList);
    411   i = 0;
    412   readNodes   = &nodes[i]; i += n;
    413   blockNode   = &nodes[i]; i += 1;
    414   commitNode = &nodes[i]; i += 1;
    415   termNode    = &nodes[i]; i += 1;
    416   RF_ASSERT(i == totalNumNodes);
    417 
    418   /* initialize nodes */
    419   rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
    420     rf_NullNodeUndoFunc, NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
    421   rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
    422     rf_NullNodeUndoFunc, NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
    423   rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
    424     rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    425 
    426   for (i = 0; i < n; i++) {
    427     RF_ASSERT(data_pda != NULL);
    428     RF_ASSERT(parity_pda != NULL);
    429     rf_InitNode(&readNodes[i], rf_wait, RF_FALSE, readfunc,
    430       rf_DiskReadMirrorUndoFunc, rf_GenericWakeupFunc, 1, 1, 5, 0, dag_h,
    431       "Rmir", allocList);
    432     readNodes[i].params[0].p = data_pda;
    433     readNodes[i].params[1].p = data_pda->bufPtr;
    434     /* parity stripe id is not necessary */
    435     readNodes[i].params[2].p = 0;
    436     readNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
    437     readNodes[i].params[4].p = parity_pda;
    438     data_pda = data_pda->next;
    439     parity_pda = parity_pda->next;
    440   }
    441 
    442   /*
    443    * Connect nodes
    444    */
    445 
    446   /* connect hdr to block node */
    447   RF_ASSERT(blockNode->numAntecedents == 0);
    448   dag_h->succedents[0] = blockNode;
    449 
    450   /* connect block node to read nodes */
    451   RF_ASSERT(blockNode->numSuccedents == n);
    452   for (i=0; i < n; i++) {
    453     RF_ASSERT(readNodes[i].numAntecedents == 1);
    454     blockNode->succedents[i] = &readNodes[i];
    455     readNodes[i].antecedents[0] = blockNode;
    456     readNodes[i].antType[0] = rf_control;
    457   }
    458 
    459   /* connect read nodes to commit node */
    460   RF_ASSERT(commitNode->numAntecedents == n);
    461   for (i=0; i < n; i++) {
    462     RF_ASSERT(readNodes[i].numSuccedents == 1);
    463     readNodes[i].succedents[0] = commitNode;
    464     commitNode->antecedents[i] = &readNodes[i];
    465     commitNode->antType[i] = rf_control;
    466   }
    467 
    468   /* connect commit node to term node */
    469   RF_ASSERT(commitNode->numSuccedents == 1);
    470   RF_ASSERT(termNode->numAntecedents == 1);
    471   RF_ASSERT(termNode->numSuccedents == 0);
    472   commitNode->succedents[0] = termNode;
    473   termNode->antecedents[0] = commitNode;
    474   termNode->antType[0] = rf_control;
    475 }
    476 
    477 void rf_CreateMirrorIdleReadDAG(
    478   RF_Raid_t             *raidPtr,
    479   RF_AccessStripeMap_t  *asmap,
    480   RF_DagHeader_t        *dag_h,
    481   void                  *bp,
    482   RF_RaidAccessFlags_t   flags,
    483   RF_AllocListElem_t    *allocList)
    484 {
    485   CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
    486     rf_DiskReadMirrorIdleFunc);
    487 }
    488 
    489 void rf_CreateMirrorPartitionReadDAG(
    490   RF_Raid_t             *raidPtr,
    491   RF_AccessStripeMap_t  *asmap,
    492   RF_DagHeader_t        *dag_h,
    493   void                  *bp,
    494   RF_RaidAccessFlags_t   flags,
    495   RF_AllocListElem_t    *allocList)
    496 {
    497   CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
    498     rf_DiskReadMirrorPartitionFunc);
    499 }
    500