Home | History | Annotate | Line # | Download | only in raidframe
rf_dagffrd.c revision 1.2
      1 /*	$NetBSD: rf_dagffrd.c,v 1.2 1999/01/26 02:33:52 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*
     30  * rf_dagffrd.c
     31  *
     32  * code for creating fault-free read DAGs
     33  *
     34  */
     35 
     36 #include "rf_types.h"
     37 #include "rf_raid.h"
     38 #include "rf_dag.h"
     39 #include "rf_dagutils.h"
     40 #include "rf_dagfuncs.h"
     41 #include "rf_threadid.h"
     42 #include "rf_debugMem.h"
     43 #include "rf_memchunk.h"
     44 #include "rf_general.h"
     45 #include "rf_dagffrd.h"
     46 
     47 /******************************************************************************
     48  *
     49  * General comments on DAG creation:
     50  *
     51  * All DAGs in this file use roll-away error recovery.  Each DAG has a single
     52  * commit node, usually called "Cmt."  If an error occurs before the Cmt node
     53  * is reached, the execution engine will halt forward execution and work
     54  * backward through the graph, executing the undo functions.  Assuming that
     55  * each node in the graph prior to the Cmt node are undoable and atomic - or -
     56  * does not make changes to permanent state, the graph will fail atomically.
     57  * If an error occurs after the Cmt node executes, the engine will roll-forward
     58  * through the graph, blindly executing nodes until it reaches the end.
     59  * If a graph reaches the end, it is assumed to have completed successfully.
     60  *
     61  * A graph has only 1 Cmt node.
     62  *
     63  */
     64 
     65 
     66 /******************************************************************************
     67  *
     68  * The following wrappers map the standard DAG creation interface to the
     69  * DAG creation routines.  Additionally, these wrappers enable experimentation
     70  * with new DAG structures by providing an extra level of indirection, allowing
     71  * the DAG creation routines to be replaced at this single point.
     72  */
     73 
     74 void rf_CreateFaultFreeReadDAG(
     75   RF_Raid_t             *raidPtr,
     76   RF_AccessStripeMap_t  *asmap,
     77   RF_DagHeader_t        *dag_h,
     78   void                  *bp,
     79   RF_RaidAccessFlags_t   flags,
     80   RF_AllocListElem_t    *allocList)
     81 {
     82   rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
     83     RF_IO_TYPE_READ);
     84 }
     85 
     86 
     87 /******************************************************************************
     88  *
     89  * DAG creation code begins here
     90  */
     91 
     92 /******************************************************************************
     93  *
     94  * creates a DAG to perform a nonredundant read or write of data within one
     95  * stripe.
     96  * For reads, this DAG is as follows:
     97  *
     98  *                   /---- read ----\
     99  *    Header -- Block ---- read ---- Commit -- Terminate
    100  *                   \---- read ----/
    101  *
    102  * For writes, this DAG is as follows:
    103  *
    104  *                    /---- write ----\
    105  *    Header -- Commit ---- write ---- Block -- Terminate
    106  *                    \---- write ----/
    107  *
    108  * There is one disk node per stripe unit accessed, and all disk nodes are in
    109  * parallel.
    110  *
    111  * Tricky point here:  The first disk node (read or write) is created
    112  * normally.  Subsequent disk nodes are created by copying the first one,
    113  * and modifying a few params.  The "succedents" and "antecedents" fields are
    114  * _not_ re-created in each node, but rather left pointing to the same array
    115  * that was malloc'd when the first node was created.  Thus, it's essential
    116  * that when this DAG is freed, the succedents and antecedents fields be freed
    117  * in ONLY ONE of the read nodes.  This does not apply to the "params" field
    118  * because it is recreated for each READ node.
    119  *
    120  * Note that normal-priority accesses do not need to be tagged with their
    121  * parity stripe ID, because they will never be promoted.  Hence, I've
    122  * commented-out the code to do this, and marked it with UNNEEDED.
    123  *
    124  *****************************************************************************/
    125 
    126 void rf_CreateNonredundantDAG(
    127   RF_Raid_t             *raidPtr,
    128   RF_AccessStripeMap_t  *asmap,
    129   RF_DagHeader_t        *dag_h,
    130   void                  *bp,
    131   RF_RaidAccessFlags_t   flags,
    132   RF_AllocListElem_t    *allocList,
    133   RF_IoType_t            type)
    134 {
    135   RF_DagNode_t *nodes, *diskNodes, *blockNode, *commitNode, *termNode;
    136   RF_PhysDiskAddr_t *pda = asmap->physInfo;
    137   int (*doFunc)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *);
    138   int i, n, totalNumNodes;
    139   char *name;
    140 
    141   n = asmap->numStripeUnitsAccessed;
    142   dag_h->creator = "NonredundantDAG";
    143 
    144   RF_ASSERT(RF_IO_IS_R_OR_W(type));
    145   switch (type) {
    146     case RF_IO_TYPE_READ:
    147       doFunc = rf_DiskReadFunc;
    148       undoFunc = rf_DiskReadUndoFunc;
    149       name = "R  ";
    150       if (rf_dagDebug) printf("[Creating non-redundant read DAG]\n");
    151       break;
    152     case RF_IO_TYPE_WRITE:
    153       doFunc = rf_DiskWriteFunc;
    154       undoFunc = rf_DiskWriteUndoFunc;
    155       name = "W  ";
    156       if (rf_dagDebug) printf("[Creating non-redundant write DAG]\n");
    157       break;
    158     default:
    159       RF_PANIC();
    160   }
    161 
    162   /*
    163    * For reads, the dag can not commit until the block node is reached.
    164    * for writes, the dag commits immediately.
    165    */
    166   dag_h->numCommitNodes = 1;
    167   dag_h->numCommits = 0;
    168   dag_h->numSuccedents = 1;
    169 
    170   /*
    171    * Node count:
    172    * 1 block node
    173    * n data reads (or writes)
    174    * 1 commit node
    175    * 1 terminator node
    176    */
    177   RF_ASSERT(n > 0);
    178   totalNumNodes = n + 3;
    179   RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
    180     (RF_DagNode_t *), allocList);
    181   i = 0;
    182   diskNodes   = &nodes[i]; i += n;
    183   blockNode   = &nodes[i]; i += 1;
    184   commitNode   = &nodes[i]; i += 1;
    185   termNode    = &nodes[i];  i += 1;
    186   RF_ASSERT(i == totalNumNodes);
    187 
    188   /* initialize nodes */
    189   switch (type) {
    190     case RF_IO_TYPE_READ:
    191       rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    192         NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
    193       rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    194         NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
    195       rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
    196         NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    197       break;
    198     case RF_IO_TYPE_WRITE:
    199       rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    200         NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
    201       rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
    202         NULL, n, 1, 0, 0, dag_h, "Cmt", allocList);
    203       rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
    204         NULL, 0, n, 0, 0, dag_h, "Trm", allocList);
    205       break;
    206     default:
    207       RF_PANIC();
    208   }
    209 
    210   for (i = 0; i < n; i++) {
    211     RF_ASSERT(pda != NULL);
    212     rf_InitNode(&diskNodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc,
    213       1, 1, 4, 0, dag_h, name, allocList);
    214     diskNodes[i].params[0].p  = pda;
    215     diskNodes[i].params[1].p  = pda->bufPtr;
    216     /* parity stripe id is not necessary */
    217     diskNodes[i].params[2].v  = 0;
    218     diskNodes[i].params[3].v  = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
    219     pda = pda->next;
    220   }
    221 
    222   /*
    223    * Connect nodes.
    224    */
    225 
    226   /* connect hdr to block node */
    227   RF_ASSERT(blockNode->numAntecedents == 0);
    228   dag_h->succedents[0] = blockNode;
    229 
    230   if (type == RF_IO_TYPE_READ) {
    231     /* connecting a nonredundant read DAG */
    232     RF_ASSERT(blockNode->numSuccedents == n);
    233     RF_ASSERT(commitNode->numAntecedents == n);
    234     for (i=0; i < n; i++) {
    235       /* connect block node to each read node */
    236       RF_ASSERT(diskNodes[i].numAntecedents == 1);
    237       blockNode->succedents[i] = &diskNodes[i];
    238       diskNodes[i].antecedents[0] = blockNode;
    239       diskNodes[i].antType[0] = rf_control;
    240 
    241       /* connect each read node to the commit node */
    242       RF_ASSERT(diskNodes[i].numSuccedents == 1);
    243       diskNodes[i].succedents[0] = commitNode;
    244       commitNode->antecedents[i] = &diskNodes[i];
    245       commitNode->antType[i] = rf_control;
    246     }
    247     /* connect the commit node to the term node */
    248     RF_ASSERT(commitNode->numSuccedents == 1);
    249     RF_ASSERT(termNode->numAntecedents == 1);
    250     RF_ASSERT(termNode->numSuccedents == 0);
    251     commitNode->succedents[0] = termNode;
    252     termNode->antecedents[0] = commitNode;
    253     termNode->antType[0] = rf_control;
    254   }
    255   else {
    256     /* connecting a nonredundant write DAG */
    257     /* connect the block node to the commit node */
    258     RF_ASSERT(blockNode->numSuccedents == 1);
    259     RF_ASSERT(commitNode->numAntecedents == 1);
    260     blockNode->succedents[0] = commitNode;
    261     commitNode->antecedents[0] = blockNode;
    262     commitNode->antType[0] = rf_control;
    263 
    264     RF_ASSERT(commitNode->numSuccedents == n);
    265     RF_ASSERT(termNode->numAntecedents == n);
    266     RF_ASSERT(termNode->numSuccedents == 0);
    267     for (i=0; i < n; i++) {
    268       /* connect the commit node to each write node */
    269       RF_ASSERT(diskNodes[i].numAntecedents == 1);
    270       commitNode->succedents[i] = &diskNodes[i];
    271       diskNodes[i].antecedents[0] = commitNode;
    272       diskNodes[i].antType[0] = rf_control;
    273 
    274       /* connect each write node to the term node */
    275       RF_ASSERT(diskNodes[i].numSuccedents == 1);
    276       diskNodes[i].succedents[0] = termNode;
    277       termNode->antecedents[i] = &diskNodes[i];
    278       termNode->antType[i] = rf_control;
    279     }
    280   }
    281 }
    282 
    283 /******************************************************************************
    284  * Create a fault-free read DAG for RAID level 1
    285  *
    286  * Hdr -> Nil -> Rmir -> Cmt -> Trm
    287  *
    288  * The "Rmir" node schedules a read from the disk in the mirror pair with the
    289  * shortest disk queue.  the proper queue is selected at Rmir execution.  this
    290  * deferred mapping is unlike other archs in RAIDframe which generally fix
    291  * mapping at DAG creation time.
    292  *
    293  * Parameters:  raidPtr   - description of the physical array
    294  *              asmap     - logical & physical addresses for this access
    295  *              bp        - buffer ptr (for holding read data)
    296  *              flags     - general flags (e.g. disk locking)
    297  *              allocList - list of memory allocated in DAG creation
    298  *****************************************************************************/
    299 
    300 static void CreateMirrorReadDAG(
    301   RF_Raid_t             *raidPtr,
    302   RF_AccessStripeMap_t  *asmap,
    303   RF_DagHeader_t        *dag_h,
    304   void                  *bp,
    305   RF_RaidAccessFlags_t   flags,
    306   RF_AllocListElem_t    *allocList,
    307   int                   (*readfunc)(RF_DagNode_t *node))
    308 {
    309   RF_DagNode_t *readNodes, *nodes, *blockNode, *commitNode, *termNode;
    310   RF_PhysDiskAddr_t *data_pda = asmap->physInfo;
    311   RF_PhysDiskAddr_t *parity_pda = asmap->parityInfo;
    312   int i, n, totalNumNodes;
    313 
    314   n = asmap->numStripeUnitsAccessed;
    315   dag_h->creator = "RaidOneReadDAG";
    316   if (rf_dagDebug) {
    317     printf("[Creating RAID level 1 read DAG]\n");
    318   }
    319 
    320   /*
    321    * This dag can not commit until the commit node is reached
    322    * errors prior to the commit point imply the dag has failed.
    323    */
    324   dag_h->numCommitNodes = 1;
    325   dag_h->numCommits = 0;
    326   dag_h->numSuccedents = 1;
    327 
    328   /*
    329    * Node count:
    330    * n data reads
    331    * 1 block node
    332    * 1 commit node
    333    * 1 terminator node
    334    */
    335   RF_ASSERT(n > 0);
    336   totalNumNodes = n + 3;
    337   RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
    338     (RF_DagNode_t *), allocList);
    339   i = 0;
    340   readNodes   = &nodes[i]; i += n;
    341   blockNode   = &nodes[i]; i += 1;
    342   commitNode = &nodes[i]; i += 1;
    343   termNode    = &nodes[i]; i += 1;
    344   RF_ASSERT(i == totalNumNodes);
    345 
    346   /* initialize nodes */
    347   rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
    348     rf_NullNodeUndoFunc, NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
    349   rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
    350     rf_NullNodeUndoFunc, NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
    351   rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
    352     rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    353 
    354   for (i = 0; i < n; i++) {
    355     RF_ASSERT(data_pda != NULL);
    356     RF_ASSERT(parity_pda != NULL);
    357     rf_InitNode(&readNodes[i], rf_wait, RF_FALSE, readfunc,
    358       rf_DiskReadMirrorUndoFunc, rf_GenericWakeupFunc, 1, 1, 5, 0, dag_h,
    359       "Rmir", allocList);
    360     readNodes[i].params[0].p = data_pda;
    361     readNodes[i].params[1].p = data_pda->bufPtr;
    362     /* parity stripe id is not necessary */
    363     readNodes[i].params[2].p = 0;
    364     readNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
    365     readNodes[i].params[4].p = parity_pda;
    366     data_pda = data_pda->next;
    367     parity_pda = parity_pda->next;
    368   }
    369 
    370   /*
    371    * Connect nodes
    372    */
    373 
    374   /* connect hdr to block node */
    375   RF_ASSERT(blockNode->numAntecedents == 0);
    376   dag_h->succedents[0] = blockNode;
    377 
    378   /* connect block node to read nodes */
    379   RF_ASSERT(blockNode->numSuccedents == n);
    380   for (i=0; i < n; i++) {
    381     RF_ASSERT(readNodes[i].numAntecedents == 1);
    382     blockNode->succedents[i] = &readNodes[i];
    383     readNodes[i].antecedents[0] = blockNode;
    384     readNodes[i].antType[0] = rf_control;
    385   }
    386 
    387   /* connect read nodes to commit node */
    388   RF_ASSERT(commitNode->numAntecedents == n);
    389   for (i=0; i < n; i++) {
    390     RF_ASSERT(readNodes[i].numSuccedents == 1);
    391     readNodes[i].succedents[0] = commitNode;
    392     commitNode->antecedents[i] = &readNodes[i];
    393     commitNode->antType[i] = rf_control;
    394   }
    395 
    396   /* connect commit node to term node */
    397   RF_ASSERT(commitNode->numSuccedents == 1);
    398   RF_ASSERT(termNode->numAntecedents == 1);
    399   RF_ASSERT(termNode->numSuccedents == 0);
    400   commitNode->succedents[0] = termNode;
    401   termNode->antecedents[0] = commitNode;
    402   termNode->antType[0] = rf_control;
    403 }
    404 
    405 void rf_CreateMirrorIdleReadDAG(
    406   RF_Raid_t             *raidPtr,
    407   RF_AccessStripeMap_t  *asmap,
    408   RF_DagHeader_t        *dag_h,
    409   void                  *bp,
    410   RF_RaidAccessFlags_t   flags,
    411   RF_AllocListElem_t    *allocList)
    412 {
    413   CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
    414     rf_DiskReadMirrorIdleFunc);
    415 }
    416 
    417 void rf_CreateMirrorPartitionReadDAG(
    418   RF_Raid_t             *raidPtr,
    419   RF_AccessStripeMap_t  *asmap,
    420   RF_DagHeader_t        *dag_h,
    421   void                  *bp,
    422   RF_RaidAccessFlags_t   flags,
    423   RF_AllocListElem_t    *allocList)
    424 {
    425   CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
    426     rf_DiskReadMirrorPartitionFunc);
    427 }
    428