Home | History | Annotate | Line # | Download | only in raidframe
rf_parityloggingdags.c revision 1.19.30.1
      1  1.19.30.1     rmind /*	$NetBSD: rf_parityloggingdags.c,v 1.19.30.1 2014/05/18 17:45:46 rmind Exp $	*/
      2        1.1     oster /*
      3        1.1     oster  * Copyright (c) 1995 Carnegie-Mellon University.
      4        1.1     oster  * All rights reserved.
      5        1.1     oster  *
      6        1.1     oster  * Author: William V. Courtright II
      7        1.1     oster  *
      8        1.1     oster  * Permission to use, copy, modify and distribute this software and
      9        1.1     oster  * its documentation is hereby granted, provided that both the copyright
     10        1.1     oster  * notice and this permission notice appear in all copies of the
     11        1.1     oster  * software, derivative works or modified versions, and any portions
     12        1.1     oster  * thereof, and that both notices appear in supporting documentation.
     13        1.1     oster  *
     14        1.1     oster  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15        1.1     oster  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16        1.1     oster  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17        1.1     oster  *
     18        1.1     oster  * Carnegie Mellon requests users of this software to return to
     19        1.1     oster  *
     20        1.1     oster  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21        1.1     oster  *  School of Computer Science
     22        1.1     oster  *  Carnegie Mellon University
     23        1.1     oster  *  Pittsburgh PA 15213-3890
     24        1.1     oster  *
     25        1.1     oster  * any improvements or extensions that they make and grant Carnegie the
     26        1.1     oster  * rights to redistribute these changes.
     27        1.1     oster  */
     28        1.1     oster 
     29        1.7     lukem /*
     30        1.7     lukem   DAGs specific to parity logging are created here
     31        1.7     lukem  */
     32        1.7     lukem 
     33        1.7     lukem #include <sys/cdefs.h>
     34  1.19.30.1     rmind __KERNEL_RCSID(0, "$NetBSD: rf_parityloggingdags.c,v 1.19.30.1 2014/05/18 17:45:46 rmind Exp $");
     35       1.19        ad 
     36       1.19        ad #ifdef _KERNEL_OPT
     37       1.19        ad #include "opt_raid_diagnostic.h"
     38       1.19        ad #endif
     39        1.7     lukem 
     40        1.1     oster #include "rf_archs.h"
     41        1.1     oster 
     42        1.1     oster #if RF_INCLUDE_PARITYLOGGING > 0
     43        1.1     oster 
     44        1.6     oster #include <dev/raidframe/raidframevar.h>
     45        1.6     oster 
     46        1.1     oster #include "rf_raid.h"
     47        1.1     oster #include "rf_dag.h"
     48        1.1     oster #include "rf_dagutils.h"
     49        1.1     oster #include "rf_dagfuncs.h"
     50        1.1     oster #include "rf_debugMem.h"
     51        1.1     oster #include "rf_paritylog.h"
     52        1.1     oster #include "rf_general.h"
     53        1.1     oster 
     54        1.1     oster #include "rf_parityloggingdags.h"
     55        1.1     oster 
     56        1.1     oster /******************************************************************************
     57        1.1     oster  *
     58        1.1     oster  * creates a DAG to perform a large-write operation:
     59        1.1     oster  *
     60        1.1     oster  *         / Rod \     / Wnd \
     61        1.1     oster  * H -- NIL- Rod - NIL - Wnd ------ NIL - T
     62        1.1     oster  *         \ Rod /     \ Xor - Lpo /
     63        1.1     oster  *
     64        1.1     oster  * The writes are not done until the reads complete because if they were done in
     65        1.1     oster  * parallel, a failure on one of the reads could leave the parity in an inconsistent
     66        1.1     oster  * state, so that the retry with a new DAG would produce erroneous parity.
     67        1.1     oster  *
     68        1.1     oster  * Note:  this DAG has the nasty property that none of the buffers allocated for reading
     69        1.1     oster  *        old data can be freed until the XOR node fires.  Need to fix this.
     70        1.1     oster  *
     71        1.1     oster  * The last two arguments are the number of faults tolerated, and function for the
     72        1.1     oster  * redundancy calculation. The undo for the redundancy calc is assumed to be null
     73        1.1     oster  *
     74        1.1     oster  *****************************************************************************/
     75        1.1     oster 
     76       1.14     perry void
     77        1.3     oster rf_CommonCreateParityLoggingLargeWriteDAG(
     78        1.3     oster     RF_Raid_t * raidPtr,
     79        1.3     oster     RF_AccessStripeMap_t * asmap,
     80        1.3     oster     RF_DagHeader_t * dag_h,
     81       1.18  christos     void *bp,
     82       1.18  christos     RF_RaidAccessFlags_t flags,
     83        1.3     oster     RF_AllocListElem_t * allocList,
     84       1.18  christos     int nfaults,
     85        1.3     oster     int (*redFunc) (RF_DagNode_t *))
     86        1.1     oster {
     87        1.3     oster 	RF_DagNode_t *nodes, *wndNodes, *rodNodes = NULL, *syncNode, *xorNode,
     88        1.3     oster 	       *lpoNode, *blockNode, *unblockNode, *termNode;
     89        1.3     oster 	int     nWndNodes, nRodNodes, i;
     90        1.3     oster 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
     91        1.3     oster 	RF_AccessStripeMapHeader_t *new_asm_h[2];
     92        1.3     oster 	int     nodeNum, asmNum;
     93        1.3     oster 	RF_ReconUnitNum_t which_ru;
     94        1.3     oster 	char   *sosBuffer, *eosBuffer;
     95        1.3     oster 	RF_PhysDiskAddr_t *pda;
     96        1.3     oster 	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
     97        1.3     oster 
     98        1.3     oster 	if (rf_dagDebug)
     99        1.3     oster 		printf("[Creating parity-logging large-write DAG]\n");
    100        1.3     oster 	RF_ASSERT(nfaults == 1);/* this arch only single fault tolerant */
    101        1.3     oster 	dag_h->creator = "ParityLoggingLargeWriteDAG";
    102        1.3     oster 
    103        1.3     oster 	/* alloc the Wnd nodes, the xor node, and the Lpo node */
    104        1.3     oster 	nWndNodes = asmap->numStripeUnitsAccessed;
    105       1.14     perry 	RF_MallocAndAdd(nodes, (nWndNodes + 6) * sizeof(RF_DagNode_t),
    106       1.10     oster 			(RF_DagNode_t *), allocList);
    107        1.3     oster 	i = 0;
    108        1.3     oster 	wndNodes = &nodes[i];
    109        1.3     oster 	i += nWndNodes;
    110        1.3     oster 	xorNode = &nodes[i];
    111        1.3     oster 	i += 1;
    112        1.3     oster 	lpoNode = &nodes[i];
    113        1.3     oster 	i += 1;
    114        1.3     oster 	blockNode = &nodes[i];
    115        1.3     oster 	i += 1;
    116        1.3     oster 	syncNode = &nodes[i];
    117        1.3     oster 	i += 1;
    118        1.3     oster 	unblockNode = &nodes[i];
    119        1.3     oster 	i += 1;
    120        1.3     oster 	termNode = &nodes[i];
    121        1.3     oster 	i += 1;
    122        1.3     oster 
    123        1.3     oster 	dag_h->numCommitNodes = nWndNodes + 1;
    124        1.3     oster 	dag_h->numCommits = 0;
    125        1.3     oster 	dag_h->numSuccedents = 1;
    126        1.3     oster 
    127        1.3     oster 	rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
    128        1.3     oster 	if (nRodNodes > 0)
    129       1.14     perry 		RF_MallocAndAdd(rodNodes, nRodNodes * sizeof(RF_DagNode_t),
    130       1.10     oster 				(RF_DagNode_t *), allocList);
    131        1.3     oster 
    132        1.3     oster 	/* begin node initialization */
    133        1.3     oster 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
    134        1.3     oster 	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
    135        1.3     oster 	rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
    136        1.3     oster 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    137        1.3     oster 
    138        1.3     oster 	/* initialize the Rod nodes */
    139        1.3     oster 	for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
    140        1.3     oster 		if (new_asm_h[asmNum]) {
    141        1.3     oster 			pda = new_asm_h[asmNum]->stripeMap->physInfo;
    142        1.3     oster 			while (pda) {
    143        1.3     oster 				rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
    144        1.3     oster 				rodNodes[nodeNum].params[0].p = pda;
    145        1.3     oster 				rodNodes[nodeNum].params[1].p = pda->bufPtr;
    146        1.3     oster 				rodNodes[nodeNum].params[2].v = parityStripeID;
    147       1.13     oster 				rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
    148        1.3     oster 				nodeNum++;
    149        1.3     oster 				pda = pda->next;
    150        1.3     oster 			}
    151        1.3     oster 		}
    152        1.3     oster 	}
    153        1.3     oster 	RF_ASSERT(nodeNum == nRodNodes);
    154        1.3     oster 
    155        1.3     oster 	/* initialize the wnd nodes */
    156        1.3     oster 	pda = asmap->physInfo;
    157        1.3     oster 	for (i = 0; i < nWndNodes; i++) {
    158        1.3     oster 		rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
    159        1.3     oster 		RF_ASSERT(pda != NULL);
    160        1.3     oster 		wndNodes[i].params[0].p = pda;
    161        1.3     oster 		wndNodes[i].params[1].p = pda->bufPtr;
    162        1.3     oster 		wndNodes[i].params[2].v = parityStripeID;
    163       1.13     oster 		wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
    164        1.3     oster 		pda = pda->next;
    165        1.3     oster 	}
    166        1.3     oster 
    167        1.3     oster 	/* initialize the redundancy node */
    168        1.3     oster 	rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2 * (nWndNodes + nRodNodes) + 1, 1, dag_h, "Xr ", allocList);
    169        1.3     oster 	xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
    170        1.3     oster 	for (i = 0; i < nWndNodes; i++) {
    171        1.3     oster 		xorNode->params[2 * i + 0] = wndNodes[i].params[0];	/* pda */
    172        1.3     oster 		xorNode->params[2 * i + 1] = wndNodes[i].params[1];	/* buf ptr */
    173        1.3     oster 	}
    174        1.3     oster 	for (i = 0; i < nRodNodes; i++) {
    175        1.3     oster 		xorNode->params[2 * (nWndNodes + i) + 0] = rodNodes[i].params[0];	/* pda */
    176        1.3     oster 		xorNode->params[2 * (nWndNodes + i) + 1] = rodNodes[i].params[1];	/* buf ptr */
    177        1.3     oster 	}
    178        1.3     oster 	xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;	/* xor node needs to get
    179        1.3     oster 									 * at RAID information */
    180        1.3     oster 
    181        1.3     oster 	/* look for an Rod node that reads a complete SU.  If none, alloc a
    182        1.3     oster 	 * buffer to receive the parity info. Note that we can't use a new
    183        1.3     oster 	 * data buffer because it will not have gotten written when the xor
    184        1.3     oster 	 * occurs. */
    185        1.3     oster 	for (i = 0; i < nRodNodes; i++)
    186        1.3     oster 		if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
    187        1.3     oster 			break;
    188        1.3     oster 	if (i == nRodNodes) {
    189       1.14     perry 		RF_MallocAndAdd(xorNode->results[0],
    190       1.10     oster 				rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
    191        1.3     oster 	} else {
    192        1.3     oster 		xorNode->results[0] = rodNodes[i].params[1].p;
    193        1.3     oster 	}
    194        1.3     oster 
    195        1.3     oster 	/* initialize the Lpo node */
    196        1.3     oster 	rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
    197        1.3     oster 
    198        1.3     oster 	lpoNode->params[0].p = asmap->parityInfo;
    199        1.3     oster 	lpoNode->params[1].p = xorNode->results[0];
    200        1.3     oster 	RF_ASSERT(asmap->parityInfo->next == NULL);	/* parityInfo must
    201        1.3     oster 							 * describe entire
    202        1.3     oster 							 * parity unit */
    203        1.3     oster 
    204        1.3     oster 	/* connect nodes to form graph */
    205        1.3     oster 
    206        1.3     oster 	/* connect dag header to block node */
    207        1.3     oster 	RF_ASSERT(dag_h->numSuccedents == 1);
    208        1.3     oster 	RF_ASSERT(blockNode->numAntecedents == 0);
    209        1.3     oster 	dag_h->succedents[0] = blockNode;
    210        1.3     oster 
    211        1.3     oster 	/* connect the block node to the Rod nodes */
    212        1.3     oster 	RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
    213        1.3     oster 	for (i = 0; i < nRodNodes; i++) {
    214        1.3     oster 		RF_ASSERT(rodNodes[i].numAntecedents == 1);
    215        1.3     oster 		blockNode->succedents[i] = &rodNodes[i];
    216        1.3     oster 		rodNodes[i].antecedents[0] = blockNode;
    217        1.3     oster 		rodNodes[i].antType[0] = rf_control;
    218        1.3     oster 	}
    219        1.3     oster 
    220        1.3     oster 	/* connect the block node to the sync node */
    221        1.3     oster 	/* necessary if nRodNodes == 0 */
    222        1.3     oster 	RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
    223        1.3     oster 	blockNode->succedents[nRodNodes] = syncNode;
    224        1.3     oster 	syncNode->antecedents[0] = blockNode;
    225        1.3     oster 	syncNode->antType[0] = rf_control;
    226        1.3     oster 
    227        1.3     oster 	/* connect the Rod nodes to the syncNode */
    228        1.3     oster 	for (i = 0; i < nRodNodes; i++) {
    229        1.3     oster 		rodNodes[i].succedents[0] = syncNode;
    230        1.3     oster 		syncNode->antecedents[1 + i] = &rodNodes[i];
    231        1.3     oster 		syncNode->antType[1 + i] = rf_control;
    232        1.3     oster 	}
    233        1.3     oster 
    234        1.3     oster 	/* connect the sync node to the xor node */
    235        1.3     oster 	RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
    236        1.3     oster 	RF_ASSERT(xorNode->numAntecedents == 1);
    237        1.3     oster 	syncNode->succedents[0] = xorNode;
    238        1.3     oster 	xorNode->antecedents[0] = syncNode;
    239        1.3     oster 	xorNode->antType[0] = rf_trueData;	/* carry forward from sync */
    240        1.3     oster 
    241        1.3     oster 	/* connect the sync node to the Wnd nodes */
    242        1.3     oster 	for (i = 0; i < nWndNodes; i++) {
    243        1.3     oster 		RF_ASSERT(wndNodes->numAntecedents == 1);
    244        1.3     oster 		syncNode->succedents[1 + i] = &wndNodes[i];
    245        1.3     oster 		wndNodes[i].antecedents[0] = syncNode;
    246        1.3     oster 		wndNodes[i].antType[0] = rf_control;
    247        1.3     oster 	}
    248        1.3     oster 
    249        1.3     oster 	/* connect the xor node to the Lpo node */
    250        1.3     oster 	RF_ASSERT(xorNode->numSuccedents == 1);
    251        1.3     oster 	RF_ASSERT(lpoNode->numAntecedents == 1);
    252        1.3     oster 	xorNode->succedents[0] = lpoNode;
    253        1.3     oster 	lpoNode->antecedents[0] = xorNode;
    254        1.3     oster 	lpoNode->antType[0] = rf_trueData;
    255        1.3     oster 
    256        1.3     oster 	/* connect the Wnd nodes to the unblock node */
    257        1.3     oster 	RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
    258        1.3     oster 	for (i = 0; i < nWndNodes; i++) {
    259        1.3     oster 		RF_ASSERT(wndNodes->numSuccedents == 1);
    260        1.3     oster 		wndNodes[i].succedents[0] = unblockNode;
    261        1.3     oster 		unblockNode->antecedents[i] = &wndNodes[i];
    262        1.3     oster 		unblockNode->antType[i] = rf_control;
    263        1.3     oster 	}
    264        1.3     oster 
    265        1.3     oster 	/* connect the Lpo node to the unblock node */
    266        1.3     oster 	RF_ASSERT(lpoNode->numSuccedents == 1);
    267        1.3     oster 	lpoNode->succedents[0] = unblockNode;
    268        1.3     oster 	unblockNode->antecedents[nWndNodes] = lpoNode;
    269        1.3     oster 	unblockNode->antType[nWndNodes] = rf_control;
    270        1.3     oster 
    271        1.3     oster 	/* connect unblock node to terminator */
    272        1.3     oster 	RF_ASSERT(unblockNode->numSuccedents == 1);
    273        1.3     oster 	RF_ASSERT(termNode->numAntecedents == 1);
    274        1.3     oster 	RF_ASSERT(termNode->numSuccedents == 0);
    275        1.3     oster 	unblockNode->succedents[0] = termNode;
    276        1.3     oster 	termNode->antecedents[0] = unblockNode;
    277        1.3     oster 	termNode->antType[0] = rf_control;
    278        1.1     oster }
    279        1.1     oster 
    280        1.1     oster 
    281        1.1     oster 
    282        1.1     oster 
    283        1.1     oster /******************************************************************************
    284        1.1     oster  *
    285        1.1     oster  * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
    286        1.1     oster  *
    287        1.1     oster  *                                     Header
    288        1.1     oster  *                                       |
    289        1.1     oster  *                                     Block
    290        1.3     oster  *                                 / |  ... \   \
    291        1.3     oster  *                                /  |       \   \
    292        1.1     oster  *                             Rod  Rod      Rod  Rop
    293        1.3     oster  *                             | \ /| \    / |  \/ |
    294        1.3     oster  *                             |    |        |  /\ |
    295        1.3     oster  *                             Wnd  Wnd      Wnd   X
    296        1.3     oster  *                              |    \       /     |
    297        1.3     oster  *                              |     \     /      |
    298        1.1     oster  *                               \     \   /      Lpo
    299        1.3     oster  *                                \     \ /       /
    300        1.3     oster  *                                 +-> Unblock <-+
    301        1.1     oster  *                                       |
    302        1.1     oster  *                                       T
    303        1.3     oster  *
    304        1.1     oster  *
    305        1.1     oster  * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
    306        1.1     oster  * When the access spans a stripe unit boundary and is less than one SU in size, there will
    307        1.1     oster  * be two Rop -- X -- Wnp branches.  I call this the "double-XOR" case.
    308        1.1     oster  * The second output from each Rod node goes to the X node.  In the double-XOR
    309        1.1     oster  * case, there are exactly 2 Rod nodes, and each sends one output to one X node.
    310        1.1     oster  * There is one Rod -- Wnd -- T branch for each stripe unit being updated.
    311        1.1     oster  *
    312        1.1     oster  * The block and unblock nodes are unused.  See comment above CreateFaultFreeReadDAG.
    313        1.1     oster  *
    314        1.1     oster  * Note:  this DAG ignores all the optimizations related to making the RMWs atomic.
    315        1.1     oster  *        it also has the nasty property that none of the buffers allocated for reading
    316        1.1     oster  *        old data & parity can be freed until the XOR node fires.  Need to fix this.
    317        1.1     oster  *
    318        1.1     oster  * A null qfuncs indicates single fault tolerant
    319        1.1     oster  *****************************************************************************/
    320        1.1     oster 
    321       1.14     perry void
    322        1.3     oster rf_CommonCreateParityLoggingSmallWriteDAG(
    323        1.3     oster     RF_Raid_t * raidPtr,
    324        1.3     oster     RF_AccessStripeMap_t * asmap,
    325        1.3     oster     RF_DagHeader_t * dag_h,
    326       1.18  christos     void *bp,
    327       1.18  christos     RF_RaidAccessFlags_t flags,
    328        1.3     oster     RF_AllocListElem_t * allocList,
    329       1.16  christos     const RF_RedFuncs_t * pfuncs,
    330       1.16  christos     const RF_RedFuncs_t * qfuncs)
    331        1.1     oster {
    332        1.3     oster 	RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
    333        1.3     oster 	RF_DagNode_t *readDataNodes, *readParityNodes;
    334        1.3     oster 	RF_DagNode_t *writeDataNodes, *lpuNodes;
    335       1.16  christos 	RF_DagNode_t *termNode;
    336        1.3     oster 	RF_PhysDiskAddr_t *pda = asmap->physInfo;
    337        1.3     oster 	int     numDataNodes = asmap->numStripeUnitsAccessed;
    338        1.3     oster 	int     numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
    339        1.3     oster 	int     i, j, nNodes, totalNumNodes;
    340        1.3     oster 	RF_ReconUnitNum_t which_ru;
    341        1.3     oster 	int     (*func) (RF_DagNode_t * node), (*undoFunc) (RF_DagNode_t * node);
    342  1.19.30.1     rmind 	const char   *name;
    343        1.3     oster 	RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
    344  1.19.30.1     rmind 	long    nfaults __unused = qfuncs ? 2 : 1;
    345        1.3     oster 
    346        1.3     oster 	if (rf_dagDebug)
    347        1.3     oster 		printf("[Creating parity-logging small-write DAG]\n");
    348        1.3     oster 	RF_ASSERT(numDataNodes > 0);
    349        1.3     oster 	RF_ASSERT(nfaults == 1);
    350        1.3     oster 	dag_h->creator = "ParityLoggingSmallWriteDAG";
    351        1.3     oster 
    352        1.3     oster 	/* DAG creation occurs in three steps: 1. count the number of nodes in
    353        1.3     oster 	 * the DAG 2. create the nodes 3. initialize the nodes 4. connect the
    354        1.3     oster 	 * nodes */
    355        1.3     oster 
    356        1.3     oster 	/* Step 1. compute number of nodes in the graph */
    357        1.3     oster 
    358        1.3     oster 	/* number of nodes: a read and write for each data unit a redundancy
    359        1.3     oster 	 * computation node for each parity node a read and Lpu for each
    360        1.3     oster 	 * parity unit a block and unblock node (2) a terminator node if
    361        1.3     oster 	 * atomic RMW an unlock node for each data unit, redundancy unit */
    362        1.3     oster 	totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
    363        1.3     oster 
    364        1.3     oster 	nNodes = numDataNodes + numParityNodes;
    365        1.3     oster 
    366        1.3     oster 	dag_h->numCommitNodes = numDataNodes + numParityNodes;
    367        1.3     oster 	dag_h->numCommits = 0;
    368        1.3     oster 	dag_h->numSuccedents = 1;
    369        1.3     oster 
    370        1.3     oster 	/* Step 2. create the nodes */
    371       1.14     perry 	RF_MallocAndAdd(nodes, totalNumNodes * sizeof(RF_DagNode_t),
    372       1.10     oster 			(RF_DagNode_t *), allocList);
    373        1.3     oster 	i = 0;
    374        1.3     oster 	blockNode = &nodes[i];
    375        1.3     oster 	i += 1;
    376        1.3     oster 	unblockNode = &nodes[i];
    377        1.3     oster 	i += 1;
    378        1.3     oster 	readDataNodes = &nodes[i];
    379        1.3     oster 	i += numDataNodes;
    380        1.3     oster 	readParityNodes = &nodes[i];
    381        1.3     oster 	i += numParityNodes;
    382        1.3     oster 	writeDataNodes = &nodes[i];
    383        1.3     oster 	i += numDataNodes;
    384        1.3     oster 	lpuNodes = &nodes[i];
    385        1.3     oster 	i += numParityNodes;
    386        1.3     oster 	xorNodes = &nodes[i];
    387        1.3     oster 	i += numParityNodes;
    388        1.3     oster 	termNode = &nodes[i];
    389        1.3     oster 	i += 1;
    390       1.12     oster 
    391        1.3     oster 	RF_ASSERT(i == totalNumNodes);
    392        1.3     oster 
    393        1.3     oster 	/* Step 3. initialize the nodes */
    394        1.3     oster 	/* initialize block node (Nil) */
    395        1.3     oster 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
    396        1.3     oster 
    397        1.3     oster 	/* initialize unblock node (Nil) */
    398        1.3     oster 	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
    399        1.3     oster 
    400        1.3     oster 	/* initialize terminatory node (Trm) */
    401        1.3     oster 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
    402        1.3     oster 
    403        1.3     oster 	/* initialize nodes which read old data (Rod) */
    404        1.3     oster 	for (i = 0; i < numDataNodes; i++) {
    405        1.3     oster 		rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
    406        1.3     oster 		RF_ASSERT(pda != NULL);
    407        1.3     oster 		readDataNodes[i].params[0].p = pda;	/* physical disk addr
    408        1.3     oster 							 * desc */
    409       1.16  christos 		readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);	/* buffer to hold old data */
    410        1.3     oster 		readDataNodes[i].params[2].v = parityStripeID;
    411       1.13     oster 		readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
    412        1.3     oster 		pda = pda->next;
    413        1.3     oster 		readDataNodes[i].propList[0] = NULL;
    414        1.3     oster 		readDataNodes[i].propList[1] = NULL;
    415        1.3     oster 	}
    416        1.3     oster 
    417        1.3     oster 	/* initialize nodes which read old parity (Rop) */
    418        1.3     oster 	pda = asmap->parityInfo;
    419        1.3     oster 	i = 0;
    420        1.3     oster 	for (i = 0; i < numParityNodes; i++) {
    421        1.3     oster 		RF_ASSERT(pda != NULL);
    422        1.3     oster 		rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
    423        1.3     oster 		readParityNodes[i].params[0].p = pda;
    424       1.16  christos 		readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);	/* buffer to hold old parity */
    425        1.3     oster 		readParityNodes[i].params[2].v = parityStripeID;
    426       1.13     oster 		readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
    427        1.3     oster 		readParityNodes[i].propList[0] = NULL;
    428        1.3     oster 		pda = pda->next;
    429        1.3     oster 	}
    430        1.3     oster 
    431        1.3     oster 	/* initialize nodes which write new data (Wnd) */
    432        1.3     oster 	pda = asmap->physInfo;
    433        1.3     oster 	for (i = 0; i < numDataNodes; i++) {
    434        1.3     oster 		RF_ASSERT(pda != NULL);
    435        1.3     oster 		rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
    436        1.3     oster 		writeDataNodes[i].params[0].p = pda;	/* physical disk addr
    437        1.3     oster 							 * desc */
    438        1.3     oster 		writeDataNodes[i].params[1].p = pda->bufPtr;	/* buffer holding new
    439        1.3     oster 								 * data to be written */
    440        1.3     oster 		writeDataNodes[i].params[2].v = parityStripeID;
    441       1.13     oster 		writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
    442        1.3     oster 
    443        1.3     oster 		pda = pda->next;
    444        1.3     oster 	}
    445        1.3     oster 
    446        1.3     oster 
    447        1.3     oster 	/* initialize nodes which compute new parity */
    448        1.3     oster 	/* we use the simple XOR func in the double-XOR case, and when we're
    449        1.3     oster 	 * accessing only a portion of one stripe unit. the distinction
    450        1.3     oster 	 * between the two is that the regular XOR func assumes that the
    451        1.3     oster 	 * targbuf is a full SU in size, and examines the pda associated with
    452        1.3     oster 	 * the buffer to decide where within the buffer to XOR the data,
    453        1.3     oster 	 * whereas the simple XOR func just XORs the data into the start of
    454        1.3     oster 	 * the buffer. */
    455        1.3     oster 	if ((numParityNodes == 2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
    456        1.3     oster 		func = pfuncs->simple;
    457        1.3     oster 		undoFunc = rf_NullNodeUndoFunc;
    458        1.3     oster 		name = pfuncs->SimpleName;
    459        1.3     oster 	} else {
    460        1.3     oster 		func = pfuncs->regular;
    461        1.3     oster 		undoFunc = rf_NullNodeUndoFunc;
    462        1.3     oster 		name = pfuncs->RegularName;
    463        1.3     oster 	}
    464        1.3     oster 	/* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop}
    465        1.3     oster 	 * nodes, and raidPtr  */
    466        1.3     oster 	if (numParityNodes == 2) {	/* double-xor case */
    467        1.3     oster 		for (i = 0; i < numParityNodes; i++) {
    468        1.3     oster 			rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList);	/* no wakeup func for
    469        1.3     oster 																	 * xor */
    470        1.3     oster 			xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
    471        1.3     oster 			xorNodes[i].params[0] = readDataNodes[i].params[0];
    472        1.3     oster 			xorNodes[i].params[1] = readDataNodes[i].params[1];
    473        1.3     oster 			xorNodes[i].params[2] = readParityNodes[i].params[0];
    474        1.3     oster 			xorNodes[i].params[3] = readParityNodes[i].params[1];
    475        1.3     oster 			xorNodes[i].params[4] = writeDataNodes[i].params[0];
    476        1.3     oster 			xorNodes[i].params[5] = writeDataNodes[i].params[1];
    477        1.3     oster 			xorNodes[i].params[6].p = raidPtr;
    478        1.3     oster 			xorNodes[i].results[0] = readParityNodes[i].params[1].p;	/* use old parity buf as
    479        1.3     oster 											 * target buf */
    480        1.3     oster 		}
    481        1.3     oster 	} else {
    482        1.3     oster 		/* there is only one xor node in this case */
    483        1.3     oster 		rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
    484        1.3     oster 		xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
    485        1.3     oster 		for (i = 0; i < numDataNodes + 1; i++) {
    486        1.3     oster 			/* set up params related to Rod and Rop nodes */
    487        1.3     oster 			xorNodes[0].params[2 * i + 0] = readDataNodes[i].params[0];	/* pda */
    488        1.3     oster 			xorNodes[0].params[2 * i + 1] = readDataNodes[i].params[1];	/* buffer pointer */
    489        1.3     oster 		}
    490        1.3     oster 		for (i = 0; i < numDataNodes; i++) {
    491        1.3     oster 			/* set up params related to Wnd and Wnp nodes */
    492        1.3     oster 			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 0] = writeDataNodes[i].params[0];	/* pda */
    493        1.3     oster 			xorNodes[0].params[2 * (numDataNodes + 1 + i) + 1] = writeDataNodes[i].params[1];	/* buffer pointer */
    494        1.3     oster 		}
    495        1.3     oster 		xorNodes[0].params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;	/* xor node needs to get
    496        1.3     oster 											 * at RAID information */
    497        1.3     oster 		xorNodes[0].results[0] = readParityNodes[0].params[1].p;
    498        1.3     oster 	}
    499        1.3     oster 
    500        1.3     oster 	/* initialize the log node(s) */
    501        1.3     oster 	pda = asmap->parityInfo;
    502        1.3     oster 	for (i = 0; i < numParityNodes; i++) {
    503        1.3     oster 		RF_ASSERT(pda);
    504        1.3     oster 		rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
    505        1.3     oster 		lpuNodes[i].params[0].p = pda;	/* PhysDiskAddr of parity */
    506        1.3     oster 		lpuNodes[i].params[1].p = xorNodes[i].results[0];	/* buffer pointer to
    507        1.3     oster 									 * parity */
    508        1.3     oster 		pda = pda->next;
    509        1.3     oster 	}
    510        1.3     oster 
    511        1.3     oster 
    512        1.3     oster 	/* Step 4. connect the nodes */
    513        1.3     oster 
    514        1.3     oster 	/* connect header to block node */
    515        1.3     oster 	RF_ASSERT(dag_h->numSuccedents == 1);
    516        1.3     oster 	RF_ASSERT(blockNode->numAntecedents == 0);
    517        1.3     oster 	dag_h->succedents[0] = blockNode;
    518        1.3     oster 
    519        1.3     oster 	/* connect block node to read old data nodes */
    520        1.3     oster 	RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
    521        1.3     oster 	for (i = 0; i < numDataNodes; i++) {
    522        1.3     oster 		blockNode->succedents[i] = &readDataNodes[i];
    523        1.3     oster 		RF_ASSERT(readDataNodes[i].numAntecedents == 1);
    524        1.3     oster 		readDataNodes[i].antecedents[0] = blockNode;
    525        1.3     oster 		readDataNodes[i].antType[0] = rf_control;
    526        1.3     oster 	}
    527        1.3     oster 
    528        1.3     oster 	/* connect block node to read old parity nodes */
    529        1.3     oster 	for (i = 0; i < numParityNodes; i++) {
    530        1.3     oster 		blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
    531        1.3     oster 		RF_ASSERT(readParityNodes[i].numAntecedents == 1);
    532        1.3     oster 		readParityNodes[i].antecedents[0] = blockNode;
    533        1.3     oster 		readParityNodes[i].antType[0] = rf_control;
    534        1.3     oster 	}
    535        1.3     oster 
    536        1.3     oster 	/* connect read old data nodes to write new data nodes */
    537        1.3     oster 	for (i = 0; i < numDataNodes; i++) {
    538        1.3     oster 		RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
    539        1.3     oster 		for (j = 0; j < numDataNodes; j++) {
    540        1.3     oster 			RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
    541        1.3     oster 			readDataNodes[i].succedents[j] = &writeDataNodes[j];
    542        1.3     oster 			writeDataNodes[j].antecedents[i] = &readDataNodes[i];
    543        1.3     oster 			if (i == j)
    544        1.3     oster 				writeDataNodes[j].antType[i] = rf_antiData;
    545        1.3     oster 			else
    546        1.3     oster 				writeDataNodes[j].antType[i] = rf_control;
    547        1.3     oster 		}
    548        1.3     oster 	}
    549        1.3     oster 
    550        1.3     oster 	/* connect read old data nodes to xor nodes */
    551        1.3     oster 	for (i = 0; i < numDataNodes; i++)
    552        1.3     oster 		for (j = 0; j < numParityNodes; j++) {
    553        1.3     oster 			RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
    554        1.3     oster 			readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
    555        1.3     oster 			xorNodes[j].antecedents[i] = &readDataNodes[i];
    556        1.3     oster 			xorNodes[j].antType[i] = rf_trueData;
    557        1.3     oster 		}
    558        1.3     oster 
    559        1.3     oster 	/* connect read old parity nodes to write new data nodes */
    560        1.3     oster 	for (i = 0; i < numParityNodes; i++) {
    561        1.3     oster 		RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
    562        1.3     oster 		for (j = 0; j < numDataNodes; j++) {
    563        1.3     oster 			readParityNodes[i].succedents[j] = &writeDataNodes[j];
    564        1.3     oster 			writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
    565        1.3     oster 			writeDataNodes[j].antType[numDataNodes + i] = rf_control;
    566        1.3     oster 		}
    567        1.3     oster 	}
    568        1.3     oster 
    569        1.3     oster 	/* connect read old parity nodes to xor nodes */
    570        1.3     oster 	for (i = 0; i < numParityNodes; i++)
    571        1.3     oster 		for (j = 0; j < numParityNodes; j++) {
    572        1.3     oster 			readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
    573        1.3     oster 			xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
    574        1.3     oster 			xorNodes[j].antType[numDataNodes + i] = rf_trueData;
    575        1.3     oster 		}
    576        1.3     oster 
    577        1.3     oster 	/* connect xor nodes to write new parity nodes */
    578        1.3     oster 	for (i = 0; i < numParityNodes; i++) {
    579        1.3     oster 		RF_ASSERT(xorNodes[i].numSuccedents == 1);
    580        1.3     oster 		RF_ASSERT(lpuNodes[i].numAntecedents == 1);
    581        1.3     oster 		xorNodes[i].succedents[0] = &lpuNodes[i];
    582        1.3     oster 		lpuNodes[i].antecedents[0] = &xorNodes[i];
    583        1.3     oster 		lpuNodes[i].antType[0] = rf_trueData;
    584        1.3     oster 	}
    585        1.3     oster 
    586        1.3     oster 	for (i = 0; i < numDataNodes; i++) {
    587       1.12     oster 		/* connect write new data nodes to unblock node */
    588       1.12     oster 		RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
    589       1.12     oster 		RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
    590       1.12     oster 		writeDataNodes[i].succedents[0] = unblockNode;
    591       1.12     oster 		unblockNode->antecedents[i] = &writeDataNodes[i];
    592       1.12     oster 		unblockNode->antType[i] = rf_control;
    593        1.3     oster 	}
    594        1.3     oster 
    595        1.3     oster 	/* connect write new parity nodes to unblock node */
    596        1.3     oster 	for (i = 0; i < numParityNodes; i++) {
    597        1.3     oster 		RF_ASSERT(lpuNodes[i].numSuccedents == 1);
    598        1.3     oster 		lpuNodes[i].succedents[0] = unblockNode;
    599        1.3     oster 		unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
    600        1.3     oster 		unblockNode->antType[numDataNodes + i] = rf_control;
    601        1.3     oster 	}
    602        1.3     oster 
    603        1.3     oster 	/* connect unblock node to terminator */
    604        1.3     oster 	RF_ASSERT(unblockNode->numSuccedents == 1);
    605        1.3     oster 	RF_ASSERT(termNode->numAntecedents == 1);
    606        1.3     oster 	RF_ASSERT(termNode->numSuccedents == 0);
    607        1.3     oster 	unblockNode->succedents[0] = termNode;
    608        1.3     oster 	termNode->antecedents[0] = unblockNode;
    609        1.3     oster 	termNode->antType[0] = rf_control;
    610        1.1     oster }
    611        1.1     oster 
    612        1.1     oster 
    613       1.14     perry void
    614        1.3     oster rf_CreateParityLoggingSmallWriteDAG(
    615        1.3     oster     RF_Raid_t * raidPtr,
    616        1.3     oster     RF_AccessStripeMap_t * asmap,
    617        1.3     oster     RF_DagHeader_t * dag_h,
    618        1.3     oster     void *bp,
    619        1.3     oster     RF_RaidAccessFlags_t flags,
    620        1.3     oster     RF_AllocListElem_t * allocList,
    621       1.18  christos     const RF_RedFuncs_t * pfuncs,
    622       1.18  christos     const RF_RedFuncs_t * qfuncs)
    623        1.1     oster {
    624        1.3     oster 	dag_h->creator = "ParityLoggingSmallWriteDAG";
    625        1.3     oster 	rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
    626        1.1     oster }
    627        1.1     oster 
    628        1.1     oster 
    629       1.14     perry void
    630        1.3     oster rf_CreateParityLoggingLargeWriteDAG(
    631        1.3     oster     RF_Raid_t * raidPtr,
    632        1.3     oster     RF_AccessStripeMap_t * asmap,
    633        1.3     oster     RF_DagHeader_t * dag_h,
    634        1.3     oster     void *bp,
    635        1.3     oster     RF_RaidAccessFlags_t flags,
    636        1.3     oster     RF_AllocListElem_t * allocList,
    637       1.18  christos     int nfaults,
    638       1.18  christos     int (*redFunc) (RF_DagNode_t *))
    639        1.1     oster {
    640        1.3     oster 	dag_h->creator = "ParityLoggingSmallWriteDAG";
    641        1.3     oster 	rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
    642        1.1     oster }
    643        1.3     oster #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
    644