Home | History | Annotate | Line # | Download | only in raidframe
rf_decluster.c revision 1.1
      1 /*	$NetBSD: rf_decluster.c,v 1.1 1998/11/13 04:20:28 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /*----------------------------------------------------------------------
     30  *
     31  * rf_decluster.c -- code related to the declustered layout
     32  *
     33  * Created 10-21-92 (MCH)
     34  *
     35  * Nov 93:  adding support for distributed sparing.  This code is a little
     36  *          complex:  the basic layout used is as follows:
     37  *          let F = (v-1)/GCD(r,v-1).  The spare space for each set of
     38  *          F consecutive fulltables is grouped together and placed after
     39  *          that set of tables.
     40  *                   +------------------------------+
     41  *                   |        F fulltables          |
     42  *                   |        Spare Space           |
     43  *                   |        F fulltables          |
     44  *                   |        Spare Space           |
     45  *                   |            ...               |
     46  *                   +------------------------------+
     47  *
     48  *--------------------------------------------------------------------*/
     49 
     50 /*
     51  * :
     52  * Log: rf_decluster.c,v
     53  * Revision 1.51  1996/08/21 19:47:10  jimz
     54  * fix bogus return values from config
     55  *
     56  * Revision 1.50  1996/08/20  22:41:42  jimz
     57  * better diagnostics for bad blockdesigns
     58  *
     59  * Revision 1.49  1996/07/31  16:56:18  jimz
     60  * dataBytesPerStripe, sectorsPerDisk init arch-indep.
     61  *
     62  * Revision 1.48  1996/07/29  14:05:12  jimz
     63  * fix numPUs/numRUs confusion (everything is now numRUs)
     64  * clean up some commenting, return values
     65  *
     66  * Revision 1.47  1996/07/27  23:36:08  jimz
     67  * Solaris port of simulator
     68  *
     69  * Revision 1.46  1996/07/27  18:40:11  jimz
     70  * cleanup sweep
     71  *
     72  * Revision 1.45  1996/07/18  22:57:14  jimz
     73  * port simulator to AIX
     74  *
     75  * Revision 1.44  1996/07/13  00:00:59  jimz
     76  * sanitized generalized reconstruction architecture
     77  * cleaned up head sep, rbuf problems
     78  *
     79  * Revision 1.43  1996/06/19  17:53:48  jimz
     80  * move GetNumSparePUs, InstallSpareTable ops into layout switch
     81  *
     82  * Revision 1.42  1996/06/17  03:23:48  jimz
     83  * switch DeclusteredDS typing
     84  *
     85  * Revision 1.41  1996/06/11  08:55:15  jimz
     86  * improved error-checking at configuration time
     87  *
     88  * Revision 1.40  1996/06/10  11:55:47  jimz
     89  * Straightened out some per-array/not-per-array distinctions, fixed
     90  * a couple bugs related to confusion. Added shutdown lists. Removed
     91  * layout shutdown function (now subsumed by shutdown lists).
     92  *
     93  * Revision 1.39  1996/06/09  02:36:46  jimz
     94  * lots of little crufty cleanup- fixup whitespace
     95  * issues, comment #ifdefs, improve typing in some
     96  * places (esp size-related)
     97  *
     98  * Revision 1.38  1996/06/07  22:26:27  jimz
     99  * type-ify which_ru (RF_ReconUnitNum_t)
    100  *
    101  * Revision 1.37  1996/06/07  21:33:04  jimz
    102  * begin using consistent types for sector numbers,
    103  * stripe numbers, row+col numbers, recon unit numbers
    104  *
    105  * Revision 1.36  1996/06/03  23:28:26  jimz
    106  * more bugfixes
    107  * check in tree to sync for IPDS runs with current bugfixes
    108  * there still may be a problem with threads in the script test
    109  * getting I/Os stuck- not trivially reproducible (runs ~50 times
    110  * in a row without getting stuck)
    111  *
    112  * Revision 1.35  1996/06/02  17:31:48  jimz
    113  * Moved a lot of global stuff into array structure, where it belongs.
    114  * Fixed up paritylogging, pss modules in this manner. Some general
    115  * code cleanup. Removed lots of dead code, some dead files.
    116  *
    117  * Revision 1.34  1996/05/30  23:22:16  jimz
    118  * bugfixes of serialization, timing problems
    119  * more cleanup
    120  *
    121  * Revision 1.33  1996/05/30  11:29:41  jimz
    122  * Numerous bug fixes. Stripe lock release code disagreed with the taking code
    123  * about when stripes should be locked (I made it consistent: no parity, no lock)
    124  * There was a lot of extra serialization of I/Os which I've removed- a lot of
    125  * it was to calculate values for the cache code, which is no longer with us.
    126  * More types, function, macro cleanup. Added code to properly quiesce the array
    127  * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
    128  * before. Fixed memory allocation, freeing bugs.
    129  *
    130  * Revision 1.32  1996/05/27  18:56:37  jimz
    131  * more code cleanup
    132  * better typing
    133  * compiles in all 3 environments
    134  *
    135  * Revision 1.31  1996/05/24  01:59:45  jimz
    136  * another checkpoint in code cleanup for release
    137  * time to sync kernel tree
    138  *
    139  * Revision 1.30  1996/05/23  00:33:23  jimz
    140  * code cleanup: move all debug decls to rf_options.c, all extern
    141  * debug decls to rf_options.h, all debug vars preceded by rf_
    142  *
    143  * Revision 1.29  1996/05/18  19:51:34  jimz
    144  * major code cleanup- fix syntax, make some types consistent,
    145  * add prototypes, clean out dead code, et cetera
    146  *
    147  * Revision 1.28  1995/12/12  18:10:06  jimz
    148  * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
    149  * fix 80-column brain damage in comments
    150  *
    151  * Revision 1.27  1995/12/01  16:00:08  root
    152  * added copyright info
    153  *
    154  * Revision 1.26  1995/11/28  21:35:12  amiri
    155  * set the RF_BD_DECLUSTERED flag
    156  *
    157  * Revision 1.25  1995/11/17  18:56:00  wvcii
    158  * added prototyping to MapParity
    159  *
    160  * Revision 1.24  1995/07/04  22:25:33  holland
    161  * increased default num bufs
    162  *
    163  * Revision 1.23  1995/07/03  20:23:51  holland
    164  * changed floating recon bufs & head sep yet again
    165  *
    166  * Revision 1.22  1995/07/03  18:12:14  holland
    167  * changed the way the number of floating recon bufs & the head sep
    168  * limit are set
    169  *
    170  * Revision 1.21  1995/07/02  15:07:42  holland
    171  * bug fixes related to getting distributed sparing numbers
    172  *
    173  * Revision 1.20  1995/06/23  13:41:28  robby
    174  * updeated to prototypes in rf_layout.h
    175  *
    176  */
    177 
    178 #ifdef _KERNEL
    179 #define KERNEL
    180 #endif
    181 
    182 
    183 #include "rf_types.h"
    184 #include "rf_raid.h"
    185 #include "rf_raidframe.h"
    186 #include "rf_configure.h"
    187 #include "rf_decluster.h"
    188 #include "rf_debugMem.h"
    189 #include "rf_utils.h"
    190 #include "rf_alloclist.h"
    191 #include "rf_general.h"
    192 #include "rf_shutdown.h"
    193 #include "rf_sys.h"
    194 
    195 extern int rf_copyback_in_progress;                /* debug only */
    196 
    197 /* found in rf_kintf.c */
    198 int rf_GetSpareTableFromDaemon(RF_SparetWait_t  *req);
    199 
    200 /* configuration code */
    201 
    202 int rf_ConfigureDeclustered(
    203   RF_ShutdownList_t  **listp,
    204   RF_Raid_t           *raidPtr,
    205   RF_Config_t         *cfgPtr)
    206 {
    207     RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    208     int b, v, k, r, lambda;				/* block design params */
    209     int i, j;
    210     RF_RowCol_t *first_avail_slot;
    211     RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
    212     RF_DeclusteredConfigInfo_t *info;
    213     RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, extraPUsPerDisk;
    214     RF_StripeCount_t totSparePUsPerDisk;
    215     RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
    216     RF_SectorCount_t SpareSpaceInSUs;
    217     char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
    218     RF_StripeNum_t l, SUID;
    219 
    220     SUID = l = 0;
    221     numCompleteSpareRegionsPerDisk = 0;
    222 
    223     /* 1. create layout specific structure */
    224     RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
    225     if (info == NULL)
    226       return(ENOMEM);
    227     layoutPtr->layoutSpecificInfo = (void *) info;
    228     info->SpareTable = NULL;
    229 
    230     /* 2. extract parameters from the config structure */
    231     if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
    232       (void) bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
    233     }
    234     cfgBuf += RF_SPAREMAP_NAME_LEN;
    235 
    236     b        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
    237     v        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
    238     k        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
    239     r        = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
    240     lambda   = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
    241     raidPtr->noRotate = *( (int *) cfgBuf);   cfgBuf += sizeof(int);
    242 
    243     /* the sparemaps are generated assuming that parity is rotated, so we issue
    244      * a warning if both distributed sparing and no-rotate are on at the same time
    245      */
    246     if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
    247 	RF_ERRORMSG("Warning:  distributed sparing specified without parity rotation.\n");
    248     }
    249 
    250     if (raidPtr->numCol != v) {
    251         RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
    252         return(EINVAL);
    253     }
    254 
    255     /* 3.  set up the values used in the mapping code */
    256     info->BlocksPerTable = b;
    257     info->Lambda = lambda;
    258     info->NumParityReps = info->groupSize = k;
    259     info->SUsPerTable = b * (k-1) * layoutPtr->SUsPerPU;/* b blks, k-1 SUs each */
    260     info->SUsPerFullTable = k * info->SUsPerTable;	/* rot k times */
    261     info->PUsPerBlock = k-1;
    262     info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
    263     info->TableDepthInPUs = (b*k) / v;
    264     info->FullTableDepthInPUs = info->TableDepthInPUs * k;		/* k repetitions */
    265 
    266     /* used only in distributed sparing case */
    267     info->FullTablesPerSpareRegion = (v-1) / rf_gcd(r, v-1);		/* (v-1)/gcd fulltables */
    268     info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
    269     info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v-1)) * layoutPtr->SUsPerPU;
    270 
    271     /* check to make sure the block design is sufficiently small */
    272     if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
    273         if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
    274 	    RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
    275 			 (int)info->FullTableDepthInPUs,
    276 			 (int)info->SpareSpaceDepthPerRegionInSUs,
    277 			 (int)layoutPtr->stripeUnitsPerDisk);
    278 	    return(EINVAL);
    279 	}
    280     } else {
    281 	if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
    282 	    RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
    283 			 (int)(info->TableDepthInPUs * layoutPtr->SUsPerPU), \
    284 			 (int)layoutPtr->stripeUnitsPerDisk);
    285 	    return(EINVAL);
    286 	}
    287     }
    288 
    289 
    290     /* compute the size of each disk, and the number of tables in the last fulltable (which
    291      * need not be complete)
    292      */
    293     if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
    294 
    295 	PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
    296 	spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
    297 				 (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v-1));
    298 	info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
    299 
    300 	numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
    301 	info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
    302 	extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
    303 
    304 	/* assume conservatively that we need the full amount of spare space in one region in order
    305 	 * to provide spares for the partial spare region at the end of the array.  We set "i" to
    306 	 * the number of tables in the partial spare region.  This may actually include some fulltables.
    307 	 */
    308 	extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
    309 	if (extraPUsPerDisk <= 0) i = 0;
    310 	else i = extraPUsPerDisk/info->TableDepthInPUs;
    311 
    312 	complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion/k) + i/k);
    313         info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
    314 	info->ExtraTablesPerDisk = i % k;
    315 
    316 	/* note that in the last spare region, the spare space is complete even though data/parity space is not */
    317 	totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk+1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
    318 	info->TotSparePUsPerDisk = totSparePUsPerDisk;
    319 
    320 	layoutPtr->stripeUnitsPerDisk =
    321 	    ((complete_FT_count/raidPtr->numRow) * info->FullTableDepthInPUs +	 	/* data & parity space */
    322 	     info->ExtraTablesPerDisk * info->TableDepthInPUs +
    323 	     totSparePUsPerDisk								/* spare space */
    324 	    ) * layoutPtr->SUsPerPU;
    325 	layoutPtr->dataStripeUnitsPerDisk =
    326 	    (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
    327 	    * layoutPtr->SUsPerPU * (k-1) / k;
    328 
    329     } else {
    330         /* non-dist spare case:  force each disk to contain an integral number of tables */
    331         layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
    332         layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
    333 
    334 	/* compute the number of tables in the last fulltable, which need not be complete */
    335         complete_FT_count =
    336             ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
    337 
    338         info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
    339         info->ExtraTablesPerDisk =
    340 		((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
    341     }
    342 
    343     raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
    344 
    345     /* find the disk offset of the stripe unit where the last fulltable starts */
    346     numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
    347     diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
    348     if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
    349         SpareSpaceInSUs  = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
    350         diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
    351         info->DiskOffsetOfLastSpareSpaceChunkInSUs =
    352 	    diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
    353     }
    354     info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
    355     info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
    356 
    357     /* 4.  create and initialize the lookup tables */
    358     info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
    359     if (info->LayoutTable == NULL)
    360       return(ENOMEM);
    361     info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
    362     if (info->OffsetTable == NULL)
    363       return(ENOMEM);
    364     info->BlockTable  =	rf_make_2d_array(info->TableDepthInPUs*layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
    365     if (info->BlockTable == NULL)
    366       return(ENOMEM);
    367 
    368     first_avail_slot = rf_make_1d_array(v, NULL);
    369     if (first_avail_slot == NULL)
    370       return(ENOMEM);
    371 
    372     for (i=0; i<b; i++)
    373       for (j=0; j<k; j++)
    374         info->LayoutTable[i][j] = *cfgBuf++;
    375 
    376     /* initialize offset table */
    377     for (i=0; i<b; i++) for (j=0; j<k; j++) {
    378         info->OffsetTable[i][j] = first_avail_slot[ info->LayoutTable[i][j] ];
    379         first_avail_slot[ info->LayoutTable[i][j] ]++;
    380     }
    381 
    382     /* initialize block table */
    383     for (SUID=l=0; l<layoutPtr->SUsPerPU; l++) {
    384         for (i=0; i<b; i++) {
    385             for (j=0; j<k; j++) {
    386                 info->BlockTable[ (info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l ]
    387 		                [ info->LayoutTable[i][j] ] = SUID;
    388             }
    389             SUID++;
    390         }
    391     }
    392 
    393     rf_free_1d_array(first_avail_slot, v);
    394 
    395     /* 5.  set up the remaining redundant-but-useful parameters */
    396 
    397     raidPtr->totalSectors = (k*complete_FT_count + raidPtr->numRow*info->ExtraTablesPerDisk) *
    398     			  info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
    399     layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k-1);
    400 
    401     /* strange evaluation order below to try and minimize overflow problems */
    402 
    403     layoutPtr->dataSectorsPerStripe = (k-1) * layoutPtr->sectorsPerStripeUnit;
    404     layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
    405     layoutPtr->numDataCol = k-1;
    406     layoutPtr->numParityCol = 1;
    407 
    408     return(0);
    409 }
    410 
    411 /* declustering with distributed sparing */
    412 static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
    413 static void rf_ShutdownDeclusteredDS(arg)
    414   RF_ThreadArg_t  arg;
    415 {
    416   RF_DeclusteredConfigInfo_t *info;
    417   RF_Raid_t *raidPtr;
    418 
    419   raidPtr = (RF_Raid_t *)arg;
    420   info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
    421   if (info->SpareTable)
    422     rf_FreeSpareTable(raidPtr);
    423 }
    424 
    425 int rf_ConfigureDeclusteredDS(
    426   RF_ShutdownList_t  **listp,
    427   RF_Raid_t           *raidPtr,
    428   RF_Config_t         *cfgPtr)
    429 {
    430   int rc;
    431 
    432   rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
    433   if (rc)
    434     return(rc);
    435   rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
    436   if (rc) {
    437     RF_ERRORMSG1("Got %d adding shutdown event for DeclusteredDS\n", rc);
    438     rf_ShutdownDeclusteredDS(raidPtr);
    439     return(rc);
    440   }
    441   return(0);
    442 }
    443 
    444 void rf_MapSectorDeclustered(raidPtr, raidSector, row, col, diskSector, remap)
    445   RF_Raid_t       *raidPtr;
    446   RF_RaidAddr_t    raidSector;
    447   RF_RowCol_t     *row;
    448   RF_RowCol_t     *col;
    449   RF_SectorNum_t  *diskSector;
    450   int              remap;
    451 {
    452     RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    453     RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
    454     RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
    455     RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
    456     RF_StripeNum_t BlockID, BlockOffset, RepIndex;
    457     RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
    458     RF_StripeCount_t fulltable_depth  = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
    459     RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0;
    460 
    461     rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
    462 
    463     FullTableID     = SUID / sus_per_fulltable;		/* fulltable ID within array (across rows) */
    464     if (raidPtr->numRow == 1) *row = 0;                 /* avoid a mod and a div in the common case */
    465     else {
    466       *row            = FullTableID % raidPtr->numRow;
    467       FullTableID    /= raidPtr->numRow;			/* convert to fulltable ID on this disk */
    468     }
    469     if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
    470 	SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
    471         SpareSpace  = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
    472     }
    473     FullTableOffset = SUID % sus_per_fulltable;
    474     TableID         = FullTableOffset / info->SUsPerTable;
    475     TableOffset     = FullTableOffset - TableID * info->SUsPerTable;
    476     BlockID         = TableOffset / info->PUsPerBlock;
    477     BlockOffset     = TableOffset - BlockID * info->PUsPerBlock;
    478     BlockID        %= info->BlocksPerTable;
    479     RepIndex        = info->PUsPerBlock - TableID;
    480     if (!raidPtr->noRotate) BlockOffset    += ((BlockOffset >= RepIndex) ? 1 : 0);
    481     *col            = info->LayoutTable[BlockID][BlockOffset];
    482 
    483     /* remap to distributed spare space if indicated */
    484     if (remap) {
    485       RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
    486 	     (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
    487       rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
    488     } else {
    489 
    490         outSU	    = base_suid;
    491         outSU      += FullTableID * fulltable_depth;  				        /* offs to strt of FT */
    492         outSU	   += SpareSpace;						        /* skip rsvd spare space */
    493         outSU      += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;   	        /* offs to strt of tble */
    494         outSU      += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU;	/* offs to the PU */
    495     }
    496     outSU          += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);	        /* offs to the SU within a PU */
    497 
    498     /* convert SUs to sectors, and, if not aligned to SU boundary, add in offset to sector.  */
    499     *diskSector     = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
    500 
    501     RF_ASSERT( *col != -1 );
    502 }
    503 
    504 
    505 /* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */
    506 void rf_MapParityDeclustered(
    507   RF_Raid_t       *raidPtr,
    508   RF_RaidAddr_t    raidSector,
    509   RF_RowCol_t     *row,
    510   RF_RowCol_t     *col,
    511   RF_SectorNum_t  *diskSector,
    512   int              remap)
    513 {
    514     RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    515     RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
    516     RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
    517     RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
    518     RF_StripeNum_t BlockID, BlockOffset, RepIndex;
    519     RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
    520     RF_StripeCount_t fulltable_depth  = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
    521     RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0;
    522 
    523     rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
    524 
    525     /* compute row & (possibly) spare space exactly as before */
    526     FullTableID     = SUID / sus_per_fulltable;
    527     if (raidPtr->numRow == 1) *row = 0;                         /* avoid a mod and a div in the common case */
    528     else {
    529       *row            = FullTableID % raidPtr->numRow;
    530       FullTableID    /= raidPtr->numRow;			/* convert to fulltable ID on this disk */
    531     }
    532     if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
    533 	SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
    534         SpareSpace  = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
    535     }
    536 
    537     /* compute BlockID and RepIndex exactly as before */
    538     FullTableOffset = SUID % sus_per_fulltable;
    539     TableID         = FullTableOffset / info->SUsPerTable;
    540     TableOffset     = FullTableOffset - TableID * info->SUsPerTable;
    541     /*TableOffset     = FullTableOffset % info->SUsPerTable;*/
    542     /*BlockID         = (TableOffset / info->PUsPerBlock) % info->BlocksPerTable;*/
    543     BlockID         = TableOffset / info->PUsPerBlock;
    544     /*BlockOffset     = TableOffset % info->PUsPerBlock;*/
    545     BlockOffset     = TableOffset - BlockID * info->PUsPerBlock;
    546     BlockID        %= info->BlocksPerTable;
    547 
    548     /* the parity block is in the position indicated by RepIndex */
    549     RepIndex        = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID;
    550     *col	    = info->LayoutTable[BlockID][RepIndex];
    551 
    552     if (remap) {
    553       RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
    554 	     (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
    555       rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
    556     } else {
    557 
    558         /* compute sector as before, except use RepIndex instead of BlockOffset */
    559         outSU        = base_suid;
    560         outSU       += FullTableID * fulltable_depth;
    561         outSU	    += SpareSpace;						/* skip rsvd spare space */
    562         outSU       += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
    563         outSU       += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
    564     }
    565 
    566     outSU       += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
    567     *diskSector  = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
    568 
    569     RF_ASSERT( *col != -1 );
    570 }
    571 
    572 /* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
    573  * the caller must _never_ attempt to modify this array.
    574  */
    575 void rf_IdentifyStripeDeclustered(
    576   RF_Raid_t        *raidPtr,
    577   RF_RaidAddr_t     addr,
    578   RF_RowCol_t     **diskids,
    579   RF_RowCol_t      *outRow)
    580 {
    581   RF_RaidLayout_t *layoutPtr           = &(raidPtr->Layout);
    582   RF_DeclusteredConfigInfo_t *info     = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
    583   RF_StripeCount_t sus_per_fulltable   = info->SUsPerFullTable;
    584   RF_StripeCount_t fulltable_depth     = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
    585   RF_StripeNum_t  base_suid            = 0;
    586   RF_StripeNum_t SUID                  = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
    587   RF_StripeNum_t stripeID, FullTableID;
    588   int tableOffset;
    589 
    590   rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
    591   FullTableID     = SUID / sus_per_fulltable;		/* fulltable ID within array (across rows) */
    592   *outRow         = FullTableID % raidPtr->numRow;
    593   stripeID        = rf_StripeUnitIDToStripeID(layoutPtr, SUID);                     /* find stripe offset into array */
    594   tableOffset     = (stripeID % info->BlocksPerTable);                        /* find offset into block design table */
    595   *diskids        = info->LayoutTable[tableOffset];
    596 }
    597 
    598 /* This returns the default head-separation limit, which is measured
    599  * in "required units for reconstruction".  Each time a disk fetches
    600  * a unit, it bumps a counter.  The head-sep code prohibits any disk
    601  * from getting more than headSepLimit counter values ahead of any
    602  * other.
    603  *
    604  * We assume here that the number of floating recon buffers is already
    605  * set.  There are r stripes to be reconstructed in each table, and so
    606  * if we have a total of B buffers, we can have at most B/r tables
    607  * under recon at any one time.  In each table, lambda units are required
    608  * from each disk, so given B buffers, the head sep limit has to be
    609  * (lambda*B)/r units.  We subtract one to avoid weird boundary cases.
    610  *
    611  * for example, suppose were given 50 buffers, r=19, and lambda=4 as in
    612  * the 20.5 design.  There are 19 stripes/table to be reconstructed, so
    613  * we can have 50/19 tables concurrently under reconstruction, which means
    614  * we can allow the fastest disk to get 50/19 tables ahead of the slower
    615  * disk.  There are lambda "required units" for each disk, so the fastest
    616  * disk can get 4*50/19 = 10 counter values ahead of the slowest.
    617  *
    618  * If numBufsToAccumulate is not 1, we need to limit the head sep further
    619  * because multiple bufs will be required for each stripe under recon.
    620  */
    621 RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered(
    622   RF_Raid_t  *raidPtr)
    623 {
    624   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
    625 
    626   return(info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate);
    627 }
    628 
    629 /* returns the default number of recon buffers to use.  The value
    630  * is somewhat arbitrary...it's intended to be large enough to allow
    631  * for a reasonably large head-sep limit, but small enough that you
    632  * don't use up all your system memory with buffers.
    633  */
    634 int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr)
    635 {
    636   return(100 * rf_numBufsToAccumulate);
    637 }
    638 
    639 /* sectors in the last fulltable of the array need to be handled
    640  * specially since this fulltable can be incomplete.  this function
    641  * changes the values of certain params to handle this.
    642  *
    643  * the idea here is that MapSector et. al. figure out which disk the
    644  * addressed unit lives on by computing the modulos of the unit number
    645  * with the number of units per fulltable, table, etc.  In the last
    646  * fulltable, there are fewer units per fulltable, so we need to adjust
    647  * the number of user data units per fulltable to reflect this.
    648  *
    649  * so, we (1) convert the fulltable size and depth parameters to
    650  * the size of the partial fulltable at the end, (2) compute the
    651  * disk sector offset where this fulltable starts, and (3) convert
    652  * the users stripe unit number from an offset into the array to
    653  * an offset into the last fulltable.
    654  */
    655 void rf_decluster_adjust_params(
    656   RF_RaidLayout_t   *layoutPtr,
    657   RF_StripeNum_t    *SUID,
    658   RF_StripeCount_t  *sus_per_fulltable,
    659   RF_StripeCount_t  *fulltable_depth,
    660   RF_StripeNum_t    *base_suid)
    661 {
    662     RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
    663 #if defined(__NetBSD__) && defined(_KERNEL)
    664     /* Nothing! */
    665 #else
    666     char pc = layoutPtr->map->parityConfig;
    667 #endif
    668 
    669     if (*SUID >= info->FullTableLimitSUID) {
    670 	/* new full table size is size of last full table on disk */
    671 	*sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable;
    672 
    673 	/* new full table depth is corresponding depth */
    674 	*fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
    675 
    676 	/* set up the new base offset */
    677 	*base_suid = info->DiskOffsetOfLastFullTableInSUs;
    678 
    679 	/* convert users array address to an offset into the last fulltable */
    680 	*SUID -= info->FullTableLimitSUID;
    681     }
    682 }
    683 
    684 /*
    685  * map a stripe ID to a parity stripe ID.
    686  * See comment above RaidAddressToParityStripeID in layout.c.
    687  */
    688 void rf_MapSIDToPSIDDeclustered(
    689   RF_RaidLayout_t    *layoutPtr,
    690   RF_StripeNum_t      stripeID,
    691   RF_StripeNum_t     *psID,
    692   RF_ReconUnitNum_t  *which_ru)
    693 {
    694     RF_DeclusteredConfigInfo_t *info;
    695 
    696     info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
    697 
    698     *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable))
    699         * info->BlocksPerTable + (stripeID % info->BlocksPerTable);
    700     *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU))
    701         / info->BlocksPerTable;
    702     RF_ASSERT( (*which_ru) < layoutPtr->SUsPerPU/layoutPtr->SUsPerRU);
    703 }
    704 
    705 /*
    706  * Called from MapSector and MapParity to retarget an access at the spare unit.
    707  * Modifies the "col" and "outSU" parameters only.
    708  */
    709 void rf_remap_to_spare_space(
    710   RF_RaidLayout_t             *layoutPtr,
    711   RF_DeclusteredConfigInfo_t  *info,
    712   RF_RowCol_t                  row,
    713   RF_StripeNum_t               FullTableID,
    714   RF_StripeNum_t               TableID,
    715   RF_SectorNum_t               BlockID,
    716   RF_StripeNum_t               base_suid,
    717   RF_StripeNum_t               SpareRegion,
    718   RF_RowCol_t                 *outCol,
    719   RF_StripeNum_t              *outSU)
    720 {
    721     RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, which_ft;
    722 
    723     /*
    724      * note that FullTableID and hence SpareRegion may have gotten
    725      * tweaked by rf_decluster_adjust_params. We detect this by
    726      * noticing that base_suid is not 0.
    727      */
    728     if (base_suid == 0) {
    729       ftID = FullTableID;
    730     }
    731     else {
    732       /*
    733        * There may be > 1.0 full tables in the last (i.e. partial)
    734        * spare region.  find out which of these we're in.
    735        */
    736       lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs;
    737       which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
    738 
    739       /* compute the actual full table ID */
    740       ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft;
    741       SpareRegion = info->NumCompleteSRs;
    742     }
    743     TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion;
    744 
    745     *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
    746     RF_ASSERT( *outCol != -1);
    747 
    748     spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
    749 	    info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU :
    750 	    (SpareRegion+1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs;
    751     *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
    752     if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
    753 	printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n",(long)*outSU);
    754     }
    755 }
    756 
    757 int rf_InstallSpareTable(
    758   RF_Raid_t    *raidPtr,
    759   RF_RowCol_t   frow,
    760   RF_RowCol_t   fcol)
    761 {
    762   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
    763   RF_SparetWait_t *req;
    764   int retcode;
    765 
    766   RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
    767   req->C                             = raidPtr->numCol;
    768   req->G                             = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
    769   req->fcol                          = fcol;
    770   req->SUsPerPU                      = raidPtr->Layout.SUsPerPU;
    771   req->TablesPerSpareRegion          = info->TablesPerSpareRegion;
    772   req->BlocksPerTable                = info->BlocksPerTable;
    773   req->TableDepthInPUs               = info->TableDepthInPUs;
    774   req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs;
    775 
    776 #ifndef KERNEL
    777   info->SpareTable = rf_ReadSpareTable(req, info->sparemap_fname);
    778   RF_Free(req, sizeof(*req));
    779   retcode = (info->SpareTable) ? 0 : 1;
    780 #else /* !KERNEL */
    781   retcode = rf_GetSpareTableFromDaemon(req);
    782   RF_ASSERT(!retcode);                                     /* XXX -- fix this to recover gracefully -- XXX */
    783 #endif /* !KERNEL */
    784 
    785   return(retcode);
    786 }
    787 
    788 #ifdef KERNEL
    789 /*
    790  * Invoked via ioctl to install a spare table in the kernel.
    791  */
    792 int rf_SetSpareTable(raidPtr, data)
    793   RF_Raid_t  *raidPtr;
    794   void       *data;
    795 {
    796   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
    797   RF_SpareTableEntry_t **ptrs;
    798   int i, retcode;
    799 
    800   /* what we need to copyin is a 2-d array, so first copyin the user pointers to the rows in the table */
    801   RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
    802   retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
    803 
    804   if (retcode) return(retcode);
    805 
    806   /* now allocate kernel space for the row pointers */
    807   RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
    808 
    809   /* now allocate kernel space for each row in the table, and copy it in from user space */
    810   for (i=0; i<info->TablesPerSpareRegion; i++) {
    811     RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
    812     retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
    813     if (retcode) {
    814       info->SpareTable = NULL;             /* blow off the memory we've allocated */
    815       return(retcode);
    816     }
    817   }
    818 
    819   /* free up the temporary array we used */
    820   RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
    821 
    822   return(0);
    823 }
    824 #endif /* KERNEL */
    825 
    826 RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(raidPtr)
    827   RF_Raid_t *raidPtr;
    828 {
    829   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
    830 
    831   return( ((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk );
    832 }
    833 
    834 
    835 void rf_FreeSpareTable(raidPtr)
    836   RF_Raid_t  *raidPtr;
    837 {
    838   long i;
    839   RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
    840   RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
    841   RF_SpareTableEntry_t **table = info->SpareTable;
    842 
    843   for (i=0; i<info->TablesPerSpareRegion; i++) {RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));}
    844   RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
    845   info->SpareTable = (RF_SpareTableEntry_t **) NULL;
    846 }
    847