Home | History | Annotate | Line # | Download | only in raidframe
      1 /*	$NetBSD: rf_paritylogging.c,v 1.35 2019/02/09 03:34:00 christos Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: William V. Courtright II
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 
     30 /*
     31   parity logging configuration, dag selection, and mapping is implemented here
     32  */
     33 
     34 #include <sys/cdefs.h>
     35 __KERNEL_RCSID(0, "$NetBSD: rf_paritylogging.c,v 1.35 2019/02/09 03:34:00 christos Exp $");
     36 
     37 #include "rf_archs.h"
     38 
     39 #if RF_INCLUDE_PARITYLOGGING > 0
     40 
     41 #include <dev/raidframe/raidframevar.h>
     42 
     43 #include "rf_raid.h"
     44 #include "rf_dag.h"
     45 #include "rf_dagutils.h"
     46 #include "rf_dagfuncs.h"
     47 #include "rf_dagffrd.h"
     48 #include "rf_dagffwr.h"
     49 #include "rf_dagdegrd.h"
     50 #include "rf_dagdegwr.h"
     51 #include "rf_paritylog.h"
     52 #include "rf_paritylogDiskMgr.h"
     53 #include "rf_paritylogging.h"
     54 #include "rf_parityloggingdags.h"
     55 #include "rf_general.h"
     56 #include "rf_map.h"
     57 #include "rf_utils.h"
     58 #include "rf_shutdown.h"
     59 
     60 typedef struct RF_ParityLoggingConfigInfo_s {
     61 	RF_RowCol_t **stripeIdentifier;	/* filled in at config time & used by
     62 					 * IdentifyStripe */
     63 }       RF_ParityLoggingConfigInfo_t;
     64 
     65 static void FreeRegionInfo(RF_Raid_t * raidPtr, RF_RegionId_t regionID);
     66 static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
     67 static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
     68 static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
     69 static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
     70 static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
     71 static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
     72 
     73 int
     74 rf_ConfigureParityLogging(
     75     RF_ShutdownList_t ** listp,
     76     RF_Raid_t * raidPtr,
     77     RF_Config_t * cfgPtr)
     78 {
     79 	int     i, j, startdisk, rc;
     80 	RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
     81 	RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
     82 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
     83 	RF_ParityLoggingConfigInfo_t *info;
     84 	RF_ParityLog_t *l = NULL, *next;
     85 	void *lHeapPtr;
     86 
     87 	if (rf_numParityRegions <= 0)
     88 		return(EINVAL);
     89 
     90 	/*
     91          * We create multiple entries on the shutdown list here, since
     92          * this configuration routine is fairly complicated in and of
     93          * itself, and this makes backing out of a failed configuration
     94          * much simpler.
     95          */
     96 
     97 	raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
     98 
     99 	/* create a parity logging configuration structure */
    100 	info = RF_MallocAndAdd(sizeof(*info), raidPtr->cleanupList);
    101 	if (info == NULL)
    102 		return (ENOMEM);
    103 	layoutPtr->layoutSpecificInfo = (void *) info;
    104 
    105 	/* the stripe identifier must identify the disks in each stripe, IN
    106 	 * THE ORDER THAT THEY APPEAR IN THE STRIPE. */
    107 	info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol),
    108 						  (raidPtr->numCol),
    109 						  raidPtr->cleanupList);
    110 	if (info->stripeIdentifier == NULL)
    111 		return (ENOMEM);
    112 
    113 	startdisk = 0;
    114 	for (i = 0; i < (raidPtr->numCol); i++) {
    115 		for (j = 0; j < (raidPtr->numCol); j++) {
    116 			info->stripeIdentifier[i][j] = (startdisk + j) %
    117 				(raidPtr->numCol - 1);
    118 		}
    119 		if ((--startdisk) < 0)
    120 			startdisk = raidPtr->numCol - 1 - 1;
    121 	}
    122 
    123 	/* fill in the remaining layout parameters */
    124 	layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
    125 	layoutPtr->numParityCol = 1;
    126 	layoutPtr->numParityLogCol = 1;
    127 	layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol -
    128 		layoutPtr->numParityLogCol;
    129 	layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol *
    130 		layoutPtr->sectorsPerStripeUnit;
    131 	layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
    132 	raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
    133 		layoutPtr->sectorsPerStripeUnit;
    134 
    135 	raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk *
    136 		layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
    137 
    138 	/* configure parity log parameters
    139 	 *
    140 	 * parameter               comment/constraints
    141 	 * -------------------------------------------
    142 	 * numParityRegions*       all regions (except possibly last)
    143 	 *                         of equal size
    144 	 * totalInCoreLogCapacity* amount of memory in bytes available
    145 	 *                         for in-core logs (default 1 MB)
    146 	 * numSectorsPerLog#       capacity of an in-core log in sectors
    147 	 *                         (1 * disk track)
    148 	 * numParityLogs           total number of in-core logs,
    149 	 *                         should be at least numParityRegions
    150 	 * regionLogCapacity       size of a region log (except possibly
    151 	 *                         last one) in sectors
    152 	 * totalLogCapacity        total amount of log space in sectors
    153 	 *
    154 	 * where '*' denotes a user settable parameter.
    155 	 * Note that logs are fixed to be the size of a disk track,
    156 	 * value #defined in rf_paritylog.h
    157 	 *
    158 	 */
    159 
    160 	totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
    161 	raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
    162 	if (rf_parityLogDebug)
    163 		printf("bytes per sector %d\n", raidPtr->bytesPerSector);
    164 
    165 	/* reduce fragmentation within a disk region by adjusting the number
    166 	 * of regions in an attempt to allow an integral number of logs to fit
    167 	 * into a disk region */
    168 	fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
    169 	if (fragmentation > 0)
    170 		for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) {
    171 			if (((totalLogCapacity / (rf_numParityRegions + i)) %
    172 			     raidPtr->numSectorsPerLog) < fragmentation) {
    173 				rf_numParityRegions++;
    174 				raidPtr->regionLogCapacity = totalLogCapacity /
    175 					rf_numParityRegions;
    176 				fragmentation = raidPtr->regionLogCapacity %
    177 					raidPtr->numSectorsPerLog;
    178 			}
    179 			if (((totalLogCapacity / (rf_numParityRegions - i)) %
    180 			     raidPtr->numSectorsPerLog) < fragmentation) {
    181 				rf_numParityRegions--;
    182 				raidPtr->regionLogCapacity = totalLogCapacity /
    183 					rf_numParityRegions;
    184 				fragmentation = raidPtr->regionLogCapacity %
    185 					raidPtr->numSectorsPerLog;
    186 			}
    187 		}
    188 	/* ensure integral number of regions per log */
    189 	raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity /
    190 				      raidPtr->numSectorsPerLog) *
    191 		raidPtr->numSectorsPerLog;
    192 
    193 	raidPtr->numParityLogs = rf_totalInCoreLogCapacity /
    194 		(raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
    195 	/* to avoid deadlock, must ensure that enough logs exist for each
    196 	 * region to have one simultaneously */
    197 	if (raidPtr->numParityLogs < rf_numParityRegions)
    198 		raidPtr->numParityLogs = rf_numParityRegions;
    199 
    200 	/* create region information structs */
    201 	printf("Allocating %d bytes for in-core parity region info\n",
    202 	       (int) (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
    203 	raidPtr->regionInfo = RF_Malloc(
    204 	    rf_numParityRegions * sizeof(*raidPtr->regionInfo));
    205 	if (raidPtr->regionInfo == NULL)
    206 		return (ENOMEM);
    207 
    208 	/* last region may not be full capacity */
    209 	lastRegionCapacity = raidPtr->regionLogCapacity;
    210 	while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity +
    211 	       lastRegionCapacity > totalLogCapacity)
    212 		lastRegionCapacity = lastRegionCapacity -
    213 			raidPtr->numSectorsPerLog;
    214 
    215 	raidPtr->regionParityRange = raidPtr->sectorsPerDisk /
    216 		rf_numParityRegions;
    217 	maxRegionParityRange = raidPtr->regionParityRange;
    218 
    219 /* i can't remember why this line is in the code -wvcii 6/30/95 */
    220 /*  if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
    221     regionParityRange++; */
    222 
    223 	/* build pool of unused parity logs */
    224 	printf("Allocating %d bytes for %d parity logs\n",
    225 	       raidPtr->numParityLogs * raidPtr->numSectorsPerLog *
    226 	       raidPtr->bytesPerSector,
    227 	       raidPtr->numParityLogs);
    228 	raidPtr->parityLogBufferHeap = RF_Malloc(raidPtr->numParityLogs
    229 	    * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
    230 	if (raidPtr->parityLogBufferHeap == NULL)
    231 		return (ENOMEM);
    232 	lHeapPtr = raidPtr->parityLogBufferHeap;
    233 	rf_init_mutex2(raidPtr->parityLogPool.mutex, IPL_VM);
    234 	for (i = 0; i < raidPtr->numParityLogs; i++) {
    235 		if (i == 0) {
    236 			raidPtr->parityLogPool.parityLogs =
    237 			    RF_Malloc(
    238 			    sizeof(*raidPtr->parityLogPool.parityLogs));
    239 			if (raidPtr->parityLogPool.parityLogs == NULL) {
    240 				RF_Free(raidPtr->parityLogBufferHeap,
    241 					raidPtr->numParityLogs *
    242 					raidPtr->numSectorsPerLog *
    243 					raidPtr->bytesPerSector);
    244 				return (ENOMEM);
    245 			}
    246 			l = raidPtr->parityLogPool.parityLogs;
    247 		} else {
    248 			l->next = RF_Malloc(sizeof(*l->next));
    249 			if (l->next == NULL) {
    250 				RF_Free(raidPtr->parityLogBufferHeap,
    251 					raidPtr->numParityLogs *
    252 					raidPtr->numSectorsPerLog *
    253 					raidPtr->bytesPerSector);
    254 				for (l = raidPtr->parityLogPool.parityLogs;
    255 				     l;
    256 				     l = next) {
    257 					next = l->next;
    258 					if (l->records)
    259 						RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
    260 					RF_Free(l, sizeof(RF_ParityLog_t));
    261 				}
    262 				return (ENOMEM);
    263 			}
    264 			l = l->next;
    265 		}
    266 		l->bufPtr = lHeapPtr;
    267 		lHeapPtr = (char *)lHeapPtr + raidPtr->numSectorsPerLog *
    268 			raidPtr->bytesPerSector;
    269 		l->records = RF_Malloc(raidPtr->numSectorsPerLog *
    270 		    sizeof(*l->records));
    271 		if (l->records == NULL) {
    272 			RF_Free(raidPtr->parityLogBufferHeap,
    273 				raidPtr->numParityLogs *
    274 				raidPtr->numSectorsPerLog *
    275 				raidPtr->bytesPerSector);
    276 			for (l = raidPtr->parityLogPool.parityLogs;
    277 			     l;
    278 			     l = next) {
    279 				next = l->next;
    280 				if (l->records)
    281 					RF_Free(l->records,
    282 						(raidPtr->numSectorsPerLog *
    283 						 sizeof(RF_ParityLogRecord_t)));
    284 				RF_Free(l, sizeof(RF_ParityLog_t));
    285 			}
    286 			return (ENOMEM);
    287 		}
    288 	}
    289 	rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
    290 	/* build pool of region buffers */
    291 	rf_init_mutex2(raidPtr->regionBufferPool.mutex, IPL_VM);
    292 	rf_init_cond2(raidPtr->regionBufferPool.cond, "rfrbpl");
    293 	raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity *
    294 		raidPtr->bytesPerSector;
    295 	printf("regionBufferPool.bufferSize %d\n",
    296 	       raidPtr->regionBufferPool.bufferSize);
    297 
    298 	/* for now, only one region at a time may be reintegrated */
    299 	raidPtr->regionBufferPool.totalBuffers = 1;
    300 
    301 	raidPtr->regionBufferPool.availableBuffers =
    302 		raidPtr->regionBufferPool.totalBuffers;
    303 	raidPtr->regionBufferPool.availBuffersIndex = 0;
    304 	raidPtr->regionBufferPool.emptyBuffersIndex = 0;
    305 	printf("Allocating %d bytes for regionBufferPool\n",
    306 	       (int) (raidPtr->regionBufferPool.totalBuffers *
    307 		      sizeof(void *)));
    308 	raidPtr->regionBufferPool.buffers =  RF_Malloc(
    309 	    raidPtr->regionBufferPool.totalBuffers *
    310 	    sizeof(*raidPtr->regionBufferPool.buffers));
    311 	if (raidPtr->regionBufferPool.buffers == NULL) {
    312 		return (ENOMEM);
    313 	}
    314 	for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
    315 		printf("Allocating %d bytes for regionBufferPool#%d\n",
    316 		       (int) (raidPtr->regionBufferPool.bufferSize *
    317 			      sizeof(char)), i);
    318 		raidPtr->regionBufferPool.buffers[i] =
    319 		    RF_Malloc(raidPtr->regionBufferPool.bufferSize);
    320 		if (raidPtr->regionBufferPool.buffers[i] == NULL) {
    321 			for (j = 0; j < i; j++) {
    322 				RF_Free(raidPtr->regionBufferPool.buffers[i],
    323 					raidPtr->regionBufferPool.bufferSize *
    324 					sizeof(char));
    325 			}
    326 			RF_Free(raidPtr->regionBufferPool.buffers,
    327 				raidPtr->regionBufferPool.totalBuffers *
    328 				sizeof(void *));
    329 			return (ENOMEM);
    330 		}
    331 		printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
    332 		    (long) raidPtr->regionBufferPool.buffers[i]);
    333 	}
    334 	rf_ShutdownCreate(listp,
    335 			  rf_ShutdownParityLoggingRegionBufferPool,
    336 			  raidPtr);
    337 	/* build pool of parity buffers */
    338 	parityBufferCapacity = maxRegionParityRange;
    339 	rf_init_mutex2(raidPtr->parityBufferPool.mutex, IPL_VM);
    340 	rf_init_cond2(raidPtr->parityBufferPool.cond, "rfpbpl");
    341 	raidPtr->parityBufferPool.bufferSize = parityBufferCapacity *
    342 		raidPtr->bytesPerSector;
    343 	printf("parityBufferPool.bufferSize %d\n",
    344 	       raidPtr->parityBufferPool.bufferSize);
    345 
    346 	/* for now, only one region at a time may be reintegrated */
    347 	raidPtr->parityBufferPool.totalBuffers = 1;
    348 
    349 	raidPtr->parityBufferPool.availableBuffers =
    350 		raidPtr->parityBufferPool.totalBuffers;
    351 	raidPtr->parityBufferPool.availBuffersIndex = 0;
    352 	raidPtr->parityBufferPool.emptyBuffersIndex = 0;
    353 	printf("Allocating %d bytes for parityBufferPool of %d units\n",
    354 	       (int) (raidPtr->parityBufferPool.totalBuffers *
    355 		      sizeof(void *)),
    356 	       raidPtr->parityBufferPool.totalBuffers );
    357 	raidPtr->parityBufferPool.buffers = RF_Malloc(
    358 	    raidPtr->parityBufferPool.totalBuffers *
    359 	    sizeof(*raidPtr->parityBufferPool.buffers));
    360 	if (raidPtr->parityBufferPool.buffers == NULL) {
    361 		return (ENOMEM);
    362 	}
    363 	for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
    364 		printf("Allocating %d bytes for parityBufferPool#%d\n",
    365 		       (int) (raidPtr->parityBufferPool.bufferSize *
    366 			      sizeof(char)),i);
    367 		raidPtr->parityBufferPool.buffers[i] = RF_Malloc(
    368 		    raidPtr->parityBufferPool.bufferSize);
    369 		if (raidPtr->parityBufferPool.buffers == NULL) {
    370 			for (j = 0; j < i; j++) {
    371 				RF_Free(raidPtr->parityBufferPool.buffers[i],
    372 					raidPtr->regionBufferPool.bufferSize *
    373 					sizeof(char));
    374 			}
    375 			RF_Free(raidPtr->parityBufferPool.buffers,
    376 				raidPtr->regionBufferPool.totalBuffers *
    377 				sizeof(void *));
    378 			return (ENOMEM);
    379 		}
    380 		printf("parityBufferPool.buffers[%d] = %lx\n", i,
    381 		    (long) raidPtr->parityBufferPool.buffers[i]);
    382 	}
    383 	rf_ShutdownCreate(listp,
    384 			  rf_ShutdownParityLoggingParityBufferPool,
    385 			  raidPtr);
    386 	/* initialize parityLogDiskQueue */
    387 	rf_init_mutex2(raidPtr->parityLogDiskQueue.mutex, IPL_VM);
    388 	rf_init_cond2(raidPtr->parityLogDiskQueue.cond, "rfpldq");
    389 	raidPtr->parityLogDiskQueue.flushQueue = NULL;
    390 	raidPtr->parityLogDiskQueue.reintQueue = NULL;
    391 	raidPtr->parityLogDiskQueue.bufHead = NULL;
    392 	raidPtr->parityLogDiskQueue.bufTail = NULL;
    393 	raidPtr->parityLogDiskQueue.reintHead = NULL;
    394 	raidPtr->parityLogDiskQueue.reintTail = NULL;
    395 	raidPtr->parityLogDiskQueue.logBlockHead = NULL;
    396 	raidPtr->parityLogDiskQueue.logBlockTail = NULL;
    397 	raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
    398 	raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
    399 	raidPtr->parityLogDiskQueue.freeDataList = NULL;
    400 	raidPtr->parityLogDiskQueue.freeCommonList = NULL;
    401 
    402 	rf_ShutdownCreate(listp,
    403 			  rf_ShutdownParityLoggingDiskQueue,
    404 			  raidPtr);
    405 	for (i = 0; i < rf_numParityRegions; i++) {
    406 		rf_init_mutex2(raidPtr->regionInfo[i].mutex, IPL_VM);
    407 		rf_init_mutex2(raidPtr->regionInfo[i].reintMutex, IPL_VM);
    408 		raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
    409 		raidPtr->regionInfo[i].regionStartAddr =
    410 			raidPtr->regionLogCapacity * i;
    411 		raidPtr->regionInfo[i].parityStartAddr =
    412 			raidPtr->regionParityRange * i;
    413 		if (i < rf_numParityRegions - 1) {
    414 			raidPtr->regionInfo[i].capacity =
    415 				raidPtr->regionLogCapacity;
    416 			raidPtr->regionInfo[i].numSectorsParity =
    417 				raidPtr->regionParityRange;
    418 		} else {
    419 			raidPtr->regionInfo[i].capacity =
    420 				lastRegionCapacity;
    421 			raidPtr->regionInfo[i].numSectorsParity =
    422 				raidPtr->sectorsPerDisk -
    423 				raidPtr->regionParityRange * i;
    424 			if (raidPtr->regionInfo[i].numSectorsParity >
    425 			    maxRegionParityRange)
    426 				maxRegionParityRange =
    427 					raidPtr->regionInfo[i].numSectorsParity;
    428 		}
    429 		raidPtr->regionInfo[i].diskCount = 0;
    430 		RF_ASSERT(raidPtr->regionInfo[i].capacity +
    431 			  raidPtr->regionInfo[i].regionStartAddr <=
    432 			  totalLogCapacity);
    433 		RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr +
    434 			  raidPtr->regionInfo[i].numSectorsParity <=
    435 			  raidPtr->sectorsPerDisk);
    436 		printf("Allocating %d bytes for region %d\n",
    437 		       (int) (raidPtr->regionInfo[i].capacity *
    438 			   sizeof(RF_DiskMap_t)), i);
    439 		raidPtr->regionInfo[i].diskMap = RF_Malloc(
    440 		    raidPtr->regionInfo[i].capacity *
    441 		    sizeof(*raidPtr->regionInfo[i].diskMap));
    442 		if (raidPtr->regionInfo[i].diskMap == NULL) {
    443 			for (j = 0; j < i; j++)
    444 				FreeRegionInfo(raidPtr, j);
    445 			RF_Free(raidPtr->regionInfo,
    446 				(rf_numParityRegions *
    447 				 sizeof(RF_RegionInfo_t)));
    448 			return (ENOMEM);
    449 		}
    450 		raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
    451 		raidPtr->regionInfo[i].coreLog = NULL;
    452 	}
    453 	rf_ShutdownCreate(listp,
    454 			  rf_ShutdownParityLoggingRegionInfo,
    455 			  raidPtr);
    456 	RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
    457 	raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
    458 	rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle,
    459 			      rf_ParityLoggingDiskManager, raidPtr,"rf_log");
    460 	if (rc) {
    461 		raidPtr->parityLogDiskQueue.threadState = 0;
    462 		RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
    463 		    __FILE__, __LINE__, rc);
    464 		return (ENOMEM);
    465 	}
    466 	/* wait for thread to start */
    467 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
    468 	while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_RUNNING)) {
    469 		rf_wait_cond2(raidPtr->parityLogDiskQueue.cond,
    470 			      raidPtr->parityLogDiskQueue.mutex);
    471 	}
    472 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
    473 
    474 	rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
    475 	if (rf_parityLogDebug) {
    476 		printf("                            size of disk log in sectors: %d\n",
    477 		    (int) totalLogCapacity);
    478 		printf("                            total number of parity regions is %d\n", (int) rf_numParityRegions);
    479 		printf("                            nominal sectors of log per parity region is %d\n", (int) raidPtr->regionLogCapacity);
    480 		printf("                            nominal region fragmentation is %d sectors\n", (int) fragmentation);
    481 		printf("                            total number of parity logs is %d\n", raidPtr->numParityLogs);
    482 		printf("                            parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
    483 		printf("                            total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
    484 	}
    485 	rf_EnableParityLogging(raidPtr);
    486 
    487 	return (0);
    488 }
    489 
    490 static void
    491 FreeRegionInfo(
    492     RF_Raid_t * raidPtr,
    493     RF_RegionId_t regionID)
    494 {
    495 	RF_Free(raidPtr->regionInfo[regionID].diskMap,
    496 		(raidPtr->regionInfo[regionID].capacity *
    497 		 sizeof(RF_DiskMap_t)));
    498 	if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
    499 		rf_ReleaseParityLogs(raidPtr,
    500 				     raidPtr->regionInfo[regionID].coreLog);
    501 		raidPtr->regionInfo[regionID].coreLog = NULL;
    502 	} else {
    503 		RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
    504 		RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
    505 	}
    506 	rf_destroy_mutex2(raidPtr->regionInfo[regionID].reintMutex);
    507 	rf_destroy_mutex2(raidPtr->regionInfo[regionID].mutex);
    508 }
    509 
    510 
    511 static void
    512 FreeParityLogQueue(RF_Raid_t * raidPtr)
    513 {
    514 	RF_ParityLog_t *l1, *l2;
    515 
    516 	l1 = raidPtr->parityLogPool.parityLogs;
    517 	while (l1) {
    518 		l2 = l1;
    519 		l1 = l2->next;
    520 		RF_Free(l2->records, (raidPtr->numSectorsPerLog *
    521 				      sizeof(RF_ParityLogRecord_t)));
    522 		RF_Free(l2, sizeof(RF_ParityLog_t));
    523 	}
    524 	rf_destroy_mutex2(raidPtr->parityLogPool.mutex);
    525 }
    526 
    527 
    528 static void
    529 FreeRegionBufferQueue(RF_RegionBufferQueue_t * queue)
    530 {
    531 	int     i;
    532 
    533 	if (queue->availableBuffers != queue->totalBuffers) {
    534 		printf("Attempt to free region queue which is still in use!\n");
    535 		RF_ASSERT(0);
    536 	}
    537 	for (i = 0; i < queue->totalBuffers; i++)
    538 		RF_Free(queue->buffers[i], queue->bufferSize);
    539 	RF_Free(queue->buffers, queue->totalBuffers * sizeof(void *));
    540 	rf_destroy_mutex2(queue->mutex);
    541 	rf_destroy_cond2(queue->cond);
    542 }
    543 
    544 static void
    545 rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
    546 {
    547 	RF_Raid_t *raidPtr;
    548 	RF_RegionId_t i;
    549 
    550 	raidPtr = (RF_Raid_t *) arg;
    551 	if (rf_parityLogDebug) {
    552 		printf("raid%d: ShutdownParityLoggingRegionInfo\n",
    553 		       raidPtr->raidid);
    554 	}
    555 	/* free region information structs */
    556 	for (i = 0; i < rf_numParityRegions; i++)
    557 		FreeRegionInfo(raidPtr, i);
    558 	RF_Free(raidPtr->regionInfo, (rf_numParityRegions *
    559 				      sizeof(raidPtr->regionInfo)));
    560 	raidPtr->regionInfo = NULL;
    561 }
    562 
    563 static void
    564 rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
    565 {
    566 	RF_Raid_t *raidPtr;
    567 
    568 	raidPtr = (RF_Raid_t *) arg;
    569 	if (rf_parityLogDebug) {
    570 		printf("raid%d: ShutdownParityLoggingPool\n", raidPtr->raidid);
    571 	}
    572 	/* free contents of parityLogPool */
    573 	FreeParityLogQueue(raidPtr);
    574 	RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs *
    575 		raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
    576 }
    577 
    578 static void
    579 rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
    580 {
    581 	RF_Raid_t *raidPtr;
    582 
    583 	raidPtr = (RF_Raid_t *) arg;
    584 	if (rf_parityLogDebug) {
    585 		printf("raid%d: ShutdownParityLoggingRegionBufferPool\n",
    586 		       raidPtr->raidid);
    587 	}
    588 	FreeRegionBufferQueue(&raidPtr->regionBufferPool);
    589 }
    590 
    591 static void
    592 rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
    593 {
    594 	RF_Raid_t *raidPtr;
    595 
    596 	raidPtr = (RF_Raid_t *) arg;
    597 	if (rf_parityLogDebug) {
    598 		printf("raid%d: ShutdownParityLoggingParityBufferPool\n",
    599 		       raidPtr->raidid);
    600 	}
    601 	FreeRegionBufferQueue(&raidPtr->parityBufferPool);
    602 }
    603 
    604 static void
    605 rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
    606 {
    607 	RF_ParityLogData_t *d;
    608 	RF_CommonLogData_t *c;
    609 	RF_Raid_t *raidPtr;
    610 
    611 	raidPtr = (RF_Raid_t *) arg;
    612 	if (rf_parityLogDebug) {
    613 		printf("raid%d: ShutdownParityLoggingDiskQueue\n",
    614 		       raidPtr->raidid);
    615 	}
    616 	/* free disk manager stuff */
    617 	RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
    618 	RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
    619 	RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
    620 	RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
    621 	while (raidPtr->parityLogDiskQueue.freeDataList) {
    622 		d = raidPtr->parityLogDiskQueue.freeDataList;
    623 		raidPtr->parityLogDiskQueue.freeDataList =
    624 			raidPtr->parityLogDiskQueue.freeDataList->next;
    625 		RF_Free(d, sizeof(RF_ParityLogData_t));
    626 	}
    627 	while (raidPtr->parityLogDiskQueue.freeCommonList) {
    628 		c = raidPtr->parityLogDiskQueue.freeCommonList;
    629 		raidPtr->parityLogDiskQueue.freeCommonList = c->next;
    630 		/* init is in rf_paritylog.c */
    631 		rf_destroy_mutex2(c->mutex);
    632 		RF_Free(c, sizeof(RF_CommonLogData_t));
    633 	}
    634 
    635 	rf_destroy_mutex2(raidPtr->parityLogDiskQueue.mutex);
    636 	rf_destroy_cond2(raidPtr->parityLogDiskQueue.cond);
    637 }
    638 
    639 static void
    640 rf_ShutdownParityLogging(RF_ThreadArg_t arg)
    641 {
    642 	RF_Raid_t *raidPtr;
    643 
    644 	raidPtr = (RF_Raid_t *) arg;
    645 	if (rf_parityLogDebug) {
    646 		printf("raid%d: ShutdownParityLogging\n", raidPtr->raidid);
    647 	}
    648 	/* shutdown disk thread */
    649 	/* This has the desirable side-effect of forcing all regions to be
    650 	 * reintegrated.  This is necessary since all parity log maps are
    651 	 * currently held in volatile memory. */
    652 
    653 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
    654 	raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
    655 	rf_signal_cond2(raidPtr->parityLogDiskQueue.cond);
    656 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
    657 	/*
    658          * pLogDiskThread will now terminate when queues are cleared
    659          * now wait for it to be done
    660          */
    661 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
    662 	while (!(raidPtr->parityLogDiskQueue.threadState & RF_PLOG_SHUTDOWN)) {
    663 		rf_wait_cond2(raidPtr->parityLogDiskQueue.cond,
    664 			      raidPtr->parityLogDiskQueue.mutex);
    665 	}
    666 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
    667 	if (rf_parityLogDebug) {
    668 		printf("raid%d: ShutdownParityLogging done (thread completed)\n", raidPtr->raidid);
    669 	}
    670 }
    671 
    672 int
    673 rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t * raidPtr)
    674 {
    675 	return (20);
    676 }
    677 
    678 RF_HeadSepLimit_t
    679 rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t * raidPtr)
    680 {
    681 	return (10);
    682 }
    683 /* return the region ID for a given RAID address */
    684 RF_RegionId_t
    685 rf_MapRegionIDParityLogging(
    686     RF_Raid_t * raidPtr,
    687     RF_SectorNum_t address)
    688 {
    689 	RF_RegionId_t regionID;
    690 
    691 /*  regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
    692 	regionID = address / raidPtr->regionParityRange;
    693 	if (regionID == rf_numParityRegions) {
    694 		/* last region may be larger than other regions */
    695 		regionID--;
    696 	}
    697 	RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
    698 	RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr +
    699 		  raidPtr->regionInfo[regionID].numSectorsParity);
    700 	RF_ASSERT(regionID < rf_numParityRegions);
    701 	return (regionID);
    702 }
    703 
    704 
    705 /* given a logical RAID sector, determine physical disk address of data */
    706 void
    707 rf_MapSectorParityLogging(
    708     RF_Raid_t * raidPtr,
    709     RF_RaidAddr_t raidSector,
    710     RF_RowCol_t * col,
    711     RF_SectorNum_t * diskSector,
    712     int remap)
    713 {
    714 	RF_StripeNum_t SUID = raidSector /
    715 		raidPtr->Layout.sectorsPerStripeUnit;
    716 	/* *col = (SUID % (raidPtr->numCol -
    717 	 * raidPtr->Layout.numParityLogCol)); */
    718 	*col = SUID % raidPtr->Layout.numDataCol;
    719 	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
    720 		raidPtr->Layout.sectorsPerStripeUnit +
    721 		(raidSector % raidPtr->Layout.sectorsPerStripeUnit);
    722 }
    723 
    724 
    725 /* given a logical RAID sector, determine physical disk address of parity  */
    726 void
    727 rf_MapParityParityLogging(
    728     RF_Raid_t * raidPtr,
    729     RF_RaidAddr_t raidSector,
    730     RF_RowCol_t * col,
    731     RF_SectorNum_t * diskSector,
    732     int remap)
    733 {
    734 	RF_StripeNum_t SUID = raidSector /
    735 		raidPtr->Layout.sectorsPerStripeUnit;
    736 
    737 	/* *col =
    738 	 * raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPt
    739 	 * r->numCol - raidPtr->Layout.numParityLogCol); */
    740 	*col = raidPtr->Layout.numDataCol;
    741 	*diskSector = (SUID / (raidPtr->Layout.numDataCol)) *
    742 		raidPtr->Layout.sectorsPerStripeUnit +
    743 		(raidSector % raidPtr->Layout.sectorsPerStripeUnit);
    744 }
    745 
    746 
    747 /* given a regionID and sector offset, determine the physical disk address of the parity log */
    748 void
    749 rf_MapLogParityLogging(
    750     RF_Raid_t * raidPtr,
    751     RF_RegionId_t regionID,
    752     RF_SectorNum_t regionOffset,
    753     RF_RowCol_t * col,
    754     RF_SectorNum_t * startSector)
    755 {
    756 	*col = raidPtr->numCol - 1;
    757 	*startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
    758 }
    759 
    760 
    761 /* given a regionID, determine the physical disk address of the logged
    762    parity for that region */
    763 void
    764 rf_MapRegionParity(
    765     RF_Raid_t * raidPtr,
    766     RF_RegionId_t regionID,
    767     RF_RowCol_t * col,
    768     RF_SectorNum_t * startSector,
    769     RF_SectorCount_t * numSector)
    770 {
    771 	*col = raidPtr->numCol - 2;
    772 	*startSector = raidPtr->regionInfo[regionID].parityStartAddr;
    773 	*numSector = raidPtr->regionInfo[regionID].numSectorsParity;
    774 }
    775 
    776 
    777 /* given a logical RAID address, determine the participating disks in
    778    the stripe */
    779 void
    780 rf_IdentifyStripeParityLogging(
    781     RF_Raid_t * raidPtr,
    782     RF_RaidAddr_t addr,
    783     RF_RowCol_t ** diskids)
    784 {
    785 	RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout,
    786 							   addr);
    787 	RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *)
    788 		raidPtr->Layout.layoutSpecificInfo;
    789 	*diskids = info->stripeIdentifier[stripeID % raidPtr->numCol];
    790 }
    791 
    792 
    793 void
    794 rf_MapSIDToPSIDParityLogging(
    795     RF_RaidLayout_t * layoutPtr,
    796     RF_StripeNum_t stripeID,
    797     RF_StripeNum_t * psID,
    798     RF_ReconUnitNum_t * which_ru)
    799 {
    800 	*which_ru = 0;
    801 	*psID = stripeID;
    802 }
    803 
    804 
    805 /* select an algorithm for performing an access.  Returns two pointers,
    806  * one to a function that will return information about the DAG, and
    807  * another to a function that will create the dag.
    808  */
    809 void
    810 rf_ParityLoggingDagSelect(
    811     RF_Raid_t * raidPtr,
    812     RF_IoType_t type,
    813     RF_AccessStripeMap_t * asmp,
    814     RF_VoidFuncPtr * createFunc)
    815 {
    816 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
    817 	RF_PhysDiskAddr_t *failedPDA = NULL;
    818 	RF_RowCol_t fcol;
    819 	RF_RowStatus_t rstat;
    820 	int     prior_recon;
    821 
    822 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
    823 
    824 	if (asmp->numDataFailed + asmp->numParityFailed > 1) {
    825 		RF_ERRORMSG("Multiple disks failed in a single group!  Aborting I/O operation.\n");
    826 		*createFunc = NULL;
    827 		return;
    828 	} else
    829 		if (asmp->numDataFailed + asmp->numParityFailed == 1) {
    830 
    831 			/* if under recon & already reconstructed, redirect
    832 			 * the access to the spare drive and eliminate the
    833 			 * failure indication */
    834 			failedPDA = asmp->failedPDAs[0];
    835 			fcol = failedPDA->col;
    836 			rstat = raidPtr->status;
    837 			prior_recon = (rstat == rf_rs_reconfigured) || (
    838 			    (rstat == rf_rs_reconstructing) ?
    839 			    rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, failedPDA->startSector) : 0
    840 			    );
    841 			if (prior_recon) {
    842 				RF_RowCol_t oc = failedPDA->col;
    843 				RF_SectorNum_t oo = failedPDA->startSector;
    844 				if (layoutPtr->map->flags &
    845 				    RF_DISTRIBUTE_SPARE) {
    846 					/* redirect to dist spare space */
    847 
    848 					if (failedPDA == asmp->parityInfo) {
    849 
    850 						/* parity has failed */
    851 						(layoutPtr->map->MapParity) (raidPtr, failedPDA->raidAddress,
    852 						    &failedPDA->col, &failedPDA->startSector, RF_REMAP);
    853 
    854 						if (asmp->parityInfo->next) {	/* redir 2nd component,
    855 										 * if any */
    856 							RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
    857 							RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
    858 							p->col = failedPDA->col;
    859 							p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
    860 							    SUoffs;	/* cheating:
    861 									 * startSector is not
    862 									 * really a RAID address */
    863 						}
    864 					} else
    865 						if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
    866 							RF_ASSERT(0);	/* should not ever
    867 									 * happen */
    868 						} else {
    869 
    870 							/* data has failed */
    871 							(layoutPtr->map->MapSector) (raidPtr, failedPDA->raidAddress,
    872 							    &failedPDA->col, &failedPDA->startSector, RF_REMAP);
    873 
    874 						}
    875 
    876 				} else {
    877 					/* redirect to dedicated spare space */
    878 
    879 					failedPDA->col = raidPtr->Disks[fcol].spareCol;
    880 
    881 					/* the parity may have two distinct
    882 					 * components, both of which may need
    883 					 * to be redirected */
    884 					if (asmp->parityInfo->next) {
    885 						if (failedPDA == asmp->parityInfo) {
    886 							failedPDA->next->col = failedPDA->col;
    887 						} else
    888 							if (failedPDA == asmp->parityInfo->next) {	/* paranoid:  should never occur */
    889 								asmp->parityInfo->col = failedPDA->col;
    890 							}
    891 					}
    892 				}
    893 
    894 				RF_ASSERT(failedPDA->col != -1);
    895 
    896 				if (rf_dagDebug || rf_mapDebug) {
    897 					printf("raid%d: Redirected type '%c' c %d o %ld -> c %d o %ld\n",
    898 					    raidPtr->raidid, type, oc, (long) oo, failedPDA->col, (long) failedPDA->startSector);
    899 				}
    900 				asmp->numDataFailed = asmp->numParityFailed = 0;
    901 			}
    902 		}
    903 	if (type == RF_IO_TYPE_READ) {
    904 
    905 		if (asmp->numDataFailed == 0)
    906 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;
    907 		else
    908 			*createFunc = (RF_VoidFuncPtr) rf_CreateRaidFiveDegradedReadDAG;
    909 
    910 	} else {
    911 
    912 
    913 		/* if mirroring, always use large writes.  If the access
    914 		 * requires two distinct parity updates, always do a small
    915 		 * write.  If the stripe contains a failure but the access
    916 		 * does not, do a small write. The first conditional
    917 		 * (numStripeUnitsAccessed <= numDataCol/2) uses a
    918 		 * less-than-or-equal rather than just a less-than because
    919 		 * when G is 3 or 4, numDataCol/2 is 1, and I want
    920 		 * single-stripe-unit updates to use just one disk. */
    921 		if ((asmp->numDataFailed + asmp->numParityFailed) == 0) {
    922 			if (((asmp->numStripeUnitsAccessed <=
    923 			      (layoutPtr->numDataCol / 2)) &&
    924 			     (layoutPtr->numDataCol != 1)) ||
    925 			    (asmp->parityInfo->next != NULL) ||
    926 			    rf_CheckStripeForFailures(raidPtr, asmp)) {
    927 				*createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingSmallWriteDAG;
    928 			} else
    929 				*createFunc = (RF_VoidFuncPtr) rf_CreateParityLoggingLargeWriteDAG;
    930 		} else
    931 			if (asmp->numParityFailed == 1)
    932 				*createFunc = (RF_VoidFuncPtr) rf_CreateNonRedundantWriteDAG;
    933 			else
    934 				if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
    935 					*createFunc = NULL;
    936 				else
    937 					*createFunc = (RF_VoidFuncPtr) rf_CreateDegradedWriteDAG;
    938 	}
    939 }
    940 #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
    941