Home | History | Annotate | Line # | Download | only in raidframe
rf_driver.c revision 1.7
      1 /*	$NetBSD: rf_driver.c,v 1.7 1999/02/23 23:55:29 oster Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II,
      7  *         Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka
      8  *
      9  * Permission to use, copy, modify and distribute this software and
     10  * its documentation is hereby granted, provided that both the copyright
     11  * notice and this permission notice appear in all copies of the
     12  * software, derivative works or modified versions, and any portions
     13  * thereof, and that both notices appear in supporting documentation.
     14  *
     15  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     16  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     17  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     18  *
     19  * Carnegie Mellon requests users of this software to return to
     20  *
     21  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     22  *  School of Computer Science
     23  *  Carnegie Mellon University
     24  *  Pittsburgh PA 15213-3890
     25  *
     26  * any improvements or extensions that they make and grant Carnegie the
     27  * rights to redistribute these changes.
     28  */
     29 
     30 /******************************************************************************
     31  *
     32  * rf_driver.c -- main setup, teardown, and access routines for the RAID driver
     33  *
     34  * all routines are prefixed with rf_ (raidframe), to avoid conficts.
     35  *
     36  ******************************************************************************/
     37 
     38 
     39 #include <sys/types.h>
     40 #include <sys/param.h>
     41 #include <sys/systm.h>
     42 #include <sys/ioctl.h>
     43 #include <sys/fcntl.h>
     44 #include <sys/vnode.h>
     45 
     46 
     47 #include "rf_archs.h"
     48 #include "rf_threadstuff.h"
     49 
     50 #include <sys/errno.h>
     51 
     52 #include "rf_raid.h"
     53 #include "rf_dag.h"
     54 #include "rf_aselect.h"
     55 #include "rf_diskqueue.h"
     56 #include "rf_parityscan.h"
     57 #include "rf_alloclist.h"
     58 #include "rf_threadid.h"
     59 #include "rf_dagutils.h"
     60 #include "rf_utils.h"
     61 #include "rf_etimer.h"
     62 #include "rf_acctrace.h"
     63 #include "rf_configure.h"
     64 #include "rf_general.h"
     65 #include "rf_desc.h"
     66 #include "rf_states.h"
     67 #include "rf_freelist.h"
     68 #include "rf_decluster.h"
     69 #include "rf_map.h"
     70 #include "rf_diskthreads.h"
     71 #include "rf_revent.h"
     72 #include "rf_callback.h"
     73 #include "rf_engine.h"
     74 #include "rf_memchunk.h"
     75 #include "rf_mcpair.h"
     76 #include "rf_nwayxor.h"
     77 #include "rf_debugprint.h"
     78 #include "rf_copyback.h"
     79 #if !defined(__NetBSD__)
     80 #include "rf_camlayer.h"
     81 #endif
     82 #include "rf_driver.h"
     83 #include "rf_options.h"
     84 #include "rf_shutdown.h"
     85 #include "rf_sys.h"
     86 #include "rf_cpuutil.h"
     87 
     88 #include <sys/buf.h>
     89 
     90 #if DKUSAGE > 0
     91 #include <sys/dkusage.h>
     92 #include <io/common/iotypes.h>
     93 #include <io/cam/dec_cam.h>
     94 #include <io/cam/cam.h>
     95 #include <io/cam/pdrv.h>
     96 #endif				/* DKUSAGE > 0 */
     97 
     98 /* rad == RF_RaidAccessDesc_t */
     99 static RF_FreeList_t *rf_rad_freelist;
    100 #define RF_MAX_FREE_RAD 128
    101 #define RF_RAD_INC       16
    102 #define RF_RAD_INITIAL   32
    103 
    104 /* debug variables */
    105 char    rf_panicbuf[2048];	/* a buffer to hold an error msg when we panic */
    106 
    107 /* main configuration routines */
    108 static int raidframe_booted = 0;
    109 
    110 static void rf_ConfigureDebug(RF_Config_t * cfgPtr);
    111 static void set_debug_option(char *name, long val);
    112 static void rf_UnconfigureArray(void);
    113 static int init_rad(RF_RaidAccessDesc_t *);
    114 static void clean_rad(RF_RaidAccessDesc_t *);
    115 static void rf_ShutdownRDFreeList(void *);
    116 static int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
    117 void rf_UnconfigureVnodes( RF_Raid_t * );
    118 
    119 
    120 RF_DECLARE_MUTEX(rf_printf_mutex)	/* debug only:  avoids interleaved
    121 					 * printfs by different stripes */
    122 RF_DECLARE_GLOBAL_THREADID	/* declarations for threadid.h */
    123 
    124 
    125 #define SIGNAL_QUIESCENT_COND(_raid_)  wakeup(&((_raid_)->accesses_suspended))
    126 #define WAIT_FOR_QUIESCENCE(_raid_) \
    127 	tsleep(&((_raid_)->accesses_suspended),PRIBIO|PCATCH,"raidframe quiesce", 0);
    128 
    129 #if DKUSAGE > 0
    130 #define IO_BUF_ERR(bp, err, unit) { \
    131 	bp->b_flags |= B_ERROR; \
    132 	bp->b_resid = bp->b_bcount; \
    133 	bp->b_error = err; \
    134 	RF_DKU_END_IO(unit, bp); \
    135 	biodone(bp); \
    136 }
    137 #else
    138 #define IO_BUF_ERR(bp, err, unit) { \
    139 	bp->b_flags |= B_ERROR; \
    140 	bp->b_resid = bp->b_bcount; \
    141 	bp->b_error = err; \
    142 	RF_DKU_END_IO(unit); \
    143 	biodone(bp); \
    144 }
    145 #endif				/* DKUSAGE > 0 */
    146 
    147 	static int configureCount = 0;	/* number of active configurations */
    148 	static int isconfigged = 0;	/* is basic raidframe (non per-array)
    149 					 * stuff configged */
    150 RF_DECLARE_STATIC_MUTEX(configureMutex)	/* used to lock the configuration
    151 					 * stuff */
    152 	static RF_ShutdownList_t *globalShutdown;	/* non array-specific
    153 							 * stuff */
    154 
    155 	static int rf_ConfigureRDFreeList(RF_ShutdownList_t ** listp);
    156 
    157 /* called at system boot time */
    158 int
    159 rf_BootRaidframe()
    160 {
    161 	int     rc;
    162 
    163 	if (raidframe_booted)
    164 		return (EBUSY);
    165 	raidframe_booted = 1;
    166 
    167 #if RF_DEBUG_ATOMIC > 0
    168 	rf_atent_init();
    169 #endif				/* RF_DEBUG_ATOMIC > 0 */
    170 
    171 	rf_setup_threadid();
    172 	rf_assign_threadid();
    173 
    174 	rc = rf_mutex_init(&configureMutex);
    175 	if (rc) {
    176 		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
    177 		    __LINE__, rc);
    178 		RF_PANIC();
    179 	}
    180 	configureCount = 0;
    181 	isconfigged = 0;
    182 	globalShutdown = NULL;
    183 	return (0);
    184 }
    185 /*
    186  * This function is really just for debugging user-level stuff: it
    187  * frees up all memory, other RAIDframe resources which might otherwise
    188  * be kept around. This is used with systems like "sentinel" to detect
    189  * memory leaks.
    190  */
    191 int
    192 rf_UnbootRaidframe()
    193 {
    194 	int     rc;
    195 
    196 	RF_LOCK_MUTEX(configureMutex);
    197 	if (configureCount) {
    198 		RF_UNLOCK_MUTEX(configureMutex);
    199 		return (EBUSY);
    200 	}
    201 	raidframe_booted = 0;
    202 	RF_UNLOCK_MUTEX(configureMutex);
    203 	rc = rf_mutex_destroy(&configureMutex);
    204 	if (rc) {
    205 		RF_ERRORMSG3("Unable to destroy mutex file %s line %d rc=%d\n", __FILE__,
    206 		    __LINE__, rc);
    207 		RF_PANIC();
    208 	}
    209 #if RF_DEBUG_ATOMIC > 0
    210 	rf_atent_shutdown();
    211 #endif				/* RF_DEBUG_ATOMIC > 0 */
    212 	return (0);
    213 }
    214 /*
    215  * Called whenever an array is shutdown
    216  */
    217 static void
    218 rf_UnconfigureArray()
    219 {
    220 	int     rc;
    221 
    222 	RF_LOCK_MUTEX(configureMutex);
    223 	if (--configureCount == 0) {	/* if no active configurations, shut
    224 					 * everything down */
    225 		isconfigged = 0;
    226 
    227 		rc = rf_ShutdownList(&globalShutdown);
    228 		if (rc) {
    229 			RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown, rc=%d\n", rc);
    230 		}
    231 		rf_shutdown_threadid();
    232 
    233 		/*
    234 	         * We must wait until now, because the AllocList module
    235 	         * uses the DebugMem module.
    236 	         */
    237 		if (rf_memDebug)
    238 			rf_print_unfreed();
    239 	}
    240 	RF_UNLOCK_MUTEX(configureMutex);
    241 }
    242 /*
    243  * Called to shut down an array.
    244  */
    245 int
    246 rf_Shutdown(raidPtr)
    247 	RF_Raid_t *raidPtr;
    248 {
    249 
    250 	if (!raidPtr->valid) {
    251 		RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver.  Aborting shutdown\n");
    252 		return (EINVAL);
    253 	}
    254 	/*
    255          * wait for outstanding IOs to land
    256          * As described in rf_raid.h, we use the rad_freelist lock
    257          * to protect the per-array info about outstanding descs
    258          * since we need to do freelist locking anyway, and this
    259          * cuts down on the amount of serialization we've got going
    260          * on.
    261          */
    262 	RF_FREELIST_DO_LOCK(rf_rad_freelist);
    263 	if (raidPtr->waitShutdown) {
    264 		RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
    265 		return (EBUSY);
    266 	}
    267 	raidPtr->waitShutdown = 1;
    268 	while (raidPtr->nAccOutstanding) {
    269 		RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist));
    270 	}
    271 	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
    272 
    273 	raidPtr->valid = 0;
    274 
    275 
    276 	rf_UnconfigureVnodes(raidPtr);
    277 
    278 	rf_ShutdownList(&raidPtr->shutdownList);
    279 
    280 	rf_UnconfigureArray();
    281 
    282 	return (0);
    283 }
    284 
    285 void
    286 rf_UnconfigureVnodes( raidPtr )
    287 	RF_Raid_t *raidPtr;
    288 {
    289 	int r,c;
    290 	struct proc *p;
    291 
    292 
    293 	/* We take this opportunity to close the vnodes like we should.. */
    294 
    295 	p = raidPtr->proc;	/* XXX */
    296 
    297 	for (r = 0; r < raidPtr->numRow; r++) {
    298 		for (c = 0; c < raidPtr->numCol; c++) {
    299 			printf("Closing vnode for row: %d col: %d\n", r, c);
    300 			if (raidPtr->raid_cinfo[r][c].ci_vp) {
    301 				VOP_UNLOCK(raidPtr->raid_cinfo[r][c].ci_vp, 0);
    302 				(void) vn_close(raidPtr->raid_cinfo[r][c].ci_vp,
    303 				    FREAD | FWRITE, p->p_ucred, p);
    304 			} else {
    305 				printf("vnode was NULL\n");
    306 			}
    307 
    308 		}
    309 	}
    310 	for (r = 0; r < raidPtr->numSpare; r++) {
    311 		printf("Closing vnode for spare: %d\n", r);
    312 		if (raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp) {
    313 			VOP_UNLOCK(raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp, 0);
    314 			(void) vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp,
    315 			    FREAD | FWRITE, p->p_ucred, p);
    316 		} else {
    317 			printf("vnode was NULL\n");
    318 		}
    319 	}
    320 
    321 
    322 }
    323 
    324 
    325 #define DO_INIT_CONFIGURE(f) { \
    326 	rc = f (&globalShutdown); \
    327 	if (rc) { \
    328 		RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
    329 		rf_ShutdownList(&globalShutdown); \
    330 		configureCount--; \
    331 		RF_UNLOCK_MUTEX(configureMutex); \
    332 		return(rc); \
    333 	} \
    334 }
    335 
    336 #define DO_RAID_FAIL() { \
    337 	rf_ShutdownList(&raidPtr->shutdownList); \
    338 	rf_UnconfigureArray(); \
    339 }
    340 
    341 #define DO_RAID_INIT_CONFIGURE(f) { \
    342 	rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \
    343 	if (rc) { \
    344 		RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
    345 		DO_RAID_FAIL(); \
    346 		return(rc); \
    347 	} \
    348 }
    349 
    350 #define DO_RAID_MUTEX(_m_) { \
    351 	rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_)); \
    352 	if (rc) { \
    353 		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", \
    354 			__FILE__, __LINE__, rc); \
    355 		DO_RAID_FAIL(); \
    356 		return(rc); \
    357 	} \
    358 }
    359 
    360 #define DO_RAID_COND(_c_) { \
    361 	rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_)); \
    362 	if (rc) { \
    363 		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", \
    364 			__FILE__, __LINE__, rc); \
    365 		DO_RAID_FAIL(); \
    366 		return(rc); \
    367 	} \
    368 }
    369 
    370 int
    371 rf_Configure(raidPtr, cfgPtr)
    372 	RF_Raid_t *raidPtr;
    373 	RF_Config_t *cfgPtr;
    374 {
    375 	RF_RowCol_t row, col;
    376 	int     i, rc;
    377 	int     unit;
    378 	struct proc *p;
    379 
    380 	if (raidPtr->valid) {
    381 		RF_ERRORMSG("RAIDframe configuration not shut down.  Aborting configure.\n");
    382 		return (EINVAL);
    383 	}
    384 	RF_LOCK_MUTEX(configureMutex);
    385 	configureCount++;
    386 	if (isconfigged == 0) {
    387 		rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex);
    388 		if (rc) {
    389 			RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
    390 			    __LINE__, rc);
    391 			rf_ShutdownList(&globalShutdown);
    392 			return (rc);
    393 		}
    394 		/* initialize globals */
    395 		printf("RAIDFRAME: protectedSectors is %ld\n", rf_protectedSectors);
    396 
    397 		rf_clear_debug_print_buffer();
    398 
    399 		DO_INIT_CONFIGURE(rf_ConfigureAllocList);
    400 		DO_INIT_CONFIGURE(rf_ConfigureEtimer);
    401 		/*
    402 	         * Yes, this does make debugging general to the whole system instead
    403 	         * of being array specific. Bummer, drag.
    404 	         */
    405 		rf_ConfigureDebug(cfgPtr);
    406 		DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
    407 		DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
    408 		DO_INIT_CONFIGURE(rf_ConfigureMapModule);
    409 		DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
    410 		DO_INIT_CONFIGURE(rf_ConfigureCallback);
    411 		DO_INIT_CONFIGURE(rf_ConfigureMemChunk);
    412 		DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
    413 		DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
    414 		DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
    415 		DO_INIT_CONFIGURE(rf_ConfigureMCPair);
    416 #if !defined(__NetBSD__)
    417 		DO_INIT_CONFIGURE(rf_ConfigureCamLayer);
    418 #endif
    419 		DO_INIT_CONFIGURE(rf_ConfigureDAGs);
    420 		DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
    421 		DO_INIT_CONFIGURE(rf_ConfigureDebugPrint);
    422 		DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
    423 		DO_INIT_CONFIGURE(rf_ConfigureCopyback);
    424 		DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
    425 		DO_INIT_CONFIGURE(rf_ConfigureCpuMonitor);
    426 		isconfigged = 1;
    427 	}
    428 	RF_UNLOCK_MUTEX(configureMutex);
    429 
    430 	/*
    431          * Null out the entire raid descriptor to avoid problems when we reconfig.
    432          * This also clears the valid bit.
    433          */
    434 	/* XXX this clearing should be moved UP to outside of here.... that,
    435 	 * or rf_Configure() needs to take more arguments... XXX */
    436 	unit = raidPtr->raidid;
    437 	p = raidPtr->proc;	/* XXX save these... */
    438 	bzero((char *) raidPtr, sizeof(RF_Raid_t));
    439 	raidPtr->raidid = unit;
    440 	raidPtr->proc = p;	/* XXX and then recover them.. */
    441 	DO_RAID_MUTEX(&raidPtr->mutex);
    442 	/* set up the cleanup list.  Do this after ConfigureDebug so that
    443 	 * value of memDebug will be set */
    444 
    445 	rf_MakeAllocList(raidPtr->cleanupList);
    446 	if (raidPtr->cleanupList == NULL) {
    447 		DO_RAID_FAIL();
    448 		return (ENOMEM);
    449 	}
    450 	rc = rf_ShutdownCreate(&raidPtr->shutdownList,
    451 	    (void (*) (void *)) rf_FreeAllocList,
    452 	    raidPtr->cleanupList);
    453 	if (rc) {
    454 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
    455 		    __FILE__, __LINE__, rc);
    456 		DO_RAID_FAIL();
    457 		return (rc);
    458 	}
    459 	raidPtr->numRow = cfgPtr->numRow;
    460 	raidPtr->numCol = cfgPtr->numCol;
    461 	raidPtr->numSpare = cfgPtr->numSpare;
    462 
    463 	/* XXX we don't even pretend to support more than one row in the
    464 	 * kernel... */
    465 	if (raidPtr->numRow != 1) {
    466 		RF_ERRORMSG("Only one row supported in kernel.\n");
    467 		DO_RAID_FAIL();
    468 		return (EINVAL);
    469 	}
    470 	RF_CallocAndAdd(raidPtr->status, raidPtr->numRow, sizeof(RF_RowStatus_t),
    471 	    (RF_RowStatus_t *), raidPtr->cleanupList);
    472 	if (raidPtr->status == NULL) {
    473 		DO_RAID_FAIL();
    474 		return (ENOMEM);
    475 	}
    476 	RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow,
    477 	    sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList);
    478 	if (raidPtr->reconControl == NULL) {
    479 		DO_RAID_FAIL();
    480 		return (ENOMEM);
    481 	}
    482 	for (i = 0; i < raidPtr->numRow; i++) {
    483 		raidPtr->status[i] = rf_rs_optimal;
    484 		raidPtr->reconControl[i] = NULL;
    485 	}
    486 
    487 	DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
    488 	DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
    489 
    490 	DO_RAID_COND(&raidPtr->outstandingCond);
    491 
    492 	raidPtr->nAccOutstanding = 0;
    493 	raidPtr->waitShutdown = 0;
    494 
    495 	DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
    496 	DO_RAID_COND(&raidPtr->quiescent_cond);
    497 
    498 	DO_RAID_COND(&raidPtr->waitForReconCond);
    499 
    500 	DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex);
    501 	DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
    502 	DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
    503 	/* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev
    504 	 * no. is set */
    505 	DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
    506 
    507 	DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
    508 
    509 	DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus);
    510 
    511 	for (row = 0; row < raidPtr->numRow; row++) {
    512 		for (col = 0; col < raidPtr->numCol; col++) {
    513 			/*
    514 		         * XXX better distribution
    515 		         */
    516 			raidPtr->hist_diskreq[row][col] = 0;
    517 		}
    518 	}
    519 
    520 	if (rf_keepAccTotals) {
    521 		raidPtr->keep_acc_totals = 1;
    522 	}
    523 	rf_StartUserStats(raidPtr);
    524 
    525 	raidPtr->valid = 1;
    526 	return (0);
    527 }
    528 
    529 static int
    530 init_rad(desc)
    531 	RF_RaidAccessDesc_t *desc;
    532 {
    533 	int     rc;
    534 
    535 	rc = rf_mutex_init(&desc->mutex);
    536 	if (rc) {
    537 		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
    538 		    __LINE__, rc);
    539 		return (rc);
    540 	}
    541 	rc = rf_cond_init(&desc->cond);
    542 	if (rc) {
    543 		RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
    544 		    __LINE__, rc);
    545 		rf_mutex_destroy(&desc->mutex);
    546 		return (rc);
    547 	}
    548 	return (0);
    549 }
    550 
    551 static void
    552 clean_rad(desc)
    553 	RF_RaidAccessDesc_t *desc;
    554 {
    555 	rf_mutex_destroy(&desc->mutex);
    556 	rf_cond_destroy(&desc->cond);
    557 }
    558 
    559 static void
    560 rf_ShutdownRDFreeList(ignored)
    561 	void   *ignored;
    562 {
    563 	RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist, next, (RF_RaidAccessDesc_t *), clean_rad);
    564 }
    565 
    566 static int
    567 rf_ConfigureRDFreeList(listp)
    568 	RF_ShutdownList_t **listp;
    569 {
    570 	int     rc;
    571 
    572 	RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD,
    573 	    RF_RAD_INC, sizeof(RF_RaidAccessDesc_t));
    574 	if (rf_rad_freelist == NULL) {
    575 		return (ENOMEM);
    576 	}
    577 	rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
    578 	if (rc) {
    579 		RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
    580 		    __LINE__, rc);
    581 		rf_ShutdownRDFreeList(NULL);
    582 		return (rc);
    583 	}
    584 	RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL, next,
    585 	    (RF_RaidAccessDesc_t *), init_rad);
    586 	return (0);
    587 }
    588 
    589 RF_RaidAccessDesc_t *
    590 rf_AllocRaidAccDesc(
    591     RF_Raid_t * raidPtr,
    592     RF_IoType_t type,
    593     RF_RaidAddr_t raidAddress,
    594     RF_SectorCount_t numBlocks,
    595     caddr_t bufPtr,
    596     void *bp,
    597     RF_DagHeader_t ** paramDAG,
    598     RF_AccessStripeMapHeader_t ** paramASM,
    599     RF_RaidAccessFlags_t flags,
    600     void (*cbF) (struct buf *),
    601     void *cbA,
    602     RF_AccessState_t * states)
    603 {
    604 	RF_RaidAccessDesc_t *desc;
    605 
    606 	RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist, desc, next, (RF_RaidAccessDesc_t *), init_rad);
    607 	if (raidPtr->waitShutdown) {
    608 		/*
    609 	         * Actually, we're shutting the array down. Free the desc
    610 	         * and return NULL.
    611 	         */
    612 		RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
    613 		RF_FREELIST_FREE_CLEAN(rf_rad_freelist, desc, next, clean_rad);
    614 		return (NULL);
    615 	}
    616 	raidPtr->nAccOutstanding++;
    617 	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
    618 
    619 	desc->raidPtr = (void *) raidPtr;
    620 	desc->type = type;
    621 	desc->raidAddress = raidAddress;
    622 	desc->numBlocks = numBlocks;
    623 	desc->bufPtr = bufPtr;
    624 	desc->bp = bp;
    625 	desc->paramDAG = paramDAG;
    626 	desc->paramASM = paramASM;
    627 	desc->flags = flags;
    628 	desc->states = states;
    629 	desc->state = 0;
    630 
    631 	desc->status = 0;
    632 	bzero((char *) &desc->tracerec, sizeof(RF_AccTraceEntry_t));
    633 	desc->callbackFunc = (void (*) (RF_CBParam_t)) cbF;	/* XXX */
    634 	desc->callbackArg = cbA;
    635 	desc->next = NULL;
    636 	desc->head = desc;
    637 	desc->numPending = 0;
    638 	desc->cleanupList = NULL;
    639 	rf_MakeAllocList(desc->cleanupList);
    640 	rf_get_threadid(desc->tid);
    641 	return (desc);
    642 }
    643 
    644 void
    645 rf_FreeRaidAccDesc(RF_RaidAccessDesc_t * desc)
    646 {
    647 	RF_Raid_t *raidPtr = desc->raidPtr;
    648 
    649 	RF_ASSERT(desc);
    650 
    651 	rf_FreeAllocList(desc->cleanupList);
    652 	RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist, desc, next, clean_rad);
    653 	raidPtr->nAccOutstanding--;
    654 	if (raidPtr->waitShutdown) {
    655 		RF_SIGNAL_COND(raidPtr->outstandingCond);
    656 	}
    657 	RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
    658 }
    659 /*********************************************************************
    660  * Main routine for performing an access.
    661  * Accesses are retried until a DAG can not be selected.  This occurs
    662  * when either the DAG library is incomplete or there are too many
    663  * failures in a parity group.
    664  ********************************************************************/
    665 int
    666 rf_DoAccess(
    667     RF_Raid_t * raidPtr,
    668     RF_IoType_t type,
    669     int async_flag,
    670     RF_RaidAddr_t raidAddress,
    671     RF_SectorCount_t numBlocks,
    672     caddr_t bufPtr,
    673     void *bp_in,
    674     RF_DagHeader_t ** paramDAG,
    675     RF_AccessStripeMapHeader_t ** paramASM,
    676     RF_RaidAccessFlags_t flags,
    677     RF_RaidAccessDesc_t ** paramDesc,
    678     void (*cbF) (struct buf *),
    679     void *cbA)
    680 /*
    681 type should be read or write
    682 async_flag should be RF_TRUE or RF_FALSE
    683 bp_in is a buf pointer.  void * to facilitate ignoring it outside the kernel
    684 */
    685 {
    686 	int     tid;
    687 	RF_RaidAccessDesc_t *desc;
    688 	caddr_t lbufPtr = bufPtr;
    689 	struct buf *bp = (struct buf *) bp_in;
    690 #if DFSTRACE > 0
    691 	struct {
    692 		RF_uint64 raidAddr;
    693 		int     numBlocks;
    694 		char    type;
    695 	}       dfsrecord;
    696 #endif				/* DFSTRACE > 0 */
    697 
    698 	raidAddress += rf_raidSectorOffset;
    699 
    700 	if (!raidPtr->valid) {
    701 		RF_ERRORMSG("RAIDframe driver not successfully configured.  Rejecting access.\n");
    702 		IO_BUF_ERR(bp, EINVAL, raidPtr->raidid);
    703 		return (EINVAL);
    704 	}
    705 #if defined(KERNEL) && DFSTRACE > 0
    706 	if (rf_DFSTraceAccesses) {
    707 		dfsrecord.raidAddr = raidAddress;
    708 		dfsrecord.numBlocks = numBlocks;
    709 		dfsrecord.type = type;
    710 		dfs_log(DFS_NOTE, (char *) &dfsrecord, sizeof(dfsrecord), 0);
    711 	}
    712 #endif				/* KERNEL && DFSTRACE > 0 */
    713 
    714 	rf_get_threadid(tid);
    715 	if (rf_accessDebug) {
    716 
    717 		printf("logBytes is: %d %d %d\n", raidPtr->raidid,
    718 		    raidPtr->logBytesPerSector,
    719 		    (int) rf_RaidAddressToByte(raidPtr, numBlocks));
    720 		printf("[%d] %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n", tid,
    721 		    (type == RF_IO_TYPE_READ) ? "READ" : "WRITE", (int) raidAddress,
    722 		    (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
    723 		    (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress + numBlocks - 1),
    724 		    (int) numBlocks,
    725 		    (int) rf_RaidAddressToByte(raidPtr, numBlocks),
    726 		    (long) bufPtr);
    727 	}
    728 	if (raidAddress + numBlocks > raidPtr->totalSectors) {
    729 
    730 		printf("DoAccess: raid addr %lu too large to access %lu sectors.  Max legal addr is %lu\n",
    731 		    (u_long) raidAddress, (u_long) numBlocks, (u_long) raidPtr->totalSectors);
    732 
    733 		if (type == RF_IO_TYPE_READ) {
    734 			IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
    735 			return (ENOSPC);
    736 		} else {
    737 			IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
    738 			return (ENOSPC);
    739 		}
    740 	}
    741 	desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
    742 	    numBlocks, lbufPtr, bp, paramDAG, paramASM,
    743 	    flags, cbF, cbA, raidPtr->Layout.map->states);
    744 
    745 	if (desc == NULL) {
    746 		return (ENOMEM);
    747 	}
    748 	RF_ETIMER_START(desc->tracerec.tot_timer);
    749 
    750 	desc->async_flag = async_flag;
    751 
    752 	rf_ContinueRaidAccess(desc);
    753 
    754 	return (0);
    755 }
    756 /* force the array into reconfigured mode without doing reconstruction */
    757 int
    758 rf_SetReconfiguredMode(raidPtr, row, col)
    759 	RF_Raid_t *raidPtr;
    760 	int     row;
    761 	int     col;
    762 {
    763 	if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
    764 		printf("Can't set reconfigured mode in dedicated-spare array\n");
    765 		RF_PANIC();
    766 	}
    767 	RF_LOCK_MUTEX(raidPtr->mutex);
    768 	raidPtr->numFailures++;
    769 	raidPtr->Disks[row][col].status = rf_ds_dist_spared;
    770 	raidPtr->status[row] = rf_rs_reconfigured;
    771 	/* install spare table only if declustering + distributed sparing
    772 	 * architecture. */
    773 	if (raidPtr->Layout.map->flags & RF_BD_DECLUSTERED)
    774 		rf_InstallSpareTable(raidPtr, row, col);
    775 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    776 	return (0);
    777 }
    778 
    779 extern int fail_row, fail_col, fail_time;
    780 extern int delayed_recon;
    781 
    782 int
    783 rf_FailDisk(
    784     RF_Raid_t * raidPtr,
    785     int frow,
    786     int fcol,
    787     int initRecon)
    788 {
    789 	int     tid;
    790 
    791 	rf_get_threadid(tid);
    792 	printf("[%d] Failing disk r%d c%d\n", tid, frow, fcol);
    793 	RF_LOCK_MUTEX(raidPtr->mutex);
    794 	raidPtr->numFailures++;
    795 	raidPtr->Disks[frow][fcol].status = rf_ds_failed;
    796 	raidPtr->status[frow] = rf_rs_degraded;
    797 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    798 	if (initRecon)
    799 		rf_ReconstructFailedDisk(raidPtr, frow, fcol);
    800 	return (0);
    801 }
    802 /* releases a thread that is waiting for the array to become quiesced.
    803  * access_suspend_mutex should be locked upon calling this
    804  */
    805 void
    806 rf_SignalQuiescenceLock(raidPtr, reconDesc)
    807 	RF_Raid_t *raidPtr;
    808 	RF_RaidReconDesc_t *reconDesc;
    809 {
    810 	int     tid;
    811 
    812 	if (rf_quiesceDebug) {
    813 		rf_get_threadid(tid);
    814 		printf("[%d] Signalling quiescence lock\n", tid);
    815 	}
    816 	raidPtr->access_suspend_release = 1;
    817 
    818 	if (raidPtr->waiting_for_quiescence) {
    819 		SIGNAL_QUIESCENT_COND(raidPtr);
    820 	}
    821 }
    822 /* suspends all new requests to the array.  No effect on accesses that are in flight.  */
    823 int
    824 rf_SuspendNewRequestsAndWait(raidPtr)
    825 	RF_Raid_t *raidPtr;
    826 {
    827 	if (rf_quiesceDebug)
    828 		printf("Suspending new reqs\n");
    829 
    830 	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
    831 	raidPtr->accesses_suspended++;
    832 	raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
    833 
    834 	if (raidPtr->waiting_for_quiescence) {
    835 		raidPtr->access_suspend_release = 0;
    836 		while (!raidPtr->access_suspend_release) {
    837 			printf("Suspending: Waiting for Quiesence\n");
    838 			WAIT_FOR_QUIESCENCE(raidPtr);
    839 			raidPtr->waiting_for_quiescence = 0;
    840 		}
    841 	}
    842 	printf("Quiesence reached..\n");
    843 
    844 	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
    845 	return (raidPtr->waiting_for_quiescence);
    846 }
    847 /* wake up everyone waiting for quiescence to be released */
    848 void
    849 rf_ResumeNewRequests(raidPtr)
    850 	RF_Raid_t *raidPtr;
    851 {
    852 	RF_CallbackDesc_t *t, *cb;
    853 
    854 	if (rf_quiesceDebug)
    855 		printf("Resuming new reqs\n");
    856 
    857 	RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
    858 	raidPtr->accesses_suspended--;
    859 	if (raidPtr->accesses_suspended == 0)
    860 		cb = raidPtr->quiesce_wait_list;
    861 	else
    862 		cb = NULL;
    863 	raidPtr->quiesce_wait_list = NULL;
    864 	RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
    865 
    866 	while (cb) {
    867 		t = cb;
    868 		cb = cb->next;
    869 		(t->callbackFunc) (t->callbackArg);
    870 		rf_FreeCallbackDesc(t);
    871 	}
    872 }
    873 /*****************************************************************************************
    874  *
    875  * debug routines
    876  *
    877  ****************************************************************************************/
    878 
    879 static void
    880 set_debug_option(name, val)
    881 	char   *name;
    882 	long    val;
    883 {
    884 	RF_DebugName_t *p;
    885 
    886 	for (p = rf_debugNames; p->name; p++) {
    887 		if (!strcmp(p->name, name)) {
    888 			*(p->ptr) = val;
    889 			printf("[Set debug variable %s to %ld]\n", name, val);
    890 			return;
    891 		}
    892 	}
    893 	RF_ERRORMSG1("Unknown debug string \"%s\"\n", name);
    894 }
    895 
    896 
    897 /* would like to use sscanf here, but apparently not available in kernel */
    898 /*ARGSUSED*/
    899 static void
    900 rf_ConfigureDebug(cfgPtr)
    901 	RF_Config_t *cfgPtr;
    902 {
    903 	char   *val_p, *name_p, *white_p;
    904 	long    val;
    905 	int     i;
    906 
    907 	rf_ResetDebugOptions();
    908 	for (i = 0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
    909 		name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
    910 		white_p = rf_find_white(name_p);	/* skip to start of 2nd
    911 							 * word */
    912 		val_p = rf_find_non_white(white_p);
    913 		if (*val_p == '0' && *(val_p + 1) == 'x')
    914 			val = rf_htoi(val_p + 2);
    915 		else
    916 			val = rf_atoi(val_p);
    917 		*white_p = '\0';
    918 		set_debug_option(name_p, val);
    919 	}
    920 }
    921 /* performance monitoring stuff */
    922 
    923 #define TIMEVAL_TO_US(t) (((long) t.tv_sec) * 1000000L + (long) t.tv_usec)
    924 
    925 #if !defined(_KERNEL) && !defined(SIMULATE)
    926 
    927 /*
    928  * Throughput stats currently only used in user-level RAIDframe
    929  */
    930 
    931 static int
    932 rf_InitThroughputStats(
    933     RF_ShutdownList_t ** listp,
    934     RF_Raid_t * raidPtr,
    935     RF_Config_t * cfgPtr)
    936 {
    937 	int     rc;
    938 
    939 	/* these used by user-level raidframe only */
    940 	rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex);
    941 	if (rc) {
    942 		RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
    943 		    __LINE__, rc);
    944 		return (rc);
    945 	}
    946 	raidPtr->throughputstats.sum_io_us = 0;
    947 	raidPtr->throughputstats.num_ios = 0;
    948 	raidPtr->throughputstats.num_out_ios = 0;
    949 	return (0);
    950 }
    951 
    952 void
    953 rf_StartThroughputStats(RF_Raid_t * raidPtr)
    954 {
    955 	RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
    956 	raidPtr->throughputstats.num_ios++;
    957 	raidPtr->throughputstats.num_out_ios++;
    958 	if (raidPtr->throughputstats.num_out_ios == 1)
    959 		RF_GETTIME(raidPtr->throughputstats.start);
    960 	RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
    961 }
    962 
    963 static void
    964 rf_StopThroughputStats(RF_Raid_t * raidPtr)
    965 {
    966 	struct timeval diff;
    967 
    968 	RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
    969 	raidPtr->throughputstats.num_out_ios--;
    970 	if (raidPtr->throughputstats.num_out_ios == 0) {
    971 		RF_GETTIME(raidPtr->throughputstats.stop);
    972 		RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start, &raidPtr->throughputstats.stop, &diff);
    973 		raidPtr->throughputstats.sum_io_us += TIMEVAL_TO_US(diff);
    974 	}
    975 	RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
    976 }
    977 
    978 static void
    979 rf_PrintThroughputStats(RF_Raid_t * raidPtr)
    980 {
    981 	RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0);
    982 	if (raidPtr->throughputstats.sum_io_us != 0) {
    983 		printf("[Througphut: %8.2f IOs/second]\n", raidPtr->throughputstats.num_ios
    984 		    / (raidPtr->throughputstats.sum_io_us / 1000000.0));
    985 	}
    986 }
    987 #endif				/* !KERNEL && !SIMULATE */
    988 
    989 void
    990 rf_StartUserStats(RF_Raid_t * raidPtr)
    991 {
    992 	RF_GETTIME(raidPtr->userstats.start);
    993 	raidPtr->userstats.sum_io_us = 0;
    994 	raidPtr->userstats.num_ios = 0;
    995 	raidPtr->userstats.num_sect_moved = 0;
    996 }
    997 
    998 void
    999 rf_StopUserStats(RF_Raid_t * raidPtr)
   1000 {
   1001 	RF_GETTIME(raidPtr->userstats.stop);
   1002 }
   1003 
   1004 void
   1005 rf_UpdateUserStats(raidPtr, rt, numsect)
   1006 	RF_Raid_t *raidPtr;
   1007 	int     rt;		/* resp time in us */
   1008 	int     numsect;	/* number of sectors for this access */
   1009 {
   1010 	raidPtr->userstats.sum_io_us += rt;
   1011 	raidPtr->userstats.num_ios++;
   1012 	raidPtr->userstats.num_sect_moved += numsect;
   1013 }
   1014 
   1015 void
   1016 rf_PrintUserStats(RF_Raid_t * raidPtr)
   1017 {
   1018 	long    elapsed_us, mbs, mbs_frac;
   1019 	struct timeval diff;
   1020 
   1021 	RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop, &diff);
   1022 	elapsed_us = TIMEVAL_TO_US(diff);
   1023 
   1024 	/* 2000 sectors per megabyte, 10000000 microseconds per second */
   1025 	if (elapsed_us)
   1026 		mbs = (raidPtr->userstats.num_sect_moved / 2000) / (elapsed_us / 1000000);
   1027 	else
   1028 		mbs = 0;
   1029 
   1030 	/* this computes only the first digit of the fractional mb/s moved */
   1031 	if (elapsed_us) {
   1032 		mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) / (elapsed_us / 1000000))
   1033 		    - (mbs * 10);
   1034 	} else {
   1035 		mbs_frac = 0;
   1036 	}
   1037 
   1038 	printf("Number of I/Os:             %ld\n", raidPtr->userstats.num_ios);
   1039 	printf("Elapsed time (us):          %ld\n", elapsed_us);
   1040 	printf("User I/Os per second:       %ld\n", RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us / 1000000)));
   1041 	printf("Average user response time: %ld us\n", RF_DB0_CHECK(raidPtr->userstats.sum_io_us, raidPtr->userstats.num_ios));
   1042 	printf("Total sectors moved:        %ld\n", raidPtr->userstats.num_sect_moved);
   1043 	printf("Average access size (sect): %ld\n", RF_DB0_CHECK(raidPtr->userstats.num_sect_moved, raidPtr->userstats.num_ios));
   1044 	printf("Achieved data rate:         %ld.%ld MB/sec\n", mbs, mbs_frac);
   1045 }
   1046