Home | History | Annotate | Line # | Download | only in raidframe
rf_reconstruct.c revision 1.105.4.6.2.1
      1 /*	$NetBSD: rf_reconstruct.c,v 1.105.4.6.2.1 2014/11/20 12:25:10 sborrill Exp $	*/
      2 /*
      3  * Copyright (c) 1995 Carnegie-Mellon University.
      4  * All rights reserved.
      5  *
      6  * Author: Mark Holland
      7  *
      8  * Permission to use, copy, modify and distribute this software and
      9  * its documentation is hereby granted, provided that both the copyright
     10  * notice and this permission notice appear in all copies of the
     11  * software, derivative works or modified versions, and any portions
     12  * thereof, and that both notices appear in supporting documentation.
     13  *
     14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     17  *
     18  * Carnegie Mellon requests users of this software to return to
     19  *
     20  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     21  *  School of Computer Science
     22  *  Carnegie Mellon University
     23  *  Pittsburgh PA 15213-3890
     24  *
     25  * any improvements or extensions that they make and grant Carnegie the
     26  * rights to redistribute these changes.
     27  */
     28 
     29 /************************************************************
     30  *
     31  * rf_reconstruct.c -- code to perform on-line reconstruction
     32  *
     33  ************************************************************/
     34 
     35 #include <sys/cdefs.h>
     36 __KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.105.4.6.2.1 2014/11/20 12:25:10 sborrill Exp $");
     37 
     38 #include <sys/param.h>
     39 #include <sys/time.h>
     40 #include <sys/buf.h>
     41 #include <sys/errno.h>
     42 #include <sys/systm.h>
     43 #include <sys/proc.h>
     44 #include <sys/ioctl.h>
     45 #include <sys/fcntl.h>
     46 #include <sys/vnode.h>
     47 #include <dev/raidframe/raidframevar.h>
     48 
     49 #include "rf_raid.h"
     50 #include "rf_reconutil.h"
     51 #include "rf_revent.h"
     52 #include "rf_reconbuffer.h"
     53 #include "rf_acctrace.h"
     54 #include "rf_etimer.h"
     55 #include "rf_dag.h"
     56 #include "rf_desc.h"
     57 #include "rf_debugprint.h"
     58 #include "rf_general.h"
     59 #include "rf_driver.h"
     60 #include "rf_utils.h"
     61 #include "rf_shutdown.h"
     62 
     63 #include "rf_kintf.h"
     64 
     65 /* setting these to -1 causes them to be set to their default values if not set by debug options */
     66 
     67 #if RF_DEBUG_RECON
     68 #define Dprintf(s)         if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
     69 #define Dprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
     70 #define Dprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
     71 #define Dprintf3(s,a,b,c)     if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
     72 #define Dprintf4(s,a,b,c,d)   if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
     73 #define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
     74 #define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
     75 #define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
     76 
     77 #define DDprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
     78 #define DDprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
     79 
     80 #else /* RF_DEBUG_RECON */
     81 
     82 #define Dprintf(s) {}
     83 #define Dprintf1(s,a) {}
     84 #define Dprintf2(s,a,b) {}
     85 #define Dprintf3(s,a,b,c) {}
     86 #define Dprintf4(s,a,b,c,d) {}
     87 #define Dprintf5(s,a,b,c,d,e) {}
     88 #define Dprintf6(s,a,b,c,d,e,f) {}
     89 #define Dprintf7(s,a,b,c,d,e,f,g) {}
     90 
     91 #define DDprintf1(s,a) {}
     92 #define DDprintf2(s,a,b) {}
     93 
     94 #endif /* RF_DEBUG_RECON */
     95 
     96 #define RF_RECON_DONE_READS   1
     97 #define RF_RECON_READ_ERROR   2
     98 #define RF_RECON_WRITE_ERROR  3
     99 #define RF_RECON_READ_STOPPED 4
    100 #define RF_RECON_WRITE_DONE   5
    101 
    102 #define RF_MAX_FREE_RECONBUFFER 32
    103 #define RF_MIN_FREE_RECONBUFFER 16
    104 
    105 static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
    106 					      RF_RaidDisk_t *, int, RF_RowCol_t);
    107 static void FreeReconDesc(RF_RaidReconDesc_t *);
    108 static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
    109 static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
    110 static int TryToRead(RF_Raid_t *, RF_RowCol_t);
    111 static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
    112 				RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
    113 				RF_SectorNum_t *);
    114 static int IssueNextWriteRequest(RF_Raid_t *);
    115 static int ReconReadDoneProc(void *, int);
    116 static int ReconWriteDoneProc(void *, int);
    117 static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
    118 static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
    119 			       RF_RowCol_t, RF_HeadSepLimit_t,
    120 			       RF_ReconUnitNum_t);
    121 static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
    122 					      RF_ReconParityStripeStatus_t *,
    123 					      RF_PerDiskReconCtrl_t *,
    124 					      RF_RowCol_t, RF_StripeNum_t,
    125 					      RF_ReconUnitNum_t);
    126 static void ForceReconReadDoneProc(void *, int);
    127 static void rf_ShutdownReconstruction(void *);
    128 
    129 struct RF_ReconDoneProc_s {
    130 	void    (*proc) (RF_Raid_t *, void *);
    131 	void   *arg;
    132 	RF_ReconDoneProc_t *next;
    133 };
    134 
    135 /**************************************************************************
    136  *
    137  * sets up the parameters that will be used by the reconstruction process
    138  * currently there are none, except for those that the layout-specific
    139  * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
    140  *
    141  * in the kernel, we fire off the recon thread.
    142  *
    143  **************************************************************************/
    144 static void
    145 rf_ShutdownReconstruction(void *ignored)
    146 {
    147 	pool_destroy(&rf_pools.reconbuffer);
    148 }
    149 
    150 int
    151 rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
    152 {
    153 
    154 	rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
    155 		     "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
    156 	rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
    157 
    158 	return (0);
    159 }
    160 
    161 static RF_RaidReconDesc_t *
    162 AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
    163 		   RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
    164 		   RF_RowCol_t scol)
    165 {
    166 
    167 	RF_RaidReconDesc_t *reconDesc;
    168 
    169 	RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
    170 		  (RF_RaidReconDesc_t *));
    171 	reconDesc->raidPtr = raidPtr;
    172 	reconDesc->col = col;
    173 	reconDesc->spareDiskPtr = spareDiskPtr;
    174 	reconDesc->numDisksDone = numDisksDone;
    175 	reconDesc->scol = scol;
    176 	reconDesc->next = NULL;
    177 
    178 	return (reconDesc);
    179 }
    180 
    181 static void
    182 FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
    183 {
    184 #if RF_RECON_STATS > 0
    185 	printf("raid%d: %lu recon event waits, %lu recon delays\n",
    186 	       reconDesc->raidPtr->raidid,
    187 	       (long) reconDesc->numReconEventWaits,
    188 	       (long) reconDesc->numReconExecDelays);
    189 #endif				/* RF_RECON_STATS > 0 */
    190 	printf("raid%d: %lu max exec ticks\n",
    191 	       reconDesc->raidPtr->raidid,
    192 	       (long) reconDesc->maxReconExecTicks);
    193 	RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
    194 }
    195 
    196 
    197 /*****************************************************************************
    198  *
    199  * primary routine to reconstruct a failed disk.  This should be called from
    200  * within its own thread.  It won't return until reconstruction completes,
    201  * fails, or is aborted.
    202  *****************************************************************************/
    203 int
    204 rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
    205 {
    206 	const RF_LayoutSW_t *lp;
    207 	int     rc;
    208 
    209 	lp = raidPtr->Layout.map;
    210 	if (lp->SubmitReconBuffer) {
    211 		/*
    212 	         * The current infrastructure only supports reconstructing one
    213 	         * disk at a time for each array.
    214 	         */
    215 		RF_LOCK_MUTEX(raidPtr->mutex);
    216 		while (raidPtr->reconInProgress) {
    217 			RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
    218 		}
    219 		raidPtr->reconInProgress++;
    220 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    221 		rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
    222 		RF_LOCK_MUTEX(raidPtr->mutex);
    223 		raidPtr->reconInProgress--;
    224 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    225 	} else {
    226 		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
    227 		    lp->parityConfig);
    228 		rc = EIO;
    229 	}
    230 	RF_SIGNAL_COND(raidPtr->waitForReconCond);
    231 	return (rc);
    232 }
    233 
    234 int
    235 rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
    236 {
    237 	RF_ComponentLabel_t *c_label;
    238 	RF_RaidDisk_t *spareDiskPtr = NULL;
    239 	RF_RaidReconDesc_t *reconDesc;
    240 	RF_RowCol_t scol;
    241 	int     numDisksDone = 0, rc;
    242 
    243 	/* first look for a spare drive onto which to reconstruct the data */
    244 	/* spare disk descriptors are stored in row 0.  This may have to
    245 	 * change eventually */
    246 
    247 	RF_LOCK_MUTEX(raidPtr->mutex);
    248 	RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
    249 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
    250 	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
    251 		if (raidPtr->status != rf_rs_degraded) {
    252 			RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
    253 			RF_UNLOCK_MUTEX(raidPtr->mutex);
    254 			return (EINVAL);
    255 		}
    256 		scol = (-1);
    257 	} else {
    258 #endif
    259 		for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
    260 			if (raidPtr->Disks[scol].status == rf_ds_spare) {
    261 				spareDiskPtr = &raidPtr->Disks[scol];
    262 				spareDiskPtr->status = rf_ds_rebuilding_spare;
    263 				break;
    264 			}
    265 		}
    266 		if (!spareDiskPtr) {
    267 			RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
    268 			RF_UNLOCK_MUTEX(raidPtr->mutex);
    269 			return (ENOSPC);
    270 		}
    271 		printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
    272 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
    273 	}
    274 #endif
    275 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    276 
    277 	reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
    278 	raidPtr->reconDesc = (void *) reconDesc;
    279 #if RF_RECON_STATS > 0
    280 	reconDesc->hsStallCount = 0;
    281 	reconDesc->numReconExecDelays = 0;
    282 	reconDesc->numReconEventWaits = 0;
    283 #endif				/* RF_RECON_STATS > 0 */
    284 	reconDesc->reconExecTimerRunning = 0;
    285 	reconDesc->reconExecTicks = 0;
    286 	reconDesc->maxReconExecTicks = 0;
    287 	rc = rf_ContinueReconstructFailedDisk(reconDesc);
    288 
    289 	if (!rc) {
    290 		/* fix up the component label */
    291 		/* Don't actually need the read here.. */
    292 		c_label = raidget_component_label(raidPtr, scol);
    293 
    294 		raid_init_component_label(raidPtr, c_label);
    295 		c_label->row = 0;
    296 		c_label->column = col;
    297 		c_label->clean = RF_RAID_DIRTY;
    298 		c_label->status = rf_ds_optimal;
    299 		rf_component_label_set_partitionsize(c_label,
    300 		    raidPtr->Disks[scol].partitionSize);
    301 
    302 		/* We've just done a rebuild based on all the other
    303 		   disks, so at this point the parity is known to be
    304 		   clean, even if it wasn't before. */
    305 
    306 		/* XXX doesn't hold for RAID 6!!*/
    307 
    308 		RF_LOCK_MUTEX(raidPtr->mutex);
    309 		/* The failed disk has already been marked as rf_ds_spared
    310 		   (or rf_ds_dist_spared) in
    311 		   rf_ContinueReconstructFailedDisk()
    312 		   so we just update the spare disk as being a used spare
    313 		*/
    314 
    315 		spareDiskPtr->status = rf_ds_used_spare;
    316 		raidPtr->parity_good = RF_RAID_CLEAN;
    317 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    318 
    319 		/* XXXX MORE NEEDED HERE */
    320 
    321 		raidflush_component_label(raidPtr, scol);
    322 	} else {
    323 		/* Reconstruct failed. */
    324 
    325 		RF_LOCK_MUTEX(raidPtr->mutex);
    326 		/* Failed disk goes back to "failed" status */
    327 		raidPtr->Disks[col].status = rf_ds_failed;
    328 
    329 		/* Spare disk goes back to "spare" status. */
    330 		spareDiskPtr->status = rf_ds_spare;
    331 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    332 
    333 	}
    334 	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
    335 	return (rc);
    336 }
    337 
    338 /*
    339 
    340    Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
    341    and you don't get a spare until the next Monday.  With this function
    342    (and hot-swappable drives) you can now put your new disk containing
    343    /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
    344    rebuild the data "on the spot".
    345 
    346 */
    347 
    348 int
    349 rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
    350 {
    351 	RF_RaidDisk_t *spareDiskPtr = NULL;
    352 	RF_RaidReconDesc_t *reconDesc;
    353 	const RF_LayoutSW_t *lp;
    354 	RF_ComponentLabel_t *c_label;
    355 	int     numDisksDone = 0, rc;
    356 	struct partinfo dpart;
    357 	struct vnode *vp;
    358 	struct vattr va;
    359 	int retcode;
    360 	int ac;
    361 
    362 	lp = raidPtr->Layout.map;
    363 	if (!lp->SubmitReconBuffer) {
    364 		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
    365 			     lp->parityConfig);
    366 		/* wakeup anyone who might be waiting to do a reconstruct */
    367 		RF_SIGNAL_COND(raidPtr->waitForReconCond);
    368 		return(EIO);
    369 	}
    370 
    371 	/*
    372 	 * The current infrastructure only supports reconstructing one
    373 	 * disk at a time for each array.
    374 	 */
    375 	RF_LOCK_MUTEX(raidPtr->mutex);
    376 
    377 	if (raidPtr->Disks[col].status != rf_ds_failed) {
    378 		/* "It's gone..." */
    379 		raidPtr->numFailures++;
    380 		raidPtr->Disks[col].status = rf_ds_failed;
    381 		raidPtr->status = rf_rs_degraded;
    382 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    383 		rf_update_component_labels(raidPtr,
    384 					   RF_NORMAL_COMPONENT_UPDATE);
    385 		RF_LOCK_MUTEX(raidPtr->mutex);
    386 	}
    387 
    388 	while (raidPtr->reconInProgress) {
    389 		RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
    390 	}
    391 
    392 	raidPtr->reconInProgress++;
    393 
    394 	/* first look for a spare drive onto which to reconstruct the
    395 	   data.  spare disk descriptors are stored in row 0.  This
    396 	   may have to change eventually */
    397 
    398 	/* Actually, we don't care if it's failed or not...  On a RAID
    399 	   set with correct parity, this function should be callable
    400 	   on any component without ill effects. */
    401 	/* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
    402 
    403 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
    404 	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
    405 		RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
    406 
    407 		raidPtr->reconInProgress--;
    408 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    409 		RF_SIGNAL_COND(raidPtr->waitForReconCond);
    410 		return (EINVAL);
    411 	}
    412 #endif
    413 
    414 	/* This device may have been opened successfully the
    415 	   first time. Close it before trying to open it again.. */
    416 
    417 	if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
    418 #if 0
    419 		printf("Closed the open device: %s\n",
    420 		       raidPtr->Disks[col].devname);
    421 #endif
    422 		vp = raidPtr->raid_cinfo[col].ci_vp;
    423 		ac = raidPtr->Disks[col].auto_configured;
    424 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    425 		rf_close_component(raidPtr, vp, ac);
    426 		RF_LOCK_MUTEX(raidPtr->mutex);
    427 		raidPtr->raid_cinfo[col].ci_vp = NULL;
    428 	}
    429 	/* note that this disk was *not* auto_configured (any longer)*/
    430 	raidPtr->Disks[col].auto_configured = 0;
    431 
    432 #if 0
    433 	printf("About to (re-)open the device for rebuilding: %s\n",
    434 	       raidPtr->Disks[col].devname);
    435 #endif
    436 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    437 	retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE);
    438 
    439 	if (retcode) {
    440 		printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
    441 		       raidPtr->Disks[col].devname, retcode);
    442 
    443 		/* the component isn't responding properly...
    444 		   must be still dead :-( */
    445 		RF_LOCK_MUTEX(raidPtr->mutex);
    446 		raidPtr->reconInProgress--;
    447 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    448 		RF_SIGNAL_COND(raidPtr->waitForReconCond);
    449 		return(retcode);
    450 	}
    451 
    452 	/* Ok, so we can at least do a lookup...
    453 	   How about actually getting a vp for it? */
    454 
    455 	if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) {
    456 		RF_LOCK_MUTEX(raidPtr->mutex);
    457 		raidPtr->reconInProgress--;
    458 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    459 		RF_SIGNAL_COND(raidPtr->waitForReconCond);
    460 		return(retcode);
    461 	}
    462 
    463 	retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred);
    464 	if (retcode) {
    465 		RF_LOCK_MUTEX(raidPtr->mutex);
    466 		raidPtr->reconInProgress--;
    467 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    468 		RF_SIGNAL_COND(raidPtr->waitForReconCond);
    469 		return(retcode);
    470 	}
    471 	RF_LOCK_MUTEX(raidPtr->mutex);
    472 	raidPtr->Disks[col].blockSize =	dpart.disklab->d_secsize;
    473 
    474 	raidPtr->Disks[col].numBlocks = dpart.part->p_size -
    475 		rf_protectedSectors;
    476 
    477 	raidPtr->raid_cinfo[col].ci_vp = vp;
    478 	raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
    479 
    480 	raidPtr->Disks[col].dev = va.va_rdev;
    481 
    482 	/* we allow the user to specify that only a fraction
    483 	   of the disks should be used this is just for debug:
    484 	   it speeds up * the parity scan */
    485 	raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
    486 		rf_sizePercentage / 100;
    487 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    488 
    489 	spareDiskPtr = &raidPtr->Disks[col];
    490 	spareDiskPtr->status = rf_ds_rebuilding_spare;
    491 
    492 	printf("raid%d: initiating in-place reconstruction on column %d\n",
    493 	       raidPtr->raidid, col);
    494 
    495 	reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
    496 				       numDisksDone, col);
    497 	raidPtr->reconDesc = (void *) reconDesc;
    498 #if RF_RECON_STATS > 0
    499 	reconDesc->hsStallCount = 0;
    500 	reconDesc->numReconExecDelays = 0;
    501 	reconDesc->numReconEventWaits = 0;
    502 #endif				/* RF_RECON_STATS > 0 */
    503 	reconDesc->reconExecTimerRunning = 0;
    504 	reconDesc->reconExecTicks = 0;
    505 	reconDesc->maxReconExecTicks = 0;
    506 	rc = rf_ContinueReconstructFailedDisk(reconDesc);
    507 
    508 	if (!rc) {
    509 		RF_LOCK_MUTEX(raidPtr->mutex);
    510 		/* Need to set these here, as at this point it'll be claiming
    511 		   that the disk is in rf_ds_spared!  But we know better :-) */
    512 
    513 		raidPtr->Disks[col].status = rf_ds_optimal;
    514 		raidPtr->status = rf_rs_optimal;
    515 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    516 
    517 		/* fix up the component label */
    518 		/* Don't actually need the read here.. */
    519 		c_label = raidget_component_label(raidPtr, col);
    520 
    521 		RF_LOCK_MUTEX(raidPtr->mutex);
    522 		raid_init_component_label(raidPtr, c_label);
    523 
    524 		c_label->row = 0;
    525 		c_label->column = col;
    526 
    527 		/* We've just done a rebuild based on all the other
    528 		   disks, so at this point the parity is known to be
    529 		   clean, even if it wasn't before. */
    530 
    531 		/* XXX doesn't hold for RAID 6!!*/
    532 
    533 		raidPtr->parity_good = RF_RAID_CLEAN;
    534 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    535 
    536 		raidflush_component_label(raidPtr, col);
    537 	} else {
    538 		/* Reconstruct-in-place failed.  Disk goes back to
    539 		   "failed" status, regardless of what it was before.  */
    540 		RF_LOCK_MUTEX(raidPtr->mutex);
    541 		raidPtr->Disks[col].status = rf_ds_failed;
    542 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    543 	}
    544 
    545 	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
    546 
    547 	RF_LOCK_MUTEX(raidPtr->mutex);
    548 	raidPtr->reconInProgress--;
    549 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    550 
    551 	RF_SIGNAL_COND(raidPtr->waitForReconCond);
    552 	return (rc);
    553 }
    554 
    555 
    556 int
    557 rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
    558 {
    559 	RF_Raid_t *raidPtr = reconDesc->raidPtr;
    560 	RF_RowCol_t col = reconDesc->col;
    561 	RF_RowCol_t scol = reconDesc->scol;
    562 	RF_ReconMap_t *mapPtr;
    563 	RF_ReconCtrl_t *tmp_reconctrl;
    564 	RF_ReconEvent_t *event;
    565 	RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev;
    566 #if RF_INCLUDE_RAID5_RS > 0
    567 	RF_StripeCount_t startPSID,endPSID,aPSID,bPSID,offPSID;
    568 #endif
    569 	RF_ReconUnitCount_t RUsPerPU;
    570 	struct timeval etime, elpsd;
    571 	unsigned long xor_s, xor_resid_us;
    572 	int     i, ds;
    573 	int status, done;
    574 	int recon_error, write_error;
    575 
    576 	raidPtr->accumXorTimeUs = 0;
    577 #if RF_ACC_TRACE > 0
    578 	/* create one trace record per physical disk */
    579 	RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
    580 #endif
    581 
    582 	/* quiesce the array prior to starting recon.  this is needed
    583 	 * to assure no nasty interactions with pending user writes.
    584 	 * We need to do this before we change the disk or row status. */
    585 
    586 	Dprintf("RECON: begin request suspend\n");
    587 	rf_SuspendNewRequestsAndWait(raidPtr);
    588 	Dprintf("RECON: end request suspend\n");
    589 
    590 	/* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
    591 	tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
    592 
    593 	RF_LOCK_MUTEX(raidPtr->mutex);
    594 
    595 	/* create the reconstruction control pointer and install it in
    596 	 * the right slot */
    597 	raidPtr->reconControl = tmp_reconctrl;
    598 	mapPtr = raidPtr->reconControl->reconMap;
    599 	raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
    600 	raidPtr->reconControl->numRUsComplete =	0;
    601 	raidPtr->status = rf_rs_reconstructing;
    602 	raidPtr->Disks[col].status = rf_ds_reconstructing;
    603 	raidPtr->Disks[col].spareCol = scol;
    604 
    605 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    606 
    607 	RF_GETTIME(raidPtr->reconControl->starttime);
    608 
    609 	Dprintf("RECON: resume requests\n");
    610 	rf_ResumeNewRequests(raidPtr);
    611 
    612 
    613 	mapPtr = raidPtr->reconControl->reconMap;
    614 
    615 	incPSID = RF_RECONMAP_SIZE;
    616 	lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU;
    617 	RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU;
    618 	recon_error = 0;
    619 	write_error = 0;
    620 	pending_writes = incPSID;
    621 	raidPtr->reconControl->lastPSID = incPSID - 1;
    622 
    623 	/* bounds check raidPtr->reconControl->lastPSID and
    624 	   pending_writes so that we don't attempt to wait for more IO
    625 	   than can possibly happen */
    626 
    627 	if (raidPtr->reconControl->lastPSID > lastPSID)
    628 		raidPtr->reconControl->lastPSID = lastPSID;
    629 
    630 	if (pending_writes > lastPSID)
    631 		pending_writes = lastPSID;
    632 
    633 	/* start the actual reconstruction */
    634 
    635 	done = 0;
    636 	while (!done) {
    637 
    638 		if (raidPtr->waitShutdown) {
    639 			/* someone is unconfiguring this array... bail on the reconstruct.. */
    640 			recon_error = 1;
    641 			break;
    642 		}
    643 
    644 		num_writes = 0;
    645 
    646 #if RF_INCLUDE_RAID5_RS > 0
    647 		/* For RAID5 with Rotated Spares we will be 'short'
    648 		   some number of writes since no writes will get
    649 		   issued for stripes where the spare is on the
    650 		   component being rebuilt.  Account for the shortage
    651 		   here so that we don't hang indefinitely below
    652 		   waiting for writes to complete that were never
    653 		   scheduled.
    654 
    655 		   XXX: Should be fixed for PARITY_DECLUSTERING and
    656 		   others too!
    657 
    658 		*/
    659 
    660 		if (raidPtr->Layout.numDataCol <
    661 		    raidPtr->numCol - raidPtr->Layout.numParityCol) {
    662 			/* numDataCol is at least 2 less than numCol, so
    663 			   should be RAID 5 with Rotated Spares */
    664 
    665 			/* XXX need to update for RAID 6 */
    666 
    667 			startPSID = raidPtr->reconControl->lastPSID - pending_writes + 1;
    668 			endPSID = raidPtr->reconControl->lastPSID;
    669 
    670 			offPSID = raidPtr->numCol - col - 1;
    671 
    672 			aPSID = startPSID - startPSID % raidPtr->numCol + offPSID;
    673 			if (aPSID < startPSID) {
    674 				aPSID += raidPtr->numCol;
    675 			}
    676 
    677 			bPSID = endPSID - ((endPSID - offPSID) % raidPtr->numCol);
    678 
    679 			if (aPSID < endPSID) {
    680 				num_writes = ((bPSID - aPSID) / raidPtr->numCol) + 1;
    681 			}
    682 
    683 			if ((aPSID == endPSID) && (bPSID == endPSID)) {
    684 				num_writes++;
    685 			}
    686 		}
    687 #endif
    688 
    689 		/* issue a read for each surviving disk */
    690 
    691 		reconDesc->numDisksDone = 0;
    692 		for (i = 0; i < raidPtr->numCol; i++) {
    693 			if (i != col) {
    694 				/* find and issue the next I/O on the
    695 				 * indicated disk */
    696 				if (IssueNextReadRequest(raidPtr, i)) {
    697 					Dprintf1("RECON: done issuing for c%d\n", i);
    698 					reconDesc->numDisksDone++;
    699 				}
    700 			}
    701 		}
    702 
    703 		/* process reconstruction events until all disks report that
    704 		 * they've completed all work */
    705 
    706 		while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
    707 
    708 			event = rf_GetNextReconEvent(reconDesc);
    709 			status = ProcessReconEvent(raidPtr, event);
    710 
    711 			/* the normal case is that a read completes, and all is well. */
    712 			if (status == RF_RECON_DONE_READS) {
    713 				reconDesc->numDisksDone++;
    714 			} else if ((status == RF_RECON_READ_ERROR) ||
    715 				   (status == RF_RECON_WRITE_ERROR)) {
    716 				/* an error was encountered while reconstructing...
    717 				   Pretend we've finished this disk.
    718 				*/
    719 				recon_error = 1;
    720 				raidPtr->reconControl->error = 1;
    721 
    722 				/* bump the numDisksDone count for reads,
    723 				   but not for writes */
    724 				if (status == RF_RECON_READ_ERROR)
    725 					reconDesc->numDisksDone++;
    726 
    727 				/* write errors are special -- when we are
    728 				   done dealing with the reads that are
    729 				   finished, we don't want to wait for any
    730 				   writes */
    731 				if (status == RF_RECON_WRITE_ERROR) {
    732 					write_error = 1;
    733 					num_writes++;
    734 				}
    735 
    736 			} else if (status == RF_RECON_READ_STOPPED) {
    737 				/* count this component as being "done" */
    738 				reconDesc->numDisksDone++;
    739 			} else if (status == RF_RECON_WRITE_DONE) {
    740 				num_writes++;
    741 			}
    742 
    743 			if (recon_error) {
    744 				/* make sure any stragglers are woken up so that
    745 				   their theads will complete, and we can get out
    746 				   of here with all IO processed */
    747 
    748 				rf_WakeupHeadSepCBWaiters(raidPtr);
    749 			}
    750 
    751 			raidPtr->reconControl->numRUsTotal =
    752 				mapPtr->totalRUs;
    753 			raidPtr->reconControl->numRUsComplete =
    754 				mapPtr->totalRUs -
    755 				rf_UnitsLeftToReconstruct(mapPtr);
    756 
    757 #if RF_DEBUG_RECON
    758 			raidPtr->reconControl->percentComplete =
    759 				(raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
    760 			if (rf_prReconSched) {
    761 				rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
    762 			}
    763 #endif
    764 		}
    765 
    766 		/* reads done, wakeup any waiters, and then wait for writes */
    767 
    768 		rf_WakeupHeadSepCBWaiters(raidPtr);
    769 
    770 		while (!recon_error && (num_writes < pending_writes)) {
    771 			event = rf_GetNextReconEvent(reconDesc);
    772 			status = ProcessReconEvent(raidPtr, event);
    773 
    774 			if (status == RF_RECON_WRITE_ERROR) {
    775 				num_writes++;
    776 				recon_error = 1;
    777 				raidPtr->reconControl->error = 1;
    778 				/* an error was encountered at the very end... bail */
    779 			} else if (status == RF_RECON_WRITE_DONE) {
    780 				num_writes++;
    781 			} /* else it's something else, and we don't care */
    782 		}
    783 		if (recon_error ||
    784 		    (raidPtr->reconControl->lastPSID == lastPSID)) {
    785 			done = 1;
    786 			break;
    787 		}
    788 
    789 		prev = raidPtr->reconControl->lastPSID;
    790 		raidPtr->reconControl->lastPSID += incPSID;
    791 
    792 		if (raidPtr->reconControl->lastPSID > lastPSID) {
    793 			pending_writes = lastPSID - prev;
    794 			raidPtr->reconControl->lastPSID = lastPSID;
    795 		}
    796 
    797 		/* back down curPSID to get ready for the next round... */
    798 		for (i = 0; i < raidPtr->numCol; i++) {
    799 			if (i != col) {
    800 				raidPtr->reconControl->perDiskInfo[i].curPSID--;
    801 				raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1;
    802 			}
    803 		}
    804 	}
    805 
    806 	mapPtr = raidPtr->reconControl->reconMap;
    807 	if (rf_reconDebug) {
    808 		printf("RECON: all reads completed\n");
    809 	}
    810 	/* at this point all the reads have completed.  We now wait
    811 	 * for any pending writes to complete, and then we're done */
    812 
    813 	while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
    814 
    815 		event = rf_GetNextReconEvent(reconDesc);
    816 		status = ProcessReconEvent(raidPtr, event);
    817 
    818 		if (status == RF_RECON_WRITE_ERROR) {
    819 			recon_error = 1;
    820 			raidPtr->reconControl->error = 1;
    821 			/* an error was encountered at the very end... bail */
    822 		} else {
    823 #if RF_DEBUG_RECON
    824 			raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
    825 			if (rf_prReconSched) {
    826 				rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
    827 			}
    828 #endif
    829 		}
    830 	}
    831 
    832 	if (recon_error) {
    833 		/* we've encountered an error in reconstructing. */
    834 		printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
    835 
    836 		/* we start by blocking IO to the RAID set. */
    837 		rf_SuspendNewRequestsAndWait(raidPtr);
    838 
    839 		RF_LOCK_MUTEX(raidPtr->mutex);
    840 		/* mark set as being degraded, rather than
    841 		   rf_rs_reconstructing as we were before the problem.
    842 		   After this is done we can update status of the
    843 		   component disks without worrying about someone
    844 		   trying to read from a failed component.
    845 		*/
    846 		raidPtr->status = rf_rs_degraded;
    847 		RF_UNLOCK_MUTEX(raidPtr->mutex);
    848 
    849 		/* resume IO */
    850 		rf_ResumeNewRequests(raidPtr);
    851 
    852 		/* At this point there are two cases:
    853 		   1) If we've experienced a read error, then we've
    854 		   already waited for all the reads we're going to get,
    855 		   and we just need to wait for the writes.
    856 
    857 		   2) If we've experienced a write error, we've also
    858 		   already waited for all the reads to complete,
    859 		   but there is little point in waiting for the writes --
    860 		   when they do complete, they will just be ignored.
    861 
    862 		   So we just wait for writes to complete if we didn't have a
    863 		   write error.
    864 		*/
    865 
    866 		if (!write_error) {
    867 			/* wait for writes to complete */
    868 			while (raidPtr->reconControl->pending_writes > 0) {
    869 
    870 				event = rf_GetNextReconEvent(reconDesc);
    871 				status = ProcessReconEvent(raidPtr, event);
    872 
    873 				if (status == RF_RECON_WRITE_ERROR) {
    874 					raidPtr->reconControl->error = 1;
    875 					/* an error was encountered at the very end... bail.
    876 					   This will be very bad news for the user, since
    877 					   at this point there will have been a read error
    878 					   on one component, and a write error on another!
    879 					*/
    880 					break;
    881 				}
    882 			}
    883 		}
    884 
    885 
    886 		/* cleanup */
    887 
    888 		/* drain the event queue - after waiting for the writes above,
    889 		   there shouldn't be much (if anything!) left in the queue. */
    890 
    891 		rf_DrainReconEventQueue(reconDesc);
    892 
    893 		/* XXX  As much as we'd like to free the recon control structure
    894 		   and the reconDesc, we have no way of knowing if/when those will
    895 		   be touched by IO that has yet to occur.  It is rather poor to be
    896 		   basically causing a 'memory leak' here, but there doesn't seem to be
    897 		   a cleaner alternative at this time.  Perhaps when the reconstruct code
    898 		   gets a makeover this problem will go away.
    899 		*/
    900 #if 0
    901 		rf_FreeReconControl(raidPtr);
    902 #endif
    903 
    904 #if RF_ACC_TRACE > 0
    905 		RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
    906 #endif
    907 		/* XXX see comment above */
    908 #if 0
    909 		FreeReconDesc(reconDesc);
    910 #endif
    911 
    912 		return (1);
    913 	}
    914 
    915 	/* Success:  mark the dead disk as reconstructed.  We quiesce
    916 	 * the array here to assure no nasty interactions with pending
    917 	 * user accesses when we free up the psstatus structure as
    918 	 * part of FreeReconControl() */
    919 
    920 	rf_SuspendNewRequestsAndWait(raidPtr);
    921 
    922 	RF_LOCK_MUTEX(raidPtr->mutex);
    923 	raidPtr->numFailures--;
    924 	ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
    925 	raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
    926 	raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
    927 	RF_UNLOCK_MUTEX(raidPtr->mutex);
    928 	RF_GETTIME(etime);
    929 	RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
    930 
    931 	rf_ResumeNewRequests(raidPtr);
    932 
    933 	printf("raid%d: Reconstruction of disk at col %d completed\n",
    934 	       raidPtr->raidid, col);
    935 	xor_s = raidPtr->accumXorTimeUs / 1000000;
    936 	xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
    937 	printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
    938 	       raidPtr->raidid,
    939 	       (int) elpsd.tv_sec, (int) elpsd.tv_usec,
    940 	       raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
    941 	printf("raid%d:  (start time %d sec %d usec, end time %d sec %d usec)\n",
    942 	       raidPtr->raidid,
    943 	       (int) raidPtr->reconControl->starttime.tv_sec,
    944 	       (int) raidPtr->reconControl->starttime.tv_usec,
    945 	       (int) etime.tv_sec, (int) etime.tv_usec);
    946 #if RF_RECON_STATS > 0
    947 	printf("raid%d: Total head-sep stall count was %d\n",
    948 	       raidPtr->raidid, (int) reconDesc->hsStallCount);
    949 #endif				/* RF_RECON_STATS > 0 */
    950 	rf_FreeReconControl(raidPtr);
    951 #if RF_ACC_TRACE > 0
    952 	RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
    953 #endif
    954 	FreeReconDesc(reconDesc);
    955 
    956 	return (0);
    957 
    958 }
    959 /*****************************************************************************
    960  * do the right thing upon each reconstruction event.
    961  *****************************************************************************/
    962 static int
    963 ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
    964 {
    965 	int     retcode = 0, submitblocked;
    966 	RF_ReconBuffer_t *rbuf;
    967 	RF_SectorCount_t sectorsPerRU;
    968 
    969 	retcode = RF_RECON_READ_STOPPED;
    970 
    971 	Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
    972 
    973 	switch (event->type) {
    974 
    975 		/* a read I/O has completed */
    976 	case RF_REVENT_READDONE:
    977 		rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
    978 		Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
    979 		    event->col, rbuf->parityStripeID);
    980 		Dprintf7("RECON: done read  psid %ld buf %lx  %02x %02x %02x %02x %02x\n",
    981 		    rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
    982 		    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
    983 		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
    984 		if (!raidPtr->reconControl->error) {
    985 			submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
    986 			Dprintf1("RECON: submitblocked=%d\n", submitblocked);
    987 			if (!submitblocked)
    988 				retcode = IssueNextReadRequest(raidPtr, event->col);
    989 			else
    990 				retcode = 0;
    991 		}
    992 		break;
    993 
    994 		/* a write I/O has completed */
    995 	case RF_REVENT_WRITEDONE:
    996 #if RF_DEBUG_RECON
    997 		if (rf_floatingRbufDebug) {
    998 			rf_CheckFloatingRbufCount(raidPtr, 1);
    999 		}
   1000 #endif
   1001 		sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
   1002 		rbuf = (RF_ReconBuffer_t *) event->arg;
   1003 		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
   1004 		Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
   1005 		    rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
   1006 		rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
   1007 		    rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
   1008 		rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
   1009 
   1010 		RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1011 		raidPtr->reconControl->pending_writes--;
   1012 		RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1013 
   1014 		if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
   1015 			RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1016 			while(raidPtr->reconControl->rb_lock) {
   1017 				ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
   1018 					&raidPtr->reconControl->rb_mutex);
   1019 			}
   1020 			raidPtr->reconControl->rb_lock = 1;
   1021 			RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1022 
   1023 			raidPtr->numFullReconBuffers--;
   1024 			rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
   1025 
   1026 			RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1027 			raidPtr->reconControl->rb_lock = 0;
   1028 			wakeup(&raidPtr->reconControl->rb_lock);
   1029 			RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1030 		} else
   1031 			if (rbuf->type == RF_RBUF_TYPE_FORCED)
   1032 				rf_FreeReconBuffer(rbuf);
   1033 			else
   1034 				RF_ASSERT(0);
   1035 		retcode = RF_RECON_WRITE_DONE;
   1036 		break;
   1037 
   1038 	case RF_REVENT_BUFCLEAR:	/* A buffer-stall condition has been
   1039 					 * cleared */
   1040 		Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
   1041 		if (!raidPtr->reconControl->error) {
   1042 			submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
   1043 							     0, (int) (long) event->arg);
   1044 			RF_ASSERT(!submitblocked);	/* we wouldn't have gotten the
   1045 							 * BUFCLEAR event if we
   1046 							 * couldn't submit */
   1047 			retcode = IssueNextReadRequest(raidPtr, event->col);
   1048 		}
   1049 		break;
   1050 
   1051 	case RF_REVENT_BLOCKCLEAR:	/* A user-write reconstruction
   1052 					 * blockage has been cleared */
   1053 		DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
   1054 		if (!raidPtr->reconControl->error) {
   1055 			retcode = TryToRead(raidPtr, event->col);
   1056 		}
   1057 		break;
   1058 
   1059 	case RF_REVENT_HEADSEPCLEAR:	/* A max-head-separation
   1060 					 * reconstruction blockage has been
   1061 					 * cleared */
   1062 		Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
   1063 		if (!raidPtr->reconControl->error) {
   1064 			retcode = TryToRead(raidPtr, event->col);
   1065 		}
   1066 		break;
   1067 
   1068 		/* a buffer has become ready to write */
   1069 	case RF_REVENT_BUFREADY:
   1070 		Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
   1071 		if (!raidPtr->reconControl->error) {
   1072 			retcode = IssueNextWriteRequest(raidPtr);
   1073 #if RF_DEBUG_RECON
   1074 			if (rf_floatingRbufDebug) {
   1075 				rf_CheckFloatingRbufCount(raidPtr, 1);
   1076 			}
   1077 #endif
   1078 		}
   1079 		break;
   1080 
   1081 		/* we need to skip the current RU entirely because it got
   1082 		 * recon'd while we were waiting for something else to happen */
   1083 	case RF_REVENT_SKIP:
   1084 		DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
   1085 		if (!raidPtr->reconControl->error) {
   1086 			retcode = IssueNextReadRequest(raidPtr, event->col);
   1087 		}
   1088 		break;
   1089 
   1090 		/* a forced-reconstruction read access has completed.  Just
   1091 		 * submit the buffer */
   1092 	case RF_REVENT_FORCEDREADDONE:
   1093 		rbuf = (RF_ReconBuffer_t *) event->arg;
   1094 		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
   1095 		DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
   1096 		if (!raidPtr->reconControl->error) {
   1097 			submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
   1098 			RF_ASSERT(!submitblocked);
   1099 			retcode = 0;
   1100 		}
   1101 		break;
   1102 
   1103 		/* A read I/O failed to complete */
   1104 	case RF_REVENT_READ_FAILED:
   1105 		retcode = RF_RECON_READ_ERROR;
   1106 		break;
   1107 
   1108 		/* A write I/O failed to complete */
   1109 	case RF_REVENT_WRITE_FAILED:
   1110 		retcode = RF_RECON_WRITE_ERROR;
   1111 
   1112 		/* This is an error, but it was a pending write.
   1113 		   Account for it. */
   1114 		RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1115 		raidPtr->reconControl->pending_writes--;
   1116 		RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1117 
   1118 		rbuf = (RF_ReconBuffer_t *) event->arg;
   1119 
   1120 		/* cleanup the disk queue data */
   1121 		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
   1122 
   1123 		/* At this point we're erroring out, badly, and floatingRbufs
   1124 		   may not even be valid.  Rather than putting this back onto
   1125 		   the floatingRbufs list, just arrange for its immediate
   1126 		   destruction.
   1127 		*/
   1128 		rf_FreeReconBuffer(rbuf);
   1129 		break;
   1130 
   1131 		/* a forced read I/O failed to complete */
   1132 	case RF_REVENT_FORCEDREAD_FAILED:
   1133 		retcode = RF_RECON_READ_ERROR;
   1134 		break;
   1135 
   1136 	default:
   1137 		RF_PANIC();
   1138 	}
   1139 	rf_FreeReconEventDesc(event);
   1140 	return (retcode);
   1141 }
   1142 /*****************************************************************************
   1143  *
   1144  * find the next thing that's needed on the indicated disk, and issue
   1145  * a read request for it.  We assume that the reconstruction buffer
   1146  * associated with this process is free to receive the data.  If
   1147  * reconstruction is blocked on the indicated RU, we issue a
   1148  * blockage-release request instead of a physical disk read request.
   1149  * If the current disk gets too far ahead of the others, we issue a
   1150  * head-separation wait request and return.
   1151  *
   1152  * ctrl->{ru_count, curPSID, diskOffset} and
   1153  * rbuf->failedDiskSectorOffset are maintained to point to the unit
   1154  * we're currently accessing.  Note that this deviates from the
   1155  * standard C idiom of having counters point to the next thing to be
   1156  * accessed.  This allows us to easily retry when we're blocked by
   1157  * head separation or reconstruction-blockage events.
   1158  *
   1159  *****************************************************************************/
   1160 static int
   1161 IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
   1162 {
   1163 	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
   1164 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
   1165 	RF_ReconBuffer_t *rbuf = ctrl->rbuf;
   1166 	RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
   1167 	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
   1168 	int     do_new_check = 0, retcode = 0, status;
   1169 
   1170 	/* if we are currently the slowest disk, mark that we have to do a new
   1171 	 * check */
   1172 	if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
   1173 		do_new_check = 1;
   1174 
   1175 	while (1) {
   1176 
   1177 		ctrl->ru_count++;
   1178 		if (ctrl->ru_count < RUsPerPU) {
   1179 			ctrl->diskOffset += sectorsPerRU;
   1180 			rbuf->failedDiskSectorOffset += sectorsPerRU;
   1181 		} else {
   1182 			ctrl->curPSID++;
   1183 			ctrl->ru_count = 0;
   1184 			/* code left over from when head-sep was based on
   1185 			 * parity stripe id */
   1186 			if (ctrl->curPSID > raidPtr->reconControl->lastPSID) {
   1187 				CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
   1188 				return (RF_RECON_DONE_READS);	/* finito! */
   1189 			}
   1190 			/* find the disk offsets of the start of the parity
   1191 			 * stripe on both the current disk and the failed
   1192 			 * disk. skip this entire parity stripe if either disk
   1193 			 * does not appear in the indicated PS */
   1194 			status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
   1195 			    &rbuf->spCol, &rbuf->spOffset);
   1196 			if (status) {
   1197 				ctrl->ru_count = RUsPerPU - 1;
   1198 				continue;
   1199 			}
   1200 		}
   1201 		rbuf->which_ru = ctrl->ru_count;
   1202 
   1203 		/* skip this RU if it's already been reconstructed */
   1204 		if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
   1205 			Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
   1206 			continue;
   1207 		}
   1208 		break;
   1209 	}
   1210 	ctrl->headSepCounter++;
   1211 	if (do_new_check)
   1212 		CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter);	/* update min if needed */
   1213 
   1214 
   1215 	/* at this point, we have definitely decided what to do, and we have
   1216 	 * only to see if we can actually do it now */
   1217 	rbuf->parityStripeID = ctrl->curPSID;
   1218 	rbuf->which_ru = ctrl->ru_count;
   1219 #if RF_ACC_TRACE > 0
   1220 	memset((char *) &raidPtr->recon_tracerecs[col], 0,
   1221 	    sizeof(raidPtr->recon_tracerecs[col]));
   1222 	raidPtr->recon_tracerecs[col].reconacc = 1;
   1223 	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
   1224 #endif
   1225 	retcode = TryToRead(raidPtr, col);
   1226 	return (retcode);
   1227 }
   1228 
   1229 /*
   1230  * tries to issue the next read on the indicated disk.  We may be
   1231  * blocked by (a) the heads being too far apart, or (b) recon on the
   1232  * indicated RU being blocked due to a write by a user thread.  In
   1233  * this case, we issue a head-sep or blockage wait request, which will
   1234  * cause this same routine to be invoked again later when the blockage
   1235  * has cleared.
   1236  */
   1237 
   1238 static int
   1239 TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
   1240 {
   1241 	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
   1242 	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
   1243 	RF_StripeNum_t psid = ctrl->curPSID;
   1244 	RF_ReconUnitNum_t which_ru = ctrl->ru_count;
   1245 	RF_DiskQueueData_t *req;
   1246 	int     status;
   1247 	RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
   1248 
   1249 	/* if the current disk is too far ahead of the others, issue a
   1250 	 * head-separation wait and return */
   1251 	if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
   1252 		return (0);
   1253 
   1254 	/* allocate a new PSS in case we need it */
   1255 	newpssPtr = rf_AllocPSStatus(raidPtr);
   1256 
   1257 	RF_LOCK_PSS_MUTEX(raidPtr, psid);
   1258 	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
   1259 
   1260 	if (pssPtr != newpssPtr) {
   1261 		rf_FreePSStatus(raidPtr, newpssPtr);
   1262 	}
   1263 
   1264 	/* if recon is blocked on the indicated parity stripe, issue a
   1265 	 * block-wait request and return. this also must mark the indicated RU
   1266 	 * in the stripe as under reconstruction if not blocked. */
   1267 	status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
   1268 	if (status == RF_PSS_RECON_BLOCKED) {
   1269 		Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
   1270 		goto out;
   1271 	} else
   1272 		if (status == RF_PSS_FORCED_ON_WRITE) {
   1273 			rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
   1274 			goto out;
   1275 		}
   1276 	/* make one last check to be sure that the indicated RU didn't get
   1277 	 * reconstructed while we were waiting for something else to happen.
   1278 	 * This is unfortunate in that it causes us to make this check twice
   1279 	 * in the normal case.  Might want to make some attempt to re-work
   1280 	 * this so that we only do this check if we've definitely blocked on
   1281 	 * one of the above checks.  When this condition is detected, we may
   1282 	 * have just created a bogus status entry, which we need to delete. */
   1283 	if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
   1284 		Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
   1285 		if (pssPtr == newpssPtr)
   1286 			rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
   1287 		rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
   1288 		goto out;
   1289 	}
   1290 	/* found something to read.  issue the I/O */
   1291 	Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
   1292 	    psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
   1293 #if RF_ACC_TRACE > 0
   1294 	RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
   1295 	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
   1296 	raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
   1297 	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
   1298 	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
   1299 #endif
   1300 	/* should be ok to use a NULL proc pointer here, all the bufs we use
   1301 	 * should be in kernel space */
   1302 	req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
   1303 	    ReconReadDoneProc, (void *) ctrl,
   1304 #if RF_ACC_TRACE > 0
   1305 				     &raidPtr->recon_tracerecs[col],
   1306 #else
   1307 				     NULL,
   1308 #endif
   1309 				     (void *) raidPtr, 0, NULL, PR_WAITOK);
   1310 
   1311 	ctrl->rbuf->arg = (void *) req;
   1312 	rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
   1313 	pssPtr->issued[col] = 1;
   1314 
   1315 out:
   1316 	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
   1317 	return (0);
   1318 }
   1319 
   1320 
   1321 /*
   1322  * given a parity stripe ID, we want to find out whether both the
   1323  * current disk and the failed disk exist in that parity stripe.  If
   1324  * not, we want to skip this whole PS.  If so, we want to find the
   1325  * disk offset of the start of the PS on both the current disk and the
   1326  * failed disk.
   1327  *
   1328  * this works by getting a list of disks comprising the indicated
   1329  * parity stripe, and searching the list for the current and failed
   1330  * disks.  Once we've decided they both exist in the parity stripe, we
   1331  * need to decide whether each is data or parity, so that we'll know
   1332  * which mapping function to call to get the corresponding disk
   1333  * offsets.
   1334  *
   1335  * this is kind of unpleasant, but doing it this way allows the
   1336  * reconstruction code to use parity stripe IDs rather than physical
   1337  * disks address to march through the failed disk, which greatly
   1338  * simplifies a lot of code, as well as eliminating the need for a
   1339  * reverse-mapping function.  I also think it will execute faster,
   1340  * since the calls to the mapping module are kept to a minimum.
   1341  *
   1342  * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
   1343  * THE STRIPE IN THE CORRECT ORDER
   1344  *
   1345  * raidPtr          - raid descriptor
   1346  * psid             - parity stripe identifier
   1347  * col              - column of disk to find the offsets for
   1348  * spCol            - out: col of spare unit for failed unit
   1349  * spOffset         - out: offset into disk containing spare unit
   1350  *
   1351  */
   1352 
   1353 
   1354 static int
   1355 ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
   1356 		     RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
   1357 		     RF_SectorNum_t *outFailedDiskSectorOffset,
   1358 		     RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
   1359 {
   1360 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
   1361 	RF_RowCol_t fcol = raidPtr->reconControl->fcol;
   1362 	RF_RaidAddr_t sosRaidAddress;	/* start-of-stripe */
   1363 	RF_RowCol_t *diskids;
   1364 	u_int   i, j, k, i_offset, j_offset;
   1365 	RF_RowCol_t pcol;
   1366 	int     testcol;
   1367 	RF_SectorNum_t poffset;
   1368 	char    i_is_parity = 0, j_is_parity = 0;
   1369 	RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
   1370 
   1371 	/* get a listing of the disks comprising that stripe */
   1372 	sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
   1373 	(layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
   1374 	RF_ASSERT(diskids);
   1375 
   1376 	/* reject this entire parity stripe if it does not contain the
   1377 	 * indicated disk or it does not contain the failed disk */
   1378 
   1379 	for (i = 0; i < stripeWidth; i++) {
   1380 		if (col == diskids[i])
   1381 			break;
   1382 	}
   1383 	if (i == stripeWidth)
   1384 		goto skipit;
   1385 	for (j = 0; j < stripeWidth; j++) {
   1386 		if (fcol == diskids[j])
   1387 			break;
   1388 	}
   1389 	if (j == stripeWidth) {
   1390 		goto skipit;
   1391 	}
   1392 	/* find out which disk the parity is on */
   1393 	(layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
   1394 
   1395 	/* find out if either the current RU or the failed RU is parity */
   1396 	/* also, if the parity occurs in this stripe prior to the data and/or
   1397 	 * failed col, we need to decrement i and/or j */
   1398 	for (k = 0; k < stripeWidth; k++)
   1399 		if (diskids[k] == pcol)
   1400 			break;
   1401 	RF_ASSERT(k < stripeWidth);
   1402 	i_offset = i;
   1403 	j_offset = j;
   1404 	if (k < i)
   1405 		i_offset--;
   1406 	else
   1407 		if (k == i) {
   1408 			i_is_parity = 1;
   1409 			i_offset = 0;
   1410 		}		/* set offsets to zero to disable multiply
   1411 				 * below */
   1412 	if (k < j)
   1413 		j_offset--;
   1414 	else
   1415 		if (k == j) {
   1416 			j_is_parity = 1;
   1417 			j_offset = 0;
   1418 		}
   1419 	/* at this point, [ij]_is_parity tells us whether the [current,failed]
   1420 	 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
   1421 	 * tells us how far into the stripe the [current,failed] disk is. */
   1422 
   1423 	/* call the mapping routine to get the offset into the current disk,
   1424 	 * repeat for failed disk. */
   1425 	if (i_is_parity)
   1426 		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
   1427 	else
   1428 		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
   1429 
   1430 	RF_ASSERT(col == testcol);
   1431 
   1432 	if (j_is_parity)
   1433 		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
   1434 	else
   1435 		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
   1436 	RF_ASSERT(fcol == testcol);
   1437 
   1438 	/* now locate the spare unit for the failed unit */
   1439 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
   1440 	if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
   1441 		if (j_is_parity)
   1442 			layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
   1443 		else
   1444 			layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
   1445 	} else {
   1446 #endif
   1447 		*spCol = raidPtr->reconControl->spareCol;
   1448 		*spOffset = *outFailedDiskSectorOffset;
   1449 #if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
   1450 	}
   1451 #endif
   1452 	return (0);
   1453 
   1454 skipit:
   1455 	Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
   1456 	    psid, col);
   1457 	return (1);
   1458 }
   1459 /* this is called when a buffer has become ready to write to the replacement disk */
   1460 static int
   1461 IssueNextWriteRequest(RF_Raid_t *raidPtr)
   1462 {
   1463 	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
   1464 	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
   1465 #if RF_ACC_TRACE > 0
   1466 	RF_RowCol_t fcol = raidPtr->reconControl->fcol;
   1467 #endif
   1468 	RF_ReconBuffer_t *rbuf;
   1469 	RF_DiskQueueData_t *req;
   1470 
   1471 	rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
   1472 	RF_ASSERT(rbuf);	/* there must be one available, or we wouldn't
   1473 				 * have gotten the event that sent us here */
   1474 	RF_ASSERT(rbuf->pssPtr);
   1475 
   1476 	rbuf->pssPtr->writeRbuf = rbuf;
   1477 	rbuf->pssPtr = NULL;
   1478 
   1479 	Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
   1480 	    rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
   1481 	    rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
   1482 	Dprintf6("RECON: new write psid %ld   %02x %02x %02x %02x %02x\n",
   1483 	    rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
   1484 	    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
   1485 
   1486 	/* should be ok to use a NULL b_proc here b/c all addrs should be in
   1487 	 * kernel space */
   1488 	req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
   1489 	    sectorsPerRU, rbuf->buffer,
   1490 	    rbuf->parityStripeID, rbuf->which_ru,
   1491 	    ReconWriteDoneProc, (void *) rbuf,
   1492 #if RF_ACC_TRACE > 0
   1493 	    &raidPtr->recon_tracerecs[fcol],
   1494 #else
   1495 				     NULL,
   1496 #endif
   1497 	    (void *) raidPtr, 0, NULL, PR_WAITOK);
   1498 
   1499 	rbuf->arg = (void *) req;
   1500 	RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1501 	raidPtr->reconControl->pending_writes++;
   1502 	RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1503 	rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
   1504 
   1505 	return (0);
   1506 }
   1507 
   1508 /*
   1509  * this gets called upon the completion of a reconstruction read
   1510  * operation the arg is a pointer to the per-disk reconstruction
   1511  * control structure for the process that just finished a read.
   1512  *
   1513  * called at interrupt context in the kernel, so don't do anything
   1514  * illegal here.
   1515  */
   1516 static int
   1517 ReconReadDoneProc(void *arg, int status)
   1518 {
   1519 	RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
   1520 	RF_Raid_t *raidPtr;
   1521 
   1522 	/* Detect that reconCtrl is no longer valid, and if that
   1523 	   is the case, bail without calling rf_CauseReconEvent().
   1524 	   There won't be anyone listening for this event anyway */
   1525 
   1526 	if (ctrl->reconCtrl == NULL)
   1527 		return(0);
   1528 
   1529 	raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
   1530 
   1531 	if (status) {
   1532 		printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status);
   1533 		rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
   1534 		return(0);
   1535 	}
   1536 #if RF_ACC_TRACE > 0
   1537 	RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
   1538 	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
   1539 	raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
   1540 	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
   1541 	RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
   1542 #endif
   1543 	rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
   1544 	return (0);
   1545 }
   1546 /* this gets called upon the completion of a reconstruction write operation.
   1547  * the arg is a pointer to the rbuf that was just written
   1548  *
   1549  * called at interrupt context in the kernel, so don't do anything illegal here.
   1550  */
   1551 static int
   1552 ReconWriteDoneProc(void *arg, int status)
   1553 {
   1554 	RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
   1555 
   1556 	/* Detect that reconControl is no longer valid, and if that
   1557 	   is the case, bail without calling rf_CauseReconEvent().
   1558 	   There won't be anyone listening for this event anyway */
   1559 
   1560 	if (rbuf->raidPtr->reconControl == NULL)
   1561 		return(0);
   1562 
   1563 	Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
   1564 	if (status) {
   1565 		printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
   1566 		rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
   1567 		return(0);
   1568 	}
   1569 	rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
   1570 	return (0);
   1571 }
   1572 
   1573 
   1574 /*
   1575  * computes a new minimum head sep, and wakes up anyone who needs to
   1576  * be woken as a result
   1577  */
   1578 static void
   1579 CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
   1580 {
   1581 	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
   1582 	RF_HeadSepLimit_t new_min;
   1583 	RF_RowCol_t i;
   1584 	RF_CallbackDesc_t *p;
   1585 	RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);	/* from the definition
   1586 								 * of a minimum */
   1587 
   1588 
   1589 	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1590 	while(reconCtrlPtr->rb_lock) {
   1591 		ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
   1592 	}
   1593 	reconCtrlPtr->rb_lock = 1;
   1594 	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1595 
   1596 	new_min = ~(1L << (8 * sizeof(long) - 1));	/* 0x7FFF....FFF */
   1597 	for (i = 0; i < raidPtr->numCol; i++)
   1598 		if (i != reconCtrlPtr->fcol) {
   1599 			if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
   1600 				new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
   1601 		}
   1602 	/* set the new minimum and wake up anyone who can now run again */
   1603 	if (new_min != reconCtrlPtr->minHeadSepCounter) {
   1604 		reconCtrlPtr->minHeadSepCounter = new_min;
   1605 		Dprintf1("RECON:  new min head pos counter val is %ld\n", new_min);
   1606 		while (reconCtrlPtr->headSepCBList) {
   1607 			if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
   1608 				break;
   1609 			p = reconCtrlPtr->headSepCBList;
   1610 			reconCtrlPtr->headSepCBList = p->next;
   1611 			p->next = NULL;
   1612 			rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
   1613 			rf_FreeCallbackDesc(p);
   1614 		}
   1615 
   1616 	}
   1617 	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1618 	reconCtrlPtr->rb_lock = 0;
   1619 	wakeup(&reconCtrlPtr->rb_lock);
   1620 	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1621 }
   1622 
   1623 /*
   1624  * checks to see that the maximum head separation will not be violated
   1625  * if we initiate a reconstruction I/O on the indicated disk.
   1626  * Limiting the maximum head separation between two disks eliminates
   1627  * the nasty buffer-stall conditions that occur when one disk races
   1628  * ahead of the others and consumes all of the floating recon buffers.
   1629  * This code is complex and unpleasant but it's necessary to avoid
   1630  * some very nasty, albeit fairly rare, reconstruction behavior.
   1631  *
   1632  * returns non-zero if and only if we have to stop working on the
   1633  * indicated disk due to a head-separation delay.
   1634  */
   1635 static int
   1636 CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
   1637 		    RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
   1638 		    RF_ReconUnitNum_t which_ru)
   1639 {
   1640 	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
   1641 	RF_CallbackDesc_t *cb, *p, *pt;
   1642 	int     retval = 0;
   1643 
   1644 	/* if we're too far ahead of the slowest disk, stop working on this
   1645 	 * disk until the slower ones catch up.  We do this by scheduling a
   1646 	 * wakeup callback for the time when the slowest disk has caught up.
   1647 	 * We define "caught up" with 20% hysteresis, i.e. the head separation
   1648 	 * must have fallen to at most 80% of the max allowable head
   1649 	 * separation before we'll wake up.
   1650 	 *
   1651 	 */
   1652 	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1653 	while(reconCtrlPtr->rb_lock) {
   1654 		ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
   1655 	}
   1656 	reconCtrlPtr->rb_lock = 1;
   1657 	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1658 	if ((raidPtr->headSepLimit >= 0) &&
   1659 	    ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
   1660 		Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
   1661 			 raidPtr->raidid, col, ctrl->headSepCounter,
   1662 			 reconCtrlPtr->minHeadSepCounter,
   1663 			 raidPtr->headSepLimit);
   1664 		cb = rf_AllocCallbackDesc();
   1665 		/* the minHeadSepCounter value we have to get to before we'll
   1666 		 * wake up.  build in 20% hysteresis. */
   1667 		cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
   1668 		cb->col = col;
   1669 		cb->next = NULL;
   1670 
   1671 		/* insert this callback descriptor into the sorted list of
   1672 		 * pending head-sep callbacks */
   1673 		p = reconCtrlPtr->headSepCBList;
   1674 		if (!p)
   1675 			reconCtrlPtr->headSepCBList = cb;
   1676 		else
   1677 			if (cb->callbackArg.v < p->callbackArg.v) {
   1678 				cb->next = reconCtrlPtr->headSepCBList;
   1679 				reconCtrlPtr->headSepCBList = cb;
   1680 			} else {
   1681 				for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
   1682 				cb->next = p;
   1683 				pt->next = cb;
   1684 			}
   1685 		retval = 1;
   1686 #if RF_RECON_STATS > 0
   1687 		ctrl->reconCtrl->reconDesc->hsStallCount++;
   1688 #endif				/* RF_RECON_STATS > 0 */
   1689 	}
   1690 	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1691 	reconCtrlPtr->rb_lock = 0;
   1692 	wakeup(&reconCtrlPtr->rb_lock);
   1693 	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
   1694 
   1695 	return (retval);
   1696 }
   1697 /*
   1698  * checks to see if reconstruction has been either forced or blocked
   1699  * by a user operation.  if forced, we skip this RU entirely.  else if
   1700  * blocked, put ourselves on the wait list.  else return 0.
   1701  *
   1702  * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
   1703  */
   1704 static int
   1705 CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
   1706 				   RF_ReconParityStripeStatus_t *pssPtr,
   1707 				   RF_PerDiskReconCtrl_t *ctrl,
   1708 				   RF_RowCol_t col,
   1709 				   RF_StripeNum_t psid,
   1710 				   RF_ReconUnitNum_t which_ru)
   1711 {
   1712 	RF_CallbackDesc_t *cb;
   1713 	int     retcode = 0;
   1714 
   1715 	if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
   1716 		retcode = RF_PSS_FORCED_ON_WRITE;
   1717 	else
   1718 		if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
   1719 			Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
   1720 			cb = rf_AllocCallbackDesc();	/* append ourselves to
   1721 							 * the blockage-wait
   1722 							 * list */
   1723 			cb->col = col;
   1724 			cb->next = pssPtr->blockWaitList;
   1725 			pssPtr->blockWaitList = cb;
   1726 			retcode = RF_PSS_RECON_BLOCKED;
   1727 		}
   1728 	if (!retcode)
   1729 		pssPtr->flags |= RF_PSS_UNDER_RECON;	/* mark this RU as under
   1730 							 * reconstruction */
   1731 
   1732 	return (retcode);
   1733 }
   1734 /*
   1735  * if reconstruction is currently ongoing for the indicated stripeID,
   1736  * reconstruction is forced to completion and we return non-zero to
   1737  * indicate that the caller must wait.  If not, then reconstruction is
   1738  * blocked on the indicated stripe and the routine returns zero.  If
   1739  * and only if we return non-zero, we'll cause the cbFunc to get
   1740  * invoked with the cbArg when the reconstruction has completed.
   1741  */
   1742 int
   1743 rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
   1744 		     void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
   1745 {
   1746 	RF_StripeNum_t stripeID = asmap->stripeID;	/* the stripe ID we're
   1747 							 * forcing recon on */
   1748 	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;	/* num sects in one RU */
   1749 	RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;	/* a pointer to the parity
   1750 						 * stripe status structure */
   1751 	RF_StripeNum_t psid;	/* parity stripe id */
   1752 	RF_SectorNum_t offset, fd_offset;	/* disk offset, failed-disk
   1753 						 * offset */
   1754 	RF_RowCol_t *diskids;
   1755 	RF_ReconUnitNum_t which_ru;	/* RU within parity stripe */
   1756 	RF_RowCol_t fcol, diskno, i;
   1757 	RF_ReconBuffer_t *new_rbuf;	/* ptr to newly allocated rbufs */
   1758 	RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
   1759 	RF_CallbackDesc_t *cb;
   1760 	int     nPromoted;
   1761 
   1762 	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
   1763 
   1764 	/* allocate a new PSS in case we need it */
   1765         newpssPtr = rf_AllocPSStatus(raidPtr);
   1766 
   1767 	RF_LOCK_PSS_MUTEX(raidPtr, psid);
   1768 
   1769 	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
   1770 
   1771         if (pssPtr != newpssPtr) {
   1772                 rf_FreePSStatus(raidPtr, newpssPtr);
   1773         }
   1774 
   1775 	/* if recon is not ongoing on this PS, just return */
   1776 	if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
   1777 		RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
   1778 		return (0);
   1779 	}
   1780 	/* otherwise, we have to wait for reconstruction to complete on this
   1781 	 * RU. */
   1782 	/* In order to avoid waiting for a potentially large number of
   1783 	 * low-priority accesses to complete, we force a normal-priority (i.e.
   1784 	 * not low-priority) reconstruction on this RU. */
   1785 	if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
   1786 		DDprintf1("Forcing recon on psid %ld\n", psid);
   1787 		pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;	/* mark this RU as under
   1788 								 * forced recon */
   1789 		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;	/* clear the blockage
   1790 							 * that we just set */
   1791 		fcol = raidPtr->reconControl->fcol;
   1792 
   1793 		/* get a listing of the disks comprising the indicated stripe */
   1794 		(raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
   1795 
   1796 		/* For previously issued reads, elevate them to normal
   1797 		 * priority.  If the I/O has already completed, it won't be
   1798 		 * found in the queue, and hence this will be a no-op. For
   1799 		 * unissued reads, allocate buffers and issue new reads.  The
   1800 		 * fact that we've set the FORCED bit means that the regular
   1801 		 * recon procs will not re-issue these reqs */
   1802 		for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
   1803 			if ((diskno = diskids[i]) != fcol) {
   1804 				if (pssPtr->issued[diskno]) {
   1805 					nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
   1806 					if (rf_reconDebug && nPromoted)
   1807 						printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
   1808 				} else {
   1809 					new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED);	/* create new buf */
   1810 					ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
   1811 					    &new_rbuf->spCol, &new_rbuf->spOffset);	/* find offsets & spare
   1812 													 * location */
   1813 					new_rbuf->parityStripeID = psid;	/* fill in the buffer */
   1814 					new_rbuf->which_ru = which_ru;
   1815 					new_rbuf->failedDiskSectorOffset = fd_offset;
   1816 					new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
   1817 
   1818 					/* use NULL b_proc b/c all addrs
   1819 					 * should be in kernel space */
   1820 					req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
   1821 					    psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
   1822 					    NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
   1823 
   1824 					new_rbuf->arg = req;
   1825 					rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY);	/* enqueue the I/O */
   1826 					Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
   1827 				}
   1828 			}
   1829 		/* if the write is sitting in the disk queue, elevate its
   1830 		 * priority */
   1831 		if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
   1832 			if (rf_reconDebug)
   1833 				printf("raid%d: promoted write to col %d\n",
   1834 				       raidPtr->raidid, fcol);
   1835 	}
   1836 	/* install a callback descriptor to be invoked when recon completes on
   1837 	 * this parity stripe. */
   1838 	cb = rf_AllocCallbackDesc();
   1839 	/* XXX the following is bogus.. These functions don't really match!!
   1840 	 * GO */
   1841 	cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
   1842 	cb->callbackArg.p = (void *) cbArg;
   1843 	cb->next = pssPtr->procWaitList;
   1844 	pssPtr->procWaitList = cb;
   1845 	DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
   1846 		  raidPtr->raidid, psid);
   1847 
   1848 	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
   1849 	return (1);
   1850 }
   1851 /* called upon the completion of a forced reconstruction read.
   1852  * all we do is schedule the FORCEDREADONE event.
   1853  * called at interrupt context in the kernel, so don't do anything illegal here.
   1854  */
   1855 static void
   1856 ForceReconReadDoneProc(void *arg, int status)
   1857 {
   1858 	RF_ReconBuffer_t *rbuf = arg;
   1859 
   1860 	/* Detect that reconControl is no longer valid, and if that
   1861 	   is the case, bail without calling rf_CauseReconEvent().
   1862 	   There won't be anyone listening for this event anyway */
   1863 
   1864 	if (rbuf->raidPtr->reconControl == NULL)
   1865 		return;
   1866 
   1867 	if (status) {
   1868 		printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
   1869 		rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
   1870 		return;
   1871 	}
   1872 	rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
   1873 }
   1874 /* releases a block on the reconstruction of the indicated stripe */
   1875 int
   1876 rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
   1877 {
   1878 	RF_StripeNum_t stripeID = asmap->stripeID;
   1879 	RF_ReconParityStripeStatus_t *pssPtr;
   1880 	RF_ReconUnitNum_t which_ru;
   1881 	RF_StripeNum_t psid;
   1882 	RF_CallbackDesc_t *cb;
   1883 
   1884 	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
   1885 	RF_LOCK_PSS_MUTEX(raidPtr, psid);
   1886 	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
   1887 
   1888 	/* When recon is forced, the pss desc can get deleted before we get
   1889 	 * back to unblock recon. But, this can _only_ happen when recon is
   1890 	 * forced. It would be good to put some kind of sanity check here, but
   1891 	 * how to decide if recon was just forced or not? */
   1892 	if (!pssPtr) {
   1893 		/* printf("Warning: no pss descriptor upon unblock on psid %ld
   1894 		 * RU %d\n",psid,which_ru); */
   1895 #if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
   1896 		if (rf_reconDebug || rf_pssDebug)
   1897 			printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
   1898 #endif
   1899 		goto out;
   1900 	}
   1901 	pssPtr->blockCount--;
   1902 	Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
   1903 		 raidPtr->raidid, psid, pssPtr->blockCount);
   1904 	if (pssPtr->blockCount == 0) {	/* if recon blockage has been released */
   1905 
   1906 		/* unblock recon before calling CauseReconEvent in case
   1907 		 * CauseReconEvent causes us to try to issue a new read before
   1908 		 * returning here. */
   1909 		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
   1910 
   1911 
   1912 		while (pssPtr->blockWaitList) {
   1913 			/* spin through the block-wait list and
   1914 			   release all the waiters */
   1915 			cb = pssPtr->blockWaitList;
   1916 			pssPtr->blockWaitList = cb->next;
   1917 			cb->next = NULL;
   1918 			rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
   1919 			rf_FreeCallbackDesc(cb);
   1920 		}
   1921 		if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
   1922 			/* if no recon was requested while recon was blocked */
   1923 			rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
   1924 		}
   1925 	}
   1926 out:
   1927 	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
   1928 	return (0);
   1929 }
   1930 
   1931 void
   1932 rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr)
   1933 {
   1934 	RF_CallbackDesc_t *p;
   1935 
   1936 	RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1937 	while(raidPtr->reconControl->rb_lock) {
   1938 		ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO,
   1939 			"rf_wakeuphscbw", 0, &raidPtr->reconControl->rb_mutex);
   1940 	}
   1941 
   1942 	raidPtr->reconControl->rb_lock = 1;
   1943 	RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1944 
   1945 	while (raidPtr->reconControl->headSepCBList) {
   1946 		p = raidPtr->reconControl->headSepCBList;
   1947 		raidPtr->reconControl->headSepCBList = p->next;
   1948 		p->next = NULL;
   1949 		rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
   1950 		rf_FreeCallbackDesc(p);
   1951 	}
   1952 	RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1953 	raidPtr->reconControl->rb_lock = 0;
   1954 	wakeup(&raidPtr->reconControl->rb_lock);
   1955 	RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
   1956 
   1957 }
   1958 
   1959