Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.298.2.1
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.298.2.1 2012/10/17 01:36:13 tls Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.298.2.1 2012/10/17 01:36:13 tls Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #include "raid.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #ifdef DEBUG
    156 int     rf_kdebug_level = 0;
    157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    158 #else				/* DEBUG */
    159 #define db1_printf(a) { }
    160 #endif				/* DEBUG */
    161 
    162 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 static void raidinit(RF_Raid_t *);
    183 
    184 void raidattach(int);
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 
    200 dev_type_open(raidopen);
    201 dev_type_close(raidclose);
    202 dev_type_read(raidread);
    203 dev_type_write(raidwrite);
    204 dev_type_ioctl(raidioctl);
    205 dev_type_strategy(raidstrategy);
    206 dev_type_dump(raiddump);
    207 dev_type_size(raidsize);
    208 
    209 const struct bdevsw raid_bdevsw = {
    210 	raidopen, raidclose, raidstrategy, raidioctl,
    211 	raiddump, raidsize, D_DISK
    212 };
    213 
    214 const struct cdevsw raid_cdevsw = {
    215 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    216 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    217 };
    218 
    219 static void	raidminphys(struct buf *);
    220 
    221 static struct dkdriver rf_dkdriver = { raidstrategy, raidminphys };
    222 
    223 /* XXX Not sure if the following should be replacing the raidPtrs above,
    224    or if it should be used in conjunction with that...
    225 */
    226 
    227 struct raid_softc {
    228 	device_t sc_dev;
    229 	int     sc_flags;	/* flags */
    230 	int     sc_cflags;	/* configuration flags */
    231 	uint64_t sc_size;	/* size of the raid device */
    232 	char    sc_xname[20];	/* XXX external name */
    233 	struct disk sc_dkdev;	/* generic disk device info */
    234 	struct bufq_state *buf_queue;	/* used for the device queue */
    235 };
    236 /* sc_flags */
    237 #define RAIDF_INITED	0x01	/* unit has been initialized */
    238 #define RAIDF_WLABEL	0x02	/* label area is writable */
    239 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    240 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    241 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    242 #define RAIDF_LOCKED	0x80	/* unit is locked */
    243 
    244 #define	raidunit(x)	DISKUNIT(x)
    245 int numraid = 0;
    246 
    247 extern struct cfdriver raid_cd;
    248 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    249     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    250     DVF_DETACH_SHUTDOWN);
    251 
    252 /*
    253  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    254  * Be aware that large numbers can allow the driver to consume a lot of
    255  * kernel memory, especially on writes, and in degraded mode reads.
    256  *
    257  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    258  * a single 64K write will typically require 64K for the old data,
    259  * 64K for the old parity, and 64K for the new parity, for a total
    260  * of 192K (if the parity buffer is not re-used immediately).
    261  * Even it if is used immediately, that's still 128K, which when multiplied
    262  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    263  *
    264  * Now in degraded mode, for example, a 64K read on the above setup may
    265  * require data reconstruction, which will require *all* of the 4 remaining
    266  * disks to participate -- 4 * 32K/disk == 128K again.
    267  */
    268 
    269 #ifndef RAIDOUTSTANDING
    270 #define RAIDOUTSTANDING   6
    271 #endif
    272 
    273 #define RAIDLABELDEV(dev)	\
    274 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    275 
    276 /* declared here, and made public, for the benefit of KVM stuff.. */
    277 struct raid_softc *raid_softc;
    278 
    279 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    280 				     struct disklabel *);
    281 static void raidgetdisklabel(dev_t);
    282 static void raidmakedisklabel(struct raid_softc *);
    283 
    284 static int raidlock(struct raid_softc *);
    285 static void raidunlock(struct raid_softc *);
    286 
    287 static int raid_detach_unlocked(struct raid_softc *);
    288 
    289 static void rf_markalldirty(RF_Raid_t *);
    290 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    291 
    292 void rf_ReconThread(struct rf_recon_req *);
    293 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    294 void rf_CopybackThread(RF_Raid_t *raidPtr);
    295 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    296 int rf_autoconfig(device_t);
    297 void rf_buildroothack(RF_ConfigSet_t *);
    298 
    299 RF_AutoConfig_t *rf_find_raid_components(void);
    300 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    301 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    302 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    303 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    304 int rf_set_autoconfig(RF_Raid_t *, int);
    305 int rf_set_rootpartition(RF_Raid_t *, int);
    306 void rf_release_all_vps(RF_ConfigSet_t *);
    307 void rf_cleanup_config_set(RF_ConfigSet_t *);
    308 int rf_have_enough_components(RF_ConfigSet_t *);
    309 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    310 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    311 
    312 /*
    313  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    314  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    315  * in the kernel config file.
    316  */
    317 #ifdef RAID_AUTOCONFIG
    318 int raidautoconfig = 1;
    319 #else
    320 int raidautoconfig = 0;
    321 #endif
    322 static bool raidautoconfigdone = false;
    323 
    324 struct RF_Pools_s rf_pools;
    325 
    326 void
    327 raidattach(int num)
    328 {
    329 	int raidID;
    330 	int i, rc;
    331 
    332 	aprint_debug("raidattach: Asked for %d units\n", num);
    333 
    334 	if (num <= 0) {
    335 #ifdef DIAGNOSTIC
    336 		panic("raidattach: count <= 0");
    337 #endif
    338 		return;
    339 	}
    340 	/* This is where all the initialization stuff gets done. */
    341 
    342 	numraid = num;
    343 
    344 	/* Make some space for requested number of units... */
    345 
    346 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    347 	if (raidPtrs == NULL) {
    348 		panic("raidPtrs is NULL!!");
    349 	}
    350 
    351 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    352 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    353 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    354 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    355 
    356 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    357 #endif
    358 
    359 	for (i = 0; i < num; i++)
    360 		raidPtrs[i] = NULL;
    361 	rc = rf_BootRaidframe();
    362 	if (rc == 0)
    363 		aprint_verbose("Kernelized RAIDframe activated\n");
    364 	else
    365 		panic("Serious error booting RAID!!");
    366 
    367 	/* put together some datastructures like the CCD device does.. This
    368 	 * lets us lock the device and what-not when it gets opened. */
    369 
    370 	raid_softc = (struct raid_softc *)
    371 		malloc(num * sizeof(struct raid_softc),
    372 		       M_RAIDFRAME, M_NOWAIT);
    373 	if (raid_softc == NULL) {
    374 		aprint_error("WARNING: no memory for RAIDframe driver\n");
    375 		return;
    376 	}
    377 
    378 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    379 
    380 	for (raidID = 0; raidID < num; raidID++) {
    381 		bufq_alloc(&raid_softc[raidID].buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    382 
    383 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    384 			  (RF_Raid_t *));
    385 		if (raidPtrs[raidID] == NULL) {
    386 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
    387 			numraid = raidID;
    388 			return;
    389 		}
    390 	}
    391 
    392 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    393 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    394 	}
    395 
    396 	raidautoconfigdone = false;
    397 
    398 	/*
    399 	 * Register a finalizer which will be used to auto-config RAID
    400 	 * sets once all real hardware devices have been found.
    401 	 */
    402 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    403 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    404 }
    405 
    406 int
    407 rf_autoconfig(device_t self)
    408 {
    409 	RF_AutoConfig_t *ac_list;
    410 	RF_ConfigSet_t *config_sets;
    411 
    412 	if (!raidautoconfig || raidautoconfigdone == true)
    413 		return (0);
    414 
    415 	/* XXX This code can only be run once. */
    416 	raidautoconfigdone = true;
    417 
    418 	/* 1. locate all RAID components on the system */
    419 	aprint_debug("Searching for RAID components...\n");
    420 	ac_list = rf_find_raid_components();
    421 
    422 	/* 2. Sort them into their respective sets. */
    423 	config_sets = rf_create_auto_sets(ac_list);
    424 
    425 	/*
    426 	 * 3. Evaluate each set andconfigure the valid ones.
    427 	 * This gets done in rf_buildroothack().
    428 	 */
    429 	rf_buildroothack(config_sets);
    430 
    431 	return 1;
    432 }
    433 
    434 void
    435 rf_buildroothack(RF_ConfigSet_t *config_sets)
    436 {
    437 	RF_ConfigSet_t *cset;
    438 	RF_ConfigSet_t *next_cset;
    439 	int retcode;
    440 	int raidID;
    441 	int rootID;
    442 	int col;
    443 	int num_root;
    444 	char *devname;
    445 
    446 	rootID = 0;
    447 	num_root = 0;
    448 	cset = config_sets;
    449 	while (cset != NULL) {
    450 		next_cset = cset->next;
    451 		if (rf_have_enough_components(cset) &&
    452 		    cset->ac->clabel->autoconfigure==1) {
    453 			retcode = rf_auto_config_set(cset,&raidID);
    454 			if (!retcode) {
    455 				aprint_debug("raid%d: configured ok\n", raidID);
    456 				if (cset->rootable) {
    457 					rootID = raidID;
    458 					num_root++;
    459 				}
    460 			} else {
    461 				/* The autoconfig didn't work :( */
    462 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    463 				rf_release_all_vps(cset);
    464 			}
    465 		} else {
    466 			/* we're not autoconfiguring this set...
    467 			   release the associated resources */
    468 			rf_release_all_vps(cset);
    469 		}
    470 		/* cleanup */
    471 		rf_cleanup_config_set(cset);
    472 		cset = next_cset;
    473 	}
    474 
    475 	/* if the user has specified what the root device should be
    476 	   then we don't touch booted_device or boothowto... */
    477 
    478 	if (rootspec != NULL)
    479 		return;
    480 
    481 	/* we found something bootable... */
    482 
    483 	if (num_root == 1) {
    484 		if (raid_softc[rootID].sc_dkdev.dk_nwedges != 0) {
    485 			/* XXX: How do we find the real root partition? */
    486 			char cname[sizeof(cset->ac->devname)];
    487 			snprintf(cname, sizeof(cname), "%s%c",
    488 			    device_xname(raid_softc[rootID].sc_dev), 'a');
    489 			booted_device = dkwedge_find_by_wname(cname);
    490 		} else
    491 			booted_device = raid_softc[rootID].sc_dev;
    492 	} else if (num_root > 1) {
    493 
    494 		/*
    495 		 * Maybe the MD code can help. If it cannot, then
    496 		 * setroot() will discover that we have no
    497 		 * booted_device and will ask the user if nothing was
    498 		 * hardwired in the kernel config file
    499 		 */
    500 
    501 		if (booted_device == NULL)
    502 			cpu_rootconf();
    503 		if (booted_device == NULL)
    504 			return;
    505 
    506 		num_root = 0;
    507 		for (raidID = 0; raidID < numraid; raidID++) {
    508 			if (raidPtrs[raidID]->valid == 0)
    509 				continue;
    510 
    511 			if (raidPtrs[raidID]->root_partition == 0)
    512 				continue;
    513 
    514 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
    515 				devname = raidPtrs[raidID]->Disks[col].devname;
    516 				devname += sizeof("/dev/") - 1;
    517 				if (strncmp(devname, device_xname(booted_device),
    518 					    strlen(device_xname(booted_device))) != 0)
    519 					continue;
    520 				aprint_debug("raid%d includes boot device %s\n",
    521 				       raidID, devname);
    522 				num_root++;
    523 				rootID = raidID;
    524 			}
    525 		}
    526 
    527 		if (num_root == 1) {
    528 			booted_device = raid_softc[rootID].sc_dev;
    529 		} else {
    530 			/* we can't guess.. require the user to answer... */
    531 			boothowto |= RB_ASKNAME;
    532 		}
    533 	}
    534 }
    535 
    536 
    537 int
    538 raidsize(dev_t dev)
    539 {
    540 	struct raid_softc *rs;
    541 	struct disklabel *lp;
    542 	int     part, unit, omask, size;
    543 
    544 	unit = raidunit(dev);
    545 	if (unit >= numraid)
    546 		return (-1);
    547 	rs = &raid_softc[unit];
    548 
    549 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    550 		return (-1);
    551 
    552 	part = DISKPART(dev);
    553 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    554 	lp = rs->sc_dkdev.dk_label;
    555 
    556 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    557 		return (-1);
    558 
    559 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    560 		size = -1;
    561 	else
    562 		size = lp->d_partitions[part].p_size *
    563 		    (lp->d_secsize / DEV_BSIZE);
    564 
    565 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    566 		return (-1);
    567 
    568 	return (size);
    569 
    570 }
    571 
    572 int
    573 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    574 {
    575 	int     unit = raidunit(dev);
    576 	struct raid_softc *rs;
    577 	const struct bdevsw *bdev;
    578 	struct disklabel *lp;
    579 	RF_Raid_t *raidPtr;
    580 	daddr_t offset;
    581 	int     part, c, sparecol, j, scol, dumpto;
    582 	int     error = 0;
    583 
    584 	if (unit >= numraid)
    585 		return (ENXIO);
    586 
    587 	rs = &raid_softc[unit];
    588 	raidPtr = raidPtrs[unit];
    589 
    590 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    591 		return ENXIO;
    592 
    593 	/* we only support dumping to RAID 1 sets */
    594 	if (raidPtr->Layout.numDataCol != 1 ||
    595 	    raidPtr->Layout.numParityCol != 1)
    596 		return EINVAL;
    597 
    598 
    599 	if ((error = raidlock(rs)) != 0)
    600 		return error;
    601 
    602 	if (size % DEV_BSIZE != 0) {
    603 		error = EINVAL;
    604 		goto out;
    605 	}
    606 
    607 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    608 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    609 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    610 		    size / DEV_BSIZE, rs->sc_size);
    611 		error = EINVAL;
    612 		goto out;
    613 	}
    614 
    615 	part = DISKPART(dev);
    616 	lp = rs->sc_dkdev.dk_label;
    617 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    618 
    619 	/* figure out what device is alive.. */
    620 
    621 	/*
    622 	   Look for a component to dump to.  The preference for the
    623 	   component to dump to is as follows:
    624 	   1) the master
    625 	   2) a used_spare of the master
    626 	   3) the slave
    627 	   4) a used_spare of the slave
    628 	*/
    629 
    630 	dumpto = -1;
    631 	for (c = 0; c < raidPtr->numCol; c++) {
    632 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    633 			/* this might be the one */
    634 			dumpto = c;
    635 			break;
    636 		}
    637 	}
    638 
    639 	/*
    640 	   At this point we have possibly selected a live master or a
    641 	   live slave.  We now check to see if there is a spared
    642 	   master (or a spared slave), if we didn't find a live master
    643 	   or a live slave.
    644 	*/
    645 
    646 	for (c = 0; c < raidPtr->numSpare; c++) {
    647 		sparecol = raidPtr->numCol + c;
    648 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    649 			/* How about this one? */
    650 			scol = -1;
    651 			for(j=0;j<raidPtr->numCol;j++) {
    652 				if (raidPtr->Disks[j].spareCol == sparecol) {
    653 					scol = j;
    654 					break;
    655 				}
    656 			}
    657 			if (scol == 0) {
    658 				/*
    659 				   We must have found a spared master!
    660 				   We'll take that over anything else
    661 				   found so far.  (We couldn't have
    662 				   found a real master before, since
    663 				   this is a used spare, and it's
    664 				   saying that it's replacing the
    665 				   master.)  On reboot (with
    666 				   autoconfiguration turned on)
    667 				   sparecol will become the 1st
    668 				   component (component0) of this set.
    669 				*/
    670 				dumpto = sparecol;
    671 				break;
    672 			} else if (scol != -1) {
    673 				/*
    674 				   Must be a spared slave.  We'll dump
    675 				   to that if we havn't found anything
    676 				   else so far.
    677 				*/
    678 				if (dumpto == -1)
    679 					dumpto = sparecol;
    680 			}
    681 		}
    682 	}
    683 
    684 	if (dumpto == -1) {
    685 		/* we couldn't find any live components to dump to!?!?
    686 		 */
    687 		error = EINVAL;
    688 		goto out;
    689 	}
    690 
    691 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    692 
    693 	/*
    694 	   Note that blkno is relative to this particular partition.
    695 	   By adding the offset of this partition in the RAID
    696 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    697 	   value that is relative to the partition used for the
    698 	   underlying component.
    699 	*/
    700 
    701 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    702 				blkno + offset, va, size);
    703 
    704 out:
    705 	raidunlock(rs);
    706 
    707 	return error;
    708 }
    709 /* ARGSUSED */
    710 int
    711 raidopen(dev_t dev, int flags, int fmt,
    712     struct lwp *l)
    713 {
    714 	int     unit = raidunit(dev);
    715 	struct raid_softc *rs;
    716 	struct disklabel *lp;
    717 	int     part, pmask;
    718 	int     error = 0;
    719 
    720 	if (unit >= numraid)
    721 		return (ENXIO);
    722 	rs = &raid_softc[unit];
    723 
    724 	if ((error = raidlock(rs)) != 0)
    725 		return (error);
    726 
    727 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    728 		error = EBUSY;
    729 		goto bad;
    730 	}
    731 
    732 	lp = rs->sc_dkdev.dk_label;
    733 
    734 	part = DISKPART(dev);
    735 
    736 	/*
    737 	 * If there are wedges, and this is not RAW_PART, then we
    738 	 * need to fail.
    739 	 */
    740 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    741 		error = EBUSY;
    742 		goto bad;
    743 	}
    744 	pmask = (1 << part);
    745 
    746 	if ((rs->sc_flags & RAIDF_INITED) &&
    747 	    (rs->sc_dkdev.dk_openmask == 0))
    748 		raidgetdisklabel(dev);
    749 
    750 	/* make sure that this partition exists */
    751 
    752 	if (part != RAW_PART) {
    753 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    754 		    ((part >= lp->d_npartitions) ||
    755 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    756 			error = ENXIO;
    757 			goto bad;
    758 		}
    759 	}
    760 	/* Prevent this unit from being unconfigured while open. */
    761 	switch (fmt) {
    762 	case S_IFCHR:
    763 		rs->sc_dkdev.dk_copenmask |= pmask;
    764 		break;
    765 
    766 	case S_IFBLK:
    767 		rs->sc_dkdev.dk_bopenmask |= pmask;
    768 		break;
    769 	}
    770 
    771 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    772 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    773 		/* First one... mark things as dirty... Note that we *MUST*
    774 		 have done a configure before this.  I DO NOT WANT TO BE
    775 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    776 		 THAT THEY BELONG TOGETHER!!!!! */
    777 		/* XXX should check to see if we're only open for reading
    778 		   here... If so, we needn't do this, but then need some
    779 		   other way of keeping track of what's happened.. */
    780 
    781 		rf_markalldirty(raidPtrs[unit]);
    782 	}
    783 
    784 
    785 	rs->sc_dkdev.dk_openmask =
    786 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    787 
    788 bad:
    789 	raidunlock(rs);
    790 
    791 	return (error);
    792 
    793 
    794 }
    795 /* ARGSUSED */
    796 int
    797 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    798 {
    799 	int     unit = raidunit(dev);
    800 	struct raid_softc *rs;
    801 	int     error = 0;
    802 	int     part;
    803 
    804 	if (unit >= numraid)
    805 		return (ENXIO);
    806 	rs = &raid_softc[unit];
    807 
    808 	if ((error = raidlock(rs)) != 0)
    809 		return (error);
    810 
    811 	part = DISKPART(dev);
    812 
    813 	/* ...that much closer to allowing unconfiguration... */
    814 	switch (fmt) {
    815 	case S_IFCHR:
    816 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    817 		break;
    818 
    819 	case S_IFBLK:
    820 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    821 		break;
    822 	}
    823 	rs->sc_dkdev.dk_openmask =
    824 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    825 
    826 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    827 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    828 		/* Last one... device is not unconfigured yet.
    829 		   Device shutdown has taken care of setting the
    830 		   clean bits if RAIDF_INITED is not set
    831 		   mark things as clean... */
    832 
    833 		rf_update_component_labels(raidPtrs[unit],
    834 						 RF_FINAL_COMPONENT_UPDATE);
    835 
    836 		/* If the kernel is shutting down, it will detach
    837 		 * this RAID set soon enough.
    838 		 */
    839 	}
    840 
    841 	raidunlock(rs);
    842 	return (0);
    843 
    844 }
    845 
    846 void
    847 raidstrategy(struct buf *bp)
    848 {
    849 	unsigned int raidID = raidunit(bp->b_dev);
    850 	RF_Raid_t *raidPtr;
    851 	struct raid_softc *rs = &raid_softc[raidID];
    852 	int     wlabel;
    853 
    854 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    855 		bp->b_error = ENXIO;
    856 		goto done;
    857 	}
    858 	if (raidID >= numraid || !raidPtrs[raidID]) {
    859 		bp->b_error = ENODEV;
    860 		goto done;
    861 	}
    862 	raidPtr = raidPtrs[raidID];
    863 	if (!raidPtr->valid) {
    864 		bp->b_error = ENODEV;
    865 		goto done;
    866 	}
    867 	if (bp->b_bcount == 0) {
    868 		db1_printf(("b_bcount is zero..\n"));
    869 		goto done;
    870 	}
    871 
    872 	/*
    873 	 * Do bounds checking and adjust transfer.  If there's an
    874 	 * error, the bounds check will flag that for us.
    875 	 */
    876 
    877 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    878 	if (DISKPART(bp->b_dev) == RAW_PART) {
    879 		uint64_t size; /* device size in DEV_BSIZE unit */
    880 
    881 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    882 			size = raidPtr->totalSectors <<
    883 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    884 		} else {
    885 			size = raidPtr->totalSectors >>
    886 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    887 		}
    888 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    889 			goto done;
    890 		}
    891 	} else {
    892 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    893 			db1_printf(("Bounds check failed!!:%d %d\n",
    894 				(int) bp->b_blkno, (int) wlabel));
    895 			goto done;
    896 		}
    897 	}
    898 
    899 	rf_lock_mutex2(raidPtr->iodone_lock);
    900 
    901 	bp->b_resid = 0;
    902 
    903 	/* stuff it onto our queue */
    904 	bufq_put(rs->buf_queue, bp);
    905 
    906 	/* scheduled the IO to happen at the next convenient time */
    907 	rf_signal_cond2(raidPtr->iodone_cv);
    908 	rf_unlock_mutex2(raidPtr->iodone_lock);
    909 
    910 	return;
    911 
    912 done:
    913 	bp->b_resid = bp->b_bcount;
    914 	biodone(bp);
    915 }
    916 /* ARGSUSED */
    917 int
    918 raidread(dev_t dev, struct uio *uio, int flags)
    919 {
    920 	int     unit = raidunit(dev);
    921 	struct raid_softc *rs;
    922 
    923 	if (unit >= numraid)
    924 		return (ENXIO);
    925 	rs = &raid_softc[unit];
    926 
    927 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    928 		return (ENXIO);
    929 
    930 	return (physio(raidstrategy, NULL, dev, B_READ, raidminphys, uio));
    931 
    932 }
    933 /* ARGSUSED */
    934 int
    935 raidwrite(dev_t dev, struct uio *uio, int flags)
    936 {
    937 	int     unit = raidunit(dev);
    938 	struct raid_softc *rs;
    939 
    940 	if (unit >= numraid)
    941 		return (ENXIO);
    942 	rs = &raid_softc[unit];
    943 
    944 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    945 		return (ENXIO);
    946 
    947 	return (physio(raidstrategy, NULL, dev, B_WRITE, raidminphys, uio));
    948 
    949 }
    950 
    951 static int
    952 raid_detach_unlocked(struct raid_softc *rs)
    953 {
    954 	int error;
    955 	RF_Raid_t *raidPtr;
    956 
    957 	raidPtr = raidPtrs[device_unit(rs->sc_dev)];
    958 
    959 	/*
    960 	 * If somebody has a partition mounted, we shouldn't
    961 	 * shutdown.
    962 	 */
    963 	if (rs->sc_dkdev.dk_openmask != 0)
    964 		return EBUSY;
    965 
    966 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    967 		;	/* not initialized: nothing to do */
    968 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    969 		return error;
    970 	else
    971 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    972 
    973 	/* Detach the disk. */
    974 	dkwedge_delall(&rs->sc_dkdev);
    975 	disk_detach(&rs->sc_dkdev);
    976 	disk_destroy(&rs->sc_dkdev);
    977 
    978 	aprint_normal_dev(rs->sc_dev, "detached\n");
    979 
    980 	return 0;
    981 }
    982 
    983 int
    984 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    985 {
    986 	int     unit = raidunit(dev);
    987 	int     error = 0;
    988 	int     part, pmask, s;
    989 	cfdata_t cf;
    990 	struct raid_softc *rs;
    991 	RF_Config_t *k_cfg, *u_cfg;
    992 	RF_Raid_t *raidPtr;
    993 	RF_RaidDisk_t *diskPtr;
    994 	RF_AccTotals_t *totals;
    995 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    996 	u_char *specific_buf;
    997 	int retcode = 0;
    998 	int column;
    999 /*	int raidid; */
   1000 	struct rf_recon_req *rrcopy, *rr;
   1001 	RF_ComponentLabel_t *clabel;
   1002 	RF_ComponentLabel_t *ci_label;
   1003 	RF_ComponentLabel_t **clabel_ptr;
   1004 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1005 	RF_SingleComponent_t component;
   1006 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1007 	int i, j, d;
   1008 #ifdef __HAVE_OLD_DISKLABEL
   1009 	struct disklabel newlabel;
   1010 #endif
   1011 	struct dkwedge_info *dkw;
   1012 
   1013 	if (unit >= numraid)
   1014 		return (ENXIO);
   1015 	rs = &raid_softc[unit];
   1016 	raidPtr = raidPtrs[unit];
   1017 
   1018 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1019 		(int) DISKPART(dev), (int) unit, cmd));
   1020 
   1021 	/* Must be open for writes for these commands... */
   1022 	switch (cmd) {
   1023 #ifdef DIOCGSECTORSIZE
   1024 	case DIOCGSECTORSIZE:
   1025 		*(u_int *)data = raidPtr->bytesPerSector;
   1026 		return 0;
   1027 	case DIOCGMEDIASIZE:
   1028 		*(off_t *)data =
   1029 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1030 		return 0;
   1031 #endif
   1032 	case DIOCSDINFO:
   1033 	case DIOCWDINFO:
   1034 #ifdef __HAVE_OLD_DISKLABEL
   1035 	case ODIOCWDINFO:
   1036 	case ODIOCSDINFO:
   1037 #endif
   1038 	case DIOCWLABEL:
   1039 	case DIOCAWEDGE:
   1040 	case DIOCDWEDGE:
   1041 	case DIOCSSTRATEGY:
   1042 		if ((flag & FWRITE) == 0)
   1043 			return (EBADF);
   1044 	}
   1045 
   1046 	/* Must be initialized for these... */
   1047 	switch (cmd) {
   1048 	case DIOCGDINFO:
   1049 	case DIOCSDINFO:
   1050 	case DIOCWDINFO:
   1051 #ifdef __HAVE_OLD_DISKLABEL
   1052 	case ODIOCGDINFO:
   1053 	case ODIOCWDINFO:
   1054 	case ODIOCSDINFO:
   1055 	case ODIOCGDEFLABEL:
   1056 #endif
   1057 	case DIOCGPART:
   1058 	case DIOCWLABEL:
   1059 	case DIOCGDEFLABEL:
   1060 	case DIOCAWEDGE:
   1061 	case DIOCDWEDGE:
   1062 	case DIOCLWEDGES:
   1063 	case DIOCCACHESYNC:
   1064 	case RAIDFRAME_SHUTDOWN:
   1065 	case RAIDFRAME_REWRITEPARITY:
   1066 	case RAIDFRAME_GET_INFO:
   1067 	case RAIDFRAME_RESET_ACCTOTALS:
   1068 	case RAIDFRAME_GET_ACCTOTALS:
   1069 	case RAIDFRAME_KEEP_ACCTOTALS:
   1070 	case RAIDFRAME_GET_SIZE:
   1071 	case RAIDFRAME_FAIL_DISK:
   1072 	case RAIDFRAME_COPYBACK:
   1073 	case RAIDFRAME_CHECK_RECON_STATUS:
   1074 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1075 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1076 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1077 	case RAIDFRAME_ADD_HOT_SPARE:
   1078 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1079 	case RAIDFRAME_INIT_LABELS:
   1080 	case RAIDFRAME_REBUILD_IN_PLACE:
   1081 	case RAIDFRAME_CHECK_PARITY:
   1082 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1083 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1084 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1085 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1086 	case RAIDFRAME_SET_AUTOCONFIG:
   1087 	case RAIDFRAME_SET_ROOT:
   1088 	case RAIDFRAME_DELETE_COMPONENT:
   1089 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1090 	case RAIDFRAME_PARITYMAP_STATUS:
   1091 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1092 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1093 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1094 	case DIOCGSTRATEGY:
   1095 	case DIOCSSTRATEGY:
   1096 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1097 			return (ENXIO);
   1098 	}
   1099 
   1100 	switch (cmd) {
   1101 #ifdef COMPAT_50
   1102 	case RAIDFRAME_GET_INFO50:
   1103 		return rf_get_info50(raidPtr, data);
   1104 
   1105 	case RAIDFRAME_CONFIGURE50:
   1106 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1107 			return retcode;
   1108 		goto config;
   1109 #endif
   1110 		/* configure the system */
   1111 	case RAIDFRAME_CONFIGURE:
   1112 
   1113 		if (raidPtr->valid) {
   1114 			/* There is a valid RAID set running on this unit! */
   1115 			printf("raid%d: Device already configured!\n",unit);
   1116 			return(EINVAL);
   1117 		}
   1118 
   1119 		/* copy-in the configuration information */
   1120 		/* data points to a pointer to the configuration structure */
   1121 
   1122 		u_cfg = *((RF_Config_t **) data);
   1123 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1124 		if (k_cfg == NULL) {
   1125 			return (ENOMEM);
   1126 		}
   1127 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1128 		if (retcode) {
   1129 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1130 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1131 				retcode));
   1132 			return (retcode);
   1133 		}
   1134 		goto config;
   1135 	config:
   1136 		/* allocate a buffer for the layout-specific data, and copy it
   1137 		 * in */
   1138 		if (k_cfg->layoutSpecificSize) {
   1139 			if (k_cfg->layoutSpecificSize > 10000) {
   1140 				/* sanity check */
   1141 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1142 				return (EINVAL);
   1143 			}
   1144 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1145 			    (u_char *));
   1146 			if (specific_buf == NULL) {
   1147 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1148 				return (ENOMEM);
   1149 			}
   1150 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1151 			    k_cfg->layoutSpecificSize);
   1152 			if (retcode) {
   1153 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1154 				RF_Free(specific_buf,
   1155 					k_cfg->layoutSpecificSize);
   1156 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1157 					retcode));
   1158 				return (retcode);
   1159 			}
   1160 		} else
   1161 			specific_buf = NULL;
   1162 		k_cfg->layoutSpecific = specific_buf;
   1163 
   1164 		/* should do some kind of sanity check on the configuration.
   1165 		 * Store the sum of all the bytes in the last byte? */
   1166 
   1167 		/* configure the system */
   1168 
   1169 		/*
   1170 		 * Clear the entire RAID descriptor, just to make sure
   1171 		 *  there is no stale data left in the case of a
   1172 		 *  reconfiguration
   1173 		 */
   1174 		memset(raidPtr, 0, sizeof(*raidPtr));
   1175 		raidPtr->raidid = unit;
   1176 
   1177 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1178 
   1179 		if (retcode == 0) {
   1180 
   1181 			/* allow this many simultaneous IO's to
   1182 			   this RAID device */
   1183 			raidPtr->openings = RAIDOUTSTANDING;
   1184 
   1185 			raidinit(raidPtr);
   1186 			rf_markalldirty(raidPtr);
   1187 		}
   1188 		/* free the buffers.  No return code here. */
   1189 		if (k_cfg->layoutSpecificSize) {
   1190 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1191 		}
   1192 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1193 
   1194 		return (retcode);
   1195 
   1196 		/* shutdown the system */
   1197 	case RAIDFRAME_SHUTDOWN:
   1198 
   1199 		part = DISKPART(dev);
   1200 		pmask = (1 << part);
   1201 
   1202 		if ((error = raidlock(rs)) != 0)
   1203 			return (error);
   1204 
   1205 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1206 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1207 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1208 			retcode = EBUSY;
   1209 		else {
   1210 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1211 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1212 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1213 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1214 			retcode = 0;
   1215 		}
   1216 
   1217 		raidunlock(rs);
   1218 
   1219 		if (retcode != 0)
   1220 			return retcode;
   1221 
   1222 		/* free the pseudo device attach bits */
   1223 
   1224 		cf = device_cfdata(rs->sc_dev);
   1225 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1226 			free(cf, M_RAIDFRAME);
   1227 
   1228 		return (retcode);
   1229 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1230 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1231 		/* need to read the component label for the disk indicated
   1232 		   by row,column in clabel */
   1233 
   1234 		/*
   1235 		 * Perhaps there should be an option to skip the in-core
   1236 		 * copy and hit the disk, as with disklabel(8).
   1237 		 */
   1238 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1239 
   1240 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1241 
   1242 		if (retcode) {
   1243 			RF_Free(clabel, sizeof(*clabel));
   1244 			return retcode;
   1245 		}
   1246 
   1247 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1248 
   1249 		column = clabel->column;
   1250 
   1251 		if ((column < 0) || (column >= raidPtr->numCol +
   1252 		    raidPtr->numSpare)) {
   1253 			RF_Free(clabel, sizeof(*clabel));
   1254 			return EINVAL;
   1255 		}
   1256 
   1257 		RF_Free(clabel, sizeof(*clabel));
   1258 
   1259 		clabel = raidget_component_label(raidPtr, column);
   1260 
   1261 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1262 
   1263 #if 0
   1264 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1265 		clabel = (RF_ComponentLabel_t *) data;
   1266 
   1267 		/* XXX check the label for valid stuff... */
   1268 		/* Note that some things *should not* get modified --
   1269 		   the user should be re-initing the labels instead of
   1270 		   trying to patch things.
   1271 		   */
   1272 
   1273 		raidid = raidPtr->raidid;
   1274 #ifdef DEBUG
   1275 		printf("raid%d: Got component label:\n", raidid);
   1276 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1277 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1278 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1279 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1280 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1281 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1282 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1283 #endif
   1284 		clabel->row = 0;
   1285 		column = clabel->column;
   1286 
   1287 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1288 			return(EINVAL);
   1289 		}
   1290 
   1291 		/* XXX this isn't allowed to do anything for now :-) */
   1292 
   1293 		/* XXX and before it is, we need to fill in the rest
   1294 		   of the fields!?!?!?! */
   1295 		memcpy(raidget_component_label(raidPtr, column),
   1296 		    clabel, sizeof(*clabel));
   1297 		raidflush_component_label(raidPtr, column);
   1298 		return (0);
   1299 #endif
   1300 
   1301 	case RAIDFRAME_INIT_LABELS:
   1302 		clabel = (RF_ComponentLabel_t *) data;
   1303 		/*
   1304 		   we only want the serial number from
   1305 		   the above.  We get all the rest of the information
   1306 		   from the config that was used to create this RAID
   1307 		   set.
   1308 		   */
   1309 
   1310 		raidPtr->serial_number = clabel->serial_number;
   1311 
   1312 		for(column=0;column<raidPtr->numCol;column++) {
   1313 			diskPtr = &raidPtr->Disks[column];
   1314 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1315 				ci_label = raidget_component_label(raidPtr,
   1316 				    column);
   1317 				/* Zeroing this is important. */
   1318 				memset(ci_label, 0, sizeof(*ci_label));
   1319 				raid_init_component_label(raidPtr, ci_label);
   1320 				ci_label->serial_number =
   1321 				    raidPtr->serial_number;
   1322 				ci_label->row = 0; /* we dont' pretend to support more */
   1323 				rf_component_label_set_partitionsize(ci_label,
   1324 				    diskPtr->partitionSize);
   1325 				ci_label->column = column;
   1326 				raidflush_component_label(raidPtr, column);
   1327 			}
   1328 			/* XXXjld what about the spares? */
   1329 		}
   1330 
   1331 		return (retcode);
   1332 	case RAIDFRAME_SET_AUTOCONFIG:
   1333 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1334 		printf("raid%d: New autoconfig value is: %d\n",
   1335 		       raidPtr->raidid, d);
   1336 		*(int *) data = d;
   1337 		return (retcode);
   1338 
   1339 	case RAIDFRAME_SET_ROOT:
   1340 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1341 		printf("raid%d: New rootpartition value is: %d\n",
   1342 		       raidPtr->raidid, d);
   1343 		*(int *) data = d;
   1344 		return (retcode);
   1345 
   1346 		/* initialize all parity */
   1347 	case RAIDFRAME_REWRITEPARITY:
   1348 
   1349 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1350 			/* Parity for RAID 0 is trivially correct */
   1351 			raidPtr->parity_good = RF_RAID_CLEAN;
   1352 			return(0);
   1353 		}
   1354 
   1355 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1356 			/* Re-write is already in progress! */
   1357 			return(EINVAL);
   1358 		}
   1359 
   1360 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1361 					   rf_RewriteParityThread,
   1362 					   raidPtr,"raid_parity");
   1363 		return (retcode);
   1364 
   1365 
   1366 	case RAIDFRAME_ADD_HOT_SPARE:
   1367 		sparePtr = (RF_SingleComponent_t *) data;
   1368 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1369 		retcode = rf_add_hot_spare(raidPtr, &component);
   1370 		return(retcode);
   1371 
   1372 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1373 		return(retcode);
   1374 
   1375 	case RAIDFRAME_DELETE_COMPONENT:
   1376 		componentPtr = (RF_SingleComponent_t *)data;
   1377 		memcpy( &component, componentPtr,
   1378 			sizeof(RF_SingleComponent_t));
   1379 		retcode = rf_delete_component(raidPtr, &component);
   1380 		return(retcode);
   1381 
   1382 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1383 		componentPtr = (RF_SingleComponent_t *)data;
   1384 		memcpy( &component, componentPtr,
   1385 			sizeof(RF_SingleComponent_t));
   1386 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1387 		return(retcode);
   1388 
   1389 	case RAIDFRAME_REBUILD_IN_PLACE:
   1390 
   1391 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1392 			/* Can't do this on a RAID 0!! */
   1393 			return(EINVAL);
   1394 		}
   1395 
   1396 		if (raidPtr->recon_in_progress == 1) {
   1397 			/* a reconstruct is already in progress! */
   1398 			return(EINVAL);
   1399 		}
   1400 
   1401 		componentPtr = (RF_SingleComponent_t *) data;
   1402 		memcpy( &component, componentPtr,
   1403 			sizeof(RF_SingleComponent_t));
   1404 		component.row = 0; /* we don't support any more */
   1405 		column = component.column;
   1406 
   1407 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1408 			return(EINVAL);
   1409 		}
   1410 
   1411 		rf_lock_mutex2(raidPtr->mutex);
   1412 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1413 		    (raidPtr->numFailures > 0)) {
   1414 			/* XXX 0 above shouldn't be constant!!! */
   1415 			/* some component other than this has failed.
   1416 			   Let's not make things worse than they already
   1417 			   are... */
   1418 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1419 			       raidPtr->raidid);
   1420 			printf("raid%d:     Col: %d   Too many failures.\n",
   1421 			       raidPtr->raidid, column);
   1422 			rf_unlock_mutex2(raidPtr->mutex);
   1423 			return (EINVAL);
   1424 		}
   1425 		if (raidPtr->Disks[column].status ==
   1426 		    rf_ds_reconstructing) {
   1427 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1428 			       raidPtr->raidid);
   1429 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1430 
   1431 			rf_unlock_mutex2(raidPtr->mutex);
   1432 			return (EINVAL);
   1433 		}
   1434 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1435 			rf_unlock_mutex2(raidPtr->mutex);
   1436 			return (EINVAL);
   1437 		}
   1438 		rf_unlock_mutex2(raidPtr->mutex);
   1439 
   1440 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1441 		if (rrcopy == NULL)
   1442 			return(ENOMEM);
   1443 
   1444 		rrcopy->raidPtr = (void *) raidPtr;
   1445 		rrcopy->col = column;
   1446 
   1447 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1448 					   rf_ReconstructInPlaceThread,
   1449 					   rrcopy,"raid_reconip");
   1450 		return(retcode);
   1451 
   1452 	case RAIDFRAME_GET_INFO:
   1453 		if (!raidPtr->valid)
   1454 			return (ENODEV);
   1455 		ucfgp = (RF_DeviceConfig_t **) data;
   1456 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1457 			  (RF_DeviceConfig_t *));
   1458 		if (d_cfg == NULL)
   1459 			return (ENOMEM);
   1460 		d_cfg->rows = 1; /* there is only 1 row now */
   1461 		d_cfg->cols = raidPtr->numCol;
   1462 		d_cfg->ndevs = raidPtr->numCol;
   1463 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1464 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1465 			return (ENOMEM);
   1466 		}
   1467 		d_cfg->nspares = raidPtr->numSpare;
   1468 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1469 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1470 			return (ENOMEM);
   1471 		}
   1472 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1473 		d = 0;
   1474 		for (j = 0; j < d_cfg->cols; j++) {
   1475 			d_cfg->devs[d] = raidPtr->Disks[j];
   1476 			d++;
   1477 		}
   1478 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1479 			d_cfg->spares[i] = raidPtr->Disks[j];
   1480 		}
   1481 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1482 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1483 
   1484 		return (retcode);
   1485 
   1486 	case RAIDFRAME_CHECK_PARITY:
   1487 		*(int *) data = raidPtr->parity_good;
   1488 		return (0);
   1489 
   1490 	case RAIDFRAME_PARITYMAP_STATUS:
   1491 		if (rf_paritymap_ineligible(raidPtr))
   1492 			return EINVAL;
   1493 		rf_paritymap_status(raidPtr->parity_map,
   1494 		    (struct rf_pmstat *)data);
   1495 		return 0;
   1496 
   1497 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1498 		if (rf_paritymap_ineligible(raidPtr))
   1499 			return EINVAL;
   1500 		if (raidPtr->parity_map == NULL)
   1501 			return ENOENT; /* ??? */
   1502 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1503 			(struct rf_pmparams *)data, 1))
   1504 			return EINVAL;
   1505 		return 0;
   1506 
   1507 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1508 		if (rf_paritymap_ineligible(raidPtr))
   1509 			return EINVAL;
   1510 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1511 		return 0;
   1512 
   1513 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1514 		if (rf_paritymap_ineligible(raidPtr))
   1515 			return EINVAL;
   1516 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1517 		/* XXX should errors be passed up? */
   1518 		return 0;
   1519 
   1520 	case RAIDFRAME_RESET_ACCTOTALS:
   1521 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1522 		return (0);
   1523 
   1524 	case RAIDFRAME_GET_ACCTOTALS:
   1525 		totals = (RF_AccTotals_t *) data;
   1526 		*totals = raidPtr->acc_totals;
   1527 		return (0);
   1528 
   1529 	case RAIDFRAME_KEEP_ACCTOTALS:
   1530 		raidPtr->keep_acc_totals = *(int *)data;
   1531 		return (0);
   1532 
   1533 	case RAIDFRAME_GET_SIZE:
   1534 		*(int *) data = raidPtr->totalSectors;
   1535 		return (0);
   1536 
   1537 		/* fail a disk & optionally start reconstruction */
   1538 	case RAIDFRAME_FAIL_DISK:
   1539 
   1540 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1541 			/* Can't do this on a RAID 0!! */
   1542 			return(EINVAL);
   1543 		}
   1544 
   1545 		rr = (struct rf_recon_req *) data;
   1546 		rr->row = 0;
   1547 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1548 			return (EINVAL);
   1549 
   1550 
   1551 		rf_lock_mutex2(raidPtr->mutex);
   1552 		if (raidPtr->status == rf_rs_reconstructing) {
   1553 			/* you can't fail a disk while we're reconstructing! */
   1554 			/* XXX wrong for RAID6 */
   1555 			rf_unlock_mutex2(raidPtr->mutex);
   1556 			return (EINVAL);
   1557 		}
   1558 		if ((raidPtr->Disks[rr->col].status ==
   1559 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1560 			/* some other component has failed.  Let's not make
   1561 			   things worse. XXX wrong for RAID6 */
   1562 			rf_unlock_mutex2(raidPtr->mutex);
   1563 			return (EINVAL);
   1564 		}
   1565 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1566 			/* Can't fail a spared disk! */
   1567 			rf_unlock_mutex2(raidPtr->mutex);
   1568 			return (EINVAL);
   1569 		}
   1570 		rf_unlock_mutex2(raidPtr->mutex);
   1571 
   1572 		/* make a copy of the recon request so that we don't rely on
   1573 		 * the user's buffer */
   1574 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1575 		if (rrcopy == NULL)
   1576 			return(ENOMEM);
   1577 		memcpy(rrcopy, rr, sizeof(*rr));
   1578 		rrcopy->raidPtr = (void *) raidPtr;
   1579 
   1580 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1581 					   rf_ReconThread,
   1582 					   rrcopy,"raid_recon");
   1583 		return (0);
   1584 
   1585 		/* invoke a copyback operation after recon on whatever disk
   1586 		 * needs it, if any */
   1587 	case RAIDFRAME_COPYBACK:
   1588 
   1589 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1590 			/* This makes no sense on a RAID 0!! */
   1591 			return(EINVAL);
   1592 		}
   1593 
   1594 		if (raidPtr->copyback_in_progress == 1) {
   1595 			/* Copyback is already in progress! */
   1596 			return(EINVAL);
   1597 		}
   1598 
   1599 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1600 					   rf_CopybackThread,
   1601 					   raidPtr,"raid_copyback");
   1602 		return (retcode);
   1603 
   1604 		/* return the percentage completion of reconstruction */
   1605 	case RAIDFRAME_CHECK_RECON_STATUS:
   1606 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1607 			/* This makes no sense on a RAID 0, so tell the
   1608 			   user it's done. */
   1609 			*(int *) data = 100;
   1610 			return(0);
   1611 		}
   1612 		if (raidPtr->status != rf_rs_reconstructing)
   1613 			*(int *) data = 100;
   1614 		else {
   1615 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1616 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1617 			} else {
   1618 				*(int *) data = 0;
   1619 			}
   1620 		}
   1621 		return (0);
   1622 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1623 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1624 		if (raidPtr->status != rf_rs_reconstructing) {
   1625 			progressInfo.remaining = 0;
   1626 			progressInfo.completed = 100;
   1627 			progressInfo.total = 100;
   1628 		} else {
   1629 			progressInfo.total =
   1630 				raidPtr->reconControl->numRUsTotal;
   1631 			progressInfo.completed =
   1632 				raidPtr->reconControl->numRUsComplete;
   1633 			progressInfo.remaining = progressInfo.total -
   1634 				progressInfo.completed;
   1635 		}
   1636 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1637 				  sizeof(RF_ProgressInfo_t));
   1638 		return (retcode);
   1639 
   1640 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1641 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1642 			/* This makes no sense on a RAID 0, so tell the
   1643 			   user it's done. */
   1644 			*(int *) data = 100;
   1645 			return(0);
   1646 		}
   1647 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1648 			*(int *) data = 100 *
   1649 				raidPtr->parity_rewrite_stripes_done /
   1650 				raidPtr->Layout.numStripe;
   1651 		} else {
   1652 			*(int *) data = 100;
   1653 		}
   1654 		return (0);
   1655 
   1656 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1657 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1658 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1659 			progressInfo.total = raidPtr->Layout.numStripe;
   1660 			progressInfo.completed =
   1661 				raidPtr->parity_rewrite_stripes_done;
   1662 			progressInfo.remaining = progressInfo.total -
   1663 				progressInfo.completed;
   1664 		} else {
   1665 			progressInfo.remaining = 0;
   1666 			progressInfo.completed = 100;
   1667 			progressInfo.total = 100;
   1668 		}
   1669 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1670 				  sizeof(RF_ProgressInfo_t));
   1671 		return (retcode);
   1672 
   1673 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1674 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1675 			/* This makes no sense on a RAID 0 */
   1676 			*(int *) data = 100;
   1677 			return(0);
   1678 		}
   1679 		if (raidPtr->copyback_in_progress == 1) {
   1680 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1681 				raidPtr->Layout.numStripe;
   1682 		} else {
   1683 			*(int *) data = 100;
   1684 		}
   1685 		return (0);
   1686 
   1687 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1688 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1689 		if (raidPtr->copyback_in_progress == 1) {
   1690 			progressInfo.total = raidPtr->Layout.numStripe;
   1691 			progressInfo.completed =
   1692 				raidPtr->copyback_stripes_done;
   1693 			progressInfo.remaining = progressInfo.total -
   1694 				progressInfo.completed;
   1695 		} else {
   1696 			progressInfo.remaining = 0;
   1697 			progressInfo.completed = 100;
   1698 			progressInfo.total = 100;
   1699 		}
   1700 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1701 				  sizeof(RF_ProgressInfo_t));
   1702 		return (retcode);
   1703 
   1704 		/* the sparetable daemon calls this to wait for the kernel to
   1705 		 * need a spare table. this ioctl does not return until a
   1706 		 * spare table is needed. XXX -- calling mpsleep here in the
   1707 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1708 		 * -- I should either compute the spare table in the kernel,
   1709 		 * or have a different -- XXX XXX -- interface (a different
   1710 		 * character device) for delivering the table     -- XXX */
   1711 #if 0
   1712 	case RAIDFRAME_SPARET_WAIT:
   1713 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1714 		while (!rf_sparet_wait_queue)
   1715 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1716 		waitreq = rf_sparet_wait_queue;
   1717 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1718 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1719 
   1720 		/* structure assignment */
   1721 		*((RF_SparetWait_t *) data) = *waitreq;
   1722 
   1723 		RF_Free(waitreq, sizeof(*waitreq));
   1724 		return (0);
   1725 
   1726 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1727 		 * code in it that will cause the dameon to exit */
   1728 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1729 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1730 		waitreq->fcol = -1;
   1731 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1732 		waitreq->next = rf_sparet_wait_queue;
   1733 		rf_sparet_wait_queue = waitreq;
   1734 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1735 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1736 		return (0);
   1737 
   1738 		/* used by the spare table daemon to deliver a spare table
   1739 		 * into the kernel */
   1740 	case RAIDFRAME_SEND_SPARET:
   1741 
   1742 		/* install the spare table */
   1743 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1744 
   1745 		/* respond to the requestor.  the return status of the spare
   1746 		 * table installation is passed in the "fcol" field */
   1747 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1748 		waitreq->fcol = retcode;
   1749 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1750 		waitreq->next = rf_sparet_resp_queue;
   1751 		rf_sparet_resp_queue = waitreq;
   1752 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1753 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1754 
   1755 		return (retcode);
   1756 #endif
   1757 
   1758 	default:
   1759 		break; /* fall through to the os-specific code below */
   1760 
   1761 	}
   1762 
   1763 	if (!raidPtr->valid)
   1764 		return (EINVAL);
   1765 
   1766 	/*
   1767 	 * Add support for "regular" device ioctls here.
   1768 	 */
   1769 
   1770 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1771 	if (error != EPASSTHROUGH)
   1772 		return (error);
   1773 
   1774 	switch (cmd) {
   1775 	case DIOCGDINFO:
   1776 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1777 		break;
   1778 #ifdef __HAVE_OLD_DISKLABEL
   1779 	case ODIOCGDINFO:
   1780 		newlabel = *(rs->sc_dkdev.dk_label);
   1781 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1782 			return ENOTTY;
   1783 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1784 		break;
   1785 #endif
   1786 
   1787 	case DIOCGPART:
   1788 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1789 		((struct partinfo *) data)->part =
   1790 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1791 		break;
   1792 
   1793 	case DIOCWDINFO:
   1794 	case DIOCSDINFO:
   1795 #ifdef __HAVE_OLD_DISKLABEL
   1796 	case ODIOCWDINFO:
   1797 	case ODIOCSDINFO:
   1798 #endif
   1799 	{
   1800 		struct disklabel *lp;
   1801 #ifdef __HAVE_OLD_DISKLABEL
   1802 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1803 			memset(&newlabel, 0, sizeof newlabel);
   1804 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1805 			lp = &newlabel;
   1806 		} else
   1807 #endif
   1808 		lp = (struct disklabel *)data;
   1809 
   1810 		if ((error = raidlock(rs)) != 0)
   1811 			return (error);
   1812 
   1813 		rs->sc_flags |= RAIDF_LABELLING;
   1814 
   1815 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1816 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1817 		if (error == 0) {
   1818 			if (cmd == DIOCWDINFO
   1819 #ifdef __HAVE_OLD_DISKLABEL
   1820 			    || cmd == ODIOCWDINFO
   1821 #endif
   1822 			   )
   1823 				error = writedisklabel(RAIDLABELDEV(dev),
   1824 				    raidstrategy, rs->sc_dkdev.dk_label,
   1825 				    rs->sc_dkdev.dk_cpulabel);
   1826 		}
   1827 		rs->sc_flags &= ~RAIDF_LABELLING;
   1828 
   1829 		raidunlock(rs);
   1830 
   1831 		if (error)
   1832 			return (error);
   1833 		break;
   1834 	}
   1835 
   1836 	case DIOCWLABEL:
   1837 		if (*(int *) data != 0)
   1838 			rs->sc_flags |= RAIDF_WLABEL;
   1839 		else
   1840 			rs->sc_flags &= ~RAIDF_WLABEL;
   1841 		break;
   1842 
   1843 	case DIOCGDEFLABEL:
   1844 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1845 		break;
   1846 
   1847 #ifdef __HAVE_OLD_DISKLABEL
   1848 	case ODIOCGDEFLABEL:
   1849 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1850 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1851 			return ENOTTY;
   1852 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1853 		break;
   1854 #endif
   1855 
   1856 	case DIOCAWEDGE:
   1857 	case DIOCDWEDGE:
   1858 	    	dkw = (void *)data;
   1859 
   1860 		/* If the ioctl happens here, the parent is us. */
   1861 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1862 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1863 
   1864 	case DIOCLWEDGES:
   1865 		return dkwedge_list(&rs->sc_dkdev,
   1866 		    (struct dkwedge_list *)data, l);
   1867 	case DIOCCACHESYNC:
   1868 		return rf_sync_component_caches(raidPtr);
   1869 
   1870 	case DIOCGSTRATEGY:
   1871 	    {
   1872 		struct disk_strategy *dks = (void *)data;
   1873 
   1874 		s = splbio();
   1875 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1876 		    sizeof(dks->dks_name));
   1877 		splx(s);
   1878 		dks->dks_paramlen = 0;
   1879 
   1880 		return 0;
   1881 	    }
   1882 
   1883 	case DIOCSSTRATEGY:
   1884 	    {
   1885 		struct disk_strategy *dks = (void *)data;
   1886 		struct bufq_state *new;
   1887 		struct bufq_state *old;
   1888 
   1889 		if (dks->dks_param != NULL) {
   1890 			return EINVAL;
   1891 		}
   1892 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1893 		error = bufq_alloc(&new, dks->dks_name,
   1894 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1895 		if (error) {
   1896 			return error;
   1897 		}
   1898 		s = splbio();
   1899 		old = rs->buf_queue;
   1900 		bufq_move(new, old);
   1901 		rs->buf_queue = new;
   1902 		splx(s);
   1903 		bufq_free(old);
   1904 
   1905 		return 0;
   1906 	    }
   1907 
   1908 	default:
   1909 		retcode = ENOTTY;
   1910 	}
   1911 	return (retcode);
   1912 
   1913 }
   1914 
   1915 
   1916 /* raidinit -- complete the rest of the initialization for the
   1917    RAIDframe device.  */
   1918 
   1919 
   1920 static void
   1921 raidinit(RF_Raid_t *raidPtr)
   1922 {
   1923 	cfdata_t cf;
   1924 	struct raid_softc *rs;
   1925 	int     unit;
   1926 
   1927 	unit = raidPtr->raidid;
   1928 
   1929 	rs = &raid_softc[unit];
   1930 
   1931 	/* XXX should check return code first... */
   1932 	rs->sc_flags |= RAIDF_INITED;
   1933 
   1934 	/* XXX doesn't check bounds. */
   1935 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1936 
   1937 	/* attach the pseudo device */
   1938 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1939 	cf->cf_name = raid_cd.cd_name;
   1940 	cf->cf_atname = raid_cd.cd_name;
   1941 	cf->cf_unit = unit;
   1942 	cf->cf_fstate = FSTATE_STAR;
   1943 
   1944 	rs->sc_dev = config_attach_pseudo(cf);
   1945 
   1946 	if (rs->sc_dev == NULL) {
   1947 		printf("raid%d: config_attach_pseudo failed\n",
   1948 		    raidPtr->raidid);
   1949 		rs->sc_flags &= ~RAIDF_INITED;
   1950 		free(cf, M_RAIDFRAME);
   1951 		return;
   1952 	}
   1953 
   1954 	/* disk_attach actually creates space for the CPU disklabel, among
   1955 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1956 	 * with disklabels. */
   1957 
   1958 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1959 	disk_attach(&rs->sc_dkdev);
   1960 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1961 
   1962 	/* XXX There may be a weird interaction here between this, and
   1963 	 * protectedSectors, as used in RAIDframe.  */
   1964 
   1965 	rs->sc_size = raidPtr->totalSectors;
   1966 
   1967 	dkwedge_discover(&rs->sc_dkdev);
   1968 
   1969 	rf_set_properties(rs, raidPtr);
   1970 
   1971 }
   1972 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1973 /* wake up the daemon & tell it to get us a spare table
   1974  * XXX
   1975  * the entries in the queues should be tagged with the raidPtr
   1976  * so that in the extremely rare case that two recons happen at once,
   1977  * we know for which device were requesting a spare table
   1978  * XXX
   1979  *
   1980  * XXX This code is not currently used. GO
   1981  */
   1982 int
   1983 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1984 {
   1985 	int     retcode;
   1986 
   1987 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1988 	req->next = rf_sparet_wait_queue;
   1989 	rf_sparet_wait_queue = req;
   1990 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1991 
   1992 	/* mpsleep unlocks the mutex */
   1993 	while (!rf_sparet_resp_queue) {
   1994 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1995 	}
   1996 	req = rf_sparet_resp_queue;
   1997 	rf_sparet_resp_queue = req->next;
   1998 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1999 
   2000 	retcode = req->fcol;
   2001 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2002 					 * alloc'd */
   2003 	return (retcode);
   2004 }
   2005 #endif
   2006 
   2007 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2008  * bp & passes it down.
   2009  * any calls originating in the kernel must use non-blocking I/O
   2010  * do some extra sanity checking to return "appropriate" error values for
   2011  * certain conditions (to make some standard utilities work)
   2012  *
   2013  * Formerly known as: rf_DoAccessKernel
   2014  */
   2015 void
   2016 raidstart(RF_Raid_t *raidPtr)
   2017 {
   2018 	RF_SectorCount_t num_blocks, pb, sum;
   2019 	RF_RaidAddr_t raid_addr;
   2020 	struct partition *pp;
   2021 	daddr_t blocknum;
   2022 	int     unit;
   2023 	struct raid_softc *rs;
   2024 	int     do_async;
   2025 	struct buf *bp;
   2026 	int rc;
   2027 
   2028 	unit = raidPtr->raidid;
   2029 	rs = &raid_softc[unit];
   2030 
   2031 	/* quick check to see if anything has died recently */
   2032 	rf_lock_mutex2(raidPtr->mutex);
   2033 	if (raidPtr->numNewFailures > 0) {
   2034 		rf_unlock_mutex2(raidPtr->mutex);
   2035 		rf_update_component_labels(raidPtr,
   2036 					   RF_NORMAL_COMPONENT_UPDATE);
   2037 		rf_lock_mutex2(raidPtr->mutex);
   2038 		raidPtr->numNewFailures--;
   2039 	}
   2040 
   2041 	/* Check to see if we're at the limit... */
   2042 	while (raidPtr->openings > 0) {
   2043 		rf_unlock_mutex2(raidPtr->mutex);
   2044 
   2045 		/* get the next item, if any, from the queue */
   2046 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2047 			/* nothing more to do */
   2048 			return;
   2049 		}
   2050 
   2051 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2052 		 * partition.. Need to make it absolute to the underlying
   2053 		 * device.. */
   2054 
   2055 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2056 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2057 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2058 			blocknum += pp->p_offset;
   2059 		}
   2060 
   2061 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2062 			    (int) blocknum));
   2063 
   2064 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2065 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2066 
   2067 		/* *THIS* is where we adjust what block we're going to...
   2068 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2069 		raid_addr = blocknum;
   2070 
   2071 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2072 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2073 		sum = raid_addr + num_blocks + pb;
   2074 		if (1 || rf_debugKernelAccess) {
   2075 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2076 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2077 				    (int) pb, (int) bp->b_resid));
   2078 		}
   2079 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2080 		    || (sum < num_blocks) || (sum < pb)) {
   2081 			bp->b_error = ENOSPC;
   2082 			bp->b_resid = bp->b_bcount;
   2083 			biodone(bp);
   2084 			rf_lock_mutex2(raidPtr->mutex);
   2085 			continue;
   2086 		}
   2087 		/*
   2088 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2089 		 */
   2090 
   2091 		if (bp->b_bcount & raidPtr->sectorMask) {
   2092 			bp->b_error = EINVAL;
   2093 			bp->b_resid = bp->b_bcount;
   2094 			biodone(bp);
   2095 			rf_lock_mutex2(raidPtr->mutex);
   2096 			continue;
   2097 
   2098 		}
   2099 		db1_printf(("Calling DoAccess..\n"));
   2100 
   2101 
   2102 		rf_lock_mutex2(raidPtr->mutex);
   2103 		raidPtr->openings--;
   2104 		rf_unlock_mutex2(raidPtr->mutex);
   2105 
   2106 		/*
   2107 		 * Everything is async.
   2108 		 */
   2109 		do_async = 1;
   2110 
   2111 		disk_busy(&rs->sc_dkdev);
   2112 
   2113 		/* XXX we're still at splbio() here... do we *really*
   2114 		   need to be? */
   2115 
   2116 		/* don't ever condition on bp->b_flags & B_WRITE.
   2117 		 * always condition on B_READ instead */
   2118 
   2119 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2120 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2121 				 do_async, raid_addr, num_blocks,
   2122 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2123 
   2124 		if (rc) {
   2125 			bp->b_error = rc;
   2126 			bp->b_resid = bp->b_bcount;
   2127 			biodone(bp);
   2128 			/* continue loop */
   2129 		}
   2130 
   2131 		rf_lock_mutex2(raidPtr->mutex);
   2132 	}
   2133 	rf_unlock_mutex2(raidPtr->mutex);
   2134 }
   2135 
   2136 
   2137 
   2138 
   2139 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2140 
   2141 int
   2142 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2143 {
   2144 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2145 	struct buf *bp;
   2146 
   2147 	req->queue = queue;
   2148 	bp = req->bp;
   2149 
   2150 	switch (req->type) {
   2151 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2152 		/* XXX need to do something extra here.. */
   2153 		/* I'm leaving this in, as I've never actually seen it used,
   2154 		 * and I'd like folks to report it... GO */
   2155 		printf(("WAKEUP CALLED\n"));
   2156 		queue->numOutstanding++;
   2157 
   2158 		bp->b_flags = 0;
   2159 		bp->b_private = req;
   2160 
   2161 		KernelWakeupFunc(bp);
   2162 		break;
   2163 
   2164 	case RF_IO_TYPE_READ:
   2165 	case RF_IO_TYPE_WRITE:
   2166 #if RF_ACC_TRACE > 0
   2167 		if (req->tracerec) {
   2168 			RF_ETIMER_START(req->tracerec->timer);
   2169 		}
   2170 #endif
   2171 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2172 		    op, queue->rf_cinfo->ci_dev,
   2173 		    req->sectorOffset, req->numSector,
   2174 		    req->buf, KernelWakeupFunc, (void *) req,
   2175 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2176 
   2177 		if (rf_debugKernelAccess) {
   2178 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2179 				(long) bp->b_blkno));
   2180 		}
   2181 		queue->numOutstanding++;
   2182 		queue->last_deq_sector = req->sectorOffset;
   2183 		/* acc wouldn't have been let in if there were any pending
   2184 		 * reqs at any other priority */
   2185 		queue->curPriority = req->priority;
   2186 
   2187 		db1_printf(("Going for %c to unit %d col %d\n",
   2188 			    req->type, queue->raidPtr->raidid,
   2189 			    queue->col));
   2190 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2191 			(int) req->sectorOffset, (int) req->numSector,
   2192 			(int) (req->numSector <<
   2193 			    queue->raidPtr->logBytesPerSector),
   2194 			(int) queue->raidPtr->logBytesPerSector));
   2195 
   2196 		/*
   2197 		 * XXX: drop lock here since this can block at
   2198 		 * least with backing SCSI devices.  Retake it
   2199 		 * to minimize fuss with calling interfaces.
   2200 		 */
   2201 
   2202 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2203 		bdev_strategy(bp);
   2204 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2205 		break;
   2206 
   2207 	default:
   2208 		panic("bad req->type in rf_DispatchKernelIO");
   2209 	}
   2210 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2211 
   2212 	return (0);
   2213 }
   2214 /* this is the callback function associated with a I/O invoked from
   2215    kernel code.
   2216  */
   2217 static void
   2218 KernelWakeupFunc(struct buf *bp)
   2219 {
   2220 	RF_DiskQueueData_t *req = NULL;
   2221 	RF_DiskQueue_t *queue;
   2222 
   2223 	db1_printf(("recovering the request queue:\n"));
   2224 
   2225 	req = bp->b_private;
   2226 
   2227 	queue = (RF_DiskQueue_t *) req->queue;
   2228 
   2229 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2230 
   2231 #if RF_ACC_TRACE > 0
   2232 	if (req->tracerec) {
   2233 		RF_ETIMER_STOP(req->tracerec->timer);
   2234 		RF_ETIMER_EVAL(req->tracerec->timer);
   2235 		rf_lock_mutex2(rf_tracing_mutex);
   2236 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2237 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2238 		req->tracerec->num_phys_ios++;
   2239 		rf_unlock_mutex2(rf_tracing_mutex);
   2240 	}
   2241 #endif
   2242 
   2243 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2244 	 * ballistic, and mark the component as hosed... */
   2245 
   2246 	if (bp->b_error != 0) {
   2247 		/* Mark the disk as dead */
   2248 		/* but only mark it once... */
   2249 		/* and only if it wouldn't leave this RAID set
   2250 		   completely broken */
   2251 		if (((queue->raidPtr->Disks[queue->col].status ==
   2252 		      rf_ds_optimal) ||
   2253 		     (queue->raidPtr->Disks[queue->col].status ==
   2254 		      rf_ds_used_spare)) &&
   2255 		     (queue->raidPtr->numFailures <
   2256 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2257 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2258 			       queue->raidPtr->raidid,
   2259 			       queue->raidPtr->Disks[queue->col].devname);
   2260 			queue->raidPtr->Disks[queue->col].status =
   2261 			    rf_ds_failed;
   2262 			queue->raidPtr->status = rf_rs_degraded;
   2263 			queue->raidPtr->numFailures++;
   2264 			queue->raidPtr->numNewFailures++;
   2265 		} else {	/* Disk is already dead... */
   2266 			/* printf("Disk already marked as dead!\n"); */
   2267 		}
   2268 
   2269 	}
   2270 
   2271 	/* Fill in the error value */
   2272 	req->error = bp->b_error;
   2273 
   2274 	/* Drop this one on the "finished" queue... */
   2275 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2276 
   2277 	/* Let the raidio thread know there is work to be done. */
   2278 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2279 
   2280 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2281 }
   2282 
   2283 
   2284 /*
   2285  * initialize a buf structure for doing an I/O in the kernel.
   2286  */
   2287 static void
   2288 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2289        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2290        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2291        struct proc *b_proc)
   2292 {
   2293 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2294 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2295 	bp->b_oflags = 0;
   2296 	bp->b_cflags = 0;
   2297 	bp->b_bcount = numSect << logBytesPerSector;
   2298 	bp->b_bufsize = bp->b_bcount;
   2299 	bp->b_error = 0;
   2300 	bp->b_dev = dev;
   2301 	bp->b_data = bf;
   2302 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2303 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2304 	if (bp->b_bcount == 0) {
   2305 		panic("bp->b_bcount is zero in InitBP!!");
   2306 	}
   2307 	bp->b_proc = b_proc;
   2308 	bp->b_iodone = cbFunc;
   2309 	bp->b_private = cbArg;
   2310 }
   2311 
   2312 static void
   2313 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2314 		    struct disklabel *lp)
   2315 {
   2316 	memset(lp, 0, sizeof(*lp));
   2317 
   2318 	/* fabricate a label... */
   2319 	lp->d_secperunit = raidPtr->totalSectors;
   2320 	lp->d_secsize = raidPtr->bytesPerSector;
   2321 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2322 	lp->d_ntracks = 4 * raidPtr->numCol;
   2323 	lp->d_ncylinders = raidPtr->totalSectors /
   2324 		(lp->d_nsectors * lp->d_ntracks);
   2325 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2326 
   2327 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2328 	lp->d_type = DTYPE_RAID;
   2329 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2330 	lp->d_rpm = 3600;
   2331 	lp->d_interleave = 1;
   2332 	lp->d_flags = 0;
   2333 
   2334 	lp->d_partitions[RAW_PART].p_offset = 0;
   2335 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2336 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2337 	lp->d_npartitions = RAW_PART + 1;
   2338 
   2339 	lp->d_magic = DISKMAGIC;
   2340 	lp->d_magic2 = DISKMAGIC;
   2341 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2342 
   2343 }
   2344 /*
   2345  * Read the disklabel from the raid device.  If one is not present, fake one
   2346  * up.
   2347  */
   2348 static void
   2349 raidgetdisklabel(dev_t dev)
   2350 {
   2351 	int     unit = raidunit(dev);
   2352 	struct raid_softc *rs = &raid_softc[unit];
   2353 	const char   *errstring;
   2354 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2355 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2356 	RF_Raid_t *raidPtr;
   2357 
   2358 	db1_printf(("Getting the disklabel...\n"));
   2359 
   2360 	memset(clp, 0, sizeof(*clp));
   2361 
   2362 	raidPtr = raidPtrs[unit];
   2363 
   2364 	raidgetdefaultlabel(raidPtr, rs, lp);
   2365 
   2366 	/*
   2367 	 * Call the generic disklabel extraction routine.
   2368 	 */
   2369 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2370 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2371 	if (errstring)
   2372 		raidmakedisklabel(rs);
   2373 	else {
   2374 		int     i;
   2375 		struct partition *pp;
   2376 
   2377 		/*
   2378 		 * Sanity check whether the found disklabel is valid.
   2379 		 *
   2380 		 * This is necessary since total size of the raid device
   2381 		 * may vary when an interleave is changed even though exactly
   2382 		 * same components are used, and old disklabel may used
   2383 		 * if that is found.
   2384 		 */
   2385 		if (lp->d_secperunit != rs->sc_size)
   2386 			printf("raid%d: WARNING: %s: "
   2387 			    "total sector size in disklabel (%" PRIu32 ") != "
   2388 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2389 			    lp->d_secperunit, rs->sc_size);
   2390 		for (i = 0; i < lp->d_npartitions; i++) {
   2391 			pp = &lp->d_partitions[i];
   2392 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2393 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2394 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2395 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2396 		}
   2397 	}
   2398 
   2399 }
   2400 /*
   2401  * Take care of things one might want to take care of in the event
   2402  * that a disklabel isn't present.
   2403  */
   2404 static void
   2405 raidmakedisklabel(struct raid_softc *rs)
   2406 {
   2407 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2408 	db1_printf(("Making a label..\n"));
   2409 
   2410 	/*
   2411 	 * For historical reasons, if there's no disklabel present
   2412 	 * the raw partition must be marked FS_BSDFFS.
   2413 	 */
   2414 
   2415 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2416 
   2417 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2418 
   2419 	lp->d_checksum = dkcksum(lp);
   2420 }
   2421 /*
   2422  * Wait interruptibly for an exclusive lock.
   2423  *
   2424  * XXX
   2425  * Several drivers do this; it should be abstracted and made MP-safe.
   2426  * (Hmm... where have we seen this warning before :->  GO )
   2427  */
   2428 static int
   2429 raidlock(struct raid_softc *rs)
   2430 {
   2431 	int     error;
   2432 
   2433 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2434 		rs->sc_flags |= RAIDF_WANTED;
   2435 		if ((error =
   2436 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2437 			return (error);
   2438 	}
   2439 	rs->sc_flags |= RAIDF_LOCKED;
   2440 	return (0);
   2441 }
   2442 /*
   2443  * Unlock and wake up any waiters.
   2444  */
   2445 static void
   2446 raidunlock(struct raid_softc *rs)
   2447 {
   2448 
   2449 	rs->sc_flags &= ~RAIDF_LOCKED;
   2450 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2451 		rs->sc_flags &= ~RAIDF_WANTED;
   2452 		wakeup(rs);
   2453 	}
   2454 }
   2455 
   2456 
   2457 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2458 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2459 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2460 
   2461 static daddr_t
   2462 rf_component_info_offset(void)
   2463 {
   2464 
   2465 	return RF_COMPONENT_INFO_OFFSET;
   2466 }
   2467 
   2468 static daddr_t
   2469 rf_component_info_size(unsigned secsize)
   2470 {
   2471 	daddr_t info_size;
   2472 
   2473 	KASSERT(secsize);
   2474 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2475 		info_size = secsize;
   2476 	else
   2477 		info_size = RF_COMPONENT_INFO_SIZE;
   2478 
   2479 	return info_size;
   2480 }
   2481 
   2482 static daddr_t
   2483 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2484 {
   2485 	daddr_t map_offset;
   2486 
   2487 	KASSERT(raidPtr->bytesPerSector);
   2488 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2489 		map_offset = raidPtr->bytesPerSector;
   2490 	else
   2491 		map_offset = RF_COMPONENT_INFO_SIZE;
   2492 	map_offset += rf_component_info_offset();
   2493 
   2494 	return map_offset;
   2495 }
   2496 
   2497 static daddr_t
   2498 rf_parity_map_size(RF_Raid_t *raidPtr)
   2499 {
   2500 	daddr_t map_size;
   2501 
   2502 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2503 		map_size = raidPtr->bytesPerSector;
   2504 	else
   2505 		map_size = RF_PARITY_MAP_SIZE;
   2506 
   2507 	return map_size;
   2508 }
   2509 
   2510 int
   2511 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2512 {
   2513 	RF_ComponentLabel_t *clabel;
   2514 
   2515 	clabel = raidget_component_label(raidPtr, col);
   2516 	clabel->clean = RF_RAID_CLEAN;
   2517 	raidflush_component_label(raidPtr, col);
   2518 	return(0);
   2519 }
   2520 
   2521 
   2522 int
   2523 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2524 {
   2525 	RF_ComponentLabel_t *clabel;
   2526 
   2527 	clabel = raidget_component_label(raidPtr, col);
   2528 	clabel->clean = RF_RAID_DIRTY;
   2529 	raidflush_component_label(raidPtr, col);
   2530 	return(0);
   2531 }
   2532 
   2533 int
   2534 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2535 {
   2536 	KASSERT(raidPtr->bytesPerSector);
   2537 	return raidread_component_label(raidPtr->bytesPerSector,
   2538 	    raidPtr->Disks[col].dev,
   2539 	    raidPtr->raid_cinfo[col].ci_vp,
   2540 	    &raidPtr->raid_cinfo[col].ci_label);
   2541 }
   2542 
   2543 RF_ComponentLabel_t *
   2544 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2545 {
   2546 	return &raidPtr->raid_cinfo[col].ci_label;
   2547 }
   2548 
   2549 int
   2550 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2551 {
   2552 	RF_ComponentLabel_t *label;
   2553 
   2554 	label = &raidPtr->raid_cinfo[col].ci_label;
   2555 	label->mod_counter = raidPtr->mod_counter;
   2556 #ifndef RF_NO_PARITY_MAP
   2557 	label->parity_map_modcount = label->mod_counter;
   2558 #endif
   2559 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2560 	    raidPtr->Disks[col].dev,
   2561 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2562 }
   2563 
   2564 
   2565 static int
   2566 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2567     RF_ComponentLabel_t *clabel)
   2568 {
   2569 	return raidread_component_area(dev, b_vp, clabel,
   2570 	    sizeof(RF_ComponentLabel_t),
   2571 	    rf_component_info_offset(),
   2572 	    rf_component_info_size(secsize));
   2573 }
   2574 
   2575 /* ARGSUSED */
   2576 static int
   2577 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2578     size_t msize, daddr_t offset, daddr_t dsize)
   2579 {
   2580 	struct buf *bp;
   2581 	const struct bdevsw *bdev;
   2582 	int error;
   2583 
   2584 	/* XXX should probably ensure that we don't try to do this if
   2585 	   someone has changed rf_protected_sectors. */
   2586 
   2587 	if (b_vp == NULL) {
   2588 		/* For whatever reason, this component is not valid.
   2589 		   Don't try to read a component label from it. */
   2590 		return(EINVAL);
   2591 	}
   2592 
   2593 	/* get a block of the appropriate size... */
   2594 	bp = geteblk((int)dsize);
   2595 	bp->b_dev = dev;
   2596 
   2597 	/* get our ducks in a row for the read */
   2598 	bp->b_blkno = offset / DEV_BSIZE;
   2599 	bp->b_bcount = dsize;
   2600 	bp->b_flags |= B_READ;
   2601  	bp->b_resid = dsize;
   2602 
   2603 	bdev = bdevsw_lookup(bp->b_dev);
   2604 	if (bdev == NULL)
   2605 		return (ENXIO);
   2606 	(*bdev->d_strategy)(bp);
   2607 
   2608 	error = biowait(bp);
   2609 
   2610 	if (!error) {
   2611 		memcpy(data, bp->b_data, msize);
   2612 	}
   2613 
   2614 	brelse(bp, 0);
   2615 	return(error);
   2616 }
   2617 
   2618 
   2619 static int
   2620 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2621     RF_ComponentLabel_t *clabel)
   2622 {
   2623 	return raidwrite_component_area(dev, b_vp, clabel,
   2624 	    sizeof(RF_ComponentLabel_t),
   2625 	    rf_component_info_offset(),
   2626 	    rf_component_info_size(secsize), 0);
   2627 }
   2628 
   2629 /* ARGSUSED */
   2630 static int
   2631 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2632     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2633 {
   2634 	struct buf *bp;
   2635 	const struct bdevsw *bdev;
   2636 	int error;
   2637 
   2638 	/* get a block of the appropriate size... */
   2639 	bp = geteblk((int)dsize);
   2640 	bp->b_dev = dev;
   2641 
   2642 	/* get our ducks in a row for the write */
   2643 	bp->b_blkno = offset / DEV_BSIZE;
   2644 	bp->b_bcount = dsize;
   2645 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2646  	bp->b_resid = dsize;
   2647 
   2648 	memset(bp->b_data, 0, dsize);
   2649 	memcpy(bp->b_data, data, msize);
   2650 
   2651 	bdev = bdevsw_lookup(bp->b_dev);
   2652 	if (bdev == NULL)
   2653 		return (ENXIO);
   2654 	(*bdev->d_strategy)(bp);
   2655 	if (asyncp)
   2656 		return 0;
   2657 	error = biowait(bp);
   2658 	brelse(bp, 0);
   2659 	if (error) {
   2660 #if 1
   2661 		printf("Failed to write RAID component info!\n");
   2662 #endif
   2663 	}
   2664 
   2665 	return(error);
   2666 }
   2667 
   2668 void
   2669 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2670 {
   2671 	int c;
   2672 
   2673 	for (c = 0; c < raidPtr->numCol; c++) {
   2674 		/* Skip dead disks. */
   2675 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2676 			continue;
   2677 		/* XXXjld: what if an error occurs here? */
   2678 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2679 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2680 		    RF_PARITYMAP_NBYTE,
   2681 		    rf_parity_map_offset(raidPtr),
   2682 		    rf_parity_map_size(raidPtr), 0);
   2683 	}
   2684 }
   2685 
   2686 void
   2687 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2688 {
   2689 	struct rf_paritymap_ondisk tmp;
   2690 	int c,first;
   2691 
   2692 	first=1;
   2693 	for (c = 0; c < raidPtr->numCol; c++) {
   2694 		/* Skip dead disks. */
   2695 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2696 			continue;
   2697 		raidread_component_area(raidPtr->Disks[c].dev,
   2698 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2699 		    RF_PARITYMAP_NBYTE,
   2700 		    rf_parity_map_offset(raidPtr),
   2701 		    rf_parity_map_size(raidPtr));
   2702 		if (first) {
   2703 			memcpy(map, &tmp, sizeof(*map));
   2704 			first = 0;
   2705 		} else {
   2706 			rf_paritymap_merge(map, &tmp);
   2707 		}
   2708 	}
   2709 }
   2710 
   2711 void
   2712 rf_markalldirty(RF_Raid_t *raidPtr)
   2713 {
   2714 	RF_ComponentLabel_t *clabel;
   2715 	int sparecol;
   2716 	int c;
   2717 	int j;
   2718 	int scol = -1;
   2719 
   2720 	raidPtr->mod_counter++;
   2721 	for (c = 0; c < raidPtr->numCol; c++) {
   2722 		/* we don't want to touch (at all) a disk that has
   2723 		   failed */
   2724 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2725 			clabel = raidget_component_label(raidPtr, c);
   2726 			if (clabel->status == rf_ds_spared) {
   2727 				/* XXX do something special...
   2728 				   but whatever you do, don't
   2729 				   try to access it!! */
   2730 			} else {
   2731 				raidmarkdirty(raidPtr, c);
   2732 			}
   2733 		}
   2734 	}
   2735 
   2736 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2737 		sparecol = raidPtr->numCol + c;
   2738 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2739 			/*
   2740 
   2741 			   we claim this disk is "optimal" if it's
   2742 			   rf_ds_used_spare, as that means it should be
   2743 			   directly substitutable for the disk it replaced.
   2744 			   We note that too...
   2745 
   2746 			 */
   2747 
   2748 			for(j=0;j<raidPtr->numCol;j++) {
   2749 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2750 					scol = j;
   2751 					break;
   2752 				}
   2753 			}
   2754 
   2755 			clabel = raidget_component_label(raidPtr, sparecol);
   2756 			/* make sure status is noted */
   2757 
   2758 			raid_init_component_label(raidPtr, clabel);
   2759 
   2760 			clabel->row = 0;
   2761 			clabel->column = scol;
   2762 			/* Note: we *don't* change status from rf_ds_used_spare
   2763 			   to rf_ds_optimal */
   2764 			/* clabel.status = rf_ds_optimal; */
   2765 
   2766 			raidmarkdirty(raidPtr, sparecol);
   2767 		}
   2768 	}
   2769 }
   2770 
   2771 
   2772 void
   2773 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2774 {
   2775 	RF_ComponentLabel_t *clabel;
   2776 	int sparecol;
   2777 	int c;
   2778 	int j;
   2779 	int scol;
   2780 
   2781 	scol = -1;
   2782 
   2783 	/* XXX should do extra checks to make sure things really are clean,
   2784 	   rather than blindly setting the clean bit... */
   2785 
   2786 	raidPtr->mod_counter++;
   2787 
   2788 	for (c = 0; c < raidPtr->numCol; c++) {
   2789 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2790 			clabel = raidget_component_label(raidPtr, c);
   2791 			/* make sure status is noted */
   2792 			clabel->status = rf_ds_optimal;
   2793 
   2794 			/* note what unit we are configured as */
   2795 			clabel->last_unit = raidPtr->raidid;
   2796 
   2797 			raidflush_component_label(raidPtr, c);
   2798 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2799 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2800 					raidmarkclean(raidPtr, c);
   2801 				}
   2802 			}
   2803 		}
   2804 		/* else we don't touch it.. */
   2805 	}
   2806 
   2807 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2808 		sparecol = raidPtr->numCol + c;
   2809 		/* Need to ensure that the reconstruct actually completed! */
   2810 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2811 			/*
   2812 
   2813 			   we claim this disk is "optimal" if it's
   2814 			   rf_ds_used_spare, as that means it should be
   2815 			   directly substitutable for the disk it replaced.
   2816 			   We note that too...
   2817 
   2818 			 */
   2819 
   2820 			for(j=0;j<raidPtr->numCol;j++) {
   2821 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2822 					scol = j;
   2823 					break;
   2824 				}
   2825 			}
   2826 
   2827 			/* XXX shouldn't *really* need this... */
   2828 			clabel = raidget_component_label(raidPtr, sparecol);
   2829 			/* make sure status is noted */
   2830 
   2831 			raid_init_component_label(raidPtr, clabel);
   2832 
   2833 			clabel->column = scol;
   2834 			clabel->status = rf_ds_optimal;
   2835 			clabel->last_unit = raidPtr->raidid;
   2836 
   2837 			raidflush_component_label(raidPtr, sparecol);
   2838 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2839 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2840 					raidmarkclean(raidPtr, sparecol);
   2841 				}
   2842 			}
   2843 		}
   2844 	}
   2845 }
   2846 
   2847 void
   2848 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2849 {
   2850 
   2851 	if (vp != NULL) {
   2852 		if (auto_configured == 1) {
   2853 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2854 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2855 			vput(vp);
   2856 
   2857 		} else {
   2858 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2859 		}
   2860 	}
   2861 }
   2862 
   2863 
   2864 void
   2865 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2866 {
   2867 	int r,c;
   2868 	struct vnode *vp;
   2869 	int acd;
   2870 
   2871 
   2872 	/* We take this opportunity to close the vnodes like we should.. */
   2873 
   2874 	for (c = 0; c < raidPtr->numCol; c++) {
   2875 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2876 		acd = raidPtr->Disks[c].auto_configured;
   2877 		rf_close_component(raidPtr, vp, acd);
   2878 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2879 		raidPtr->Disks[c].auto_configured = 0;
   2880 	}
   2881 
   2882 	for (r = 0; r < raidPtr->numSpare; r++) {
   2883 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2884 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2885 		rf_close_component(raidPtr, vp, acd);
   2886 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2887 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2888 	}
   2889 }
   2890 
   2891 
   2892 void
   2893 rf_ReconThread(struct rf_recon_req *req)
   2894 {
   2895 	int     s;
   2896 	RF_Raid_t *raidPtr;
   2897 
   2898 	s = splbio();
   2899 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2900 	raidPtr->recon_in_progress = 1;
   2901 
   2902 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2903 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2904 
   2905 	RF_Free(req, sizeof(*req));
   2906 
   2907 	raidPtr->recon_in_progress = 0;
   2908 	splx(s);
   2909 
   2910 	/* That's all... */
   2911 	kthread_exit(0);	/* does not return */
   2912 }
   2913 
   2914 void
   2915 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2916 {
   2917 	int retcode;
   2918 	int s;
   2919 
   2920 	raidPtr->parity_rewrite_stripes_done = 0;
   2921 	raidPtr->parity_rewrite_in_progress = 1;
   2922 	s = splbio();
   2923 	retcode = rf_RewriteParity(raidPtr);
   2924 	splx(s);
   2925 	if (retcode) {
   2926 		printf("raid%d: Error re-writing parity (%d)!\n",
   2927 		    raidPtr->raidid, retcode);
   2928 	} else {
   2929 		/* set the clean bit!  If we shutdown correctly,
   2930 		   the clean bit on each component label will get
   2931 		   set */
   2932 		raidPtr->parity_good = RF_RAID_CLEAN;
   2933 	}
   2934 	raidPtr->parity_rewrite_in_progress = 0;
   2935 
   2936 	/* Anyone waiting for us to stop?  If so, inform them... */
   2937 	if (raidPtr->waitShutdown) {
   2938 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2939 	}
   2940 
   2941 	/* That's all... */
   2942 	kthread_exit(0);	/* does not return */
   2943 }
   2944 
   2945 
   2946 void
   2947 rf_CopybackThread(RF_Raid_t *raidPtr)
   2948 {
   2949 	int s;
   2950 
   2951 	raidPtr->copyback_in_progress = 1;
   2952 	s = splbio();
   2953 	rf_CopybackReconstructedData(raidPtr);
   2954 	splx(s);
   2955 	raidPtr->copyback_in_progress = 0;
   2956 
   2957 	/* That's all... */
   2958 	kthread_exit(0);	/* does not return */
   2959 }
   2960 
   2961 
   2962 void
   2963 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2964 {
   2965 	int s;
   2966 	RF_Raid_t *raidPtr;
   2967 
   2968 	s = splbio();
   2969 	raidPtr = req->raidPtr;
   2970 	raidPtr->recon_in_progress = 1;
   2971 	rf_ReconstructInPlace(raidPtr, req->col);
   2972 	RF_Free(req, sizeof(*req));
   2973 	raidPtr->recon_in_progress = 0;
   2974 	splx(s);
   2975 
   2976 	/* That's all... */
   2977 	kthread_exit(0);	/* does not return */
   2978 }
   2979 
   2980 static RF_AutoConfig_t *
   2981 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2982     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2983     unsigned secsize)
   2984 {
   2985 	int good_one = 0;
   2986 	RF_ComponentLabel_t *clabel;
   2987 	RF_AutoConfig_t *ac;
   2988 
   2989 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2990 	if (clabel == NULL) {
   2991 oomem:
   2992 		    while(ac_list) {
   2993 			    ac = ac_list;
   2994 			    if (ac->clabel)
   2995 				    free(ac->clabel, M_RAIDFRAME);
   2996 			    ac_list = ac_list->next;
   2997 			    free(ac, M_RAIDFRAME);
   2998 		    }
   2999 		    printf("RAID auto config: out of memory!\n");
   3000 		    return NULL; /* XXX probably should panic? */
   3001 	}
   3002 
   3003 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3004 		/* Got the label.  Does it look reasonable? */
   3005 		if (rf_reasonable_label(clabel, numsecs) &&
   3006 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3007 #ifdef DEBUG
   3008 			printf("Component on: %s: %llu\n",
   3009 				cname, (unsigned long long)size);
   3010 			rf_print_component_label(clabel);
   3011 #endif
   3012 			/* if it's reasonable, add it, else ignore it. */
   3013 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3014 				M_NOWAIT);
   3015 			if (ac == NULL) {
   3016 				free(clabel, M_RAIDFRAME);
   3017 				goto oomem;
   3018 			}
   3019 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3020 			ac->dev = dev;
   3021 			ac->vp = vp;
   3022 			ac->clabel = clabel;
   3023 			ac->next = ac_list;
   3024 			ac_list = ac;
   3025 			good_one = 1;
   3026 		}
   3027 	}
   3028 	if (!good_one) {
   3029 		/* cleanup */
   3030 		free(clabel, M_RAIDFRAME);
   3031 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3032 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3033 		vput(vp);
   3034 	}
   3035 	return ac_list;
   3036 }
   3037 
   3038 RF_AutoConfig_t *
   3039 rf_find_raid_components(void)
   3040 {
   3041 	struct vnode *vp;
   3042 	struct disklabel label;
   3043 	device_t dv;
   3044 	deviter_t di;
   3045 	dev_t dev;
   3046 	int bmajor, bminor, wedge, rf_part_found;
   3047 	int error;
   3048 	int i;
   3049 	RF_AutoConfig_t *ac_list;
   3050 	uint64_t numsecs;
   3051 	unsigned secsize;
   3052 
   3053 	/* initialize the AutoConfig list */
   3054 	ac_list = NULL;
   3055 
   3056 	/* we begin by trolling through *all* the devices on the system */
   3057 
   3058 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3059 	     dv = deviter_next(&di)) {
   3060 
   3061 		/* we are only interested in disks... */
   3062 		if (device_class(dv) != DV_DISK)
   3063 			continue;
   3064 
   3065 		/* we don't care about floppies... */
   3066 		if (device_is_a(dv, "fd")) {
   3067 			continue;
   3068 		}
   3069 
   3070 		/* we don't care about CD's... */
   3071 		if (device_is_a(dv, "cd")) {
   3072 			continue;
   3073 		}
   3074 
   3075 		/* we don't care about md's... */
   3076 		if (device_is_a(dv, "md")) {
   3077 			continue;
   3078 		}
   3079 
   3080 		/* hdfd is the Atari/Hades floppy driver */
   3081 		if (device_is_a(dv, "hdfd")) {
   3082 			continue;
   3083 		}
   3084 
   3085 		/* fdisa is the Atari/Milan floppy driver */
   3086 		if (device_is_a(dv, "fdisa")) {
   3087 			continue;
   3088 		}
   3089 
   3090 		/* need to find the device_name_to_block_device_major stuff */
   3091 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3092 
   3093 		rf_part_found = 0; /*No raid partition as yet*/
   3094 
   3095 		/* get a vnode for the raw partition of this disk */
   3096 
   3097 		wedge = device_is_a(dv, "dk");
   3098 		bminor = minor(device_unit(dv));
   3099 		dev = wedge ? makedev(bmajor, bminor) :
   3100 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3101 		if (bdevvp(dev, &vp))
   3102 			panic("RAID can't alloc vnode");
   3103 
   3104 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3105 
   3106 		if (error) {
   3107 			/* "Who cares."  Continue looking
   3108 			   for something that exists*/
   3109 			vput(vp);
   3110 			continue;
   3111 		}
   3112 
   3113 		error = getdisksize(vp, &numsecs, &secsize);
   3114 		if (error) {
   3115 			vput(vp);
   3116 			continue;
   3117 		}
   3118 		if (wedge) {
   3119 			struct dkwedge_info dkw;
   3120 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3121 			    NOCRED);
   3122 			if (error) {
   3123 				printf("RAIDframe: can't get wedge info for "
   3124 				    "dev %s (%d)\n", device_xname(dv), error);
   3125 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3126 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3127 				vput(vp);
   3128 				continue;
   3129 			}
   3130 
   3131 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3132 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3133 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3134 				vput(vp);
   3135 				continue;
   3136 			}
   3137 
   3138 			ac_list = rf_get_component(ac_list, dev, vp,
   3139 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3140 			rf_part_found = 1; /*There is a raid component on this disk*/
   3141 			continue;
   3142 		}
   3143 
   3144 		/* Ok, the disk exists.  Go get the disklabel. */
   3145 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3146 		if (error) {
   3147 			/*
   3148 			 * XXX can't happen - open() would
   3149 			 * have errored out (or faked up one)
   3150 			 */
   3151 			if (error != ENOTTY)
   3152 				printf("RAIDframe: can't get label for dev "
   3153 				    "%s (%d)\n", device_xname(dv), error);
   3154 		}
   3155 
   3156 		/* don't need this any more.  We'll allocate it again
   3157 		   a little later if we really do... */
   3158 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3159 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3160 		vput(vp);
   3161 
   3162 		if (error)
   3163 			continue;
   3164 
   3165 		rf_part_found = 0; /*No raid partitions yet*/
   3166 		for (i = 0; i < label.d_npartitions; i++) {
   3167 			char cname[sizeof(ac_list->devname)];
   3168 
   3169 			/* We only support partitions marked as RAID */
   3170 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3171 				continue;
   3172 
   3173 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3174 			if (bdevvp(dev, &vp))
   3175 				panic("RAID can't alloc vnode");
   3176 
   3177 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3178 			if (error) {
   3179 				/* Whatever... */
   3180 				vput(vp);
   3181 				continue;
   3182 			}
   3183 			snprintf(cname, sizeof(cname), "%s%c",
   3184 			    device_xname(dv), 'a' + i);
   3185 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3186 				label.d_partitions[i].p_size, numsecs, secsize);
   3187 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3188 		}
   3189 
   3190 		/*
   3191 		 *If there is no raid component on this disk, either in a
   3192 		 *disklabel or inside a wedge, check the raw partition as well,
   3193 		 *as it is possible to configure raid components on raw disk
   3194 		 *devices.
   3195 		 */
   3196 
   3197 		if (!rf_part_found) {
   3198 			char cname[sizeof(ac_list->devname)];
   3199 
   3200 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3201 			if (bdevvp(dev, &vp))
   3202 				panic("RAID can't alloc vnode");
   3203 
   3204 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3205 			if (error) {
   3206 				/* Whatever... */
   3207 				vput(vp);
   3208 				continue;
   3209 			}
   3210 			snprintf(cname, sizeof(cname), "%s%c",
   3211 			    device_xname(dv), 'a' + RAW_PART);
   3212 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3213 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3214 		}
   3215 	}
   3216 	deviter_release(&di);
   3217 	return ac_list;
   3218 }
   3219 
   3220 
   3221 int
   3222 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3223 {
   3224 
   3225 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3226 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3227 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3228 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3229 	    clabel->row >=0 &&
   3230 	    clabel->column >= 0 &&
   3231 	    clabel->num_rows > 0 &&
   3232 	    clabel->num_columns > 0 &&
   3233 	    clabel->row < clabel->num_rows &&
   3234 	    clabel->column < clabel->num_columns &&
   3235 	    clabel->blockSize > 0 &&
   3236 	    /*
   3237 	     * numBlocksHi may contain garbage, but it is ok since
   3238 	     * the type is unsigned.  If it is really garbage,
   3239 	     * rf_fix_old_label_size() will fix it.
   3240 	     */
   3241 	    rf_component_label_numblocks(clabel) > 0) {
   3242 		/*
   3243 		 * label looks reasonable enough...
   3244 		 * let's make sure it has no old garbage.
   3245 		 */
   3246 		if (numsecs)
   3247 			rf_fix_old_label_size(clabel, numsecs);
   3248 		return(1);
   3249 	}
   3250 	return(0);
   3251 }
   3252 
   3253 
   3254 /*
   3255  * For reasons yet unknown, some old component labels have garbage in
   3256  * the newer numBlocksHi region, and this causes lossage.  Since those
   3257  * disks will also have numsecs set to less than 32 bits of sectors,
   3258  * we can determine when this corruption has occured, and fix it.
   3259  *
   3260  * The exact same problem, with the same unknown reason, happens to
   3261  * the partitionSizeHi member as well.
   3262  */
   3263 static void
   3264 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3265 {
   3266 
   3267 	if (numsecs < ((uint64_t)1 << 32)) {
   3268 		if (clabel->numBlocksHi) {
   3269 			printf("WARNING: total sectors < 32 bits, yet "
   3270 			       "numBlocksHi set\n"
   3271 			       "WARNING: resetting numBlocksHi to zero.\n");
   3272 			clabel->numBlocksHi = 0;
   3273 		}
   3274 
   3275 		if (clabel->partitionSizeHi) {
   3276 			printf("WARNING: total sectors < 32 bits, yet "
   3277 			       "partitionSizeHi set\n"
   3278 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3279 			clabel->partitionSizeHi = 0;
   3280 		}
   3281 	}
   3282 }
   3283 
   3284 
   3285 #ifdef DEBUG
   3286 void
   3287 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3288 {
   3289 	uint64_t numBlocks;
   3290 
   3291 	numBlocks = rf_component_label_numblocks(clabel);
   3292 
   3293 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3294 	       clabel->row, clabel->column,
   3295 	       clabel->num_rows, clabel->num_columns);
   3296 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3297 	       clabel->version, clabel->serial_number,
   3298 	       clabel->mod_counter);
   3299 	printf("   Clean: %s Status: %d\n",
   3300 	       clabel->clean ? "Yes" : "No", clabel->status);
   3301 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3302 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3303 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3304 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3305 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3306 	printf("   Contains root partition: %s\n",
   3307 	       clabel->root_partition ? "Yes" : "No");
   3308 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3309 #if 0
   3310 	   printf("   Config order: %d\n", clabel->config_order);
   3311 #endif
   3312 
   3313 }
   3314 #endif
   3315 
   3316 RF_ConfigSet_t *
   3317 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3318 {
   3319 	RF_AutoConfig_t *ac;
   3320 	RF_ConfigSet_t *config_sets;
   3321 	RF_ConfigSet_t *cset;
   3322 	RF_AutoConfig_t *ac_next;
   3323 
   3324 
   3325 	config_sets = NULL;
   3326 
   3327 	/* Go through the AutoConfig list, and figure out which components
   3328 	   belong to what sets.  */
   3329 	ac = ac_list;
   3330 	while(ac!=NULL) {
   3331 		/* we're going to putz with ac->next, so save it here
   3332 		   for use at the end of the loop */
   3333 		ac_next = ac->next;
   3334 
   3335 		if (config_sets == NULL) {
   3336 			/* will need at least this one... */
   3337 			config_sets = (RF_ConfigSet_t *)
   3338 				malloc(sizeof(RF_ConfigSet_t),
   3339 				       M_RAIDFRAME, M_NOWAIT);
   3340 			if (config_sets == NULL) {
   3341 				panic("rf_create_auto_sets: No memory!");
   3342 			}
   3343 			/* this one is easy :) */
   3344 			config_sets->ac = ac;
   3345 			config_sets->next = NULL;
   3346 			config_sets->rootable = 0;
   3347 			ac->next = NULL;
   3348 		} else {
   3349 			/* which set does this component fit into? */
   3350 			cset = config_sets;
   3351 			while(cset!=NULL) {
   3352 				if (rf_does_it_fit(cset, ac)) {
   3353 					/* looks like it matches... */
   3354 					ac->next = cset->ac;
   3355 					cset->ac = ac;
   3356 					break;
   3357 				}
   3358 				cset = cset->next;
   3359 			}
   3360 			if (cset==NULL) {
   3361 				/* didn't find a match above... new set..*/
   3362 				cset = (RF_ConfigSet_t *)
   3363 					malloc(sizeof(RF_ConfigSet_t),
   3364 					       M_RAIDFRAME, M_NOWAIT);
   3365 				if (cset == NULL) {
   3366 					panic("rf_create_auto_sets: No memory!");
   3367 				}
   3368 				cset->ac = ac;
   3369 				ac->next = NULL;
   3370 				cset->next = config_sets;
   3371 				cset->rootable = 0;
   3372 				config_sets = cset;
   3373 			}
   3374 		}
   3375 		ac = ac_next;
   3376 	}
   3377 
   3378 
   3379 	return(config_sets);
   3380 }
   3381 
   3382 static int
   3383 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3384 {
   3385 	RF_ComponentLabel_t *clabel1, *clabel2;
   3386 
   3387 	/* If this one matches the *first* one in the set, that's good
   3388 	   enough, since the other members of the set would have been
   3389 	   through here too... */
   3390 	/* note that we are not checking partitionSize here..
   3391 
   3392 	   Note that we are also not checking the mod_counters here.
   3393 	   If everything else matches execpt the mod_counter, that's
   3394 	   good enough for this test.  We will deal with the mod_counters
   3395 	   a little later in the autoconfiguration process.
   3396 
   3397 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3398 
   3399 	   The reason we don't check for this is that failed disks
   3400 	   will have lower modification counts.  If those disks are
   3401 	   not added to the set they used to belong to, then they will
   3402 	   form their own set, which may result in 2 different sets,
   3403 	   for example, competing to be configured at raid0, and
   3404 	   perhaps competing to be the root filesystem set.  If the
   3405 	   wrong ones get configured, or both attempt to become /,
   3406 	   weird behaviour and or serious lossage will occur.  Thus we
   3407 	   need to bring them into the fold here, and kick them out at
   3408 	   a later point.
   3409 
   3410 	*/
   3411 
   3412 	clabel1 = cset->ac->clabel;
   3413 	clabel2 = ac->clabel;
   3414 	if ((clabel1->version == clabel2->version) &&
   3415 	    (clabel1->serial_number == clabel2->serial_number) &&
   3416 	    (clabel1->num_rows == clabel2->num_rows) &&
   3417 	    (clabel1->num_columns == clabel2->num_columns) &&
   3418 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3419 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3420 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3421 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3422 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3423 	    (clabel1->blockSize == clabel2->blockSize) &&
   3424 	    rf_component_label_numblocks(clabel1) ==
   3425 	    rf_component_label_numblocks(clabel2) &&
   3426 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3427 	    (clabel1->root_partition == clabel2->root_partition) &&
   3428 	    (clabel1->last_unit == clabel2->last_unit) &&
   3429 	    (clabel1->config_order == clabel2->config_order)) {
   3430 		/* if it get's here, it almost *has* to be a match */
   3431 	} else {
   3432 		/* it's not consistent with somebody in the set..
   3433 		   punt */
   3434 		return(0);
   3435 	}
   3436 	/* all was fine.. it must fit... */
   3437 	return(1);
   3438 }
   3439 
   3440 int
   3441 rf_have_enough_components(RF_ConfigSet_t *cset)
   3442 {
   3443 	RF_AutoConfig_t *ac;
   3444 	RF_AutoConfig_t *auto_config;
   3445 	RF_ComponentLabel_t *clabel;
   3446 	int c;
   3447 	int num_cols;
   3448 	int num_missing;
   3449 	int mod_counter;
   3450 	int mod_counter_found;
   3451 	int even_pair_failed;
   3452 	char parity_type;
   3453 
   3454 
   3455 	/* check to see that we have enough 'live' components
   3456 	   of this set.  If so, we can configure it if necessary */
   3457 
   3458 	num_cols = cset->ac->clabel->num_columns;
   3459 	parity_type = cset->ac->clabel->parityConfig;
   3460 
   3461 	/* XXX Check for duplicate components!?!?!? */
   3462 
   3463 	/* Determine what the mod_counter is supposed to be for this set. */
   3464 
   3465 	mod_counter_found = 0;
   3466 	mod_counter = 0;
   3467 	ac = cset->ac;
   3468 	while(ac!=NULL) {
   3469 		if (mod_counter_found==0) {
   3470 			mod_counter = ac->clabel->mod_counter;
   3471 			mod_counter_found = 1;
   3472 		} else {
   3473 			if (ac->clabel->mod_counter > mod_counter) {
   3474 				mod_counter = ac->clabel->mod_counter;
   3475 			}
   3476 		}
   3477 		ac = ac->next;
   3478 	}
   3479 
   3480 	num_missing = 0;
   3481 	auto_config = cset->ac;
   3482 
   3483 	even_pair_failed = 0;
   3484 	for(c=0; c<num_cols; c++) {
   3485 		ac = auto_config;
   3486 		while(ac!=NULL) {
   3487 			if ((ac->clabel->column == c) &&
   3488 			    (ac->clabel->mod_counter == mod_counter)) {
   3489 				/* it's this one... */
   3490 #ifdef DEBUG
   3491 				printf("Found: %s at %d\n",
   3492 				       ac->devname,c);
   3493 #endif
   3494 				break;
   3495 			}
   3496 			ac=ac->next;
   3497 		}
   3498 		if (ac==NULL) {
   3499 				/* Didn't find one here! */
   3500 				/* special case for RAID 1, especially
   3501 				   where there are more than 2
   3502 				   components (where RAIDframe treats
   3503 				   things a little differently :( ) */
   3504 			if (parity_type == '1') {
   3505 				if (c%2 == 0) { /* even component */
   3506 					even_pair_failed = 1;
   3507 				} else { /* odd component.  If
   3508 					    we're failed, and
   3509 					    so is the even
   3510 					    component, it's
   3511 					    "Good Night, Charlie" */
   3512 					if (even_pair_failed == 1) {
   3513 						return(0);
   3514 					}
   3515 				}
   3516 			} else {
   3517 				/* normal accounting */
   3518 				num_missing++;
   3519 			}
   3520 		}
   3521 		if ((parity_type == '1') && (c%2 == 1)) {
   3522 				/* Just did an even component, and we didn't
   3523 				   bail.. reset the even_pair_failed flag,
   3524 				   and go on to the next component.... */
   3525 			even_pair_failed = 0;
   3526 		}
   3527 	}
   3528 
   3529 	clabel = cset->ac->clabel;
   3530 
   3531 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3532 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3533 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3534 		/* XXX this needs to be made *much* more general */
   3535 		/* Too many failures */
   3536 		return(0);
   3537 	}
   3538 	/* otherwise, all is well, and we've got enough to take a kick
   3539 	   at autoconfiguring this set */
   3540 	return(1);
   3541 }
   3542 
   3543 void
   3544 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3545 			RF_Raid_t *raidPtr)
   3546 {
   3547 	RF_ComponentLabel_t *clabel;
   3548 	int i;
   3549 
   3550 	clabel = ac->clabel;
   3551 
   3552 	/* 1. Fill in the common stuff */
   3553 	config->numRow = clabel->num_rows = 1;
   3554 	config->numCol = clabel->num_columns;
   3555 	config->numSpare = 0; /* XXX should this be set here? */
   3556 	config->sectPerSU = clabel->sectPerSU;
   3557 	config->SUsPerPU = clabel->SUsPerPU;
   3558 	config->SUsPerRU = clabel->SUsPerRU;
   3559 	config->parityConfig = clabel->parityConfig;
   3560 	/* XXX... */
   3561 	strcpy(config->diskQueueType,"fifo");
   3562 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3563 	config->layoutSpecificSize = 0; /* XXX ?? */
   3564 
   3565 	while(ac!=NULL) {
   3566 		/* row/col values will be in range due to the checks
   3567 		   in reasonable_label() */
   3568 		strcpy(config->devnames[0][ac->clabel->column],
   3569 		       ac->devname);
   3570 		ac = ac->next;
   3571 	}
   3572 
   3573 	for(i=0;i<RF_MAXDBGV;i++) {
   3574 		config->debugVars[i][0] = 0;
   3575 	}
   3576 }
   3577 
   3578 int
   3579 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3580 {
   3581 	RF_ComponentLabel_t *clabel;
   3582 	int column;
   3583 	int sparecol;
   3584 
   3585 	raidPtr->autoconfigure = new_value;
   3586 
   3587 	for(column=0; column<raidPtr->numCol; column++) {
   3588 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3589 			clabel = raidget_component_label(raidPtr, column);
   3590 			clabel->autoconfigure = new_value;
   3591 			raidflush_component_label(raidPtr, column);
   3592 		}
   3593 	}
   3594 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3595 		sparecol = raidPtr->numCol + column;
   3596 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3597 			clabel = raidget_component_label(raidPtr, sparecol);
   3598 			clabel->autoconfigure = new_value;
   3599 			raidflush_component_label(raidPtr, sparecol);
   3600 		}
   3601 	}
   3602 	return(new_value);
   3603 }
   3604 
   3605 int
   3606 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3607 {
   3608 	RF_ComponentLabel_t *clabel;
   3609 	int column;
   3610 	int sparecol;
   3611 
   3612 	raidPtr->root_partition = new_value;
   3613 	for(column=0; column<raidPtr->numCol; column++) {
   3614 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3615 			clabel = raidget_component_label(raidPtr, column);
   3616 			clabel->root_partition = new_value;
   3617 			raidflush_component_label(raidPtr, column);
   3618 		}
   3619 	}
   3620 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3621 		sparecol = raidPtr->numCol + column;
   3622 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3623 			clabel = raidget_component_label(raidPtr, sparecol);
   3624 			clabel->root_partition = new_value;
   3625 			raidflush_component_label(raidPtr, sparecol);
   3626 		}
   3627 	}
   3628 	return(new_value);
   3629 }
   3630 
   3631 void
   3632 rf_release_all_vps(RF_ConfigSet_t *cset)
   3633 {
   3634 	RF_AutoConfig_t *ac;
   3635 
   3636 	ac = cset->ac;
   3637 	while(ac!=NULL) {
   3638 		/* Close the vp, and give it back */
   3639 		if (ac->vp) {
   3640 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3641 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3642 			vput(ac->vp);
   3643 			ac->vp = NULL;
   3644 		}
   3645 		ac = ac->next;
   3646 	}
   3647 }
   3648 
   3649 
   3650 void
   3651 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3652 {
   3653 	RF_AutoConfig_t *ac;
   3654 	RF_AutoConfig_t *next_ac;
   3655 
   3656 	ac = cset->ac;
   3657 	while(ac!=NULL) {
   3658 		next_ac = ac->next;
   3659 		/* nuke the label */
   3660 		free(ac->clabel, M_RAIDFRAME);
   3661 		/* cleanup the config structure */
   3662 		free(ac, M_RAIDFRAME);
   3663 		/* "next.." */
   3664 		ac = next_ac;
   3665 	}
   3666 	/* and, finally, nuke the config set */
   3667 	free(cset, M_RAIDFRAME);
   3668 }
   3669 
   3670 
   3671 void
   3672 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3673 {
   3674 	/* current version number */
   3675 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3676 	clabel->serial_number = raidPtr->serial_number;
   3677 	clabel->mod_counter = raidPtr->mod_counter;
   3678 
   3679 	clabel->num_rows = 1;
   3680 	clabel->num_columns = raidPtr->numCol;
   3681 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3682 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3683 
   3684 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3685 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3686 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3687 
   3688 	clabel->blockSize = raidPtr->bytesPerSector;
   3689 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3690 
   3691 	/* XXX not portable */
   3692 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3693 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3694 	clabel->autoconfigure = raidPtr->autoconfigure;
   3695 	clabel->root_partition = raidPtr->root_partition;
   3696 	clabel->last_unit = raidPtr->raidid;
   3697 	clabel->config_order = raidPtr->config_order;
   3698 
   3699 #ifndef RF_NO_PARITY_MAP
   3700 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3701 #endif
   3702 }
   3703 
   3704 int
   3705 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3706 {
   3707 	RF_Raid_t *raidPtr;
   3708 	RF_Config_t *config;
   3709 	int raidID;
   3710 	int retcode;
   3711 
   3712 #ifdef DEBUG
   3713 	printf("RAID autoconfigure\n");
   3714 #endif
   3715 
   3716 	retcode = 0;
   3717 	*unit = -1;
   3718 
   3719 	/* 1. Create a config structure */
   3720 
   3721 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3722 				       M_RAIDFRAME,
   3723 				       M_NOWAIT);
   3724 	if (config==NULL) {
   3725 		printf("Out of mem!?!?\n");
   3726 				/* XXX do something more intelligent here. */
   3727 		return(1);
   3728 	}
   3729 
   3730 	memset(config, 0, sizeof(RF_Config_t));
   3731 
   3732 	/*
   3733 	   2. Figure out what RAID ID this one is supposed to live at
   3734 	   See if we can get the same RAID dev that it was configured
   3735 	   on last time..
   3736 	*/
   3737 
   3738 	raidID = cset->ac->clabel->last_unit;
   3739 	if ((raidID < 0) || (raidID >= numraid)) {
   3740 		/* let's not wander off into lala land. */
   3741 		raidID = numraid - 1;
   3742 	}
   3743 	if (raidPtrs[raidID]->valid != 0) {
   3744 
   3745 		/*
   3746 		   Nope... Go looking for an alternative...
   3747 		   Start high so we don't immediately use raid0 if that's
   3748 		   not taken.
   3749 		*/
   3750 
   3751 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3752 			if (raidPtrs[raidID]->valid == 0) {
   3753 				/* can use this one! */
   3754 				break;
   3755 			}
   3756 		}
   3757 	}
   3758 
   3759 	if (raidID < 0) {
   3760 		/* punt... */
   3761 		printf("Unable to auto configure this set!\n");
   3762 		printf("(Out of RAID devs!)\n");
   3763 		free(config, M_RAIDFRAME);
   3764 		return(1);
   3765 	}
   3766 
   3767 #ifdef DEBUG
   3768 	printf("Configuring raid%d:\n",raidID);
   3769 #endif
   3770 
   3771 	raidPtr = raidPtrs[raidID];
   3772 
   3773 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3774 	raidPtr->raidid = raidID;
   3775 	raidPtr->openings = RAIDOUTSTANDING;
   3776 
   3777 	/* 3. Build the configuration structure */
   3778 	rf_create_configuration(cset->ac, config, raidPtr);
   3779 
   3780 	/* 4. Do the configuration */
   3781 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3782 
   3783 	if (retcode == 0) {
   3784 
   3785 		raidinit(raidPtrs[raidID]);
   3786 
   3787 		rf_markalldirty(raidPtrs[raidID]);
   3788 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3789 		if (cset->ac->clabel->root_partition==1) {
   3790 			/* everything configured just fine.  Make a note
   3791 			   that this set is eligible to be root. */
   3792 			cset->rootable = 1;
   3793 			/* XXX do this here? */
   3794 			raidPtrs[raidID]->root_partition = 1;
   3795 		}
   3796 	}
   3797 
   3798 	/* 5. Cleanup */
   3799 	free(config, M_RAIDFRAME);
   3800 
   3801 	*unit = raidID;
   3802 	return(retcode);
   3803 }
   3804 
   3805 void
   3806 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3807 {
   3808 	struct buf *bp;
   3809 
   3810 	bp = (struct buf *)desc->bp;
   3811 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3812 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3813 }
   3814 
   3815 void
   3816 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3817 	     size_t xmin, size_t xmax)
   3818 {
   3819 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3820 	pool_sethiwat(p, xmax);
   3821 	pool_prime(p, xmin);
   3822 	pool_setlowat(p, xmin);
   3823 }
   3824 
   3825 /*
   3826  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3827  * if there is IO pending and if that IO could possibly be done for a
   3828  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3829  * otherwise.
   3830  *
   3831  */
   3832 
   3833 int
   3834 rf_buf_queue_check(int raidid)
   3835 {
   3836 	if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
   3837 	    raidPtrs[raidid]->openings > 0) {
   3838 		/* there is work to do */
   3839 		return 0;
   3840 	}
   3841 	/* default is nothing to do */
   3842 	return 1;
   3843 }
   3844 
   3845 int
   3846 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3847 {
   3848 	uint64_t numsecs;
   3849 	unsigned secsize;
   3850 	int error;
   3851 
   3852 	error = getdisksize(vp, &numsecs, &secsize);
   3853 	if (error == 0) {
   3854 		diskPtr->blockSize = secsize;
   3855 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3856 		diskPtr->partitionSize = numsecs;
   3857 		return 0;
   3858 	}
   3859 	return error;
   3860 }
   3861 
   3862 static int
   3863 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3864 {
   3865 	return 1;
   3866 }
   3867 
   3868 static void
   3869 raid_attach(device_t parent, device_t self, void *aux)
   3870 {
   3871 
   3872 }
   3873 
   3874 
   3875 static int
   3876 raid_detach(device_t self, int flags)
   3877 {
   3878 	int error;
   3879 	struct raid_softc *rs = &raid_softc[device_unit(self)];
   3880 
   3881 	if ((error = raidlock(rs)) != 0)
   3882 		return (error);
   3883 
   3884 	error = raid_detach_unlocked(rs);
   3885 
   3886 	raidunlock(rs);
   3887 
   3888 	return error;
   3889 }
   3890 
   3891 static void
   3892 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3893 {
   3894 	prop_dictionary_t disk_info, odisk_info, geom;
   3895 	disk_info = prop_dictionary_create();
   3896 	geom = prop_dictionary_create();
   3897 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3898 				   raidPtr->totalSectors);
   3899 	prop_dictionary_set_uint32(geom, "sector-size",
   3900 				   raidPtr->bytesPerSector);
   3901 
   3902 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3903 				   raidPtr->Layout.dataSectorsPerStripe);
   3904 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3905 				   4 * raidPtr->numCol);
   3906 
   3907 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3908 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3909 	   (4 * raidPtr->numCol)));
   3910 
   3911 	prop_dictionary_set(disk_info, "geometry", geom);
   3912 	prop_object_release(geom);
   3913 	prop_dictionary_set(device_properties(rs->sc_dev),
   3914 			    "disk-info", disk_info);
   3915 	odisk_info = rs->sc_dkdev.dk_info;
   3916 	rs->sc_dkdev.dk_info = disk_info;
   3917 	if (odisk_info)
   3918 		prop_object_release(odisk_info);
   3919 }
   3920 
   3921 /*
   3922  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3923  * We end up returning whatever error was returned by the first cache flush
   3924  * that fails.
   3925  */
   3926 
   3927 int
   3928 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3929 {
   3930 	int c, sparecol;
   3931 	int e,error;
   3932 	int force = 1;
   3933 
   3934 	error = 0;
   3935 	for (c = 0; c < raidPtr->numCol; c++) {
   3936 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3937 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3938 					  &force, FWRITE, NOCRED);
   3939 			if (e) {
   3940 				if (e != ENODEV)
   3941 					printf("raid%d: cache flush to component %s failed.\n",
   3942 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3943 				if (error == 0) {
   3944 					error = e;
   3945 				}
   3946 			}
   3947 		}
   3948 	}
   3949 
   3950 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3951 		sparecol = raidPtr->numCol + c;
   3952 		/* Need to ensure that the reconstruct actually completed! */
   3953 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3954 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3955 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3956 			if (e) {
   3957 				if (e != ENODEV)
   3958 					printf("raid%d: cache flush to component %s failed.\n",
   3959 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3960 				if (error == 0) {
   3961 					error = e;
   3962 				}
   3963 			}
   3964 		}
   3965 	}
   3966 	return error;
   3967 }
   3968 
   3969 static void
   3970 raidminphys(struct buf *bp)
   3971 {
   3972 	dev_t dev;
   3973 	int unit;
   3974 	struct raid_softc *rs;
   3975 	RF_Raid_t *raidPtr;
   3976 	long xmax;
   3977 
   3978 	dev = bp->b_dev;
   3979 	unit = raidunit(dev);
   3980 	rs = &raid_softc[unit];
   3981 	raidPtr = raidPtrs[unit];
   3982 
   3983 	xmax = raidPtr->Layout.numDataCol * MAXPHYS;
   3984 
   3985 	if (bp->b_bcount > xmax) {
   3986 		bp->b_bcount = xmax;
   3987 	}
   3988 }
   3989