Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.307
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.307 2014/04/03 15:30:52 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.307 2014/04/03 15:30:52 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_flag = D_DISK
    215 };
    216 
    217 const struct cdevsw raid_cdevsw = {
    218 	.d_open = raidopen,
    219 	.d_close = raidclose,
    220 	.d_read = raidread,
    221 	.d_write = raidwrite,
    222 	.d_ioctl = raidioctl,
    223 	.d_stop = nostop,
    224 	.d_tty = notty,
    225 	.d_poll = nopoll,
    226 	.d_mmap = nommap,
    227 	.d_kqfilter = nokqfilter,
    228 	.d_flag = D_DISK
    229 };
    230 
    231 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    232 
    233 struct raid_softc {
    234 	device_t sc_dev;
    235 	int	sc_unit;
    236 	int     sc_flags;	/* flags */
    237 	int     sc_cflags;	/* configuration flags */
    238 	uint64_t sc_size;	/* size of the raid device */
    239 	char    sc_xname[20];	/* XXX external name */
    240 	struct disk sc_dkdev;	/* generic disk device info */
    241 	struct bufq_state *buf_queue;	/* used for the device queue */
    242 	RF_Raid_t sc_r;
    243 	LIST_ENTRY(raid_softc) sc_link;
    244 };
    245 /* sc_flags */
    246 #define RAIDF_INITED	0x01	/* unit has been initialized */
    247 #define RAIDF_WLABEL	0x02	/* label area is writable */
    248 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    249 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    250 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    251 #define RAIDF_LOCKED	0x80	/* unit is locked */
    252 
    253 #define	raidunit(x)	DISKUNIT(x)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /*
    261  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    262  * Be aware that large numbers can allow the driver to consume a lot of
    263  * kernel memory, especially on writes, and in degraded mode reads.
    264  *
    265  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    266  * a single 64K write will typically require 64K for the old data,
    267  * 64K for the old parity, and 64K for the new parity, for a total
    268  * of 192K (if the parity buffer is not re-used immediately).
    269  * Even it if is used immediately, that's still 128K, which when multiplied
    270  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    271  *
    272  * Now in degraded mode, for example, a 64K read on the above setup may
    273  * require data reconstruction, which will require *all* of the 4 remaining
    274  * disks to participate -- 4 * 32K/disk == 128K again.
    275  */
    276 
    277 #ifndef RAIDOUTSTANDING
    278 #define RAIDOUTSTANDING   6
    279 #endif
    280 
    281 #define RAIDLABELDEV(dev)	\
    282 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    283 
    284 /* declared here, and made public, for the benefit of KVM stuff.. */
    285 
    286 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    287 				     struct disklabel *);
    288 static void raidgetdisklabel(dev_t);
    289 static void raidmakedisklabel(struct raid_softc *);
    290 
    291 static int raidlock(struct raid_softc *);
    292 static void raidunlock(struct raid_softc *);
    293 
    294 static int raid_detach_unlocked(struct raid_softc *);
    295 
    296 static void rf_markalldirty(RF_Raid_t *);
    297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    298 
    299 void rf_ReconThread(struct rf_recon_req *);
    300 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    301 void rf_CopybackThread(RF_Raid_t *raidPtr);
    302 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    303 int rf_autoconfig(device_t);
    304 void rf_buildroothack(RF_ConfigSet_t *);
    305 
    306 RF_AutoConfig_t *rf_find_raid_components(void);
    307 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    308 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    309 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    310 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    311 int rf_set_autoconfig(RF_Raid_t *, int);
    312 int rf_set_rootpartition(RF_Raid_t *, int);
    313 void rf_release_all_vps(RF_ConfigSet_t *);
    314 void rf_cleanup_config_set(RF_ConfigSet_t *);
    315 int rf_have_enough_components(RF_ConfigSet_t *);
    316 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    318 
    319 /*
    320  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    321  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    322  * in the kernel config file.
    323  */
    324 #ifdef RAID_AUTOCONFIG
    325 int raidautoconfig = 1;
    326 #else
    327 int raidautoconfig = 0;
    328 #endif
    329 static bool raidautoconfigdone = false;
    330 
    331 struct RF_Pools_s rf_pools;
    332 
    333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    334 static kmutex_t raid_lock;
    335 
    336 static struct raid_softc *
    337 raidcreate(int unit) {
    338 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    339 	if (sc == NULL) {
    340 #ifdef DIAGNOSTIC
    341 		printf("%s: out of memory\n", __func__);
    342 #endif
    343 		return NULL;
    344 	}
    345 	sc->sc_unit = unit;
    346 	bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    347 	return sc;
    348 }
    349 
    350 static void
    351 raiddestroy(struct raid_softc *sc) {
    352 	bufq_free(sc->buf_queue);
    353 	kmem_free(sc, sizeof(*sc));
    354 }
    355 
    356 static struct raid_softc *
    357 raidget(int unit) {
    358 	struct raid_softc *sc;
    359 	if (unit < 0) {
    360 #ifdef DIAGNOSTIC
    361 		panic("%s: unit %d!", __func__, unit);
    362 #endif
    363 		return NULL;
    364 	}
    365 	mutex_enter(&raid_lock);
    366 	LIST_FOREACH(sc, &raids, sc_link) {
    367 		if (sc->sc_unit == unit) {
    368 			mutex_exit(&raid_lock);
    369 			return sc;
    370 		}
    371 	}
    372 	mutex_exit(&raid_lock);
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    393 	/* This is where all the initialization stuff gets done. */
    394 
    395 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    396 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    397 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    398 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    399 
    400 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    401 #endif
    402 
    403 	if (rf_BootRaidframe() == 0)
    404 		aprint_verbose("Kernelized RAIDframe activated\n");
    405 	else
    406 		panic("Serious error booting RAID!!");
    407 
    408 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    409 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    410 	}
    411 
    412 	raidautoconfigdone = false;
    413 
    414 	/*
    415 	 * Register a finalizer which will be used to auto-config RAID
    416 	 * sets once all real hardware devices have been found.
    417 	 */
    418 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    419 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    420 }
    421 
    422 int
    423 rf_autoconfig(device_t self)
    424 {
    425 	RF_AutoConfig_t *ac_list;
    426 	RF_ConfigSet_t *config_sets;
    427 
    428 	if (!raidautoconfig || raidautoconfigdone == true)
    429 		return (0);
    430 
    431 	/* XXX This code can only be run once. */
    432 	raidautoconfigdone = true;
    433 
    434 #ifdef __HAVE_CPU_BOOTCONF
    435 	/*
    436 	 * 0. find the boot device if needed first so we can use it later
    437 	 * this needs to be done before we autoconfigure any raid sets,
    438 	 * because if we use wedges we are not going to be able to open
    439 	 * the boot device later
    440 	 */
    441 	if (booted_device == NULL)
    442 		cpu_bootconf();
    443 #endif
    444 	/* 1. locate all RAID components on the system */
    445 	aprint_debug("Searching for RAID components...\n");
    446 	ac_list = rf_find_raid_components();
    447 
    448 	/* 2. Sort them into their respective sets. */
    449 	config_sets = rf_create_auto_sets(ac_list);
    450 
    451 	/*
    452 	 * 3. Evaluate each set and configure the valid ones.
    453 	 * This gets done in rf_buildroothack().
    454 	 */
    455 	rf_buildroothack(config_sets);
    456 
    457 	return 1;
    458 }
    459 
    460 static int
    461 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    462 	const char *bootname = device_xname(bdv);
    463 	size_t len = strlen(bootname);
    464 
    465 	for (int col = 0; col < r->numCol; col++) {
    466 		const char *devname = r->Disks[col].devname;
    467 		devname += sizeof("/dev/") - 1;
    468 		if (strncmp(devname, "dk", 2) == 0) {
    469 			const char *parent =
    470 			    dkwedge_get_parent_name(r->Disks[col].dev);
    471 			if (parent != NULL)
    472 				devname = parent;
    473 		}
    474 		if (strncmp(devname, bootname, len) == 0) {
    475 			struct raid_softc *sc = r->softc;
    476 			aprint_debug("raid%d includes boot device %s\n",
    477 			    sc->sc_unit, devname);
    478 			return 1;
    479 		}
    480 	}
    481 	return 0;
    482 }
    483 
    484 void
    485 rf_buildroothack(RF_ConfigSet_t *config_sets)
    486 {
    487 	RF_ConfigSet_t *cset;
    488 	RF_ConfigSet_t *next_cset;
    489 	int num_root;
    490 	struct raid_softc *sc, *rsc;
    491 
    492 	sc = rsc = NULL;
    493 	num_root = 0;
    494 	cset = config_sets;
    495 	while (cset != NULL) {
    496 		next_cset = cset->next;
    497 		if (rf_have_enough_components(cset) &&
    498 		    cset->ac->clabel->autoconfigure == 1) {
    499 			sc = rf_auto_config_set(cset);
    500 			if (sc != NULL) {
    501 				aprint_debug("raid%d: configured ok\n",
    502 				    sc->sc_unit);
    503 				if (cset->rootable) {
    504 					rsc = sc;
    505 					num_root++;
    506 				}
    507 			} else {
    508 				/* The autoconfig didn't work :( */
    509 				aprint_debug("Autoconfig failed\n");
    510 				rf_release_all_vps(cset);
    511 			}
    512 		} else {
    513 			/* we're not autoconfiguring this set...
    514 			   release the associated resources */
    515 			rf_release_all_vps(cset);
    516 		}
    517 		/* cleanup */
    518 		rf_cleanup_config_set(cset);
    519 		cset = next_cset;
    520 	}
    521 
    522 	/* if the user has specified what the root device should be
    523 	   then we don't touch booted_device or boothowto... */
    524 
    525 	if (rootspec != NULL)
    526 		return;
    527 
    528 	/* we found something bootable... */
    529 
    530 	if (num_root == 1) {
    531 		device_t candidate_root;
    532 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    533 			/* XXX: How do we find the real root partition? */
    534 			char cname[sizeof(cset->ac->devname)];
    535 			snprintf(cname, sizeof(cname), "%s%c",
    536 			    device_xname(rsc->sc_dev), 'a');
    537 			candidate_root = dkwedge_find_by_wname(cname);
    538 		} else
    539 			candidate_root = rsc->sc_dev;
    540 #ifndef RAIDFRAME_FORCE_ROOT
    541 		if (booted_device == NULL
    542 		    || rf_containsboot(&rsc->sc_r, booted_device))
    543 #endif
    544 		booted_device = candidate_root;
    545 	} else if (num_root > 1) {
    546 
    547 		/*
    548 		 * Maybe the MD code can help. If it cannot, then
    549 		 * setroot() will discover that we have no
    550 		 * booted_device and will ask the user if nothing was
    551 		 * hardwired in the kernel config file
    552 		 */
    553 		if (booted_device == NULL)
    554 			return;
    555 
    556 		num_root = 0;
    557 		mutex_enter(&raid_lock);
    558 		LIST_FOREACH(sc, &raids, sc_link) {
    559 			RF_Raid_t *r = &sc->sc_r;
    560 			if (r->valid == 0)
    561 				continue;
    562 
    563 			if (r->root_partition == 0)
    564 				continue;
    565 
    566 			if (rf_containsboot(r, booted_device)) {
    567 				num_root++;
    568 				rsc = sc;
    569 			}
    570 		}
    571 		mutex_exit(&raid_lock);
    572 
    573 		if (num_root == 1) {
    574 			booted_device = rsc->sc_dev;
    575 		} else {
    576 			/* we can't guess.. require the user to answer... */
    577 			boothowto |= RB_ASKNAME;
    578 		}
    579 	}
    580 }
    581 
    582 
    583 int
    584 raidsize(dev_t dev)
    585 {
    586 	struct raid_softc *rs;
    587 	struct disklabel *lp;
    588 	int     part, unit, omask, size;
    589 
    590 	unit = raidunit(dev);
    591 	if ((rs = raidget(unit)) == NULL)
    592 		return -1;
    593 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    594 		return (-1);
    595 
    596 	part = DISKPART(dev);
    597 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    598 	lp = rs->sc_dkdev.dk_label;
    599 
    600 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    601 		return (-1);
    602 
    603 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    604 		size = -1;
    605 	else
    606 		size = lp->d_partitions[part].p_size *
    607 		    (lp->d_secsize / DEV_BSIZE);
    608 
    609 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    610 		return (-1);
    611 
    612 	return (size);
    613 
    614 }
    615 
    616 int
    617 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    618 {
    619 	int     unit = raidunit(dev);
    620 	struct raid_softc *rs;
    621 	const struct bdevsw *bdev;
    622 	struct disklabel *lp;
    623 	RF_Raid_t *raidPtr;
    624 	daddr_t offset;
    625 	int     part, c, sparecol, j, scol, dumpto;
    626 	int     error = 0;
    627 
    628 	if ((rs = raidget(unit)) == NULL)
    629 		return ENXIO;
    630 
    631 	raidPtr = &rs->sc_r;
    632 
    633 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    634 		return ENXIO;
    635 
    636 	/* we only support dumping to RAID 1 sets */
    637 	if (raidPtr->Layout.numDataCol != 1 ||
    638 	    raidPtr->Layout.numParityCol != 1)
    639 		return EINVAL;
    640 
    641 
    642 	if ((error = raidlock(rs)) != 0)
    643 		return error;
    644 
    645 	if (size % DEV_BSIZE != 0) {
    646 		error = EINVAL;
    647 		goto out;
    648 	}
    649 
    650 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    651 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    652 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    653 		    size / DEV_BSIZE, rs->sc_size);
    654 		error = EINVAL;
    655 		goto out;
    656 	}
    657 
    658 	part = DISKPART(dev);
    659 	lp = rs->sc_dkdev.dk_label;
    660 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    661 
    662 	/* figure out what device is alive.. */
    663 
    664 	/*
    665 	   Look for a component to dump to.  The preference for the
    666 	   component to dump to is as follows:
    667 	   1) the master
    668 	   2) a used_spare of the master
    669 	   3) the slave
    670 	   4) a used_spare of the slave
    671 	*/
    672 
    673 	dumpto = -1;
    674 	for (c = 0; c < raidPtr->numCol; c++) {
    675 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    676 			/* this might be the one */
    677 			dumpto = c;
    678 			break;
    679 		}
    680 	}
    681 
    682 	/*
    683 	   At this point we have possibly selected a live master or a
    684 	   live slave.  We now check to see if there is a spared
    685 	   master (or a spared slave), if we didn't find a live master
    686 	   or a live slave.
    687 	*/
    688 
    689 	for (c = 0; c < raidPtr->numSpare; c++) {
    690 		sparecol = raidPtr->numCol + c;
    691 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    692 			/* How about this one? */
    693 			scol = -1;
    694 			for(j=0;j<raidPtr->numCol;j++) {
    695 				if (raidPtr->Disks[j].spareCol == sparecol) {
    696 					scol = j;
    697 					break;
    698 				}
    699 			}
    700 			if (scol == 0) {
    701 				/*
    702 				   We must have found a spared master!
    703 				   We'll take that over anything else
    704 				   found so far.  (We couldn't have
    705 				   found a real master before, since
    706 				   this is a used spare, and it's
    707 				   saying that it's replacing the
    708 				   master.)  On reboot (with
    709 				   autoconfiguration turned on)
    710 				   sparecol will become the 1st
    711 				   component (component0) of this set.
    712 				*/
    713 				dumpto = sparecol;
    714 				break;
    715 			} else if (scol != -1) {
    716 				/*
    717 				   Must be a spared slave.  We'll dump
    718 				   to that if we havn't found anything
    719 				   else so far.
    720 				*/
    721 				if (dumpto == -1)
    722 					dumpto = sparecol;
    723 			}
    724 		}
    725 	}
    726 
    727 	if (dumpto == -1) {
    728 		/* we couldn't find any live components to dump to!?!?
    729 		 */
    730 		error = EINVAL;
    731 		goto out;
    732 	}
    733 
    734 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    735 
    736 	/*
    737 	   Note that blkno is relative to this particular partition.
    738 	   By adding the offset of this partition in the RAID
    739 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    740 	   value that is relative to the partition used for the
    741 	   underlying component.
    742 	*/
    743 
    744 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    745 				blkno + offset, va, size);
    746 
    747 out:
    748 	raidunlock(rs);
    749 
    750 	return error;
    751 }
    752 /* ARGSUSED */
    753 int
    754 raidopen(dev_t dev, int flags, int fmt,
    755     struct lwp *l)
    756 {
    757 	int     unit = raidunit(dev);
    758 	struct raid_softc *rs;
    759 	struct disklabel *lp;
    760 	int     part, pmask;
    761 	int     error = 0;
    762 
    763 	if ((rs = raidget(unit)) == NULL)
    764 		return ENXIO;
    765 	if ((error = raidlock(rs)) != 0)
    766 		return (error);
    767 
    768 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    769 		error = EBUSY;
    770 		goto bad;
    771 	}
    772 
    773 	lp = rs->sc_dkdev.dk_label;
    774 
    775 	part = DISKPART(dev);
    776 
    777 	/*
    778 	 * If there are wedges, and this is not RAW_PART, then we
    779 	 * need to fail.
    780 	 */
    781 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    782 		error = EBUSY;
    783 		goto bad;
    784 	}
    785 	pmask = (1 << part);
    786 
    787 	if ((rs->sc_flags & RAIDF_INITED) &&
    788 	    (rs->sc_dkdev.dk_openmask == 0))
    789 		raidgetdisklabel(dev);
    790 
    791 	/* make sure that this partition exists */
    792 
    793 	if (part != RAW_PART) {
    794 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    795 		    ((part >= lp->d_npartitions) ||
    796 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    797 			error = ENXIO;
    798 			goto bad;
    799 		}
    800 	}
    801 	/* Prevent this unit from being unconfigured while open. */
    802 	switch (fmt) {
    803 	case S_IFCHR:
    804 		rs->sc_dkdev.dk_copenmask |= pmask;
    805 		break;
    806 
    807 	case S_IFBLK:
    808 		rs->sc_dkdev.dk_bopenmask |= pmask;
    809 		break;
    810 	}
    811 
    812 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    813 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    814 		/* First one... mark things as dirty... Note that we *MUST*
    815 		 have done a configure before this.  I DO NOT WANT TO BE
    816 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    817 		 THAT THEY BELONG TOGETHER!!!!! */
    818 		/* XXX should check to see if we're only open for reading
    819 		   here... If so, we needn't do this, but then need some
    820 		   other way of keeping track of what's happened.. */
    821 
    822 		rf_markalldirty(&rs->sc_r);
    823 	}
    824 
    825 
    826 	rs->sc_dkdev.dk_openmask =
    827 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    828 
    829 bad:
    830 	raidunlock(rs);
    831 
    832 	return (error);
    833 
    834 
    835 }
    836 /* ARGSUSED */
    837 int
    838 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    839 {
    840 	int     unit = raidunit(dev);
    841 	struct raid_softc *rs;
    842 	int     error = 0;
    843 	int     part;
    844 
    845 	if ((rs = raidget(unit)) == NULL)
    846 		return ENXIO;
    847 
    848 	if ((error = raidlock(rs)) != 0)
    849 		return (error);
    850 
    851 	part = DISKPART(dev);
    852 
    853 	/* ...that much closer to allowing unconfiguration... */
    854 	switch (fmt) {
    855 	case S_IFCHR:
    856 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    857 		break;
    858 
    859 	case S_IFBLK:
    860 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    861 		break;
    862 	}
    863 	rs->sc_dkdev.dk_openmask =
    864 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    865 
    866 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    867 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    868 		/* Last one... device is not unconfigured yet.
    869 		   Device shutdown has taken care of setting the
    870 		   clean bits if RAIDF_INITED is not set
    871 		   mark things as clean... */
    872 
    873 		rf_update_component_labels(&rs->sc_r,
    874 						 RF_FINAL_COMPONENT_UPDATE);
    875 
    876 		/* If the kernel is shutting down, it will detach
    877 		 * this RAID set soon enough.
    878 		 */
    879 	}
    880 
    881 	raidunlock(rs);
    882 	return (0);
    883 
    884 }
    885 
    886 void
    887 raidstrategy(struct buf *bp)
    888 {
    889 	unsigned int unit = raidunit(bp->b_dev);
    890 	RF_Raid_t *raidPtr;
    891 	int     wlabel;
    892 	struct raid_softc *rs;
    893 
    894 	if ((rs = raidget(unit)) == NULL) {
    895 		bp->b_error = ENXIO;
    896 		goto done;
    897 	}
    898 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    899 		bp->b_error = ENXIO;
    900 		goto done;
    901 	}
    902 	raidPtr = &rs->sc_r;
    903 	if (!raidPtr->valid) {
    904 		bp->b_error = ENODEV;
    905 		goto done;
    906 	}
    907 	if (bp->b_bcount == 0) {
    908 		db1_printf(("b_bcount is zero..\n"));
    909 		goto done;
    910 	}
    911 
    912 	/*
    913 	 * Do bounds checking and adjust transfer.  If there's an
    914 	 * error, the bounds check will flag that for us.
    915 	 */
    916 
    917 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    918 	if (DISKPART(bp->b_dev) == RAW_PART) {
    919 		uint64_t size; /* device size in DEV_BSIZE unit */
    920 
    921 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    922 			size = raidPtr->totalSectors <<
    923 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    924 		} else {
    925 			size = raidPtr->totalSectors >>
    926 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    927 		}
    928 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    929 			goto done;
    930 		}
    931 	} else {
    932 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    933 			db1_printf(("Bounds check failed!!:%d %d\n",
    934 				(int) bp->b_blkno, (int) wlabel));
    935 			goto done;
    936 		}
    937 	}
    938 
    939 	rf_lock_mutex2(raidPtr->iodone_lock);
    940 
    941 	bp->b_resid = 0;
    942 
    943 	/* stuff it onto our queue */
    944 	bufq_put(rs->buf_queue, bp);
    945 
    946 	/* scheduled the IO to happen at the next convenient time */
    947 	rf_signal_cond2(raidPtr->iodone_cv);
    948 	rf_unlock_mutex2(raidPtr->iodone_lock);
    949 
    950 	return;
    951 
    952 done:
    953 	bp->b_resid = bp->b_bcount;
    954 	biodone(bp);
    955 }
    956 /* ARGSUSED */
    957 int
    958 raidread(dev_t dev, struct uio *uio, int flags)
    959 {
    960 	int     unit = raidunit(dev);
    961 	struct raid_softc *rs;
    962 
    963 	if ((rs = raidget(unit)) == NULL)
    964 		return ENXIO;
    965 
    966 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    967 		return (ENXIO);
    968 
    969 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    970 
    971 }
    972 /* ARGSUSED */
    973 int
    974 raidwrite(dev_t dev, struct uio *uio, int flags)
    975 {
    976 	int     unit = raidunit(dev);
    977 	struct raid_softc *rs;
    978 
    979 	if ((rs = raidget(unit)) == NULL)
    980 		return ENXIO;
    981 
    982 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    983 		return (ENXIO);
    984 
    985 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    986 
    987 }
    988 
    989 static int
    990 raid_detach_unlocked(struct raid_softc *rs)
    991 {
    992 	int error;
    993 	RF_Raid_t *raidPtr;
    994 
    995 	raidPtr = &rs->sc_r;
    996 
    997 	/*
    998 	 * If somebody has a partition mounted, we shouldn't
    999 	 * shutdown.
   1000 	 */
   1001 	if (rs->sc_dkdev.dk_openmask != 0)
   1002 		return EBUSY;
   1003 
   1004 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1005 		;	/* not initialized: nothing to do */
   1006 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1007 		return error;
   1008 	else
   1009 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1010 
   1011 	/* Detach the disk. */
   1012 	dkwedge_delall(&rs->sc_dkdev);
   1013 	disk_detach(&rs->sc_dkdev);
   1014 	disk_destroy(&rs->sc_dkdev);
   1015 
   1016 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1017 
   1018 	return 0;
   1019 }
   1020 
   1021 int
   1022 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1023 {
   1024 	int     unit = raidunit(dev);
   1025 	int     error = 0;
   1026 	int     part, pmask, s;
   1027 	cfdata_t cf;
   1028 	struct raid_softc *rs;
   1029 	RF_Config_t *k_cfg, *u_cfg;
   1030 	RF_Raid_t *raidPtr;
   1031 	RF_RaidDisk_t *diskPtr;
   1032 	RF_AccTotals_t *totals;
   1033 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1034 	u_char *specific_buf;
   1035 	int retcode = 0;
   1036 	int column;
   1037 /*	int raidid; */
   1038 	struct rf_recon_req *rrcopy, *rr;
   1039 	RF_ComponentLabel_t *clabel;
   1040 	RF_ComponentLabel_t *ci_label;
   1041 	RF_ComponentLabel_t **clabel_ptr;
   1042 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1043 	RF_SingleComponent_t component;
   1044 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1045 	int i, j, d;
   1046 #ifdef __HAVE_OLD_DISKLABEL
   1047 	struct disklabel newlabel;
   1048 #endif
   1049 	struct dkwedge_info *dkw;
   1050 
   1051 	if ((rs = raidget(unit)) == NULL)
   1052 		return ENXIO;
   1053 	raidPtr = &rs->sc_r;
   1054 
   1055 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1056 		(int) DISKPART(dev), (int) unit, cmd));
   1057 
   1058 	/* Must be open for writes for these commands... */
   1059 	switch (cmd) {
   1060 #ifdef DIOCGSECTORSIZE
   1061 	case DIOCGSECTORSIZE:
   1062 		*(u_int *)data = raidPtr->bytesPerSector;
   1063 		return 0;
   1064 	case DIOCGMEDIASIZE:
   1065 		*(off_t *)data =
   1066 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1067 		return 0;
   1068 #endif
   1069 	case DIOCSDINFO:
   1070 	case DIOCWDINFO:
   1071 #ifdef __HAVE_OLD_DISKLABEL
   1072 	case ODIOCWDINFO:
   1073 	case ODIOCSDINFO:
   1074 #endif
   1075 	case DIOCWLABEL:
   1076 	case DIOCAWEDGE:
   1077 	case DIOCDWEDGE:
   1078 	case DIOCSSTRATEGY:
   1079 		if ((flag & FWRITE) == 0)
   1080 			return (EBADF);
   1081 	}
   1082 
   1083 	/* Must be initialized for these... */
   1084 	switch (cmd) {
   1085 	case DIOCGDINFO:
   1086 	case DIOCSDINFO:
   1087 	case DIOCWDINFO:
   1088 #ifdef __HAVE_OLD_DISKLABEL
   1089 	case ODIOCGDINFO:
   1090 	case ODIOCWDINFO:
   1091 	case ODIOCSDINFO:
   1092 	case ODIOCGDEFLABEL:
   1093 #endif
   1094 	case DIOCGPART:
   1095 	case DIOCWLABEL:
   1096 	case DIOCGDEFLABEL:
   1097 	case DIOCAWEDGE:
   1098 	case DIOCDWEDGE:
   1099 	case DIOCLWEDGES:
   1100 	case DIOCCACHESYNC:
   1101 	case RAIDFRAME_SHUTDOWN:
   1102 	case RAIDFRAME_REWRITEPARITY:
   1103 	case RAIDFRAME_GET_INFO:
   1104 	case RAIDFRAME_RESET_ACCTOTALS:
   1105 	case RAIDFRAME_GET_ACCTOTALS:
   1106 	case RAIDFRAME_KEEP_ACCTOTALS:
   1107 	case RAIDFRAME_GET_SIZE:
   1108 	case RAIDFRAME_FAIL_DISK:
   1109 	case RAIDFRAME_COPYBACK:
   1110 	case RAIDFRAME_CHECK_RECON_STATUS:
   1111 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1112 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1113 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1114 	case RAIDFRAME_ADD_HOT_SPARE:
   1115 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1116 	case RAIDFRAME_INIT_LABELS:
   1117 	case RAIDFRAME_REBUILD_IN_PLACE:
   1118 	case RAIDFRAME_CHECK_PARITY:
   1119 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1120 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1121 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1122 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1123 	case RAIDFRAME_SET_AUTOCONFIG:
   1124 	case RAIDFRAME_SET_ROOT:
   1125 	case RAIDFRAME_DELETE_COMPONENT:
   1126 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1127 	case RAIDFRAME_PARITYMAP_STATUS:
   1128 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1129 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1130 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1131 	case DIOCGSTRATEGY:
   1132 	case DIOCSSTRATEGY:
   1133 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1134 			return (ENXIO);
   1135 	}
   1136 
   1137 	switch (cmd) {
   1138 #ifdef COMPAT_50
   1139 	case RAIDFRAME_GET_INFO50:
   1140 		return rf_get_info50(raidPtr, data);
   1141 
   1142 	case RAIDFRAME_CONFIGURE50:
   1143 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1144 			return retcode;
   1145 		goto config;
   1146 #endif
   1147 		/* configure the system */
   1148 	case RAIDFRAME_CONFIGURE:
   1149 
   1150 		if (raidPtr->valid) {
   1151 			/* There is a valid RAID set running on this unit! */
   1152 			printf("raid%d: Device already configured!\n",unit);
   1153 			return(EINVAL);
   1154 		}
   1155 
   1156 		/* copy-in the configuration information */
   1157 		/* data points to a pointer to the configuration structure */
   1158 
   1159 		u_cfg = *((RF_Config_t **) data);
   1160 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1161 		if (k_cfg == NULL) {
   1162 			return (ENOMEM);
   1163 		}
   1164 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1165 		if (retcode) {
   1166 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1167 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1168 				retcode));
   1169 			return (retcode);
   1170 		}
   1171 		goto config;
   1172 	config:
   1173 		/* allocate a buffer for the layout-specific data, and copy it
   1174 		 * in */
   1175 		if (k_cfg->layoutSpecificSize) {
   1176 			if (k_cfg->layoutSpecificSize > 10000) {
   1177 				/* sanity check */
   1178 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1179 				return (EINVAL);
   1180 			}
   1181 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1182 			    (u_char *));
   1183 			if (specific_buf == NULL) {
   1184 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1185 				return (ENOMEM);
   1186 			}
   1187 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1188 			    k_cfg->layoutSpecificSize);
   1189 			if (retcode) {
   1190 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1191 				RF_Free(specific_buf,
   1192 					k_cfg->layoutSpecificSize);
   1193 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1194 					retcode));
   1195 				return (retcode);
   1196 			}
   1197 		} else
   1198 			specific_buf = NULL;
   1199 		k_cfg->layoutSpecific = specific_buf;
   1200 
   1201 		/* should do some kind of sanity check on the configuration.
   1202 		 * Store the sum of all the bytes in the last byte? */
   1203 
   1204 		/* configure the system */
   1205 
   1206 		/*
   1207 		 * Clear the entire RAID descriptor, just to make sure
   1208 		 *  there is no stale data left in the case of a
   1209 		 *  reconfiguration
   1210 		 */
   1211 		memset(raidPtr, 0, sizeof(*raidPtr));
   1212 		raidPtr->softc = rs;
   1213 		raidPtr->raidid = unit;
   1214 
   1215 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1216 
   1217 		if (retcode == 0) {
   1218 
   1219 			/* allow this many simultaneous IO's to
   1220 			   this RAID device */
   1221 			raidPtr->openings = RAIDOUTSTANDING;
   1222 
   1223 			raidinit(rs);
   1224 			rf_markalldirty(raidPtr);
   1225 		}
   1226 		/* free the buffers.  No return code here. */
   1227 		if (k_cfg->layoutSpecificSize) {
   1228 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1229 		}
   1230 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1231 
   1232 		return (retcode);
   1233 
   1234 		/* shutdown the system */
   1235 	case RAIDFRAME_SHUTDOWN:
   1236 
   1237 		part = DISKPART(dev);
   1238 		pmask = (1 << part);
   1239 
   1240 		if ((error = raidlock(rs)) != 0)
   1241 			return (error);
   1242 
   1243 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1244 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1245 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1246 			retcode = EBUSY;
   1247 		else {
   1248 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1249 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1250 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1251 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1252 			retcode = 0;
   1253 		}
   1254 
   1255 		raidunlock(rs);
   1256 
   1257 		if (retcode != 0)
   1258 			return retcode;
   1259 
   1260 		/* free the pseudo device attach bits */
   1261 
   1262 		cf = device_cfdata(rs->sc_dev);
   1263 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1264 			free(cf, M_RAIDFRAME);
   1265 
   1266 		return (retcode);
   1267 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1268 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1269 		/* need to read the component label for the disk indicated
   1270 		   by row,column in clabel */
   1271 
   1272 		/*
   1273 		 * Perhaps there should be an option to skip the in-core
   1274 		 * copy and hit the disk, as with disklabel(8).
   1275 		 */
   1276 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1277 
   1278 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1279 
   1280 		if (retcode) {
   1281 			RF_Free(clabel, sizeof(*clabel));
   1282 			return retcode;
   1283 		}
   1284 
   1285 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1286 
   1287 		column = clabel->column;
   1288 
   1289 		if ((column < 0) || (column >= raidPtr->numCol +
   1290 		    raidPtr->numSpare)) {
   1291 			RF_Free(clabel, sizeof(*clabel));
   1292 			return EINVAL;
   1293 		}
   1294 
   1295 		RF_Free(clabel, sizeof(*clabel));
   1296 
   1297 		clabel = raidget_component_label(raidPtr, column);
   1298 
   1299 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1300 
   1301 #if 0
   1302 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1303 		clabel = (RF_ComponentLabel_t *) data;
   1304 
   1305 		/* XXX check the label for valid stuff... */
   1306 		/* Note that some things *should not* get modified --
   1307 		   the user should be re-initing the labels instead of
   1308 		   trying to patch things.
   1309 		   */
   1310 
   1311 		raidid = raidPtr->raidid;
   1312 #ifdef DEBUG
   1313 		printf("raid%d: Got component label:\n", raidid);
   1314 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1315 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1316 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1317 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1318 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1319 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1320 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1321 #endif
   1322 		clabel->row = 0;
   1323 		column = clabel->column;
   1324 
   1325 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1326 			return(EINVAL);
   1327 		}
   1328 
   1329 		/* XXX this isn't allowed to do anything for now :-) */
   1330 
   1331 		/* XXX and before it is, we need to fill in the rest
   1332 		   of the fields!?!?!?! */
   1333 		memcpy(raidget_component_label(raidPtr, column),
   1334 		    clabel, sizeof(*clabel));
   1335 		raidflush_component_label(raidPtr, column);
   1336 		return (0);
   1337 #endif
   1338 
   1339 	case RAIDFRAME_INIT_LABELS:
   1340 		clabel = (RF_ComponentLabel_t *) data;
   1341 		/*
   1342 		   we only want the serial number from
   1343 		   the above.  We get all the rest of the information
   1344 		   from the config that was used to create this RAID
   1345 		   set.
   1346 		   */
   1347 
   1348 		raidPtr->serial_number = clabel->serial_number;
   1349 
   1350 		for(column=0;column<raidPtr->numCol;column++) {
   1351 			diskPtr = &raidPtr->Disks[column];
   1352 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1353 				ci_label = raidget_component_label(raidPtr,
   1354 				    column);
   1355 				/* Zeroing this is important. */
   1356 				memset(ci_label, 0, sizeof(*ci_label));
   1357 				raid_init_component_label(raidPtr, ci_label);
   1358 				ci_label->serial_number =
   1359 				    raidPtr->serial_number;
   1360 				ci_label->row = 0; /* we dont' pretend to support more */
   1361 				rf_component_label_set_partitionsize(ci_label,
   1362 				    diskPtr->partitionSize);
   1363 				ci_label->column = column;
   1364 				raidflush_component_label(raidPtr, column);
   1365 			}
   1366 			/* XXXjld what about the spares? */
   1367 		}
   1368 
   1369 		return (retcode);
   1370 	case RAIDFRAME_SET_AUTOCONFIG:
   1371 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1372 		printf("raid%d: New autoconfig value is: %d\n",
   1373 		       raidPtr->raidid, d);
   1374 		*(int *) data = d;
   1375 		return (retcode);
   1376 
   1377 	case RAIDFRAME_SET_ROOT:
   1378 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1379 		printf("raid%d: New rootpartition value is: %d\n",
   1380 		       raidPtr->raidid, d);
   1381 		*(int *) data = d;
   1382 		return (retcode);
   1383 
   1384 		/* initialize all parity */
   1385 	case RAIDFRAME_REWRITEPARITY:
   1386 
   1387 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1388 			/* Parity for RAID 0 is trivially correct */
   1389 			raidPtr->parity_good = RF_RAID_CLEAN;
   1390 			return(0);
   1391 		}
   1392 
   1393 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1394 			/* Re-write is already in progress! */
   1395 			return(EINVAL);
   1396 		}
   1397 
   1398 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1399 					   rf_RewriteParityThread,
   1400 					   raidPtr,"raid_parity");
   1401 		return (retcode);
   1402 
   1403 
   1404 	case RAIDFRAME_ADD_HOT_SPARE:
   1405 		sparePtr = (RF_SingleComponent_t *) data;
   1406 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1407 		retcode = rf_add_hot_spare(raidPtr, &component);
   1408 		return(retcode);
   1409 
   1410 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1411 		return(retcode);
   1412 
   1413 	case RAIDFRAME_DELETE_COMPONENT:
   1414 		componentPtr = (RF_SingleComponent_t *)data;
   1415 		memcpy( &component, componentPtr,
   1416 			sizeof(RF_SingleComponent_t));
   1417 		retcode = rf_delete_component(raidPtr, &component);
   1418 		return(retcode);
   1419 
   1420 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1421 		componentPtr = (RF_SingleComponent_t *)data;
   1422 		memcpy( &component, componentPtr,
   1423 			sizeof(RF_SingleComponent_t));
   1424 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1425 		return(retcode);
   1426 
   1427 	case RAIDFRAME_REBUILD_IN_PLACE:
   1428 
   1429 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1430 			/* Can't do this on a RAID 0!! */
   1431 			return(EINVAL);
   1432 		}
   1433 
   1434 		if (raidPtr->recon_in_progress == 1) {
   1435 			/* a reconstruct is already in progress! */
   1436 			return(EINVAL);
   1437 		}
   1438 
   1439 		componentPtr = (RF_SingleComponent_t *) data;
   1440 		memcpy( &component, componentPtr,
   1441 			sizeof(RF_SingleComponent_t));
   1442 		component.row = 0; /* we don't support any more */
   1443 		column = component.column;
   1444 
   1445 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1446 			return(EINVAL);
   1447 		}
   1448 
   1449 		rf_lock_mutex2(raidPtr->mutex);
   1450 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1451 		    (raidPtr->numFailures > 0)) {
   1452 			/* XXX 0 above shouldn't be constant!!! */
   1453 			/* some component other than this has failed.
   1454 			   Let's not make things worse than they already
   1455 			   are... */
   1456 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1457 			       raidPtr->raidid);
   1458 			printf("raid%d:     Col: %d   Too many failures.\n",
   1459 			       raidPtr->raidid, column);
   1460 			rf_unlock_mutex2(raidPtr->mutex);
   1461 			return (EINVAL);
   1462 		}
   1463 		if (raidPtr->Disks[column].status ==
   1464 		    rf_ds_reconstructing) {
   1465 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1466 			       raidPtr->raidid);
   1467 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1468 
   1469 			rf_unlock_mutex2(raidPtr->mutex);
   1470 			return (EINVAL);
   1471 		}
   1472 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1473 			rf_unlock_mutex2(raidPtr->mutex);
   1474 			return (EINVAL);
   1475 		}
   1476 		rf_unlock_mutex2(raidPtr->mutex);
   1477 
   1478 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1479 		if (rrcopy == NULL)
   1480 			return(ENOMEM);
   1481 
   1482 		rrcopy->raidPtr = (void *) raidPtr;
   1483 		rrcopy->col = column;
   1484 
   1485 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1486 					   rf_ReconstructInPlaceThread,
   1487 					   rrcopy,"raid_reconip");
   1488 		return(retcode);
   1489 
   1490 	case RAIDFRAME_GET_INFO:
   1491 		if (!raidPtr->valid)
   1492 			return (ENODEV);
   1493 		ucfgp = (RF_DeviceConfig_t **) data;
   1494 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1495 			  (RF_DeviceConfig_t *));
   1496 		if (d_cfg == NULL)
   1497 			return (ENOMEM);
   1498 		d_cfg->rows = 1; /* there is only 1 row now */
   1499 		d_cfg->cols = raidPtr->numCol;
   1500 		d_cfg->ndevs = raidPtr->numCol;
   1501 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1502 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1503 			return (ENOMEM);
   1504 		}
   1505 		d_cfg->nspares = raidPtr->numSpare;
   1506 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1507 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1508 			return (ENOMEM);
   1509 		}
   1510 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1511 		d = 0;
   1512 		for (j = 0; j < d_cfg->cols; j++) {
   1513 			d_cfg->devs[d] = raidPtr->Disks[j];
   1514 			d++;
   1515 		}
   1516 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1517 			d_cfg->spares[i] = raidPtr->Disks[j];
   1518 		}
   1519 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1520 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1521 
   1522 		return (retcode);
   1523 
   1524 	case RAIDFRAME_CHECK_PARITY:
   1525 		*(int *) data = raidPtr->parity_good;
   1526 		return (0);
   1527 
   1528 	case RAIDFRAME_PARITYMAP_STATUS:
   1529 		if (rf_paritymap_ineligible(raidPtr))
   1530 			return EINVAL;
   1531 		rf_paritymap_status(raidPtr->parity_map,
   1532 		    (struct rf_pmstat *)data);
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		if (raidPtr->parity_map == NULL)
   1539 			return ENOENT; /* ??? */
   1540 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1541 			(struct rf_pmparams *)data, 1))
   1542 			return EINVAL;
   1543 		return 0;
   1544 
   1545 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1546 		if (rf_paritymap_ineligible(raidPtr))
   1547 			return EINVAL;
   1548 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1549 		return 0;
   1550 
   1551 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1552 		if (rf_paritymap_ineligible(raidPtr))
   1553 			return EINVAL;
   1554 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1555 		/* XXX should errors be passed up? */
   1556 		return 0;
   1557 
   1558 	case RAIDFRAME_RESET_ACCTOTALS:
   1559 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1560 		return (0);
   1561 
   1562 	case RAIDFRAME_GET_ACCTOTALS:
   1563 		totals = (RF_AccTotals_t *) data;
   1564 		*totals = raidPtr->acc_totals;
   1565 		return (0);
   1566 
   1567 	case RAIDFRAME_KEEP_ACCTOTALS:
   1568 		raidPtr->keep_acc_totals = *(int *)data;
   1569 		return (0);
   1570 
   1571 	case RAIDFRAME_GET_SIZE:
   1572 		*(int *) data = raidPtr->totalSectors;
   1573 		return (0);
   1574 
   1575 		/* fail a disk & optionally start reconstruction */
   1576 	case RAIDFRAME_FAIL_DISK:
   1577 
   1578 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1579 			/* Can't do this on a RAID 0!! */
   1580 			return(EINVAL);
   1581 		}
   1582 
   1583 		rr = (struct rf_recon_req *) data;
   1584 		rr->row = 0;
   1585 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1586 			return (EINVAL);
   1587 
   1588 
   1589 		rf_lock_mutex2(raidPtr->mutex);
   1590 		if (raidPtr->status == rf_rs_reconstructing) {
   1591 			/* you can't fail a disk while we're reconstructing! */
   1592 			/* XXX wrong for RAID6 */
   1593 			rf_unlock_mutex2(raidPtr->mutex);
   1594 			return (EINVAL);
   1595 		}
   1596 		if ((raidPtr->Disks[rr->col].status ==
   1597 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1598 			/* some other component has failed.  Let's not make
   1599 			   things worse. XXX wrong for RAID6 */
   1600 			rf_unlock_mutex2(raidPtr->mutex);
   1601 			return (EINVAL);
   1602 		}
   1603 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1604 			/* Can't fail a spared disk! */
   1605 			rf_unlock_mutex2(raidPtr->mutex);
   1606 			return (EINVAL);
   1607 		}
   1608 		rf_unlock_mutex2(raidPtr->mutex);
   1609 
   1610 		/* make a copy of the recon request so that we don't rely on
   1611 		 * the user's buffer */
   1612 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1613 		if (rrcopy == NULL)
   1614 			return(ENOMEM);
   1615 		memcpy(rrcopy, rr, sizeof(*rr));
   1616 		rrcopy->raidPtr = (void *) raidPtr;
   1617 
   1618 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1619 					   rf_ReconThread,
   1620 					   rrcopy,"raid_recon");
   1621 		return (0);
   1622 
   1623 		/* invoke a copyback operation after recon on whatever disk
   1624 		 * needs it, if any */
   1625 	case RAIDFRAME_COPYBACK:
   1626 
   1627 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1628 			/* This makes no sense on a RAID 0!! */
   1629 			return(EINVAL);
   1630 		}
   1631 
   1632 		if (raidPtr->copyback_in_progress == 1) {
   1633 			/* Copyback is already in progress! */
   1634 			return(EINVAL);
   1635 		}
   1636 
   1637 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1638 					   rf_CopybackThread,
   1639 					   raidPtr,"raid_copyback");
   1640 		return (retcode);
   1641 
   1642 		/* return the percentage completion of reconstruction */
   1643 	case RAIDFRAME_CHECK_RECON_STATUS:
   1644 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1645 			/* This makes no sense on a RAID 0, so tell the
   1646 			   user it's done. */
   1647 			*(int *) data = 100;
   1648 			return(0);
   1649 		}
   1650 		if (raidPtr->status != rf_rs_reconstructing)
   1651 			*(int *) data = 100;
   1652 		else {
   1653 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1654 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1655 			} else {
   1656 				*(int *) data = 0;
   1657 			}
   1658 		}
   1659 		return (0);
   1660 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1661 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1662 		if (raidPtr->status != rf_rs_reconstructing) {
   1663 			progressInfo.remaining = 0;
   1664 			progressInfo.completed = 100;
   1665 			progressInfo.total = 100;
   1666 		} else {
   1667 			progressInfo.total =
   1668 				raidPtr->reconControl->numRUsTotal;
   1669 			progressInfo.completed =
   1670 				raidPtr->reconControl->numRUsComplete;
   1671 			progressInfo.remaining = progressInfo.total -
   1672 				progressInfo.completed;
   1673 		}
   1674 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1675 				  sizeof(RF_ProgressInfo_t));
   1676 		return (retcode);
   1677 
   1678 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1679 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1680 			/* This makes no sense on a RAID 0, so tell the
   1681 			   user it's done. */
   1682 			*(int *) data = 100;
   1683 			return(0);
   1684 		}
   1685 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1686 			*(int *) data = 100 *
   1687 				raidPtr->parity_rewrite_stripes_done /
   1688 				raidPtr->Layout.numStripe;
   1689 		} else {
   1690 			*(int *) data = 100;
   1691 		}
   1692 		return (0);
   1693 
   1694 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1695 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1696 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1697 			progressInfo.total = raidPtr->Layout.numStripe;
   1698 			progressInfo.completed =
   1699 				raidPtr->parity_rewrite_stripes_done;
   1700 			progressInfo.remaining = progressInfo.total -
   1701 				progressInfo.completed;
   1702 		} else {
   1703 			progressInfo.remaining = 0;
   1704 			progressInfo.completed = 100;
   1705 			progressInfo.total = 100;
   1706 		}
   1707 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1708 				  sizeof(RF_ProgressInfo_t));
   1709 		return (retcode);
   1710 
   1711 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1712 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1713 			/* This makes no sense on a RAID 0 */
   1714 			*(int *) data = 100;
   1715 			return(0);
   1716 		}
   1717 		if (raidPtr->copyback_in_progress == 1) {
   1718 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1719 				raidPtr->Layout.numStripe;
   1720 		} else {
   1721 			*(int *) data = 100;
   1722 		}
   1723 		return (0);
   1724 
   1725 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1726 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1727 		if (raidPtr->copyback_in_progress == 1) {
   1728 			progressInfo.total = raidPtr->Layout.numStripe;
   1729 			progressInfo.completed =
   1730 				raidPtr->copyback_stripes_done;
   1731 			progressInfo.remaining = progressInfo.total -
   1732 				progressInfo.completed;
   1733 		} else {
   1734 			progressInfo.remaining = 0;
   1735 			progressInfo.completed = 100;
   1736 			progressInfo.total = 100;
   1737 		}
   1738 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1739 				  sizeof(RF_ProgressInfo_t));
   1740 		return (retcode);
   1741 
   1742 		/* the sparetable daemon calls this to wait for the kernel to
   1743 		 * need a spare table. this ioctl does not return until a
   1744 		 * spare table is needed. XXX -- calling mpsleep here in the
   1745 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1746 		 * -- I should either compute the spare table in the kernel,
   1747 		 * or have a different -- XXX XXX -- interface (a different
   1748 		 * character device) for delivering the table     -- XXX */
   1749 #if 0
   1750 	case RAIDFRAME_SPARET_WAIT:
   1751 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1752 		while (!rf_sparet_wait_queue)
   1753 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1754 		waitreq = rf_sparet_wait_queue;
   1755 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1756 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1757 
   1758 		/* structure assignment */
   1759 		*((RF_SparetWait_t *) data) = *waitreq;
   1760 
   1761 		RF_Free(waitreq, sizeof(*waitreq));
   1762 		return (0);
   1763 
   1764 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1765 		 * code in it that will cause the dameon to exit */
   1766 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1767 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1768 		waitreq->fcol = -1;
   1769 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1770 		waitreq->next = rf_sparet_wait_queue;
   1771 		rf_sparet_wait_queue = waitreq;
   1772 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1773 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1774 		return (0);
   1775 
   1776 		/* used by the spare table daemon to deliver a spare table
   1777 		 * into the kernel */
   1778 	case RAIDFRAME_SEND_SPARET:
   1779 
   1780 		/* install the spare table */
   1781 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1782 
   1783 		/* respond to the requestor.  the return status of the spare
   1784 		 * table installation is passed in the "fcol" field */
   1785 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1786 		waitreq->fcol = retcode;
   1787 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1788 		waitreq->next = rf_sparet_resp_queue;
   1789 		rf_sparet_resp_queue = waitreq;
   1790 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1791 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1792 
   1793 		return (retcode);
   1794 #endif
   1795 
   1796 	default:
   1797 		break; /* fall through to the os-specific code below */
   1798 
   1799 	}
   1800 
   1801 	if (!raidPtr->valid)
   1802 		return (EINVAL);
   1803 
   1804 	/*
   1805 	 * Add support for "regular" device ioctls here.
   1806 	 */
   1807 
   1808 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1809 	if (error != EPASSTHROUGH)
   1810 		return (error);
   1811 
   1812 	switch (cmd) {
   1813 	case DIOCGDINFO:
   1814 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1815 		break;
   1816 #ifdef __HAVE_OLD_DISKLABEL
   1817 	case ODIOCGDINFO:
   1818 		newlabel = *(rs->sc_dkdev.dk_label);
   1819 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1820 			return ENOTTY;
   1821 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1822 		break;
   1823 #endif
   1824 
   1825 	case DIOCGPART:
   1826 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1827 		((struct partinfo *) data)->part =
   1828 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1829 		break;
   1830 
   1831 	case DIOCWDINFO:
   1832 	case DIOCSDINFO:
   1833 #ifdef __HAVE_OLD_DISKLABEL
   1834 	case ODIOCWDINFO:
   1835 	case ODIOCSDINFO:
   1836 #endif
   1837 	{
   1838 		struct disklabel *lp;
   1839 #ifdef __HAVE_OLD_DISKLABEL
   1840 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1841 			memset(&newlabel, 0, sizeof newlabel);
   1842 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1843 			lp = &newlabel;
   1844 		} else
   1845 #endif
   1846 		lp = (struct disklabel *)data;
   1847 
   1848 		if ((error = raidlock(rs)) != 0)
   1849 			return (error);
   1850 
   1851 		rs->sc_flags |= RAIDF_LABELLING;
   1852 
   1853 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1854 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1855 		if (error == 0) {
   1856 			if (cmd == DIOCWDINFO
   1857 #ifdef __HAVE_OLD_DISKLABEL
   1858 			    || cmd == ODIOCWDINFO
   1859 #endif
   1860 			   )
   1861 				error = writedisklabel(RAIDLABELDEV(dev),
   1862 				    raidstrategy, rs->sc_dkdev.dk_label,
   1863 				    rs->sc_dkdev.dk_cpulabel);
   1864 		}
   1865 		rs->sc_flags &= ~RAIDF_LABELLING;
   1866 
   1867 		raidunlock(rs);
   1868 
   1869 		if (error)
   1870 			return (error);
   1871 		break;
   1872 	}
   1873 
   1874 	case DIOCWLABEL:
   1875 		if (*(int *) data != 0)
   1876 			rs->sc_flags |= RAIDF_WLABEL;
   1877 		else
   1878 			rs->sc_flags &= ~RAIDF_WLABEL;
   1879 		break;
   1880 
   1881 	case DIOCGDEFLABEL:
   1882 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1883 		break;
   1884 
   1885 #ifdef __HAVE_OLD_DISKLABEL
   1886 	case ODIOCGDEFLABEL:
   1887 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1888 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1889 			return ENOTTY;
   1890 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1891 		break;
   1892 #endif
   1893 
   1894 	case DIOCAWEDGE:
   1895 	case DIOCDWEDGE:
   1896 	    	dkw = (void *)data;
   1897 
   1898 		/* If the ioctl happens here, the parent is us. */
   1899 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1900 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1901 
   1902 	case DIOCLWEDGES:
   1903 		return dkwedge_list(&rs->sc_dkdev,
   1904 		    (struct dkwedge_list *)data, l);
   1905 	case DIOCCACHESYNC:
   1906 		return rf_sync_component_caches(raidPtr);
   1907 
   1908 	case DIOCGSTRATEGY:
   1909 	    {
   1910 		struct disk_strategy *dks = (void *)data;
   1911 
   1912 		s = splbio();
   1913 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1914 		    sizeof(dks->dks_name));
   1915 		splx(s);
   1916 		dks->dks_paramlen = 0;
   1917 
   1918 		return 0;
   1919 	    }
   1920 
   1921 	case DIOCSSTRATEGY:
   1922 	    {
   1923 		struct disk_strategy *dks = (void *)data;
   1924 		struct bufq_state *new;
   1925 		struct bufq_state *old;
   1926 
   1927 		if (dks->dks_param != NULL) {
   1928 			return EINVAL;
   1929 		}
   1930 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1931 		error = bufq_alloc(&new, dks->dks_name,
   1932 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1933 		if (error) {
   1934 			return error;
   1935 		}
   1936 		s = splbio();
   1937 		old = rs->buf_queue;
   1938 		bufq_move(new, old);
   1939 		rs->buf_queue = new;
   1940 		splx(s);
   1941 		bufq_free(old);
   1942 
   1943 		return 0;
   1944 	    }
   1945 
   1946 	default:
   1947 		retcode = ENOTTY;
   1948 	}
   1949 	return (retcode);
   1950 
   1951 }
   1952 
   1953 
   1954 /* raidinit -- complete the rest of the initialization for the
   1955    RAIDframe device.  */
   1956 
   1957 
   1958 static void
   1959 raidinit(struct raid_softc *rs)
   1960 {
   1961 	cfdata_t cf;
   1962 	int     unit;
   1963 	RF_Raid_t *raidPtr = &rs->sc_r;
   1964 
   1965 	unit = raidPtr->raidid;
   1966 
   1967 
   1968 	/* XXX should check return code first... */
   1969 	rs->sc_flags |= RAIDF_INITED;
   1970 
   1971 	/* XXX doesn't check bounds. */
   1972 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1973 
   1974 	/* attach the pseudo device */
   1975 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1976 	cf->cf_name = raid_cd.cd_name;
   1977 	cf->cf_atname = raid_cd.cd_name;
   1978 	cf->cf_unit = unit;
   1979 	cf->cf_fstate = FSTATE_STAR;
   1980 
   1981 	rs->sc_dev = config_attach_pseudo(cf);
   1982 
   1983 	if (rs->sc_dev == NULL) {
   1984 		printf("raid%d: config_attach_pseudo failed\n",
   1985 		    raidPtr->raidid);
   1986 		rs->sc_flags &= ~RAIDF_INITED;
   1987 		free(cf, M_RAIDFRAME);
   1988 		return;
   1989 	}
   1990 
   1991 	/* disk_attach actually creates space for the CPU disklabel, among
   1992 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1993 	 * with disklabels. */
   1994 
   1995 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1996 	disk_attach(&rs->sc_dkdev);
   1997 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1998 
   1999 	/* XXX There may be a weird interaction here between this, and
   2000 	 * protectedSectors, as used in RAIDframe.  */
   2001 
   2002 	rs->sc_size = raidPtr->totalSectors;
   2003 
   2004 	dkwedge_discover(&rs->sc_dkdev);
   2005 
   2006 	rf_set_geometry(rs, raidPtr);
   2007 
   2008 }
   2009 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2010 /* wake up the daemon & tell it to get us a spare table
   2011  * XXX
   2012  * the entries in the queues should be tagged with the raidPtr
   2013  * so that in the extremely rare case that two recons happen at once,
   2014  * we know for which device were requesting a spare table
   2015  * XXX
   2016  *
   2017  * XXX This code is not currently used. GO
   2018  */
   2019 int
   2020 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2021 {
   2022 	int     retcode;
   2023 
   2024 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2025 	req->next = rf_sparet_wait_queue;
   2026 	rf_sparet_wait_queue = req;
   2027 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2028 
   2029 	/* mpsleep unlocks the mutex */
   2030 	while (!rf_sparet_resp_queue) {
   2031 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2032 	}
   2033 	req = rf_sparet_resp_queue;
   2034 	rf_sparet_resp_queue = req->next;
   2035 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2036 
   2037 	retcode = req->fcol;
   2038 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2039 					 * alloc'd */
   2040 	return (retcode);
   2041 }
   2042 #endif
   2043 
   2044 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2045  * bp & passes it down.
   2046  * any calls originating in the kernel must use non-blocking I/O
   2047  * do some extra sanity checking to return "appropriate" error values for
   2048  * certain conditions (to make some standard utilities work)
   2049  *
   2050  * Formerly known as: rf_DoAccessKernel
   2051  */
   2052 void
   2053 raidstart(RF_Raid_t *raidPtr)
   2054 {
   2055 	RF_SectorCount_t num_blocks, pb, sum;
   2056 	RF_RaidAddr_t raid_addr;
   2057 	struct partition *pp;
   2058 	daddr_t blocknum;
   2059 	struct raid_softc *rs;
   2060 	int     do_async;
   2061 	struct buf *bp;
   2062 	int rc;
   2063 
   2064 	rs = raidPtr->softc;
   2065 	/* quick check to see if anything has died recently */
   2066 	rf_lock_mutex2(raidPtr->mutex);
   2067 	if (raidPtr->numNewFailures > 0) {
   2068 		rf_unlock_mutex2(raidPtr->mutex);
   2069 		rf_update_component_labels(raidPtr,
   2070 					   RF_NORMAL_COMPONENT_UPDATE);
   2071 		rf_lock_mutex2(raidPtr->mutex);
   2072 		raidPtr->numNewFailures--;
   2073 	}
   2074 
   2075 	/* Check to see if we're at the limit... */
   2076 	while (raidPtr->openings > 0) {
   2077 		rf_unlock_mutex2(raidPtr->mutex);
   2078 
   2079 		/* get the next item, if any, from the queue */
   2080 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2081 			/* nothing more to do */
   2082 			return;
   2083 		}
   2084 
   2085 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2086 		 * partition.. Need to make it absolute to the underlying
   2087 		 * device.. */
   2088 
   2089 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2090 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2091 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2092 			blocknum += pp->p_offset;
   2093 		}
   2094 
   2095 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2096 			    (int) blocknum));
   2097 
   2098 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2099 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2100 
   2101 		/* *THIS* is where we adjust what block we're going to...
   2102 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2103 		raid_addr = blocknum;
   2104 
   2105 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2106 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2107 		sum = raid_addr + num_blocks + pb;
   2108 		if (1 || rf_debugKernelAccess) {
   2109 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2110 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2111 				    (int) pb, (int) bp->b_resid));
   2112 		}
   2113 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2114 		    || (sum < num_blocks) || (sum < pb)) {
   2115 			bp->b_error = ENOSPC;
   2116 			bp->b_resid = bp->b_bcount;
   2117 			biodone(bp);
   2118 			rf_lock_mutex2(raidPtr->mutex);
   2119 			continue;
   2120 		}
   2121 		/*
   2122 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2123 		 */
   2124 
   2125 		if (bp->b_bcount & raidPtr->sectorMask) {
   2126 			bp->b_error = EINVAL;
   2127 			bp->b_resid = bp->b_bcount;
   2128 			biodone(bp);
   2129 			rf_lock_mutex2(raidPtr->mutex);
   2130 			continue;
   2131 
   2132 		}
   2133 		db1_printf(("Calling DoAccess..\n"));
   2134 
   2135 
   2136 		rf_lock_mutex2(raidPtr->mutex);
   2137 		raidPtr->openings--;
   2138 		rf_unlock_mutex2(raidPtr->mutex);
   2139 
   2140 		/*
   2141 		 * Everything is async.
   2142 		 */
   2143 		do_async = 1;
   2144 
   2145 		disk_busy(&rs->sc_dkdev);
   2146 
   2147 		/* XXX we're still at splbio() here... do we *really*
   2148 		   need to be? */
   2149 
   2150 		/* don't ever condition on bp->b_flags & B_WRITE.
   2151 		 * always condition on B_READ instead */
   2152 
   2153 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2154 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2155 				 do_async, raid_addr, num_blocks,
   2156 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2157 
   2158 		if (rc) {
   2159 			bp->b_error = rc;
   2160 			bp->b_resid = bp->b_bcount;
   2161 			biodone(bp);
   2162 			/* continue loop */
   2163 		}
   2164 
   2165 		rf_lock_mutex2(raidPtr->mutex);
   2166 	}
   2167 	rf_unlock_mutex2(raidPtr->mutex);
   2168 }
   2169 
   2170 
   2171 
   2172 
   2173 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2174 
   2175 int
   2176 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2177 {
   2178 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2179 	struct buf *bp;
   2180 
   2181 	req->queue = queue;
   2182 	bp = req->bp;
   2183 
   2184 	switch (req->type) {
   2185 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2186 		/* XXX need to do something extra here.. */
   2187 		/* I'm leaving this in, as I've never actually seen it used,
   2188 		 * and I'd like folks to report it... GO */
   2189 		printf(("WAKEUP CALLED\n"));
   2190 		queue->numOutstanding++;
   2191 
   2192 		bp->b_flags = 0;
   2193 		bp->b_private = req;
   2194 
   2195 		KernelWakeupFunc(bp);
   2196 		break;
   2197 
   2198 	case RF_IO_TYPE_READ:
   2199 	case RF_IO_TYPE_WRITE:
   2200 #if RF_ACC_TRACE > 0
   2201 		if (req->tracerec) {
   2202 			RF_ETIMER_START(req->tracerec->timer);
   2203 		}
   2204 #endif
   2205 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2206 		    op, queue->rf_cinfo->ci_dev,
   2207 		    req->sectorOffset, req->numSector,
   2208 		    req->buf, KernelWakeupFunc, (void *) req,
   2209 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2210 
   2211 		if (rf_debugKernelAccess) {
   2212 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2213 				(long) bp->b_blkno));
   2214 		}
   2215 		queue->numOutstanding++;
   2216 		queue->last_deq_sector = req->sectorOffset;
   2217 		/* acc wouldn't have been let in if there were any pending
   2218 		 * reqs at any other priority */
   2219 		queue->curPriority = req->priority;
   2220 
   2221 		db1_printf(("Going for %c to unit %d col %d\n",
   2222 			    req->type, queue->raidPtr->raidid,
   2223 			    queue->col));
   2224 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2225 			(int) req->sectorOffset, (int) req->numSector,
   2226 			(int) (req->numSector <<
   2227 			    queue->raidPtr->logBytesPerSector),
   2228 			(int) queue->raidPtr->logBytesPerSector));
   2229 
   2230 		/*
   2231 		 * XXX: drop lock here since this can block at
   2232 		 * least with backing SCSI devices.  Retake it
   2233 		 * to minimize fuss with calling interfaces.
   2234 		 */
   2235 
   2236 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2237 		bdev_strategy(bp);
   2238 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2239 		break;
   2240 
   2241 	default:
   2242 		panic("bad req->type in rf_DispatchKernelIO");
   2243 	}
   2244 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2245 
   2246 	return (0);
   2247 }
   2248 /* this is the callback function associated with a I/O invoked from
   2249    kernel code.
   2250  */
   2251 static void
   2252 KernelWakeupFunc(struct buf *bp)
   2253 {
   2254 	RF_DiskQueueData_t *req = NULL;
   2255 	RF_DiskQueue_t *queue;
   2256 
   2257 	db1_printf(("recovering the request queue:\n"));
   2258 
   2259 	req = bp->b_private;
   2260 
   2261 	queue = (RF_DiskQueue_t *) req->queue;
   2262 
   2263 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2264 
   2265 #if RF_ACC_TRACE > 0
   2266 	if (req->tracerec) {
   2267 		RF_ETIMER_STOP(req->tracerec->timer);
   2268 		RF_ETIMER_EVAL(req->tracerec->timer);
   2269 		rf_lock_mutex2(rf_tracing_mutex);
   2270 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2271 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2272 		req->tracerec->num_phys_ios++;
   2273 		rf_unlock_mutex2(rf_tracing_mutex);
   2274 	}
   2275 #endif
   2276 
   2277 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2278 	 * ballistic, and mark the component as hosed... */
   2279 
   2280 	if (bp->b_error != 0) {
   2281 		/* Mark the disk as dead */
   2282 		/* but only mark it once... */
   2283 		/* and only if it wouldn't leave this RAID set
   2284 		   completely broken */
   2285 		if (((queue->raidPtr->Disks[queue->col].status ==
   2286 		      rf_ds_optimal) ||
   2287 		     (queue->raidPtr->Disks[queue->col].status ==
   2288 		      rf_ds_used_spare)) &&
   2289 		     (queue->raidPtr->numFailures <
   2290 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2291 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2292 			       queue->raidPtr->raidid,
   2293 			       queue->raidPtr->Disks[queue->col].devname);
   2294 			queue->raidPtr->Disks[queue->col].status =
   2295 			    rf_ds_failed;
   2296 			queue->raidPtr->status = rf_rs_degraded;
   2297 			queue->raidPtr->numFailures++;
   2298 			queue->raidPtr->numNewFailures++;
   2299 		} else {	/* Disk is already dead... */
   2300 			/* printf("Disk already marked as dead!\n"); */
   2301 		}
   2302 
   2303 	}
   2304 
   2305 	/* Fill in the error value */
   2306 	req->error = bp->b_error;
   2307 
   2308 	/* Drop this one on the "finished" queue... */
   2309 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2310 
   2311 	/* Let the raidio thread know there is work to be done. */
   2312 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2313 
   2314 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2315 }
   2316 
   2317 
   2318 /*
   2319  * initialize a buf structure for doing an I/O in the kernel.
   2320  */
   2321 static void
   2322 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2323        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2324        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2325        struct proc *b_proc)
   2326 {
   2327 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2328 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2329 	bp->b_oflags = 0;
   2330 	bp->b_cflags = 0;
   2331 	bp->b_bcount = numSect << logBytesPerSector;
   2332 	bp->b_bufsize = bp->b_bcount;
   2333 	bp->b_error = 0;
   2334 	bp->b_dev = dev;
   2335 	bp->b_data = bf;
   2336 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2337 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2338 	if (bp->b_bcount == 0) {
   2339 		panic("bp->b_bcount is zero in InitBP!!");
   2340 	}
   2341 	bp->b_proc = b_proc;
   2342 	bp->b_iodone = cbFunc;
   2343 	bp->b_private = cbArg;
   2344 }
   2345 
   2346 static void
   2347 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2348 		    struct disklabel *lp)
   2349 {
   2350 	memset(lp, 0, sizeof(*lp));
   2351 
   2352 	/* fabricate a label... */
   2353 	lp->d_secperunit = raidPtr->totalSectors;
   2354 	lp->d_secsize = raidPtr->bytesPerSector;
   2355 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2356 	lp->d_ntracks = 4 * raidPtr->numCol;
   2357 	lp->d_ncylinders = raidPtr->totalSectors /
   2358 		(lp->d_nsectors * lp->d_ntracks);
   2359 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2360 
   2361 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2362 	lp->d_type = DTYPE_RAID;
   2363 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2364 	lp->d_rpm = 3600;
   2365 	lp->d_interleave = 1;
   2366 	lp->d_flags = 0;
   2367 
   2368 	lp->d_partitions[RAW_PART].p_offset = 0;
   2369 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2370 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2371 	lp->d_npartitions = RAW_PART + 1;
   2372 
   2373 	lp->d_magic = DISKMAGIC;
   2374 	lp->d_magic2 = DISKMAGIC;
   2375 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2376 
   2377 }
   2378 /*
   2379  * Read the disklabel from the raid device.  If one is not present, fake one
   2380  * up.
   2381  */
   2382 static void
   2383 raidgetdisklabel(dev_t dev)
   2384 {
   2385 	int     unit = raidunit(dev);
   2386 	struct raid_softc *rs;
   2387 	const char   *errstring;
   2388 	struct disklabel *lp;
   2389 	struct cpu_disklabel *clp;
   2390 	RF_Raid_t *raidPtr;
   2391 
   2392 	if ((rs = raidget(unit)) == NULL)
   2393 		return;
   2394 
   2395 	lp = rs->sc_dkdev.dk_label;
   2396 	clp = rs->sc_dkdev.dk_cpulabel;
   2397 
   2398 	db1_printf(("Getting the disklabel...\n"));
   2399 
   2400 	memset(clp, 0, sizeof(*clp));
   2401 
   2402 	raidPtr = &rs->sc_r;
   2403 
   2404 	raidgetdefaultlabel(raidPtr, rs, lp);
   2405 
   2406 	/*
   2407 	 * Call the generic disklabel extraction routine.
   2408 	 */
   2409 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2410 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2411 	if (errstring)
   2412 		raidmakedisklabel(rs);
   2413 	else {
   2414 		int     i;
   2415 		struct partition *pp;
   2416 
   2417 		/*
   2418 		 * Sanity check whether the found disklabel is valid.
   2419 		 *
   2420 		 * This is necessary since total size of the raid device
   2421 		 * may vary when an interleave is changed even though exactly
   2422 		 * same components are used, and old disklabel may used
   2423 		 * if that is found.
   2424 		 */
   2425 		if (lp->d_secperunit != rs->sc_size)
   2426 			printf("raid%d: WARNING: %s: "
   2427 			    "total sector size in disklabel (%" PRIu32 ") != "
   2428 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2429 			    lp->d_secperunit, rs->sc_size);
   2430 		for (i = 0; i < lp->d_npartitions; i++) {
   2431 			pp = &lp->d_partitions[i];
   2432 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2433 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2434 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2435 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2436 		}
   2437 	}
   2438 
   2439 }
   2440 /*
   2441  * Take care of things one might want to take care of in the event
   2442  * that a disklabel isn't present.
   2443  */
   2444 static void
   2445 raidmakedisklabel(struct raid_softc *rs)
   2446 {
   2447 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2448 	db1_printf(("Making a label..\n"));
   2449 
   2450 	/*
   2451 	 * For historical reasons, if there's no disklabel present
   2452 	 * the raw partition must be marked FS_BSDFFS.
   2453 	 */
   2454 
   2455 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2456 
   2457 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2458 
   2459 	lp->d_checksum = dkcksum(lp);
   2460 }
   2461 /*
   2462  * Wait interruptibly for an exclusive lock.
   2463  *
   2464  * XXX
   2465  * Several drivers do this; it should be abstracted and made MP-safe.
   2466  * (Hmm... where have we seen this warning before :->  GO )
   2467  */
   2468 static int
   2469 raidlock(struct raid_softc *rs)
   2470 {
   2471 	int     error;
   2472 
   2473 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2474 		rs->sc_flags |= RAIDF_WANTED;
   2475 		if ((error =
   2476 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2477 			return (error);
   2478 	}
   2479 	rs->sc_flags |= RAIDF_LOCKED;
   2480 	return (0);
   2481 }
   2482 /*
   2483  * Unlock and wake up any waiters.
   2484  */
   2485 static void
   2486 raidunlock(struct raid_softc *rs)
   2487 {
   2488 
   2489 	rs->sc_flags &= ~RAIDF_LOCKED;
   2490 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2491 		rs->sc_flags &= ~RAIDF_WANTED;
   2492 		wakeup(rs);
   2493 	}
   2494 }
   2495 
   2496 
   2497 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2498 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2499 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2500 
   2501 static daddr_t
   2502 rf_component_info_offset(void)
   2503 {
   2504 
   2505 	return RF_COMPONENT_INFO_OFFSET;
   2506 }
   2507 
   2508 static daddr_t
   2509 rf_component_info_size(unsigned secsize)
   2510 {
   2511 	daddr_t info_size;
   2512 
   2513 	KASSERT(secsize);
   2514 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2515 		info_size = secsize;
   2516 	else
   2517 		info_size = RF_COMPONENT_INFO_SIZE;
   2518 
   2519 	return info_size;
   2520 }
   2521 
   2522 static daddr_t
   2523 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2524 {
   2525 	daddr_t map_offset;
   2526 
   2527 	KASSERT(raidPtr->bytesPerSector);
   2528 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2529 		map_offset = raidPtr->bytesPerSector;
   2530 	else
   2531 		map_offset = RF_COMPONENT_INFO_SIZE;
   2532 	map_offset += rf_component_info_offset();
   2533 
   2534 	return map_offset;
   2535 }
   2536 
   2537 static daddr_t
   2538 rf_parity_map_size(RF_Raid_t *raidPtr)
   2539 {
   2540 	daddr_t map_size;
   2541 
   2542 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2543 		map_size = raidPtr->bytesPerSector;
   2544 	else
   2545 		map_size = RF_PARITY_MAP_SIZE;
   2546 
   2547 	return map_size;
   2548 }
   2549 
   2550 int
   2551 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2552 {
   2553 	RF_ComponentLabel_t *clabel;
   2554 
   2555 	clabel = raidget_component_label(raidPtr, col);
   2556 	clabel->clean = RF_RAID_CLEAN;
   2557 	raidflush_component_label(raidPtr, col);
   2558 	return(0);
   2559 }
   2560 
   2561 
   2562 int
   2563 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2564 {
   2565 	RF_ComponentLabel_t *clabel;
   2566 
   2567 	clabel = raidget_component_label(raidPtr, col);
   2568 	clabel->clean = RF_RAID_DIRTY;
   2569 	raidflush_component_label(raidPtr, col);
   2570 	return(0);
   2571 }
   2572 
   2573 int
   2574 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2575 {
   2576 	KASSERT(raidPtr->bytesPerSector);
   2577 	return raidread_component_label(raidPtr->bytesPerSector,
   2578 	    raidPtr->Disks[col].dev,
   2579 	    raidPtr->raid_cinfo[col].ci_vp,
   2580 	    &raidPtr->raid_cinfo[col].ci_label);
   2581 }
   2582 
   2583 RF_ComponentLabel_t *
   2584 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2585 {
   2586 	return &raidPtr->raid_cinfo[col].ci_label;
   2587 }
   2588 
   2589 int
   2590 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2591 {
   2592 	RF_ComponentLabel_t *label;
   2593 
   2594 	label = &raidPtr->raid_cinfo[col].ci_label;
   2595 	label->mod_counter = raidPtr->mod_counter;
   2596 #ifndef RF_NO_PARITY_MAP
   2597 	label->parity_map_modcount = label->mod_counter;
   2598 #endif
   2599 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2600 	    raidPtr->Disks[col].dev,
   2601 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2602 }
   2603 
   2604 
   2605 static int
   2606 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2607     RF_ComponentLabel_t *clabel)
   2608 {
   2609 	return raidread_component_area(dev, b_vp, clabel,
   2610 	    sizeof(RF_ComponentLabel_t),
   2611 	    rf_component_info_offset(),
   2612 	    rf_component_info_size(secsize));
   2613 }
   2614 
   2615 /* ARGSUSED */
   2616 static int
   2617 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2618     size_t msize, daddr_t offset, daddr_t dsize)
   2619 {
   2620 	struct buf *bp;
   2621 	const struct bdevsw *bdev;
   2622 	int error;
   2623 
   2624 	/* XXX should probably ensure that we don't try to do this if
   2625 	   someone has changed rf_protected_sectors. */
   2626 
   2627 	if (b_vp == NULL) {
   2628 		/* For whatever reason, this component is not valid.
   2629 		   Don't try to read a component label from it. */
   2630 		return(EINVAL);
   2631 	}
   2632 
   2633 	/* get a block of the appropriate size... */
   2634 	bp = geteblk((int)dsize);
   2635 	bp->b_dev = dev;
   2636 
   2637 	/* get our ducks in a row for the read */
   2638 	bp->b_blkno = offset / DEV_BSIZE;
   2639 	bp->b_bcount = dsize;
   2640 	bp->b_flags |= B_READ;
   2641  	bp->b_resid = dsize;
   2642 
   2643 	bdev = bdevsw_lookup(bp->b_dev);
   2644 	if (bdev == NULL)
   2645 		return (ENXIO);
   2646 	(*bdev->d_strategy)(bp);
   2647 
   2648 	error = biowait(bp);
   2649 
   2650 	if (!error) {
   2651 		memcpy(data, bp->b_data, msize);
   2652 	}
   2653 
   2654 	brelse(bp, 0);
   2655 	return(error);
   2656 }
   2657 
   2658 
   2659 static int
   2660 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2661     RF_ComponentLabel_t *clabel)
   2662 {
   2663 	return raidwrite_component_area(dev, b_vp, clabel,
   2664 	    sizeof(RF_ComponentLabel_t),
   2665 	    rf_component_info_offset(),
   2666 	    rf_component_info_size(secsize), 0);
   2667 }
   2668 
   2669 /* ARGSUSED */
   2670 static int
   2671 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2672     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2673 {
   2674 	struct buf *bp;
   2675 	const struct bdevsw *bdev;
   2676 	int error;
   2677 
   2678 	/* get a block of the appropriate size... */
   2679 	bp = geteblk((int)dsize);
   2680 	bp->b_dev = dev;
   2681 
   2682 	/* get our ducks in a row for the write */
   2683 	bp->b_blkno = offset / DEV_BSIZE;
   2684 	bp->b_bcount = dsize;
   2685 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2686  	bp->b_resid = dsize;
   2687 
   2688 	memset(bp->b_data, 0, dsize);
   2689 	memcpy(bp->b_data, data, msize);
   2690 
   2691 	bdev = bdevsw_lookup(bp->b_dev);
   2692 	if (bdev == NULL)
   2693 		return (ENXIO);
   2694 	(*bdev->d_strategy)(bp);
   2695 	if (asyncp)
   2696 		return 0;
   2697 	error = biowait(bp);
   2698 	brelse(bp, 0);
   2699 	if (error) {
   2700 #if 1
   2701 		printf("Failed to write RAID component info!\n");
   2702 #endif
   2703 	}
   2704 
   2705 	return(error);
   2706 }
   2707 
   2708 void
   2709 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2710 {
   2711 	int c;
   2712 
   2713 	for (c = 0; c < raidPtr->numCol; c++) {
   2714 		/* Skip dead disks. */
   2715 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2716 			continue;
   2717 		/* XXXjld: what if an error occurs here? */
   2718 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2719 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2720 		    RF_PARITYMAP_NBYTE,
   2721 		    rf_parity_map_offset(raidPtr),
   2722 		    rf_parity_map_size(raidPtr), 0);
   2723 	}
   2724 }
   2725 
   2726 void
   2727 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2728 {
   2729 	struct rf_paritymap_ondisk tmp;
   2730 	int c,first;
   2731 
   2732 	first=1;
   2733 	for (c = 0; c < raidPtr->numCol; c++) {
   2734 		/* Skip dead disks. */
   2735 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2736 			continue;
   2737 		raidread_component_area(raidPtr->Disks[c].dev,
   2738 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2739 		    RF_PARITYMAP_NBYTE,
   2740 		    rf_parity_map_offset(raidPtr),
   2741 		    rf_parity_map_size(raidPtr));
   2742 		if (first) {
   2743 			memcpy(map, &tmp, sizeof(*map));
   2744 			first = 0;
   2745 		} else {
   2746 			rf_paritymap_merge(map, &tmp);
   2747 		}
   2748 	}
   2749 }
   2750 
   2751 void
   2752 rf_markalldirty(RF_Raid_t *raidPtr)
   2753 {
   2754 	RF_ComponentLabel_t *clabel;
   2755 	int sparecol;
   2756 	int c;
   2757 	int j;
   2758 	int scol = -1;
   2759 
   2760 	raidPtr->mod_counter++;
   2761 	for (c = 0; c < raidPtr->numCol; c++) {
   2762 		/* we don't want to touch (at all) a disk that has
   2763 		   failed */
   2764 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2765 			clabel = raidget_component_label(raidPtr, c);
   2766 			if (clabel->status == rf_ds_spared) {
   2767 				/* XXX do something special...
   2768 				   but whatever you do, don't
   2769 				   try to access it!! */
   2770 			} else {
   2771 				raidmarkdirty(raidPtr, c);
   2772 			}
   2773 		}
   2774 	}
   2775 
   2776 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2777 		sparecol = raidPtr->numCol + c;
   2778 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2779 			/*
   2780 
   2781 			   we claim this disk is "optimal" if it's
   2782 			   rf_ds_used_spare, as that means it should be
   2783 			   directly substitutable for the disk it replaced.
   2784 			   We note that too...
   2785 
   2786 			 */
   2787 
   2788 			for(j=0;j<raidPtr->numCol;j++) {
   2789 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2790 					scol = j;
   2791 					break;
   2792 				}
   2793 			}
   2794 
   2795 			clabel = raidget_component_label(raidPtr, sparecol);
   2796 			/* make sure status is noted */
   2797 
   2798 			raid_init_component_label(raidPtr, clabel);
   2799 
   2800 			clabel->row = 0;
   2801 			clabel->column = scol;
   2802 			/* Note: we *don't* change status from rf_ds_used_spare
   2803 			   to rf_ds_optimal */
   2804 			/* clabel.status = rf_ds_optimal; */
   2805 
   2806 			raidmarkdirty(raidPtr, sparecol);
   2807 		}
   2808 	}
   2809 }
   2810 
   2811 
   2812 void
   2813 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2814 {
   2815 	RF_ComponentLabel_t *clabel;
   2816 	int sparecol;
   2817 	int c;
   2818 	int j;
   2819 	int scol;
   2820 
   2821 	scol = -1;
   2822 
   2823 	/* XXX should do extra checks to make sure things really are clean,
   2824 	   rather than blindly setting the clean bit... */
   2825 
   2826 	raidPtr->mod_counter++;
   2827 
   2828 	for (c = 0; c < raidPtr->numCol; c++) {
   2829 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2830 			clabel = raidget_component_label(raidPtr, c);
   2831 			/* make sure status is noted */
   2832 			clabel->status = rf_ds_optimal;
   2833 
   2834 			/* note what unit we are configured as */
   2835 			clabel->last_unit = raidPtr->raidid;
   2836 
   2837 			raidflush_component_label(raidPtr, c);
   2838 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2839 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2840 					raidmarkclean(raidPtr, c);
   2841 				}
   2842 			}
   2843 		}
   2844 		/* else we don't touch it.. */
   2845 	}
   2846 
   2847 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2848 		sparecol = raidPtr->numCol + c;
   2849 		/* Need to ensure that the reconstruct actually completed! */
   2850 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2851 			/*
   2852 
   2853 			   we claim this disk is "optimal" if it's
   2854 			   rf_ds_used_spare, as that means it should be
   2855 			   directly substitutable for the disk it replaced.
   2856 			   We note that too...
   2857 
   2858 			 */
   2859 
   2860 			for(j=0;j<raidPtr->numCol;j++) {
   2861 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2862 					scol = j;
   2863 					break;
   2864 				}
   2865 			}
   2866 
   2867 			/* XXX shouldn't *really* need this... */
   2868 			clabel = raidget_component_label(raidPtr, sparecol);
   2869 			/* make sure status is noted */
   2870 
   2871 			raid_init_component_label(raidPtr, clabel);
   2872 
   2873 			clabel->column = scol;
   2874 			clabel->status = rf_ds_optimal;
   2875 			clabel->last_unit = raidPtr->raidid;
   2876 
   2877 			raidflush_component_label(raidPtr, sparecol);
   2878 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2879 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2880 					raidmarkclean(raidPtr, sparecol);
   2881 				}
   2882 			}
   2883 		}
   2884 	}
   2885 }
   2886 
   2887 void
   2888 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2889 {
   2890 
   2891 	if (vp != NULL) {
   2892 		if (auto_configured == 1) {
   2893 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2894 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2895 			vput(vp);
   2896 
   2897 		} else {
   2898 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2899 		}
   2900 	}
   2901 }
   2902 
   2903 
   2904 void
   2905 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2906 {
   2907 	int r,c;
   2908 	struct vnode *vp;
   2909 	int acd;
   2910 
   2911 
   2912 	/* We take this opportunity to close the vnodes like we should.. */
   2913 
   2914 	for (c = 0; c < raidPtr->numCol; c++) {
   2915 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2916 		acd = raidPtr->Disks[c].auto_configured;
   2917 		rf_close_component(raidPtr, vp, acd);
   2918 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2919 		raidPtr->Disks[c].auto_configured = 0;
   2920 	}
   2921 
   2922 	for (r = 0; r < raidPtr->numSpare; r++) {
   2923 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2924 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2925 		rf_close_component(raidPtr, vp, acd);
   2926 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2927 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2928 	}
   2929 }
   2930 
   2931 
   2932 void
   2933 rf_ReconThread(struct rf_recon_req *req)
   2934 {
   2935 	int     s;
   2936 	RF_Raid_t *raidPtr;
   2937 
   2938 	s = splbio();
   2939 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2940 	raidPtr->recon_in_progress = 1;
   2941 
   2942 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2943 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2944 
   2945 	RF_Free(req, sizeof(*req));
   2946 
   2947 	raidPtr->recon_in_progress = 0;
   2948 	splx(s);
   2949 
   2950 	/* That's all... */
   2951 	kthread_exit(0);	/* does not return */
   2952 }
   2953 
   2954 void
   2955 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2956 {
   2957 	int retcode;
   2958 	int s;
   2959 
   2960 	raidPtr->parity_rewrite_stripes_done = 0;
   2961 	raidPtr->parity_rewrite_in_progress = 1;
   2962 	s = splbio();
   2963 	retcode = rf_RewriteParity(raidPtr);
   2964 	splx(s);
   2965 	if (retcode) {
   2966 		printf("raid%d: Error re-writing parity (%d)!\n",
   2967 		    raidPtr->raidid, retcode);
   2968 	} else {
   2969 		/* set the clean bit!  If we shutdown correctly,
   2970 		   the clean bit on each component label will get
   2971 		   set */
   2972 		raidPtr->parity_good = RF_RAID_CLEAN;
   2973 	}
   2974 	raidPtr->parity_rewrite_in_progress = 0;
   2975 
   2976 	/* Anyone waiting for us to stop?  If so, inform them... */
   2977 	if (raidPtr->waitShutdown) {
   2978 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2979 	}
   2980 
   2981 	/* That's all... */
   2982 	kthread_exit(0);	/* does not return */
   2983 }
   2984 
   2985 
   2986 void
   2987 rf_CopybackThread(RF_Raid_t *raidPtr)
   2988 {
   2989 	int s;
   2990 
   2991 	raidPtr->copyback_in_progress = 1;
   2992 	s = splbio();
   2993 	rf_CopybackReconstructedData(raidPtr);
   2994 	splx(s);
   2995 	raidPtr->copyback_in_progress = 0;
   2996 
   2997 	/* That's all... */
   2998 	kthread_exit(0);	/* does not return */
   2999 }
   3000 
   3001 
   3002 void
   3003 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3004 {
   3005 	int s;
   3006 	RF_Raid_t *raidPtr;
   3007 
   3008 	s = splbio();
   3009 	raidPtr = req->raidPtr;
   3010 	raidPtr->recon_in_progress = 1;
   3011 	rf_ReconstructInPlace(raidPtr, req->col);
   3012 	RF_Free(req, sizeof(*req));
   3013 	raidPtr->recon_in_progress = 0;
   3014 	splx(s);
   3015 
   3016 	/* That's all... */
   3017 	kthread_exit(0);	/* does not return */
   3018 }
   3019 
   3020 static RF_AutoConfig_t *
   3021 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3022     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3023     unsigned secsize)
   3024 {
   3025 	int good_one = 0;
   3026 	RF_ComponentLabel_t *clabel;
   3027 	RF_AutoConfig_t *ac;
   3028 
   3029 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3030 	if (clabel == NULL) {
   3031 oomem:
   3032 		    while(ac_list) {
   3033 			    ac = ac_list;
   3034 			    if (ac->clabel)
   3035 				    free(ac->clabel, M_RAIDFRAME);
   3036 			    ac_list = ac_list->next;
   3037 			    free(ac, M_RAIDFRAME);
   3038 		    }
   3039 		    printf("RAID auto config: out of memory!\n");
   3040 		    return NULL; /* XXX probably should panic? */
   3041 	}
   3042 
   3043 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3044 		/* Got the label.  Does it look reasonable? */
   3045 		if (rf_reasonable_label(clabel, numsecs) &&
   3046 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3047 #ifdef DEBUG
   3048 			printf("Component on: %s: %llu\n",
   3049 				cname, (unsigned long long)size);
   3050 			rf_print_component_label(clabel);
   3051 #endif
   3052 			/* if it's reasonable, add it, else ignore it. */
   3053 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3054 				M_NOWAIT);
   3055 			if (ac == NULL) {
   3056 				free(clabel, M_RAIDFRAME);
   3057 				goto oomem;
   3058 			}
   3059 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3060 			ac->dev = dev;
   3061 			ac->vp = vp;
   3062 			ac->clabel = clabel;
   3063 			ac->next = ac_list;
   3064 			ac_list = ac;
   3065 			good_one = 1;
   3066 		}
   3067 	}
   3068 	if (!good_one) {
   3069 		/* cleanup */
   3070 		free(clabel, M_RAIDFRAME);
   3071 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3072 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3073 		vput(vp);
   3074 	}
   3075 	return ac_list;
   3076 }
   3077 
   3078 RF_AutoConfig_t *
   3079 rf_find_raid_components(void)
   3080 {
   3081 	struct vnode *vp;
   3082 	struct disklabel label;
   3083 	device_t dv;
   3084 	deviter_t di;
   3085 	dev_t dev;
   3086 	int bmajor, bminor, wedge, rf_part_found;
   3087 	int error;
   3088 	int i;
   3089 	RF_AutoConfig_t *ac_list;
   3090 	uint64_t numsecs;
   3091 	unsigned secsize;
   3092 
   3093 	/* initialize the AutoConfig list */
   3094 	ac_list = NULL;
   3095 
   3096 	/* we begin by trolling through *all* the devices on the system */
   3097 
   3098 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3099 	     dv = deviter_next(&di)) {
   3100 
   3101 		/* we are only interested in disks... */
   3102 		if (device_class(dv) != DV_DISK)
   3103 			continue;
   3104 
   3105 		/* we don't care about floppies... */
   3106 		if (device_is_a(dv, "fd")) {
   3107 			continue;
   3108 		}
   3109 
   3110 		/* we don't care about CD's... */
   3111 		if (device_is_a(dv, "cd")) {
   3112 			continue;
   3113 		}
   3114 
   3115 		/* we don't care about md's... */
   3116 		if (device_is_a(dv, "md")) {
   3117 			continue;
   3118 		}
   3119 
   3120 		/* hdfd is the Atari/Hades floppy driver */
   3121 		if (device_is_a(dv, "hdfd")) {
   3122 			continue;
   3123 		}
   3124 
   3125 		/* fdisa is the Atari/Milan floppy driver */
   3126 		if (device_is_a(dv, "fdisa")) {
   3127 			continue;
   3128 		}
   3129 
   3130 		/* need to find the device_name_to_block_device_major stuff */
   3131 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3132 
   3133 		rf_part_found = 0; /*No raid partition as yet*/
   3134 
   3135 		/* get a vnode for the raw partition of this disk */
   3136 
   3137 		wedge = device_is_a(dv, "dk");
   3138 		bminor = minor(device_unit(dv));
   3139 		dev = wedge ? makedev(bmajor, bminor) :
   3140 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3141 		if (bdevvp(dev, &vp))
   3142 			panic("RAID can't alloc vnode");
   3143 
   3144 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3145 
   3146 		if (error) {
   3147 			/* "Who cares."  Continue looking
   3148 			   for something that exists*/
   3149 			vput(vp);
   3150 			continue;
   3151 		}
   3152 
   3153 		error = getdisksize(vp, &numsecs, &secsize);
   3154 		if (error) {
   3155 			vput(vp);
   3156 			continue;
   3157 		}
   3158 		if (wedge) {
   3159 			struct dkwedge_info dkw;
   3160 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3161 			    NOCRED);
   3162 			if (error) {
   3163 				printf("RAIDframe: can't get wedge info for "
   3164 				    "dev %s (%d)\n", device_xname(dv), error);
   3165 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3166 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3167 				vput(vp);
   3168 				continue;
   3169 			}
   3170 
   3171 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3172 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3173 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3174 				vput(vp);
   3175 				continue;
   3176 			}
   3177 
   3178 			ac_list = rf_get_component(ac_list, dev, vp,
   3179 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3180 			rf_part_found = 1; /*There is a raid component on this disk*/
   3181 			continue;
   3182 		}
   3183 
   3184 		/* Ok, the disk exists.  Go get the disklabel. */
   3185 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3186 		if (error) {
   3187 			/*
   3188 			 * XXX can't happen - open() would
   3189 			 * have errored out (or faked up one)
   3190 			 */
   3191 			if (error != ENOTTY)
   3192 				printf("RAIDframe: can't get label for dev "
   3193 				    "%s (%d)\n", device_xname(dv), error);
   3194 		}
   3195 
   3196 		/* don't need this any more.  We'll allocate it again
   3197 		   a little later if we really do... */
   3198 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3199 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3200 		vput(vp);
   3201 
   3202 		if (error)
   3203 			continue;
   3204 
   3205 		rf_part_found = 0; /*No raid partitions yet*/
   3206 		for (i = 0; i < label.d_npartitions; i++) {
   3207 			char cname[sizeof(ac_list->devname)];
   3208 
   3209 			/* We only support partitions marked as RAID */
   3210 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3211 				continue;
   3212 
   3213 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3214 			if (bdevvp(dev, &vp))
   3215 				panic("RAID can't alloc vnode");
   3216 
   3217 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3218 			if (error) {
   3219 				/* Whatever... */
   3220 				vput(vp);
   3221 				continue;
   3222 			}
   3223 			snprintf(cname, sizeof(cname), "%s%c",
   3224 			    device_xname(dv), 'a' + i);
   3225 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3226 				label.d_partitions[i].p_size, numsecs, secsize);
   3227 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3228 		}
   3229 
   3230 		/*
   3231 		 *If there is no raid component on this disk, either in a
   3232 		 *disklabel or inside a wedge, check the raw partition as well,
   3233 		 *as it is possible to configure raid components on raw disk
   3234 		 *devices.
   3235 		 */
   3236 
   3237 		if (!rf_part_found) {
   3238 			char cname[sizeof(ac_list->devname)];
   3239 
   3240 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3241 			if (bdevvp(dev, &vp))
   3242 				panic("RAID can't alloc vnode");
   3243 
   3244 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3245 			if (error) {
   3246 				/* Whatever... */
   3247 				vput(vp);
   3248 				continue;
   3249 			}
   3250 			snprintf(cname, sizeof(cname), "%s%c",
   3251 			    device_xname(dv), 'a' + RAW_PART);
   3252 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3253 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3254 		}
   3255 	}
   3256 	deviter_release(&di);
   3257 	return ac_list;
   3258 }
   3259 
   3260 
   3261 int
   3262 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3263 {
   3264 
   3265 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3266 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3267 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3268 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3269 	    clabel->row >=0 &&
   3270 	    clabel->column >= 0 &&
   3271 	    clabel->num_rows > 0 &&
   3272 	    clabel->num_columns > 0 &&
   3273 	    clabel->row < clabel->num_rows &&
   3274 	    clabel->column < clabel->num_columns &&
   3275 	    clabel->blockSize > 0 &&
   3276 	    /*
   3277 	     * numBlocksHi may contain garbage, but it is ok since
   3278 	     * the type is unsigned.  If it is really garbage,
   3279 	     * rf_fix_old_label_size() will fix it.
   3280 	     */
   3281 	    rf_component_label_numblocks(clabel) > 0) {
   3282 		/*
   3283 		 * label looks reasonable enough...
   3284 		 * let's make sure it has no old garbage.
   3285 		 */
   3286 		if (numsecs)
   3287 			rf_fix_old_label_size(clabel, numsecs);
   3288 		return(1);
   3289 	}
   3290 	return(0);
   3291 }
   3292 
   3293 
   3294 /*
   3295  * For reasons yet unknown, some old component labels have garbage in
   3296  * the newer numBlocksHi region, and this causes lossage.  Since those
   3297  * disks will also have numsecs set to less than 32 bits of sectors,
   3298  * we can determine when this corruption has occurred, and fix it.
   3299  *
   3300  * The exact same problem, with the same unknown reason, happens to
   3301  * the partitionSizeHi member as well.
   3302  */
   3303 static void
   3304 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3305 {
   3306 
   3307 	if (numsecs < ((uint64_t)1 << 32)) {
   3308 		if (clabel->numBlocksHi) {
   3309 			printf("WARNING: total sectors < 32 bits, yet "
   3310 			       "numBlocksHi set\n"
   3311 			       "WARNING: resetting numBlocksHi to zero.\n");
   3312 			clabel->numBlocksHi = 0;
   3313 		}
   3314 
   3315 		if (clabel->partitionSizeHi) {
   3316 			printf("WARNING: total sectors < 32 bits, yet "
   3317 			       "partitionSizeHi set\n"
   3318 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3319 			clabel->partitionSizeHi = 0;
   3320 		}
   3321 	}
   3322 }
   3323 
   3324 
   3325 #ifdef DEBUG
   3326 void
   3327 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3328 {
   3329 	uint64_t numBlocks;
   3330 
   3331 	numBlocks = rf_component_label_numblocks(clabel);
   3332 
   3333 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3334 	       clabel->row, clabel->column,
   3335 	       clabel->num_rows, clabel->num_columns);
   3336 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3337 	       clabel->version, clabel->serial_number,
   3338 	       clabel->mod_counter);
   3339 	printf("   Clean: %s Status: %d\n",
   3340 	       clabel->clean ? "Yes" : "No", clabel->status);
   3341 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3342 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3343 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3344 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3345 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3346 	printf("   Contains root partition: %s\n",
   3347 	       clabel->root_partition ? "Yes" : "No");
   3348 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3349 #if 0
   3350 	   printf("   Config order: %d\n", clabel->config_order);
   3351 #endif
   3352 
   3353 }
   3354 #endif
   3355 
   3356 RF_ConfigSet_t *
   3357 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3358 {
   3359 	RF_AutoConfig_t *ac;
   3360 	RF_ConfigSet_t *config_sets;
   3361 	RF_ConfigSet_t *cset;
   3362 	RF_AutoConfig_t *ac_next;
   3363 
   3364 
   3365 	config_sets = NULL;
   3366 
   3367 	/* Go through the AutoConfig list, and figure out which components
   3368 	   belong to what sets.  */
   3369 	ac = ac_list;
   3370 	while(ac!=NULL) {
   3371 		/* we're going to putz with ac->next, so save it here
   3372 		   for use at the end of the loop */
   3373 		ac_next = ac->next;
   3374 
   3375 		if (config_sets == NULL) {
   3376 			/* will need at least this one... */
   3377 			config_sets = (RF_ConfigSet_t *)
   3378 				malloc(sizeof(RF_ConfigSet_t),
   3379 				       M_RAIDFRAME, M_NOWAIT);
   3380 			if (config_sets == NULL) {
   3381 				panic("rf_create_auto_sets: No memory!");
   3382 			}
   3383 			/* this one is easy :) */
   3384 			config_sets->ac = ac;
   3385 			config_sets->next = NULL;
   3386 			config_sets->rootable = 0;
   3387 			ac->next = NULL;
   3388 		} else {
   3389 			/* which set does this component fit into? */
   3390 			cset = config_sets;
   3391 			while(cset!=NULL) {
   3392 				if (rf_does_it_fit(cset, ac)) {
   3393 					/* looks like it matches... */
   3394 					ac->next = cset->ac;
   3395 					cset->ac = ac;
   3396 					break;
   3397 				}
   3398 				cset = cset->next;
   3399 			}
   3400 			if (cset==NULL) {
   3401 				/* didn't find a match above... new set..*/
   3402 				cset = (RF_ConfigSet_t *)
   3403 					malloc(sizeof(RF_ConfigSet_t),
   3404 					       M_RAIDFRAME, M_NOWAIT);
   3405 				if (cset == NULL) {
   3406 					panic("rf_create_auto_sets: No memory!");
   3407 				}
   3408 				cset->ac = ac;
   3409 				ac->next = NULL;
   3410 				cset->next = config_sets;
   3411 				cset->rootable = 0;
   3412 				config_sets = cset;
   3413 			}
   3414 		}
   3415 		ac = ac_next;
   3416 	}
   3417 
   3418 
   3419 	return(config_sets);
   3420 }
   3421 
   3422 static int
   3423 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3424 {
   3425 	RF_ComponentLabel_t *clabel1, *clabel2;
   3426 
   3427 	/* If this one matches the *first* one in the set, that's good
   3428 	   enough, since the other members of the set would have been
   3429 	   through here too... */
   3430 	/* note that we are not checking partitionSize here..
   3431 
   3432 	   Note that we are also not checking the mod_counters here.
   3433 	   If everything else matches except the mod_counter, that's
   3434 	   good enough for this test.  We will deal with the mod_counters
   3435 	   a little later in the autoconfiguration process.
   3436 
   3437 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3438 
   3439 	   The reason we don't check for this is that failed disks
   3440 	   will have lower modification counts.  If those disks are
   3441 	   not added to the set they used to belong to, then they will
   3442 	   form their own set, which may result in 2 different sets,
   3443 	   for example, competing to be configured at raid0, and
   3444 	   perhaps competing to be the root filesystem set.  If the
   3445 	   wrong ones get configured, or both attempt to become /,
   3446 	   weird behaviour and or serious lossage will occur.  Thus we
   3447 	   need to bring them into the fold here, and kick them out at
   3448 	   a later point.
   3449 
   3450 	*/
   3451 
   3452 	clabel1 = cset->ac->clabel;
   3453 	clabel2 = ac->clabel;
   3454 	if ((clabel1->version == clabel2->version) &&
   3455 	    (clabel1->serial_number == clabel2->serial_number) &&
   3456 	    (clabel1->num_rows == clabel2->num_rows) &&
   3457 	    (clabel1->num_columns == clabel2->num_columns) &&
   3458 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3459 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3460 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3461 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3462 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3463 	    (clabel1->blockSize == clabel2->blockSize) &&
   3464 	    rf_component_label_numblocks(clabel1) ==
   3465 	    rf_component_label_numblocks(clabel2) &&
   3466 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3467 	    (clabel1->root_partition == clabel2->root_partition) &&
   3468 	    (clabel1->last_unit == clabel2->last_unit) &&
   3469 	    (clabel1->config_order == clabel2->config_order)) {
   3470 		/* if it get's here, it almost *has* to be a match */
   3471 	} else {
   3472 		/* it's not consistent with somebody in the set..
   3473 		   punt */
   3474 		return(0);
   3475 	}
   3476 	/* all was fine.. it must fit... */
   3477 	return(1);
   3478 }
   3479 
   3480 int
   3481 rf_have_enough_components(RF_ConfigSet_t *cset)
   3482 {
   3483 	RF_AutoConfig_t *ac;
   3484 	RF_AutoConfig_t *auto_config;
   3485 	RF_ComponentLabel_t *clabel;
   3486 	int c;
   3487 	int num_cols;
   3488 	int num_missing;
   3489 	int mod_counter;
   3490 	int mod_counter_found;
   3491 	int even_pair_failed;
   3492 	char parity_type;
   3493 
   3494 
   3495 	/* check to see that we have enough 'live' components
   3496 	   of this set.  If so, we can configure it if necessary */
   3497 
   3498 	num_cols = cset->ac->clabel->num_columns;
   3499 	parity_type = cset->ac->clabel->parityConfig;
   3500 
   3501 	/* XXX Check for duplicate components!?!?!? */
   3502 
   3503 	/* Determine what the mod_counter is supposed to be for this set. */
   3504 
   3505 	mod_counter_found = 0;
   3506 	mod_counter = 0;
   3507 	ac = cset->ac;
   3508 	while(ac!=NULL) {
   3509 		if (mod_counter_found==0) {
   3510 			mod_counter = ac->clabel->mod_counter;
   3511 			mod_counter_found = 1;
   3512 		} else {
   3513 			if (ac->clabel->mod_counter > mod_counter) {
   3514 				mod_counter = ac->clabel->mod_counter;
   3515 			}
   3516 		}
   3517 		ac = ac->next;
   3518 	}
   3519 
   3520 	num_missing = 0;
   3521 	auto_config = cset->ac;
   3522 
   3523 	even_pair_failed = 0;
   3524 	for(c=0; c<num_cols; c++) {
   3525 		ac = auto_config;
   3526 		while(ac!=NULL) {
   3527 			if ((ac->clabel->column == c) &&
   3528 			    (ac->clabel->mod_counter == mod_counter)) {
   3529 				/* it's this one... */
   3530 #ifdef DEBUG
   3531 				printf("Found: %s at %d\n",
   3532 				       ac->devname,c);
   3533 #endif
   3534 				break;
   3535 			}
   3536 			ac=ac->next;
   3537 		}
   3538 		if (ac==NULL) {
   3539 				/* Didn't find one here! */
   3540 				/* special case for RAID 1, especially
   3541 				   where there are more than 2
   3542 				   components (where RAIDframe treats
   3543 				   things a little differently :( ) */
   3544 			if (parity_type == '1') {
   3545 				if (c%2 == 0) { /* even component */
   3546 					even_pair_failed = 1;
   3547 				} else { /* odd component.  If
   3548 					    we're failed, and
   3549 					    so is the even
   3550 					    component, it's
   3551 					    "Good Night, Charlie" */
   3552 					if (even_pair_failed == 1) {
   3553 						return(0);
   3554 					}
   3555 				}
   3556 			} else {
   3557 				/* normal accounting */
   3558 				num_missing++;
   3559 			}
   3560 		}
   3561 		if ((parity_type == '1') && (c%2 == 1)) {
   3562 				/* Just did an even component, and we didn't
   3563 				   bail.. reset the even_pair_failed flag,
   3564 				   and go on to the next component.... */
   3565 			even_pair_failed = 0;
   3566 		}
   3567 	}
   3568 
   3569 	clabel = cset->ac->clabel;
   3570 
   3571 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3572 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3573 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3574 		/* XXX this needs to be made *much* more general */
   3575 		/* Too many failures */
   3576 		return(0);
   3577 	}
   3578 	/* otherwise, all is well, and we've got enough to take a kick
   3579 	   at autoconfiguring this set */
   3580 	return(1);
   3581 }
   3582 
   3583 void
   3584 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3585 			RF_Raid_t *raidPtr)
   3586 {
   3587 	RF_ComponentLabel_t *clabel;
   3588 	int i;
   3589 
   3590 	clabel = ac->clabel;
   3591 
   3592 	/* 1. Fill in the common stuff */
   3593 	config->numRow = clabel->num_rows = 1;
   3594 	config->numCol = clabel->num_columns;
   3595 	config->numSpare = 0; /* XXX should this be set here? */
   3596 	config->sectPerSU = clabel->sectPerSU;
   3597 	config->SUsPerPU = clabel->SUsPerPU;
   3598 	config->SUsPerRU = clabel->SUsPerRU;
   3599 	config->parityConfig = clabel->parityConfig;
   3600 	/* XXX... */
   3601 	strcpy(config->diskQueueType,"fifo");
   3602 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3603 	config->layoutSpecificSize = 0; /* XXX ?? */
   3604 
   3605 	while(ac!=NULL) {
   3606 		/* row/col values will be in range due to the checks
   3607 		   in reasonable_label() */
   3608 		strcpy(config->devnames[0][ac->clabel->column],
   3609 		       ac->devname);
   3610 		ac = ac->next;
   3611 	}
   3612 
   3613 	for(i=0;i<RF_MAXDBGV;i++) {
   3614 		config->debugVars[i][0] = 0;
   3615 	}
   3616 }
   3617 
   3618 int
   3619 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3620 {
   3621 	RF_ComponentLabel_t *clabel;
   3622 	int column;
   3623 	int sparecol;
   3624 
   3625 	raidPtr->autoconfigure = new_value;
   3626 
   3627 	for(column=0; column<raidPtr->numCol; column++) {
   3628 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3629 			clabel = raidget_component_label(raidPtr, column);
   3630 			clabel->autoconfigure = new_value;
   3631 			raidflush_component_label(raidPtr, column);
   3632 		}
   3633 	}
   3634 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3635 		sparecol = raidPtr->numCol + column;
   3636 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3637 			clabel = raidget_component_label(raidPtr, sparecol);
   3638 			clabel->autoconfigure = new_value;
   3639 			raidflush_component_label(raidPtr, sparecol);
   3640 		}
   3641 	}
   3642 	return(new_value);
   3643 }
   3644 
   3645 int
   3646 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3647 {
   3648 	RF_ComponentLabel_t *clabel;
   3649 	int column;
   3650 	int sparecol;
   3651 
   3652 	raidPtr->root_partition = new_value;
   3653 	for(column=0; column<raidPtr->numCol; column++) {
   3654 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3655 			clabel = raidget_component_label(raidPtr, column);
   3656 			clabel->root_partition = new_value;
   3657 			raidflush_component_label(raidPtr, column);
   3658 		}
   3659 	}
   3660 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3661 		sparecol = raidPtr->numCol + column;
   3662 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3663 			clabel = raidget_component_label(raidPtr, sparecol);
   3664 			clabel->root_partition = new_value;
   3665 			raidflush_component_label(raidPtr, sparecol);
   3666 		}
   3667 	}
   3668 	return(new_value);
   3669 }
   3670 
   3671 void
   3672 rf_release_all_vps(RF_ConfigSet_t *cset)
   3673 {
   3674 	RF_AutoConfig_t *ac;
   3675 
   3676 	ac = cset->ac;
   3677 	while(ac!=NULL) {
   3678 		/* Close the vp, and give it back */
   3679 		if (ac->vp) {
   3680 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3681 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3682 			vput(ac->vp);
   3683 			ac->vp = NULL;
   3684 		}
   3685 		ac = ac->next;
   3686 	}
   3687 }
   3688 
   3689 
   3690 void
   3691 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3692 {
   3693 	RF_AutoConfig_t *ac;
   3694 	RF_AutoConfig_t *next_ac;
   3695 
   3696 	ac = cset->ac;
   3697 	while(ac!=NULL) {
   3698 		next_ac = ac->next;
   3699 		/* nuke the label */
   3700 		free(ac->clabel, M_RAIDFRAME);
   3701 		/* cleanup the config structure */
   3702 		free(ac, M_RAIDFRAME);
   3703 		/* "next.." */
   3704 		ac = next_ac;
   3705 	}
   3706 	/* and, finally, nuke the config set */
   3707 	free(cset, M_RAIDFRAME);
   3708 }
   3709 
   3710 
   3711 void
   3712 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3713 {
   3714 	/* current version number */
   3715 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3716 	clabel->serial_number = raidPtr->serial_number;
   3717 	clabel->mod_counter = raidPtr->mod_counter;
   3718 
   3719 	clabel->num_rows = 1;
   3720 	clabel->num_columns = raidPtr->numCol;
   3721 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3722 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3723 
   3724 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3725 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3726 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3727 
   3728 	clabel->blockSize = raidPtr->bytesPerSector;
   3729 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3730 
   3731 	/* XXX not portable */
   3732 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3733 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3734 	clabel->autoconfigure = raidPtr->autoconfigure;
   3735 	clabel->root_partition = raidPtr->root_partition;
   3736 	clabel->last_unit = raidPtr->raidid;
   3737 	clabel->config_order = raidPtr->config_order;
   3738 
   3739 #ifndef RF_NO_PARITY_MAP
   3740 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3741 #endif
   3742 }
   3743 
   3744 struct raid_softc *
   3745 rf_auto_config_set(RF_ConfigSet_t *cset)
   3746 {
   3747 	RF_Raid_t *raidPtr;
   3748 	RF_Config_t *config;
   3749 	int raidID;
   3750 	struct raid_softc *sc;
   3751 
   3752 #ifdef DEBUG
   3753 	printf("RAID autoconfigure\n");
   3754 #endif
   3755 
   3756 	/* 1. Create a config structure */
   3757 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3758 	if (config == NULL) {
   3759 		printf("Out of mem!?!?\n");
   3760 				/* XXX do something more intelligent here. */
   3761 		return NULL;
   3762 	}
   3763 
   3764 	/*
   3765 	   2. Figure out what RAID ID this one is supposed to live at
   3766 	   See if we can get the same RAID dev that it was configured
   3767 	   on last time..
   3768 	*/
   3769 
   3770 	raidID = cset->ac->clabel->last_unit;
   3771 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3772 		continue;
   3773 #ifdef DEBUG
   3774 	printf("Configuring raid%d:\n",raidID);
   3775 #endif
   3776 
   3777 	raidPtr = &sc->sc_r;
   3778 
   3779 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3780 	raidPtr->softc = sc;
   3781 	raidPtr->raidid = raidID;
   3782 	raidPtr->openings = RAIDOUTSTANDING;
   3783 
   3784 	/* 3. Build the configuration structure */
   3785 	rf_create_configuration(cset->ac, config, raidPtr);
   3786 
   3787 	/* 4. Do the configuration */
   3788 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3789 		raidinit(sc);
   3790 
   3791 		rf_markalldirty(raidPtr);
   3792 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3793 		if (cset->ac->clabel->root_partition==1) {
   3794 			/* everything configured just fine.  Make a note
   3795 			   that this set is eligible to be root. */
   3796 			cset->rootable = 1;
   3797 			/* XXX do this here? */
   3798 			raidPtr->root_partition = 1;
   3799 		}
   3800 	} else {
   3801 		raidput(sc);
   3802 		sc = NULL;
   3803 	}
   3804 
   3805 	/* 5. Cleanup */
   3806 	free(config, M_RAIDFRAME);
   3807 	return sc;
   3808 }
   3809 
   3810 void
   3811 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3812 {
   3813 	struct buf *bp;
   3814 	struct raid_softc *rs;
   3815 
   3816 	bp = (struct buf *)desc->bp;
   3817 	rs = desc->raidPtr->softc;
   3818 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3819 	    (bp->b_flags & B_READ));
   3820 }
   3821 
   3822 void
   3823 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3824 	     size_t xmin, size_t xmax)
   3825 {
   3826 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3827 	pool_sethiwat(p, xmax);
   3828 	pool_prime(p, xmin);
   3829 	pool_setlowat(p, xmin);
   3830 }
   3831 
   3832 /*
   3833  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3834  * if there is IO pending and if that IO could possibly be done for a
   3835  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3836  * otherwise.
   3837  *
   3838  */
   3839 
   3840 int
   3841 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3842 {
   3843 	struct raid_softc *rs = raidPtr->softc;
   3844 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3845 		/* there is work to do */
   3846 		return 0;
   3847 	}
   3848 	/* default is nothing to do */
   3849 	return 1;
   3850 }
   3851 
   3852 int
   3853 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3854 {
   3855 	uint64_t numsecs;
   3856 	unsigned secsize;
   3857 	int error;
   3858 
   3859 	error = getdisksize(vp, &numsecs, &secsize);
   3860 	if (error == 0) {
   3861 		diskPtr->blockSize = secsize;
   3862 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3863 		diskPtr->partitionSize = numsecs;
   3864 		return 0;
   3865 	}
   3866 	return error;
   3867 }
   3868 
   3869 static int
   3870 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3871 {
   3872 	return 1;
   3873 }
   3874 
   3875 static void
   3876 raid_attach(device_t parent, device_t self, void *aux)
   3877 {
   3878 
   3879 }
   3880 
   3881 
   3882 static int
   3883 raid_detach(device_t self, int flags)
   3884 {
   3885 	int error;
   3886 	struct raid_softc *rs = raidget(device_unit(self));
   3887 
   3888 	if (rs == NULL)
   3889 		return ENXIO;
   3890 
   3891 	if ((error = raidlock(rs)) != 0)
   3892 		return (error);
   3893 
   3894 	error = raid_detach_unlocked(rs);
   3895 
   3896 	raidunlock(rs);
   3897 
   3898 	/* XXXkd: raidput(rs) ??? */
   3899 
   3900 	return error;
   3901 }
   3902 
   3903 static void
   3904 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3905 {
   3906 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3907 
   3908 	memset(dg, 0, sizeof(*dg));
   3909 
   3910 	dg->dg_secperunit = raidPtr->totalSectors;
   3911 	dg->dg_secsize = raidPtr->bytesPerSector;
   3912 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3913 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3914 
   3915 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3916 }
   3917 
   3918 /*
   3919  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3920  * We end up returning whatever error was returned by the first cache flush
   3921  * that fails.
   3922  */
   3923 
   3924 int
   3925 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3926 {
   3927 	int c, sparecol;
   3928 	int e,error;
   3929 	int force = 1;
   3930 
   3931 	error = 0;
   3932 	for (c = 0; c < raidPtr->numCol; c++) {
   3933 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3934 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3935 					  &force, FWRITE, NOCRED);
   3936 			if (e) {
   3937 				if (e != ENODEV)
   3938 					printf("raid%d: cache flush to component %s failed.\n",
   3939 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3940 				if (error == 0) {
   3941 					error = e;
   3942 				}
   3943 			}
   3944 		}
   3945 	}
   3946 
   3947 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3948 		sparecol = raidPtr->numCol + c;
   3949 		/* Need to ensure that the reconstruct actually completed! */
   3950 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3951 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3952 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3953 			if (e) {
   3954 				if (e != ENODEV)
   3955 					printf("raid%d: cache flush to component %s failed.\n",
   3956 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3957 				if (error == 0) {
   3958 					error = e;
   3959 				}
   3960 			}
   3961 		}
   3962 	}
   3963 	return error;
   3964 }
   3965