Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.315
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.315 2014/11/04 07:51:55 mlelstv Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.315 2014/11/04 07:51:55 mlelstv Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    234 
    235 struct raid_softc {
    236 	device_t sc_dev;
    237 	int	sc_unit;
    238 	int     sc_flags;	/* flags */
    239 	int     sc_cflags;	/* configuration flags */
    240 	uint64_t sc_size;	/* size of the raid device */
    241 	char    sc_xname[20];	/* XXX external name */
    242 	struct disk sc_dkdev;	/* generic disk device info */
    243 	struct bufq_state *buf_queue;	/* used for the device queue */
    244 	RF_Raid_t sc_r;
    245 	LIST_ENTRY(raid_softc) sc_link;
    246 };
    247 /* sc_flags */
    248 #define RAIDF_INITED	0x01	/* unit has been initialized */
    249 #define RAIDF_WLABEL	0x02	/* label area is writable */
    250 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    251 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    252 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    253 #define RAIDF_LOCKED	0x80	/* unit is locked */
    254 
    255 #define	raidunit(x)	DISKUNIT(x)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /*
    263  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    264  * Be aware that large numbers can allow the driver to consume a lot of
    265  * kernel memory, especially on writes, and in degraded mode reads.
    266  *
    267  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    268  * a single 64K write will typically require 64K for the old data,
    269  * 64K for the old parity, and 64K for the new parity, for a total
    270  * of 192K (if the parity buffer is not re-used immediately).
    271  * Even it if is used immediately, that's still 128K, which when multiplied
    272  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    273  *
    274  * Now in degraded mode, for example, a 64K read on the above setup may
    275  * require data reconstruction, which will require *all* of the 4 remaining
    276  * disks to participate -- 4 * 32K/disk == 128K again.
    277  */
    278 
    279 #ifndef RAIDOUTSTANDING
    280 #define RAIDOUTSTANDING   6
    281 #endif
    282 
    283 #define RAIDLABELDEV(dev)	\
    284 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    285 
    286 /* declared here, and made public, for the benefit of KVM stuff.. */
    287 
    288 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    289 				     struct disklabel *);
    290 static void raidgetdisklabel(dev_t);
    291 static void raidmakedisklabel(struct raid_softc *);
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	if (sc == NULL) {
    342 #ifdef DIAGNOSTIC
    343 		printf("%s: out of memory\n", __func__);
    344 #endif
    345 		return NULL;
    346 	}
    347 	sc->sc_unit = unit;
    348 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    349 	return sc;
    350 }
    351 
    352 static void
    353 raiddestroy(struct raid_softc *sc) {
    354 	bufq_free(sc->buf_queue);
    355 	kmem_free(sc, sizeof(*sc));
    356 }
    357 
    358 static struct raid_softc *
    359 raidget(int unit) {
    360 	struct raid_softc *sc;
    361 	if (unit < 0) {
    362 #ifdef DIAGNOSTIC
    363 		panic("%s: unit %d!", __func__, unit);
    364 #endif
    365 		return NULL;
    366 	}
    367 	mutex_enter(&raid_lock);
    368 	LIST_FOREACH(sc, &raids, sc_link) {
    369 		if (sc->sc_unit == unit) {
    370 			mutex_exit(&raid_lock);
    371 			return sc;
    372 		}
    373 	}
    374 	mutex_exit(&raid_lock);
    375 	if ((sc = raidcreate(unit)) == NULL)
    376 		return NULL;
    377 	mutex_enter(&raid_lock);
    378 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    379 	mutex_exit(&raid_lock);
    380 	return sc;
    381 }
    382 
    383 static void
    384 raidput(struct raid_softc *sc) {
    385 	mutex_enter(&raid_lock);
    386 	LIST_REMOVE(sc, sc_link);
    387 	mutex_exit(&raid_lock);
    388 	raiddestroy(sc);
    389 }
    390 
    391 void
    392 raidattach(int num)
    393 {
    394 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    395 	/* This is where all the initialization stuff gets done. */
    396 
    397 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    398 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    399 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    400 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    401 
    402 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    403 #endif
    404 
    405 	if (rf_BootRaidframe() == 0)
    406 		aprint_verbose("Kernelized RAIDframe activated\n");
    407 	else
    408 		panic("Serious error booting RAID!!");
    409 
    410 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    411 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    412 	}
    413 
    414 	raidautoconfigdone = false;
    415 
    416 	/*
    417 	 * Register a finalizer which will be used to auto-config RAID
    418 	 * sets once all real hardware devices have been found.
    419 	 */
    420 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    421 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    422 }
    423 
    424 int
    425 rf_autoconfig(device_t self)
    426 {
    427 	RF_AutoConfig_t *ac_list;
    428 	RF_ConfigSet_t *config_sets;
    429 
    430 	if (!raidautoconfig || raidautoconfigdone == true)
    431 		return (0);
    432 
    433 	/* XXX This code can only be run once. */
    434 	raidautoconfigdone = true;
    435 
    436 #ifdef __HAVE_CPU_BOOTCONF
    437 	/*
    438 	 * 0. find the boot device if needed first so we can use it later
    439 	 * this needs to be done before we autoconfigure any raid sets,
    440 	 * because if we use wedges we are not going to be able to open
    441 	 * the boot device later
    442 	 */
    443 	if (booted_device == NULL)
    444 		cpu_bootconf();
    445 #endif
    446 	/* 1. locate all RAID components on the system */
    447 	aprint_debug("Searching for RAID components...\n");
    448 	ac_list = rf_find_raid_components();
    449 
    450 	/* 2. Sort them into their respective sets. */
    451 	config_sets = rf_create_auto_sets(ac_list);
    452 
    453 	/*
    454 	 * 3. Evaluate each set and configure the valid ones.
    455 	 * This gets done in rf_buildroothack().
    456 	 */
    457 	rf_buildroothack(config_sets);
    458 
    459 	return 1;
    460 }
    461 
    462 static int
    463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    464 	const char *bootname = device_xname(bdv);
    465 	size_t len = strlen(bootname);
    466 
    467 	for (int col = 0; col < r->numCol; col++) {
    468 		const char *devname = r->Disks[col].devname;
    469 		devname += sizeof("/dev/") - 1;
    470 		if (strncmp(devname, "dk", 2) == 0) {
    471 			const char *parent =
    472 			    dkwedge_get_parent_name(r->Disks[col].dev);
    473 			if (parent != NULL)
    474 				devname = parent;
    475 		}
    476 		if (strncmp(devname, bootname, len) == 0) {
    477 			struct raid_softc *sc = r->softc;
    478 			aprint_debug("raid%d includes boot device %s\n",
    479 			    sc->sc_unit, devname);
    480 			return 1;
    481 		}
    482 	}
    483 	return 0;
    484 }
    485 
    486 void
    487 rf_buildroothack(RF_ConfigSet_t *config_sets)
    488 {
    489 	RF_ConfigSet_t *cset;
    490 	RF_ConfigSet_t *next_cset;
    491 	int num_root;
    492 	struct raid_softc *sc, *rsc;
    493 
    494 	sc = rsc = NULL;
    495 	num_root = 0;
    496 	cset = config_sets;
    497 	while (cset != NULL) {
    498 		next_cset = cset->next;
    499 		if (rf_have_enough_components(cset) &&
    500 		    cset->ac->clabel->autoconfigure == 1) {
    501 			sc = rf_auto_config_set(cset);
    502 			if (sc != NULL) {
    503 				aprint_debug("raid%d: configured ok\n",
    504 				    sc->sc_unit);
    505 				if (cset->rootable) {
    506 					rsc = sc;
    507 					num_root++;
    508 				}
    509 			} else {
    510 				/* The autoconfig didn't work :( */
    511 				aprint_debug("Autoconfig failed\n");
    512 				rf_release_all_vps(cset);
    513 			}
    514 		} else {
    515 			/* we're not autoconfiguring this set...
    516 			   release the associated resources */
    517 			rf_release_all_vps(cset);
    518 		}
    519 		/* cleanup */
    520 		rf_cleanup_config_set(cset);
    521 		cset = next_cset;
    522 	}
    523 
    524 	/* if the user has specified what the root device should be
    525 	   then we don't touch booted_device or boothowto... */
    526 
    527 	if (rootspec != NULL)
    528 		return;
    529 
    530 	/* we found something bootable... */
    531 
    532 	/*
    533 	 * XXX: The following code assumes that the root raid
    534 	 * is the first ('a') partition. This is about the best
    535 	 * we can do with a BSD disklabel, but we might be able
    536 	 * to do better with a GPT label, by setting a specified
    537 	 * attribute to indicate the root partition. We can then
    538 	 * stash the partition number in the r->root_partition
    539 	 * high bits (the bottom 2 bits are already used). For
    540 	 * now we just set booted_partition to 0 when we override
    541 	 * root.
    542 	 */
    543 	if (num_root == 1) {
    544 		device_t candidate_root;
    545 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    546 			char cname[sizeof(cset->ac->devname)];
    547 			/* XXX: assume 'a' */
    548 			snprintf(cname, sizeof(cname), "%s%c",
    549 			    device_xname(rsc->sc_dev), 'a');
    550 			candidate_root = dkwedge_find_by_wname(cname);
    551 		} else
    552 			candidate_root = rsc->sc_dev;
    553 		if (booted_device == NULL ||
    554 		    rsc->sc_r.root_partition == 1 ||
    555 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    556 			booted_device = candidate_root;
    557 			booted_partition = 0;	/* XXX assume 'a' */
    558 		}
    559 	} else if (num_root > 1) {
    560 
    561 		/*
    562 		 * Maybe the MD code can help. If it cannot, then
    563 		 * setroot() will discover that we have no
    564 		 * booted_device and will ask the user if nothing was
    565 		 * hardwired in the kernel config file
    566 		 */
    567 		if (booted_device == NULL)
    568 			return;
    569 
    570 		num_root = 0;
    571 		mutex_enter(&raid_lock);
    572 		LIST_FOREACH(sc, &raids, sc_link) {
    573 			RF_Raid_t *r = &sc->sc_r;
    574 			if (r->valid == 0)
    575 				continue;
    576 
    577 			if (r->root_partition == 0)
    578 				continue;
    579 
    580 			if (rf_containsboot(r, booted_device)) {
    581 				num_root++;
    582 				rsc = sc;
    583 			}
    584 		}
    585 		mutex_exit(&raid_lock);
    586 
    587 		if (num_root == 1) {
    588 			booted_device = rsc->sc_dev;
    589 			booted_partition = 0;	/* XXX assume 'a' */
    590 		} else {
    591 			/* we can't guess.. require the user to answer... */
    592 			boothowto |= RB_ASKNAME;
    593 		}
    594 	}
    595 }
    596 
    597 
    598 int
    599 raidsize(dev_t dev)
    600 {
    601 	struct raid_softc *rs;
    602 	struct disklabel *lp;
    603 	int     part, unit, omask, size;
    604 
    605 	unit = raidunit(dev);
    606 	if ((rs = raidget(unit)) == NULL)
    607 		return -1;
    608 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    609 		return (-1);
    610 
    611 	part = DISKPART(dev);
    612 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    613 	lp = rs->sc_dkdev.dk_label;
    614 
    615 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    616 		return (-1);
    617 
    618 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    619 		size = -1;
    620 	else
    621 		size = lp->d_partitions[part].p_size *
    622 		    (lp->d_secsize / DEV_BSIZE);
    623 
    624 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    625 		return (-1);
    626 
    627 	return (size);
    628 
    629 }
    630 
    631 int
    632 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    633 {
    634 	int     unit = raidunit(dev);
    635 	struct raid_softc *rs;
    636 	const struct bdevsw *bdev;
    637 	struct disklabel *lp;
    638 	RF_Raid_t *raidPtr;
    639 	daddr_t offset;
    640 	int     part, c, sparecol, j, scol, dumpto;
    641 	int     error = 0;
    642 
    643 	if ((rs = raidget(unit)) == NULL)
    644 		return ENXIO;
    645 
    646 	raidPtr = &rs->sc_r;
    647 
    648 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    649 		return ENXIO;
    650 
    651 	/* we only support dumping to RAID 1 sets */
    652 	if (raidPtr->Layout.numDataCol != 1 ||
    653 	    raidPtr->Layout.numParityCol != 1)
    654 		return EINVAL;
    655 
    656 
    657 	if ((error = raidlock(rs)) != 0)
    658 		return error;
    659 
    660 	if (size % DEV_BSIZE != 0) {
    661 		error = EINVAL;
    662 		goto out;
    663 	}
    664 
    665 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    666 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    667 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    668 		    size / DEV_BSIZE, rs->sc_size);
    669 		error = EINVAL;
    670 		goto out;
    671 	}
    672 
    673 	part = DISKPART(dev);
    674 	lp = rs->sc_dkdev.dk_label;
    675 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    676 
    677 	/* figure out what device is alive.. */
    678 
    679 	/*
    680 	   Look for a component to dump to.  The preference for the
    681 	   component to dump to is as follows:
    682 	   1) the master
    683 	   2) a used_spare of the master
    684 	   3) the slave
    685 	   4) a used_spare of the slave
    686 	*/
    687 
    688 	dumpto = -1;
    689 	for (c = 0; c < raidPtr->numCol; c++) {
    690 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    691 			/* this might be the one */
    692 			dumpto = c;
    693 			break;
    694 		}
    695 	}
    696 
    697 	/*
    698 	   At this point we have possibly selected a live master or a
    699 	   live slave.  We now check to see if there is a spared
    700 	   master (or a spared slave), if we didn't find a live master
    701 	   or a live slave.
    702 	*/
    703 
    704 	for (c = 0; c < raidPtr->numSpare; c++) {
    705 		sparecol = raidPtr->numCol + c;
    706 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    707 			/* How about this one? */
    708 			scol = -1;
    709 			for(j=0;j<raidPtr->numCol;j++) {
    710 				if (raidPtr->Disks[j].spareCol == sparecol) {
    711 					scol = j;
    712 					break;
    713 				}
    714 			}
    715 			if (scol == 0) {
    716 				/*
    717 				   We must have found a spared master!
    718 				   We'll take that over anything else
    719 				   found so far.  (We couldn't have
    720 				   found a real master before, since
    721 				   this is a used spare, and it's
    722 				   saying that it's replacing the
    723 				   master.)  On reboot (with
    724 				   autoconfiguration turned on)
    725 				   sparecol will become the 1st
    726 				   component (component0) of this set.
    727 				*/
    728 				dumpto = sparecol;
    729 				break;
    730 			} else if (scol != -1) {
    731 				/*
    732 				   Must be a spared slave.  We'll dump
    733 				   to that if we havn't found anything
    734 				   else so far.
    735 				*/
    736 				if (dumpto == -1)
    737 					dumpto = sparecol;
    738 			}
    739 		}
    740 	}
    741 
    742 	if (dumpto == -1) {
    743 		/* we couldn't find any live components to dump to!?!?
    744 		 */
    745 		error = EINVAL;
    746 		goto out;
    747 	}
    748 
    749 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    750 
    751 	/*
    752 	   Note that blkno is relative to this particular partition.
    753 	   By adding the offset of this partition in the RAID
    754 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    755 	   value that is relative to the partition used for the
    756 	   underlying component.
    757 	*/
    758 
    759 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    760 				blkno + offset, va, size);
    761 
    762 out:
    763 	raidunlock(rs);
    764 
    765 	return error;
    766 }
    767 /* ARGSUSED */
    768 int
    769 raidopen(dev_t dev, int flags, int fmt,
    770     struct lwp *l)
    771 {
    772 	int     unit = raidunit(dev);
    773 	struct raid_softc *rs;
    774 	struct disklabel *lp;
    775 	int     part, pmask;
    776 	int     error = 0;
    777 
    778 	if ((rs = raidget(unit)) == NULL)
    779 		return ENXIO;
    780 	if ((error = raidlock(rs)) != 0)
    781 		return (error);
    782 
    783 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    784 		error = EBUSY;
    785 		goto bad;
    786 	}
    787 
    788 	lp = rs->sc_dkdev.dk_label;
    789 
    790 	part = DISKPART(dev);
    791 
    792 	/*
    793 	 * If there are wedges, and this is not RAW_PART, then we
    794 	 * need to fail.
    795 	 */
    796 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    797 		error = EBUSY;
    798 		goto bad;
    799 	}
    800 	pmask = (1 << part);
    801 
    802 	if ((rs->sc_flags & RAIDF_INITED) &&
    803 	    (rs->sc_dkdev.dk_openmask == 0))
    804 		raidgetdisklabel(dev);
    805 
    806 	/* make sure that this partition exists */
    807 
    808 	if (part != RAW_PART) {
    809 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    810 		    ((part >= lp->d_npartitions) ||
    811 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    812 			error = ENXIO;
    813 			goto bad;
    814 		}
    815 	}
    816 	/* Prevent this unit from being unconfigured while open. */
    817 	switch (fmt) {
    818 	case S_IFCHR:
    819 		rs->sc_dkdev.dk_copenmask |= pmask;
    820 		break;
    821 
    822 	case S_IFBLK:
    823 		rs->sc_dkdev.dk_bopenmask |= pmask;
    824 		break;
    825 	}
    826 
    827 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    828 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    829 		/* First one... mark things as dirty... Note that we *MUST*
    830 		 have done a configure before this.  I DO NOT WANT TO BE
    831 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    832 		 THAT THEY BELONG TOGETHER!!!!! */
    833 		/* XXX should check to see if we're only open for reading
    834 		   here... If so, we needn't do this, but then need some
    835 		   other way of keeping track of what's happened.. */
    836 
    837 		rf_markalldirty(&rs->sc_r);
    838 	}
    839 
    840 
    841 	rs->sc_dkdev.dk_openmask =
    842 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    843 
    844 bad:
    845 	raidunlock(rs);
    846 
    847 	return (error);
    848 
    849 
    850 }
    851 /* ARGSUSED */
    852 int
    853 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    854 {
    855 	int     unit = raidunit(dev);
    856 	struct raid_softc *rs;
    857 	int     error = 0;
    858 	int     part;
    859 
    860 	if ((rs = raidget(unit)) == NULL)
    861 		return ENXIO;
    862 
    863 	if ((error = raidlock(rs)) != 0)
    864 		return (error);
    865 
    866 	part = DISKPART(dev);
    867 
    868 	/* ...that much closer to allowing unconfiguration... */
    869 	switch (fmt) {
    870 	case S_IFCHR:
    871 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    872 		break;
    873 
    874 	case S_IFBLK:
    875 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    876 		break;
    877 	}
    878 	rs->sc_dkdev.dk_openmask =
    879 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    880 
    881 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    882 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    883 		/* Last one... device is not unconfigured yet.
    884 		   Device shutdown has taken care of setting the
    885 		   clean bits if RAIDF_INITED is not set
    886 		   mark things as clean... */
    887 
    888 		rf_update_component_labels(&rs->sc_r,
    889 						 RF_FINAL_COMPONENT_UPDATE);
    890 
    891 		/* If the kernel is shutting down, it will detach
    892 		 * this RAID set soon enough.
    893 		 */
    894 	}
    895 
    896 	raidunlock(rs);
    897 	return (0);
    898 
    899 }
    900 
    901 void
    902 raidstrategy(struct buf *bp)
    903 {
    904 	unsigned int unit = raidunit(bp->b_dev);
    905 	RF_Raid_t *raidPtr;
    906 	int     wlabel;
    907 	struct raid_softc *rs;
    908 
    909 	if ((rs = raidget(unit)) == NULL) {
    910 		bp->b_error = ENXIO;
    911 		goto done;
    912 	}
    913 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    914 		bp->b_error = ENXIO;
    915 		goto done;
    916 	}
    917 	raidPtr = &rs->sc_r;
    918 	if (!raidPtr->valid) {
    919 		bp->b_error = ENODEV;
    920 		goto done;
    921 	}
    922 	if (bp->b_bcount == 0) {
    923 		db1_printf(("b_bcount is zero..\n"));
    924 		goto done;
    925 	}
    926 
    927 	/*
    928 	 * Do bounds checking and adjust transfer.  If there's an
    929 	 * error, the bounds check will flag that for us.
    930 	 */
    931 
    932 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    933 	if (DISKPART(bp->b_dev) == RAW_PART) {
    934 		uint64_t size; /* device size in DEV_BSIZE unit */
    935 
    936 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    937 			size = raidPtr->totalSectors <<
    938 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    939 		} else {
    940 			size = raidPtr->totalSectors >>
    941 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    942 		}
    943 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    944 			goto done;
    945 		}
    946 	} else {
    947 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    948 			db1_printf(("Bounds check failed!!:%d %d\n",
    949 				(int) bp->b_blkno, (int) wlabel));
    950 			goto done;
    951 		}
    952 	}
    953 
    954 	rf_lock_mutex2(raidPtr->iodone_lock);
    955 
    956 	bp->b_resid = 0;
    957 
    958 	/* stuff it onto our queue */
    959 	bufq_put(rs->buf_queue, bp);
    960 
    961 	/* scheduled the IO to happen at the next convenient time */
    962 	rf_signal_cond2(raidPtr->iodone_cv);
    963 	rf_unlock_mutex2(raidPtr->iodone_lock);
    964 
    965 	return;
    966 
    967 done:
    968 	bp->b_resid = bp->b_bcount;
    969 	biodone(bp);
    970 }
    971 /* ARGSUSED */
    972 int
    973 raidread(dev_t dev, struct uio *uio, int flags)
    974 {
    975 	int     unit = raidunit(dev);
    976 	struct raid_softc *rs;
    977 
    978 	if ((rs = raidget(unit)) == NULL)
    979 		return ENXIO;
    980 
    981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    982 		return (ENXIO);
    983 
    984 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    985 
    986 }
    987 /* ARGSUSED */
    988 int
    989 raidwrite(dev_t dev, struct uio *uio, int flags)
    990 {
    991 	int     unit = raidunit(dev);
    992 	struct raid_softc *rs;
    993 
    994 	if ((rs = raidget(unit)) == NULL)
    995 		return ENXIO;
    996 
    997 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    998 		return (ENXIO);
    999 
   1000 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1001 
   1002 }
   1003 
   1004 static int
   1005 raid_detach_unlocked(struct raid_softc *rs)
   1006 {
   1007 	int error;
   1008 	RF_Raid_t *raidPtr;
   1009 
   1010 	raidPtr = &rs->sc_r;
   1011 
   1012 	/*
   1013 	 * If somebody has a partition mounted, we shouldn't
   1014 	 * shutdown.
   1015 	 */
   1016 	if (rs->sc_dkdev.dk_openmask != 0)
   1017 		return EBUSY;
   1018 
   1019 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1020 		;	/* not initialized: nothing to do */
   1021 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1022 		return error;
   1023 	else
   1024 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1025 
   1026 	/* Detach the disk. */
   1027 	dkwedge_delall(&rs->sc_dkdev);
   1028 	disk_detach(&rs->sc_dkdev);
   1029 	disk_destroy(&rs->sc_dkdev);
   1030 
   1031 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1032 
   1033 	return 0;
   1034 }
   1035 
   1036 int
   1037 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1038 {
   1039 	int     unit = raidunit(dev);
   1040 	int     error = 0;
   1041 	int     part, pmask, s;
   1042 	cfdata_t cf;
   1043 	struct raid_softc *rs;
   1044 	RF_Config_t *k_cfg, *u_cfg;
   1045 	RF_Raid_t *raidPtr;
   1046 	RF_RaidDisk_t *diskPtr;
   1047 	RF_AccTotals_t *totals;
   1048 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1049 	u_char *specific_buf;
   1050 	int retcode = 0;
   1051 	int column;
   1052 /*	int raidid; */
   1053 	struct rf_recon_req *rrcopy, *rr;
   1054 	RF_ComponentLabel_t *clabel;
   1055 	RF_ComponentLabel_t *ci_label;
   1056 	RF_ComponentLabel_t **clabel_ptr;
   1057 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1058 	RF_SingleComponent_t component;
   1059 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1060 	int i, j, d;
   1061 #ifdef __HAVE_OLD_DISKLABEL
   1062 	struct disklabel newlabel;
   1063 #endif
   1064 	struct dkwedge_info *dkw;
   1065 
   1066 	if ((rs = raidget(unit)) == NULL)
   1067 		return ENXIO;
   1068 	raidPtr = &rs->sc_r;
   1069 
   1070 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1071 		(int) DISKPART(dev), (int) unit, cmd));
   1072 
   1073 	/* Must be open for writes for these commands... */
   1074 	switch (cmd) {
   1075 #ifdef DIOCGSECTORSIZE
   1076 	case DIOCGSECTORSIZE:
   1077 		*(u_int *)data = raidPtr->bytesPerSector;
   1078 		return 0;
   1079 	case DIOCGMEDIASIZE:
   1080 		*(off_t *)data =
   1081 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1082 		return 0;
   1083 #endif
   1084 	case DIOCSDINFO:
   1085 	case DIOCWDINFO:
   1086 #ifdef __HAVE_OLD_DISKLABEL
   1087 	case ODIOCWDINFO:
   1088 	case ODIOCSDINFO:
   1089 #endif
   1090 	case DIOCWLABEL:
   1091 	case DIOCAWEDGE:
   1092 	case DIOCDWEDGE:
   1093 	case DIOCMWEDGES:
   1094 	case DIOCSSTRATEGY:
   1095 		if ((flag & FWRITE) == 0)
   1096 			return (EBADF);
   1097 	}
   1098 
   1099 	/* Must be initialized for these... */
   1100 	switch (cmd) {
   1101 	case DIOCGDINFO:
   1102 	case DIOCSDINFO:
   1103 	case DIOCWDINFO:
   1104 #ifdef __HAVE_OLD_DISKLABEL
   1105 	case ODIOCGDINFO:
   1106 	case ODIOCWDINFO:
   1107 	case ODIOCSDINFO:
   1108 	case ODIOCGDEFLABEL:
   1109 #endif
   1110 	case DIOCGPART:
   1111 	case DIOCWLABEL:
   1112 	case DIOCGDEFLABEL:
   1113 	case DIOCAWEDGE:
   1114 	case DIOCDWEDGE:
   1115 	case DIOCLWEDGES:
   1116 	case DIOCMWEDGES:
   1117 	case DIOCCACHESYNC:
   1118 	case RAIDFRAME_SHUTDOWN:
   1119 	case RAIDFRAME_REWRITEPARITY:
   1120 	case RAIDFRAME_GET_INFO:
   1121 	case RAIDFRAME_RESET_ACCTOTALS:
   1122 	case RAIDFRAME_GET_ACCTOTALS:
   1123 	case RAIDFRAME_KEEP_ACCTOTALS:
   1124 	case RAIDFRAME_GET_SIZE:
   1125 	case RAIDFRAME_FAIL_DISK:
   1126 	case RAIDFRAME_COPYBACK:
   1127 	case RAIDFRAME_CHECK_RECON_STATUS:
   1128 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1129 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1130 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1131 	case RAIDFRAME_ADD_HOT_SPARE:
   1132 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1133 	case RAIDFRAME_INIT_LABELS:
   1134 	case RAIDFRAME_REBUILD_IN_PLACE:
   1135 	case RAIDFRAME_CHECK_PARITY:
   1136 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1137 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1138 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1139 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1140 	case RAIDFRAME_SET_AUTOCONFIG:
   1141 	case RAIDFRAME_SET_ROOT:
   1142 	case RAIDFRAME_DELETE_COMPONENT:
   1143 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1144 	case RAIDFRAME_PARITYMAP_STATUS:
   1145 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1146 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1147 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1148 	case DIOCGSTRATEGY:
   1149 	case DIOCSSTRATEGY:
   1150 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1151 			return (ENXIO);
   1152 	}
   1153 
   1154 	switch (cmd) {
   1155 #ifdef COMPAT_50
   1156 	case RAIDFRAME_GET_INFO50:
   1157 		return rf_get_info50(raidPtr, data);
   1158 
   1159 	case RAIDFRAME_CONFIGURE50:
   1160 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1161 			return retcode;
   1162 		goto config;
   1163 #endif
   1164 		/* configure the system */
   1165 	case RAIDFRAME_CONFIGURE:
   1166 
   1167 		if (raidPtr->valid) {
   1168 			/* There is a valid RAID set running on this unit! */
   1169 			printf("raid%d: Device already configured!\n",unit);
   1170 			return(EINVAL);
   1171 		}
   1172 
   1173 		/* copy-in the configuration information */
   1174 		/* data points to a pointer to the configuration structure */
   1175 
   1176 		u_cfg = *((RF_Config_t **) data);
   1177 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1178 		if (k_cfg == NULL) {
   1179 			return (ENOMEM);
   1180 		}
   1181 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1182 		if (retcode) {
   1183 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1184 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1185 				retcode));
   1186 			return (retcode);
   1187 		}
   1188 		goto config;
   1189 	config:
   1190 		/* allocate a buffer for the layout-specific data, and copy it
   1191 		 * in */
   1192 		if (k_cfg->layoutSpecificSize) {
   1193 			if (k_cfg->layoutSpecificSize > 10000) {
   1194 				/* sanity check */
   1195 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1196 				return (EINVAL);
   1197 			}
   1198 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1199 			    (u_char *));
   1200 			if (specific_buf == NULL) {
   1201 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1202 				return (ENOMEM);
   1203 			}
   1204 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1205 			    k_cfg->layoutSpecificSize);
   1206 			if (retcode) {
   1207 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1208 				RF_Free(specific_buf,
   1209 					k_cfg->layoutSpecificSize);
   1210 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1211 					retcode));
   1212 				return (retcode);
   1213 			}
   1214 		} else
   1215 			specific_buf = NULL;
   1216 		k_cfg->layoutSpecific = specific_buf;
   1217 
   1218 		/* should do some kind of sanity check on the configuration.
   1219 		 * Store the sum of all the bytes in the last byte? */
   1220 
   1221 		/* configure the system */
   1222 
   1223 		/*
   1224 		 * Clear the entire RAID descriptor, just to make sure
   1225 		 *  there is no stale data left in the case of a
   1226 		 *  reconfiguration
   1227 		 */
   1228 		memset(raidPtr, 0, sizeof(*raidPtr));
   1229 		raidPtr->softc = rs;
   1230 		raidPtr->raidid = unit;
   1231 
   1232 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1233 
   1234 		if (retcode == 0) {
   1235 
   1236 			/* allow this many simultaneous IO's to
   1237 			   this RAID device */
   1238 			raidPtr->openings = RAIDOUTSTANDING;
   1239 
   1240 			raidinit(rs);
   1241 			rf_markalldirty(raidPtr);
   1242 		}
   1243 		/* free the buffers.  No return code here. */
   1244 		if (k_cfg->layoutSpecificSize) {
   1245 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1246 		}
   1247 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1248 
   1249 		return (retcode);
   1250 
   1251 		/* shutdown the system */
   1252 	case RAIDFRAME_SHUTDOWN:
   1253 
   1254 		part = DISKPART(dev);
   1255 		pmask = (1 << part);
   1256 
   1257 		if ((error = raidlock(rs)) != 0)
   1258 			return (error);
   1259 
   1260 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1261 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1262 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1263 			retcode = EBUSY;
   1264 		else {
   1265 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1266 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1267 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1268 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1269 			retcode = 0;
   1270 		}
   1271 
   1272 		raidunlock(rs);
   1273 
   1274 		if (retcode != 0)
   1275 			return retcode;
   1276 
   1277 		/* free the pseudo device attach bits */
   1278 
   1279 		cf = device_cfdata(rs->sc_dev);
   1280 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1281 			free(cf, M_RAIDFRAME);
   1282 
   1283 		return (retcode);
   1284 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1285 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1286 		/* need to read the component label for the disk indicated
   1287 		   by row,column in clabel */
   1288 
   1289 		/*
   1290 		 * Perhaps there should be an option to skip the in-core
   1291 		 * copy and hit the disk, as with disklabel(8).
   1292 		 */
   1293 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1294 
   1295 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1296 
   1297 		if (retcode) {
   1298 			RF_Free(clabel, sizeof(*clabel));
   1299 			return retcode;
   1300 		}
   1301 
   1302 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1303 
   1304 		column = clabel->column;
   1305 
   1306 		if ((column < 0) || (column >= raidPtr->numCol +
   1307 		    raidPtr->numSpare)) {
   1308 			RF_Free(clabel, sizeof(*clabel));
   1309 			return EINVAL;
   1310 		}
   1311 
   1312 		RF_Free(clabel, sizeof(*clabel));
   1313 
   1314 		clabel = raidget_component_label(raidPtr, column);
   1315 
   1316 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1317 
   1318 #if 0
   1319 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1320 		clabel = (RF_ComponentLabel_t *) data;
   1321 
   1322 		/* XXX check the label for valid stuff... */
   1323 		/* Note that some things *should not* get modified --
   1324 		   the user should be re-initing the labels instead of
   1325 		   trying to patch things.
   1326 		   */
   1327 
   1328 		raidid = raidPtr->raidid;
   1329 #ifdef DEBUG
   1330 		printf("raid%d: Got component label:\n", raidid);
   1331 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1332 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1333 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1334 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1335 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1336 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1337 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1338 #endif
   1339 		clabel->row = 0;
   1340 		column = clabel->column;
   1341 
   1342 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1343 			return(EINVAL);
   1344 		}
   1345 
   1346 		/* XXX this isn't allowed to do anything for now :-) */
   1347 
   1348 		/* XXX and before it is, we need to fill in the rest
   1349 		   of the fields!?!?!?! */
   1350 		memcpy(raidget_component_label(raidPtr, column),
   1351 		    clabel, sizeof(*clabel));
   1352 		raidflush_component_label(raidPtr, column);
   1353 		return (0);
   1354 #endif
   1355 
   1356 	case RAIDFRAME_INIT_LABELS:
   1357 		clabel = (RF_ComponentLabel_t *) data;
   1358 		/*
   1359 		   we only want the serial number from
   1360 		   the above.  We get all the rest of the information
   1361 		   from the config that was used to create this RAID
   1362 		   set.
   1363 		   */
   1364 
   1365 		raidPtr->serial_number = clabel->serial_number;
   1366 
   1367 		for(column=0;column<raidPtr->numCol;column++) {
   1368 			diskPtr = &raidPtr->Disks[column];
   1369 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1370 				ci_label = raidget_component_label(raidPtr,
   1371 				    column);
   1372 				/* Zeroing this is important. */
   1373 				memset(ci_label, 0, sizeof(*ci_label));
   1374 				raid_init_component_label(raidPtr, ci_label);
   1375 				ci_label->serial_number =
   1376 				    raidPtr->serial_number;
   1377 				ci_label->row = 0; /* we dont' pretend to support more */
   1378 				rf_component_label_set_partitionsize(ci_label,
   1379 				    diskPtr->partitionSize);
   1380 				ci_label->column = column;
   1381 				raidflush_component_label(raidPtr, column);
   1382 			}
   1383 			/* XXXjld what about the spares? */
   1384 		}
   1385 
   1386 		return (retcode);
   1387 	case RAIDFRAME_SET_AUTOCONFIG:
   1388 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1389 		printf("raid%d: New autoconfig value is: %d\n",
   1390 		       raidPtr->raidid, d);
   1391 		*(int *) data = d;
   1392 		return (retcode);
   1393 
   1394 	case RAIDFRAME_SET_ROOT:
   1395 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1396 		printf("raid%d: New rootpartition value is: %d\n",
   1397 		       raidPtr->raidid, d);
   1398 		*(int *) data = d;
   1399 		return (retcode);
   1400 
   1401 		/* initialize all parity */
   1402 	case RAIDFRAME_REWRITEPARITY:
   1403 
   1404 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1405 			/* Parity for RAID 0 is trivially correct */
   1406 			raidPtr->parity_good = RF_RAID_CLEAN;
   1407 			return(0);
   1408 		}
   1409 
   1410 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1411 			/* Re-write is already in progress! */
   1412 			return(EINVAL);
   1413 		}
   1414 
   1415 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1416 					   rf_RewriteParityThread,
   1417 					   raidPtr,"raid_parity");
   1418 		return (retcode);
   1419 
   1420 
   1421 	case RAIDFRAME_ADD_HOT_SPARE:
   1422 		sparePtr = (RF_SingleComponent_t *) data;
   1423 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1424 		retcode = rf_add_hot_spare(raidPtr, &component);
   1425 		return(retcode);
   1426 
   1427 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1428 		return(retcode);
   1429 
   1430 	case RAIDFRAME_DELETE_COMPONENT:
   1431 		componentPtr = (RF_SingleComponent_t *)data;
   1432 		memcpy( &component, componentPtr,
   1433 			sizeof(RF_SingleComponent_t));
   1434 		retcode = rf_delete_component(raidPtr, &component);
   1435 		return(retcode);
   1436 
   1437 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1438 		componentPtr = (RF_SingleComponent_t *)data;
   1439 		memcpy( &component, componentPtr,
   1440 			sizeof(RF_SingleComponent_t));
   1441 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1442 		return(retcode);
   1443 
   1444 	case RAIDFRAME_REBUILD_IN_PLACE:
   1445 
   1446 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1447 			/* Can't do this on a RAID 0!! */
   1448 			return(EINVAL);
   1449 		}
   1450 
   1451 		if (raidPtr->recon_in_progress == 1) {
   1452 			/* a reconstruct is already in progress! */
   1453 			return(EINVAL);
   1454 		}
   1455 
   1456 		componentPtr = (RF_SingleComponent_t *) data;
   1457 		memcpy( &component, componentPtr,
   1458 			sizeof(RF_SingleComponent_t));
   1459 		component.row = 0; /* we don't support any more */
   1460 		column = component.column;
   1461 
   1462 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1463 			return(EINVAL);
   1464 		}
   1465 
   1466 		rf_lock_mutex2(raidPtr->mutex);
   1467 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1468 		    (raidPtr->numFailures > 0)) {
   1469 			/* XXX 0 above shouldn't be constant!!! */
   1470 			/* some component other than this has failed.
   1471 			   Let's not make things worse than they already
   1472 			   are... */
   1473 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1474 			       raidPtr->raidid);
   1475 			printf("raid%d:     Col: %d   Too many failures.\n",
   1476 			       raidPtr->raidid, column);
   1477 			rf_unlock_mutex2(raidPtr->mutex);
   1478 			return (EINVAL);
   1479 		}
   1480 		if (raidPtr->Disks[column].status ==
   1481 		    rf_ds_reconstructing) {
   1482 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1483 			       raidPtr->raidid);
   1484 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1485 
   1486 			rf_unlock_mutex2(raidPtr->mutex);
   1487 			return (EINVAL);
   1488 		}
   1489 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1490 			rf_unlock_mutex2(raidPtr->mutex);
   1491 			return (EINVAL);
   1492 		}
   1493 		rf_unlock_mutex2(raidPtr->mutex);
   1494 
   1495 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1496 		if (rrcopy == NULL)
   1497 			return(ENOMEM);
   1498 
   1499 		rrcopy->raidPtr = (void *) raidPtr;
   1500 		rrcopy->col = column;
   1501 
   1502 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1503 					   rf_ReconstructInPlaceThread,
   1504 					   rrcopy,"raid_reconip");
   1505 		return(retcode);
   1506 
   1507 	case RAIDFRAME_GET_INFO:
   1508 		if (!raidPtr->valid)
   1509 			return (ENODEV);
   1510 		ucfgp = (RF_DeviceConfig_t **) data;
   1511 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1512 			  (RF_DeviceConfig_t *));
   1513 		if (d_cfg == NULL)
   1514 			return (ENOMEM);
   1515 		d_cfg->rows = 1; /* there is only 1 row now */
   1516 		d_cfg->cols = raidPtr->numCol;
   1517 		d_cfg->ndevs = raidPtr->numCol;
   1518 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1519 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1520 			return (ENOMEM);
   1521 		}
   1522 		d_cfg->nspares = raidPtr->numSpare;
   1523 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1524 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1525 			return (ENOMEM);
   1526 		}
   1527 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1528 		d = 0;
   1529 		for (j = 0; j < d_cfg->cols; j++) {
   1530 			d_cfg->devs[d] = raidPtr->Disks[j];
   1531 			d++;
   1532 		}
   1533 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1534 			d_cfg->spares[i] = raidPtr->Disks[j];
   1535 		}
   1536 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1537 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1538 
   1539 		return (retcode);
   1540 
   1541 	case RAIDFRAME_CHECK_PARITY:
   1542 		*(int *) data = raidPtr->parity_good;
   1543 		return (0);
   1544 
   1545 	case RAIDFRAME_PARITYMAP_STATUS:
   1546 		if (rf_paritymap_ineligible(raidPtr))
   1547 			return EINVAL;
   1548 		rf_paritymap_status(raidPtr->parity_map,
   1549 		    (struct rf_pmstat *)data);
   1550 		return 0;
   1551 
   1552 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1553 		if (rf_paritymap_ineligible(raidPtr))
   1554 			return EINVAL;
   1555 		if (raidPtr->parity_map == NULL)
   1556 			return ENOENT; /* ??? */
   1557 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1558 			(struct rf_pmparams *)data, 1))
   1559 			return EINVAL;
   1560 		return 0;
   1561 
   1562 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1563 		if (rf_paritymap_ineligible(raidPtr))
   1564 			return EINVAL;
   1565 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1566 		return 0;
   1567 
   1568 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1569 		if (rf_paritymap_ineligible(raidPtr))
   1570 			return EINVAL;
   1571 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1572 		/* XXX should errors be passed up? */
   1573 		return 0;
   1574 
   1575 	case RAIDFRAME_RESET_ACCTOTALS:
   1576 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1577 		return (0);
   1578 
   1579 	case RAIDFRAME_GET_ACCTOTALS:
   1580 		totals = (RF_AccTotals_t *) data;
   1581 		*totals = raidPtr->acc_totals;
   1582 		return (0);
   1583 
   1584 	case RAIDFRAME_KEEP_ACCTOTALS:
   1585 		raidPtr->keep_acc_totals = *(int *)data;
   1586 		return (0);
   1587 
   1588 	case RAIDFRAME_GET_SIZE:
   1589 		*(int *) data = raidPtr->totalSectors;
   1590 		return (0);
   1591 
   1592 		/* fail a disk & optionally start reconstruction */
   1593 	case RAIDFRAME_FAIL_DISK:
   1594 
   1595 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1596 			/* Can't do this on a RAID 0!! */
   1597 			return(EINVAL);
   1598 		}
   1599 
   1600 		rr = (struct rf_recon_req *) data;
   1601 		rr->row = 0;
   1602 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1603 			return (EINVAL);
   1604 
   1605 
   1606 		rf_lock_mutex2(raidPtr->mutex);
   1607 		if (raidPtr->status == rf_rs_reconstructing) {
   1608 			/* you can't fail a disk while we're reconstructing! */
   1609 			/* XXX wrong for RAID6 */
   1610 			rf_unlock_mutex2(raidPtr->mutex);
   1611 			return (EINVAL);
   1612 		}
   1613 		if ((raidPtr->Disks[rr->col].status ==
   1614 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1615 			/* some other component has failed.  Let's not make
   1616 			   things worse. XXX wrong for RAID6 */
   1617 			rf_unlock_mutex2(raidPtr->mutex);
   1618 			return (EINVAL);
   1619 		}
   1620 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1621 			/* Can't fail a spared disk! */
   1622 			rf_unlock_mutex2(raidPtr->mutex);
   1623 			return (EINVAL);
   1624 		}
   1625 		rf_unlock_mutex2(raidPtr->mutex);
   1626 
   1627 		/* make a copy of the recon request so that we don't rely on
   1628 		 * the user's buffer */
   1629 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1630 		if (rrcopy == NULL)
   1631 			return(ENOMEM);
   1632 		memcpy(rrcopy, rr, sizeof(*rr));
   1633 		rrcopy->raidPtr = (void *) raidPtr;
   1634 
   1635 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1636 					   rf_ReconThread,
   1637 					   rrcopy,"raid_recon");
   1638 		return (0);
   1639 
   1640 		/* invoke a copyback operation after recon on whatever disk
   1641 		 * needs it, if any */
   1642 	case RAIDFRAME_COPYBACK:
   1643 
   1644 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1645 			/* This makes no sense on a RAID 0!! */
   1646 			return(EINVAL);
   1647 		}
   1648 
   1649 		if (raidPtr->copyback_in_progress == 1) {
   1650 			/* Copyback is already in progress! */
   1651 			return(EINVAL);
   1652 		}
   1653 
   1654 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1655 					   rf_CopybackThread,
   1656 					   raidPtr,"raid_copyback");
   1657 		return (retcode);
   1658 
   1659 		/* return the percentage completion of reconstruction */
   1660 	case RAIDFRAME_CHECK_RECON_STATUS:
   1661 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1662 			/* This makes no sense on a RAID 0, so tell the
   1663 			   user it's done. */
   1664 			*(int *) data = 100;
   1665 			return(0);
   1666 		}
   1667 		if (raidPtr->status != rf_rs_reconstructing)
   1668 			*(int *) data = 100;
   1669 		else {
   1670 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1671 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1672 			} else {
   1673 				*(int *) data = 0;
   1674 			}
   1675 		}
   1676 		return (0);
   1677 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1678 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1679 		if (raidPtr->status != rf_rs_reconstructing) {
   1680 			progressInfo.remaining = 0;
   1681 			progressInfo.completed = 100;
   1682 			progressInfo.total = 100;
   1683 		} else {
   1684 			progressInfo.total =
   1685 				raidPtr->reconControl->numRUsTotal;
   1686 			progressInfo.completed =
   1687 				raidPtr->reconControl->numRUsComplete;
   1688 			progressInfo.remaining = progressInfo.total -
   1689 				progressInfo.completed;
   1690 		}
   1691 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1692 				  sizeof(RF_ProgressInfo_t));
   1693 		return (retcode);
   1694 
   1695 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1696 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1697 			/* This makes no sense on a RAID 0, so tell the
   1698 			   user it's done. */
   1699 			*(int *) data = 100;
   1700 			return(0);
   1701 		}
   1702 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1703 			*(int *) data = 100 *
   1704 				raidPtr->parity_rewrite_stripes_done /
   1705 				raidPtr->Layout.numStripe;
   1706 		} else {
   1707 			*(int *) data = 100;
   1708 		}
   1709 		return (0);
   1710 
   1711 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1712 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1713 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1714 			progressInfo.total = raidPtr->Layout.numStripe;
   1715 			progressInfo.completed =
   1716 				raidPtr->parity_rewrite_stripes_done;
   1717 			progressInfo.remaining = progressInfo.total -
   1718 				progressInfo.completed;
   1719 		} else {
   1720 			progressInfo.remaining = 0;
   1721 			progressInfo.completed = 100;
   1722 			progressInfo.total = 100;
   1723 		}
   1724 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1725 				  sizeof(RF_ProgressInfo_t));
   1726 		return (retcode);
   1727 
   1728 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1729 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1730 			/* This makes no sense on a RAID 0 */
   1731 			*(int *) data = 100;
   1732 			return(0);
   1733 		}
   1734 		if (raidPtr->copyback_in_progress == 1) {
   1735 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1736 				raidPtr->Layout.numStripe;
   1737 		} else {
   1738 			*(int *) data = 100;
   1739 		}
   1740 		return (0);
   1741 
   1742 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1743 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1744 		if (raidPtr->copyback_in_progress == 1) {
   1745 			progressInfo.total = raidPtr->Layout.numStripe;
   1746 			progressInfo.completed =
   1747 				raidPtr->copyback_stripes_done;
   1748 			progressInfo.remaining = progressInfo.total -
   1749 				progressInfo.completed;
   1750 		} else {
   1751 			progressInfo.remaining = 0;
   1752 			progressInfo.completed = 100;
   1753 			progressInfo.total = 100;
   1754 		}
   1755 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1756 				  sizeof(RF_ProgressInfo_t));
   1757 		return (retcode);
   1758 
   1759 		/* the sparetable daemon calls this to wait for the kernel to
   1760 		 * need a spare table. this ioctl does not return until a
   1761 		 * spare table is needed. XXX -- calling mpsleep here in the
   1762 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1763 		 * -- I should either compute the spare table in the kernel,
   1764 		 * or have a different -- XXX XXX -- interface (a different
   1765 		 * character device) for delivering the table     -- XXX */
   1766 #if 0
   1767 	case RAIDFRAME_SPARET_WAIT:
   1768 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1769 		while (!rf_sparet_wait_queue)
   1770 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1771 		waitreq = rf_sparet_wait_queue;
   1772 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1773 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1774 
   1775 		/* structure assignment */
   1776 		*((RF_SparetWait_t *) data) = *waitreq;
   1777 
   1778 		RF_Free(waitreq, sizeof(*waitreq));
   1779 		return (0);
   1780 
   1781 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1782 		 * code in it that will cause the dameon to exit */
   1783 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1784 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1785 		waitreq->fcol = -1;
   1786 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1787 		waitreq->next = rf_sparet_wait_queue;
   1788 		rf_sparet_wait_queue = waitreq;
   1789 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1790 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1791 		return (0);
   1792 
   1793 		/* used by the spare table daemon to deliver a spare table
   1794 		 * into the kernel */
   1795 	case RAIDFRAME_SEND_SPARET:
   1796 
   1797 		/* install the spare table */
   1798 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1799 
   1800 		/* respond to the requestor.  the return status of the spare
   1801 		 * table installation is passed in the "fcol" field */
   1802 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1803 		waitreq->fcol = retcode;
   1804 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1805 		waitreq->next = rf_sparet_resp_queue;
   1806 		rf_sparet_resp_queue = waitreq;
   1807 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1808 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1809 
   1810 		return (retcode);
   1811 #endif
   1812 
   1813 	default:
   1814 		break; /* fall through to the os-specific code below */
   1815 
   1816 	}
   1817 
   1818 	if (!raidPtr->valid)
   1819 		return (EINVAL);
   1820 
   1821 	/*
   1822 	 * Add support for "regular" device ioctls here.
   1823 	 */
   1824 
   1825 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1826 	if (error != EPASSTHROUGH)
   1827 		return (error);
   1828 
   1829 	switch (cmd) {
   1830 	case DIOCGDINFO:
   1831 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1832 		break;
   1833 #ifdef __HAVE_OLD_DISKLABEL
   1834 	case ODIOCGDINFO:
   1835 		newlabel = *(rs->sc_dkdev.dk_label);
   1836 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1837 			return ENOTTY;
   1838 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1839 		break;
   1840 #endif
   1841 
   1842 	case DIOCGPART:
   1843 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1844 		((struct partinfo *) data)->part =
   1845 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1846 		break;
   1847 
   1848 	case DIOCWDINFO:
   1849 	case DIOCSDINFO:
   1850 #ifdef __HAVE_OLD_DISKLABEL
   1851 	case ODIOCWDINFO:
   1852 	case ODIOCSDINFO:
   1853 #endif
   1854 	{
   1855 		struct disklabel *lp;
   1856 #ifdef __HAVE_OLD_DISKLABEL
   1857 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1858 			memset(&newlabel, 0, sizeof newlabel);
   1859 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1860 			lp = &newlabel;
   1861 		} else
   1862 #endif
   1863 		lp = (struct disklabel *)data;
   1864 
   1865 		if ((error = raidlock(rs)) != 0)
   1866 			return (error);
   1867 
   1868 		rs->sc_flags |= RAIDF_LABELLING;
   1869 
   1870 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1871 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1872 		if (error == 0) {
   1873 			if (cmd == DIOCWDINFO
   1874 #ifdef __HAVE_OLD_DISKLABEL
   1875 			    || cmd == ODIOCWDINFO
   1876 #endif
   1877 			   )
   1878 				error = writedisklabel(RAIDLABELDEV(dev),
   1879 				    raidstrategy, rs->sc_dkdev.dk_label,
   1880 				    rs->sc_dkdev.dk_cpulabel);
   1881 		}
   1882 		rs->sc_flags &= ~RAIDF_LABELLING;
   1883 
   1884 		raidunlock(rs);
   1885 
   1886 		if (error)
   1887 			return (error);
   1888 		break;
   1889 	}
   1890 
   1891 	case DIOCWLABEL:
   1892 		if (*(int *) data != 0)
   1893 			rs->sc_flags |= RAIDF_WLABEL;
   1894 		else
   1895 			rs->sc_flags &= ~RAIDF_WLABEL;
   1896 		break;
   1897 
   1898 	case DIOCGDEFLABEL:
   1899 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1900 		break;
   1901 
   1902 #ifdef __HAVE_OLD_DISKLABEL
   1903 	case ODIOCGDEFLABEL:
   1904 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1905 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1906 			return ENOTTY;
   1907 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1908 		break;
   1909 #endif
   1910 
   1911 	case DIOCAWEDGE:
   1912 	case DIOCDWEDGE:
   1913 	    	dkw = (void *)data;
   1914 
   1915 		/* If the ioctl happens here, the parent is us. */
   1916 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1917 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1918 
   1919 	case DIOCLWEDGES:
   1920 		return dkwedge_list(&rs->sc_dkdev,
   1921 		    (struct dkwedge_list *)data, l);
   1922 	case DIOCMWEDGES:
   1923 		dkwedge_discover(&rs->sc_dkdev);
   1924 		return 0;
   1925 	case DIOCCACHESYNC:
   1926 		return rf_sync_component_caches(raidPtr);
   1927 
   1928 	case DIOCGSTRATEGY:
   1929 	    {
   1930 		struct disk_strategy *dks = (void *)data;
   1931 
   1932 		s = splbio();
   1933 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1934 		    sizeof(dks->dks_name));
   1935 		splx(s);
   1936 		dks->dks_paramlen = 0;
   1937 
   1938 		return 0;
   1939 	    }
   1940 
   1941 	case DIOCSSTRATEGY:
   1942 	    {
   1943 		struct disk_strategy *dks = (void *)data;
   1944 		struct bufq_state *new;
   1945 		struct bufq_state *old;
   1946 
   1947 		if (dks->dks_param != NULL) {
   1948 			return EINVAL;
   1949 		}
   1950 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1951 		error = bufq_alloc(&new, dks->dks_name,
   1952 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1953 		if (error) {
   1954 			return error;
   1955 		}
   1956 		s = splbio();
   1957 		old = rs->buf_queue;
   1958 		bufq_move(new, old);
   1959 		rs->buf_queue = new;
   1960 		splx(s);
   1961 		bufq_free(old);
   1962 
   1963 		return 0;
   1964 	    }
   1965 
   1966 	default:
   1967 		retcode = ENOTTY;
   1968 	}
   1969 	return (retcode);
   1970 
   1971 }
   1972 
   1973 
   1974 /* raidinit -- complete the rest of the initialization for the
   1975    RAIDframe device.  */
   1976 
   1977 
   1978 static void
   1979 raidinit(struct raid_softc *rs)
   1980 {
   1981 	cfdata_t cf;
   1982 	int     unit;
   1983 	RF_Raid_t *raidPtr = &rs->sc_r;
   1984 
   1985 	unit = raidPtr->raidid;
   1986 
   1987 
   1988 	/* XXX should check return code first... */
   1989 	rs->sc_flags |= RAIDF_INITED;
   1990 
   1991 	/* XXX doesn't check bounds. */
   1992 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1993 
   1994 	/* attach the pseudo device */
   1995 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1996 	cf->cf_name = raid_cd.cd_name;
   1997 	cf->cf_atname = raid_cd.cd_name;
   1998 	cf->cf_unit = unit;
   1999 	cf->cf_fstate = FSTATE_STAR;
   2000 
   2001 	rs->sc_dev = config_attach_pseudo(cf);
   2002 
   2003 	if (rs->sc_dev == NULL) {
   2004 		printf("raid%d: config_attach_pseudo failed\n",
   2005 		    raidPtr->raidid);
   2006 		rs->sc_flags &= ~RAIDF_INITED;
   2007 		free(cf, M_RAIDFRAME);
   2008 		return;
   2009 	}
   2010 
   2011 	/* disk_attach actually creates space for the CPU disklabel, among
   2012 	 * other things, so it's critical to call this *BEFORE* we try putzing
   2013 	 * with disklabels. */
   2014 
   2015 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   2016 	disk_attach(&rs->sc_dkdev);
   2017 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   2018 
   2019 	/* XXX There may be a weird interaction here between this, and
   2020 	 * protectedSectors, as used in RAIDframe.  */
   2021 
   2022 	rs->sc_size = raidPtr->totalSectors;
   2023 
   2024 	dkwedge_discover(&rs->sc_dkdev);
   2025 
   2026 	rf_set_geometry(rs, raidPtr);
   2027 
   2028 }
   2029 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2030 /* wake up the daemon & tell it to get us a spare table
   2031  * XXX
   2032  * the entries in the queues should be tagged with the raidPtr
   2033  * so that in the extremely rare case that two recons happen at once,
   2034  * we know for which device were requesting a spare table
   2035  * XXX
   2036  *
   2037  * XXX This code is not currently used. GO
   2038  */
   2039 int
   2040 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2041 {
   2042 	int     retcode;
   2043 
   2044 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2045 	req->next = rf_sparet_wait_queue;
   2046 	rf_sparet_wait_queue = req;
   2047 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2048 
   2049 	/* mpsleep unlocks the mutex */
   2050 	while (!rf_sparet_resp_queue) {
   2051 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2052 	}
   2053 	req = rf_sparet_resp_queue;
   2054 	rf_sparet_resp_queue = req->next;
   2055 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2056 
   2057 	retcode = req->fcol;
   2058 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2059 					 * alloc'd */
   2060 	return (retcode);
   2061 }
   2062 #endif
   2063 
   2064 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2065  * bp & passes it down.
   2066  * any calls originating in the kernel must use non-blocking I/O
   2067  * do some extra sanity checking to return "appropriate" error values for
   2068  * certain conditions (to make some standard utilities work)
   2069  *
   2070  * Formerly known as: rf_DoAccessKernel
   2071  */
   2072 void
   2073 raidstart(RF_Raid_t *raidPtr)
   2074 {
   2075 	RF_SectorCount_t num_blocks, pb, sum;
   2076 	RF_RaidAddr_t raid_addr;
   2077 	struct partition *pp;
   2078 	daddr_t blocknum;
   2079 	struct raid_softc *rs;
   2080 	int     do_async;
   2081 	struct buf *bp;
   2082 	int rc;
   2083 
   2084 	rs = raidPtr->softc;
   2085 	/* quick check to see if anything has died recently */
   2086 	rf_lock_mutex2(raidPtr->mutex);
   2087 	if (raidPtr->numNewFailures > 0) {
   2088 		rf_unlock_mutex2(raidPtr->mutex);
   2089 		rf_update_component_labels(raidPtr,
   2090 					   RF_NORMAL_COMPONENT_UPDATE);
   2091 		rf_lock_mutex2(raidPtr->mutex);
   2092 		raidPtr->numNewFailures--;
   2093 	}
   2094 
   2095 	/* Check to see if we're at the limit... */
   2096 	while (raidPtr->openings > 0) {
   2097 		rf_unlock_mutex2(raidPtr->mutex);
   2098 
   2099 		/* get the next item, if any, from the queue */
   2100 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2101 			/* nothing more to do */
   2102 			return;
   2103 		}
   2104 
   2105 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2106 		 * partition.. Need to make it absolute to the underlying
   2107 		 * device.. */
   2108 
   2109 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2110 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2111 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2112 			blocknum += pp->p_offset;
   2113 		}
   2114 
   2115 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2116 			    (int) blocknum));
   2117 
   2118 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2119 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2120 
   2121 		/* *THIS* is where we adjust what block we're going to...
   2122 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2123 		raid_addr = blocknum;
   2124 
   2125 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2126 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2127 		sum = raid_addr + num_blocks + pb;
   2128 		if (1 || rf_debugKernelAccess) {
   2129 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2130 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2131 				    (int) pb, (int) bp->b_resid));
   2132 		}
   2133 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2134 		    || (sum < num_blocks) || (sum < pb)) {
   2135 			bp->b_error = ENOSPC;
   2136 			bp->b_resid = bp->b_bcount;
   2137 			biodone(bp);
   2138 			rf_lock_mutex2(raidPtr->mutex);
   2139 			continue;
   2140 		}
   2141 		/*
   2142 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2143 		 */
   2144 
   2145 		if (bp->b_bcount & raidPtr->sectorMask) {
   2146 			bp->b_error = EINVAL;
   2147 			bp->b_resid = bp->b_bcount;
   2148 			biodone(bp);
   2149 			rf_lock_mutex2(raidPtr->mutex);
   2150 			continue;
   2151 
   2152 		}
   2153 		db1_printf(("Calling DoAccess..\n"));
   2154 
   2155 
   2156 		rf_lock_mutex2(raidPtr->mutex);
   2157 		raidPtr->openings--;
   2158 		rf_unlock_mutex2(raidPtr->mutex);
   2159 
   2160 		/*
   2161 		 * Everything is async.
   2162 		 */
   2163 		do_async = 1;
   2164 
   2165 		disk_busy(&rs->sc_dkdev);
   2166 
   2167 		/* XXX we're still at splbio() here... do we *really*
   2168 		   need to be? */
   2169 
   2170 		/* don't ever condition on bp->b_flags & B_WRITE.
   2171 		 * always condition on B_READ instead */
   2172 
   2173 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2174 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2175 				 do_async, raid_addr, num_blocks,
   2176 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2177 
   2178 		if (rc) {
   2179 			bp->b_error = rc;
   2180 			bp->b_resid = bp->b_bcount;
   2181 			biodone(bp);
   2182 			/* continue loop */
   2183 		}
   2184 
   2185 		rf_lock_mutex2(raidPtr->mutex);
   2186 	}
   2187 	rf_unlock_mutex2(raidPtr->mutex);
   2188 }
   2189 
   2190 
   2191 
   2192 
   2193 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2194 
   2195 int
   2196 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2197 {
   2198 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2199 	struct buf *bp;
   2200 
   2201 	req->queue = queue;
   2202 	bp = req->bp;
   2203 
   2204 	switch (req->type) {
   2205 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2206 		/* XXX need to do something extra here.. */
   2207 		/* I'm leaving this in, as I've never actually seen it used,
   2208 		 * and I'd like folks to report it... GO */
   2209 		printf(("WAKEUP CALLED\n"));
   2210 		queue->numOutstanding++;
   2211 
   2212 		bp->b_flags = 0;
   2213 		bp->b_private = req;
   2214 
   2215 		KernelWakeupFunc(bp);
   2216 		break;
   2217 
   2218 	case RF_IO_TYPE_READ:
   2219 	case RF_IO_TYPE_WRITE:
   2220 #if RF_ACC_TRACE > 0
   2221 		if (req->tracerec) {
   2222 			RF_ETIMER_START(req->tracerec->timer);
   2223 		}
   2224 #endif
   2225 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2226 		    op, queue->rf_cinfo->ci_dev,
   2227 		    req->sectorOffset, req->numSector,
   2228 		    req->buf, KernelWakeupFunc, (void *) req,
   2229 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2230 
   2231 		if (rf_debugKernelAccess) {
   2232 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2233 				(long) bp->b_blkno));
   2234 		}
   2235 		queue->numOutstanding++;
   2236 		queue->last_deq_sector = req->sectorOffset;
   2237 		/* acc wouldn't have been let in if there were any pending
   2238 		 * reqs at any other priority */
   2239 		queue->curPriority = req->priority;
   2240 
   2241 		db1_printf(("Going for %c to unit %d col %d\n",
   2242 			    req->type, queue->raidPtr->raidid,
   2243 			    queue->col));
   2244 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2245 			(int) req->sectorOffset, (int) req->numSector,
   2246 			(int) (req->numSector <<
   2247 			    queue->raidPtr->logBytesPerSector),
   2248 			(int) queue->raidPtr->logBytesPerSector));
   2249 
   2250 		/*
   2251 		 * XXX: drop lock here since this can block at
   2252 		 * least with backing SCSI devices.  Retake it
   2253 		 * to minimize fuss with calling interfaces.
   2254 		 */
   2255 
   2256 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2257 		bdev_strategy(bp);
   2258 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2259 		break;
   2260 
   2261 	default:
   2262 		panic("bad req->type in rf_DispatchKernelIO");
   2263 	}
   2264 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2265 
   2266 	return (0);
   2267 }
   2268 /* this is the callback function associated with a I/O invoked from
   2269    kernel code.
   2270  */
   2271 static void
   2272 KernelWakeupFunc(struct buf *bp)
   2273 {
   2274 	RF_DiskQueueData_t *req = NULL;
   2275 	RF_DiskQueue_t *queue;
   2276 
   2277 	db1_printf(("recovering the request queue:\n"));
   2278 
   2279 	req = bp->b_private;
   2280 
   2281 	queue = (RF_DiskQueue_t *) req->queue;
   2282 
   2283 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2284 
   2285 #if RF_ACC_TRACE > 0
   2286 	if (req->tracerec) {
   2287 		RF_ETIMER_STOP(req->tracerec->timer);
   2288 		RF_ETIMER_EVAL(req->tracerec->timer);
   2289 		rf_lock_mutex2(rf_tracing_mutex);
   2290 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2291 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2292 		req->tracerec->num_phys_ios++;
   2293 		rf_unlock_mutex2(rf_tracing_mutex);
   2294 	}
   2295 #endif
   2296 
   2297 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2298 	 * ballistic, and mark the component as hosed... */
   2299 
   2300 	if (bp->b_error != 0) {
   2301 		/* Mark the disk as dead */
   2302 		/* but only mark it once... */
   2303 		/* and only if it wouldn't leave this RAID set
   2304 		   completely broken */
   2305 		if (((queue->raidPtr->Disks[queue->col].status ==
   2306 		      rf_ds_optimal) ||
   2307 		     (queue->raidPtr->Disks[queue->col].status ==
   2308 		      rf_ds_used_spare)) &&
   2309 		     (queue->raidPtr->numFailures <
   2310 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2311 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2312 			       queue->raidPtr->raidid,
   2313 			       queue->raidPtr->Disks[queue->col].devname);
   2314 			queue->raidPtr->Disks[queue->col].status =
   2315 			    rf_ds_failed;
   2316 			queue->raidPtr->status = rf_rs_degraded;
   2317 			queue->raidPtr->numFailures++;
   2318 			queue->raidPtr->numNewFailures++;
   2319 		} else {	/* Disk is already dead... */
   2320 			/* printf("Disk already marked as dead!\n"); */
   2321 		}
   2322 
   2323 	}
   2324 
   2325 	/* Fill in the error value */
   2326 	req->error = bp->b_error;
   2327 
   2328 	/* Drop this one on the "finished" queue... */
   2329 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2330 
   2331 	/* Let the raidio thread know there is work to be done. */
   2332 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2333 
   2334 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2335 }
   2336 
   2337 
   2338 /*
   2339  * initialize a buf structure for doing an I/O in the kernel.
   2340  */
   2341 static void
   2342 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2343        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2344        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2345        struct proc *b_proc)
   2346 {
   2347 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2348 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2349 	bp->b_oflags = 0;
   2350 	bp->b_cflags = 0;
   2351 	bp->b_bcount = numSect << logBytesPerSector;
   2352 	bp->b_bufsize = bp->b_bcount;
   2353 	bp->b_error = 0;
   2354 	bp->b_dev = dev;
   2355 	bp->b_data = bf;
   2356 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2357 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2358 	if (bp->b_bcount == 0) {
   2359 		panic("bp->b_bcount is zero in InitBP!!");
   2360 	}
   2361 	bp->b_proc = b_proc;
   2362 	bp->b_iodone = cbFunc;
   2363 	bp->b_private = cbArg;
   2364 }
   2365 
   2366 static void
   2367 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2368 		    struct disklabel *lp)
   2369 {
   2370 	memset(lp, 0, sizeof(*lp));
   2371 
   2372 	/* fabricate a label... */
   2373 	if (raidPtr->totalSectors > UINT32_MAX)
   2374 		lp->d_secperunit = UINT32_MAX;
   2375 	else
   2376 		lp->d_secperunit = raidPtr->totalSectors;
   2377 	lp->d_secsize = raidPtr->bytesPerSector;
   2378 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2379 	lp->d_ntracks = 4 * raidPtr->numCol;
   2380 	lp->d_ncylinders = raidPtr->totalSectors /
   2381 		(lp->d_nsectors * lp->d_ntracks);
   2382 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2383 
   2384 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2385 	lp->d_type = DTYPE_RAID;
   2386 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2387 	lp->d_rpm = 3600;
   2388 	lp->d_interleave = 1;
   2389 	lp->d_flags = 0;
   2390 
   2391 	lp->d_partitions[RAW_PART].p_offset = 0;
   2392 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2393 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2394 	lp->d_npartitions = RAW_PART + 1;
   2395 
   2396 	lp->d_magic = DISKMAGIC;
   2397 	lp->d_magic2 = DISKMAGIC;
   2398 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2399 
   2400 }
   2401 /*
   2402  * Read the disklabel from the raid device.  If one is not present, fake one
   2403  * up.
   2404  */
   2405 static void
   2406 raidgetdisklabel(dev_t dev)
   2407 {
   2408 	int     unit = raidunit(dev);
   2409 	struct raid_softc *rs;
   2410 	const char   *errstring;
   2411 	struct disklabel *lp;
   2412 	struct cpu_disklabel *clp;
   2413 	RF_Raid_t *raidPtr;
   2414 
   2415 	if ((rs = raidget(unit)) == NULL)
   2416 		return;
   2417 
   2418 	lp = rs->sc_dkdev.dk_label;
   2419 	clp = rs->sc_dkdev.dk_cpulabel;
   2420 
   2421 	db1_printf(("Getting the disklabel...\n"));
   2422 
   2423 	memset(clp, 0, sizeof(*clp));
   2424 
   2425 	raidPtr = &rs->sc_r;
   2426 
   2427 	raidgetdefaultlabel(raidPtr, rs, lp);
   2428 
   2429 	/*
   2430 	 * Call the generic disklabel extraction routine.
   2431 	 */
   2432 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2433 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2434 	if (errstring)
   2435 		raidmakedisklabel(rs);
   2436 	else {
   2437 		int     i;
   2438 		struct partition *pp;
   2439 
   2440 		/*
   2441 		 * Sanity check whether the found disklabel is valid.
   2442 		 *
   2443 		 * This is necessary since total size of the raid device
   2444 		 * may vary when an interleave is changed even though exactly
   2445 		 * same components are used, and old disklabel may used
   2446 		 * if that is found.
   2447 		 */
   2448 		if (lp->d_secperunit < UINT32_MAX ?
   2449 		    lp->d_secperunit != rs->sc_size :
   2450 		    lp->d_secperunit > rs->sc_size)
   2451 			printf("raid%d: WARNING: %s: "
   2452 			    "total sector size in disklabel (%ju) != "
   2453 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
   2454 			    (uintmax_t)lp->d_secperunit,
   2455 			    (uintmax_t)rs->sc_size);
   2456 		for (i = 0; i < lp->d_npartitions; i++) {
   2457 			pp = &lp->d_partitions[i];
   2458 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2459 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2460 				       "exceeds the size of raid (%ju)\n",
   2461 				       unit, rs->sc_xname, 'a' + i,
   2462 				       (uintmax_t)rs->sc_size);
   2463 		}
   2464 	}
   2465 
   2466 }
   2467 /*
   2468  * Take care of things one might want to take care of in the event
   2469  * that a disklabel isn't present.
   2470  */
   2471 static void
   2472 raidmakedisklabel(struct raid_softc *rs)
   2473 {
   2474 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2475 	db1_printf(("Making a label..\n"));
   2476 
   2477 	/*
   2478 	 * For historical reasons, if there's no disklabel present
   2479 	 * the raw partition must be marked FS_BSDFFS.
   2480 	 */
   2481 
   2482 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2483 
   2484 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2485 
   2486 	lp->d_checksum = dkcksum(lp);
   2487 }
   2488 /*
   2489  * Wait interruptibly for an exclusive lock.
   2490  *
   2491  * XXX
   2492  * Several drivers do this; it should be abstracted and made MP-safe.
   2493  * (Hmm... where have we seen this warning before :->  GO )
   2494  */
   2495 static int
   2496 raidlock(struct raid_softc *rs)
   2497 {
   2498 	int     error;
   2499 
   2500 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2501 		rs->sc_flags |= RAIDF_WANTED;
   2502 		if ((error =
   2503 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2504 			return (error);
   2505 	}
   2506 	rs->sc_flags |= RAIDF_LOCKED;
   2507 	return (0);
   2508 }
   2509 /*
   2510  * Unlock and wake up any waiters.
   2511  */
   2512 static void
   2513 raidunlock(struct raid_softc *rs)
   2514 {
   2515 
   2516 	rs->sc_flags &= ~RAIDF_LOCKED;
   2517 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2518 		rs->sc_flags &= ~RAIDF_WANTED;
   2519 		wakeup(rs);
   2520 	}
   2521 }
   2522 
   2523 
   2524 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2525 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2526 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2527 
   2528 static daddr_t
   2529 rf_component_info_offset(void)
   2530 {
   2531 
   2532 	return RF_COMPONENT_INFO_OFFSET;
   2533 }
   2534 
   2535 static daddr_t
   2536 rf_component_info_size(unsigned secsize)
   2537 {
   2538 	daddr_t info_size;
   2539 
   2540 	KASSERT(secsize);
   2541 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2542 		info_size = secsize;
   2543 	else
   2544 		info_size = RF_COMPONENT_INFO_SIZE;
   2545 
   2546 	return info_size;
   2547 }
   2548 
   2549 static daddr_t
   2550 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2551 {
   2552 	daddr_t map_offset;
   2553 
   2554 	KASSERT(raidPtr->bytesPerSector);
   2555 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2556 		map_offset = raidPtr->bytesPerSector;
   2557 	else
   2558 		map_offset = RF_COMPONENT_INFO_SIZE;
   2559 	map_offset += rf_component_info_offset();
   2560 
   2561 	return map_offset;
   2562 }
   2563 
   2564 static daddr_t
   2565 rf_parity_map_size(RF_Raid_t *raidPtr)
   2566 {
   2567 	daddr_t map_size;
   2568 
   2569 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2570 		map_size = raidPtr->bytesPerSector;
   2571 	else
   2572 		map_size = RF_PARITY_MAP_SIZE;
   2573 
   2574 	return map_size;
   2575 }
   2576 
   2577 int
   2578 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2579 {
   2580 	RF_ComponentLabel_t *clabel;
   2581 
   2582 	clabel = raidget_component_label(raidPtr, col);
   2583 	clabel->clean = RF_RAID_CLEAN;
   2584 	raidflush_component_label(raidPtr, col);
   2585 	return(0);
   2586 }
   2587 
   2588 
   2589 int
   2590 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2591 {
   2592 	RF_ComponentLabel_t *clabel;
   2593 
   2594 	clabel = raidget_component_label(raidPtr, col);
   2595 	clabel->clean = RF_RAID_DIRTY;
   2596 	raidflush_component_label(raidPtr, col);
   2597 	return(0);
   2598 }
   2599 
   2600 int
   2601 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2602 {
   2603 	KASSERT(raidPtr->bytesPerSector);
   2604 	return raidread_component_label(raidPtr->bytesPerSector,
   2605 	    raidPtr->Disks[col].dev,
   2606 	    raidPtr->raid_cinfo[col].ci_vp,
   2607 	    &raidPtr->raid_cinfo[col].ci_label);
   2608 }
   2609 
   2610 RF_ComponentLabel_t *
   2611 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2612 {
   2613 	return &raidPtr->raid_cinfo[col].ci_label;
   2614 }
   2615 
   2616 int
   2617 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2618 {
   2619 	RF_ComponentLabel_t *label;
   2620 
   2621 	label = &raidPtr->raid_cinfo[col].ci_label;
   2622 	label->mod_counter = raidPtr->mod_counter;
   2623 #ifndef RF_NO_PARITY_MAP
   2624 	label->parity_map_modcount = label->mod_counter;
   2625 #endif
   2626 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2627 	    raidPtr->Disks[col].dev,
   2628 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2629 }
   2630 
   2631 
   2632 static int
   2633 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2634     RF_ComponentLabel_t *clabel)
   2635 {
   2636 	return raidread_component_area(dev, b_vp, clabel,
   2637 	    sizeof(RF_ComponentLabel_t),
   2638 	    rf_component_info_offset(),
   2639 	    rf_component_info_size(secsize));
   2640 }
   2641 
   2642 /* ARGSUSED */
   2643 static int
   2644 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2645     size_t msize, daddr_t offset, daddr_t dsize)
   2646 {
   2647 	struct buf *bp;
   2648 	const struct bdevsw *bdev;
   2649 	int error;
   2650 
   2651 	/* XXX should probably ensure that we don't try to do this if
   2652 	   someone has changed rf_protected_sectors. */
   2653 
   2654 	if (b_vp == NULL) {
   2655 		/* For whatever reason, this component is not valid.
   2656 		   Don't try to read a component label from it. */
   2657 		return(EINVAL);
   2658 	}
   2659 
   2660 	/* get a block of the appropriate size... */
   2661 	bp = geteblk((int)dsize);
   2662 	bp->b_dev = dev;
   2663 
   2664 	/* get our ducks in a row for the read */
   2665 	bp->b_blkno = offset / DEV_BSIZE;
   2666 	bp->b_bcount = dsize;
   2667 	bp->b_flags |= B_READ;
   2668  	bp->b_resid = dsize;
   2669 
   2670 	bdev = bdevsw_lookup(bp->b_dev);
   2671 	if (bdev == NULL)
   2672 		return (ENXIO);
   2673 	(*bdev->d_strategy)(bp);
   2674 
   2675 	error = biowait(bp);
   2676 
   2677 	if (!error) {
   2678 		memcpy(data, bp->b_data, msize);
   2679 	}
   2680 
   2681 	brelse(bp, 0);
   2682 	return(error);
   2683 }
   2684 
   2685 
   2686 static int
   2687 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2688     RF_ComponentLabel_t *clabel)
   2689 {
   2690 	return raidwrite_component_area(dev, b_vp, clabel,
   2691 	    sizeof(RF_ComponentLabel_t),
   2692 	    rf_component_info_offset(),
   2693 	    rf_component_info_size(secsize), 0);
   2694 }
   2695 
   2696 /* ARGSUSED */
   2697 static int
   2698 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2699     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2700 {
   2701 	struct buf *bp;
   2702 	const struct bdevsw *bdev;
   2703 	int error;
   2704 
   2705 	/* get a block of the appropriate size... */
   2706 	bp = geteblk((int)dsize);
   2707 	bp->b_dev = dev;
   2708 
   2709 	/* get our ducks in a row for the write */
   2710 	bp->b_blkno = offset / DEV_BSIZE;
   2711 	bp->b_bcount = dsize;
   2712 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2713  	bp->b_resid = dsize;
   2714 
   2715 	memset(bp->b_data, 0, dsize);
   2716 	memcpy(bp->b_data, data, msize);
   2717 
   2718 	bdev = bdevsw_lookup(bp->b_dev);
   2719 	if (bdev == NULL)
   2720 		return (ENXIO);
   2721 	(*bdev->d_strategy)(bp);
   2722 	if (asyncp)
   2723 		return 0;
   2724 	error = biowait(bp);
   2725 	brelse(bp, 0);
   2726 	if (error) {
   2727 #if 1
   2728 		printf("Failed to write RAID component info!\n");
   2729 #endif
   2730 	}
   2731 
   2732 	return(error);
   2733 }
   2734 
   2735 void
   2736 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2737 {
   2738 	int c;
   2739 
   2740 	for (c = 0; c < raidPtr->numCol; c++) {
   2741 		/* Skip dead disks. */
   2742 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2743 			continue;
   2744 		/* XXXjld: what if an error occurs here? */
   2745 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2746 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2747 		    RF_PARITYMAP_NBYTE,
   2748 		    rf_parity_map_offset(raidPtr),
   2749 		    rf_parity_map_size(raidPtr), 0);
   2750 	}
   2751 }
   2752 
   2753 void
   2754 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2755 {
   2756 	struct rf_paritymap_ondisk tmp;
   2757 	int c,first;
   2758 
   2759 	first=1;
   2760 	for (c = 0; c < raidPtr->numCol; c++) {
   2761 		/* Skip dead disks. */
   2762 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2763 			continue;
   2764 		raidread_component_area(raidPtr->Disks[c].dev,
   2765 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2766 		    RF_PARITYMAP_NBYTE,
   2767 		    rf_parity_map_offset(raidPtr),
   2768 		    rf_parity_map_size(raidPtr));
   2769 		if (first) {
   2770 			memcpy(map, &tmp, sizeof(*map));
   2771 			first = 0;
   2772 		} else {
   2773 			rf_paritymap_merge(map, &tmp);
   2774 		}
   2775 	}
   2776 }
   2777 
   2778 void
   2779 rf_markalldirty(RF_Raid_t *raidPtr)
   2780 {
   2781 	RF_ComponentLabel_t *clabel;
   2782 	int sparecol;
   2783 	int c;
   2784 	int j;
   2785 	int scol = -1;
   2786 
   2787 	raidPtr->mod_counter++;
   2788 	for (c = 0; c < raidPtr->numCol; c++) {
   2789 		/* we don't want to touch (at all) a disk that has
   2790 		   failed */
   2791 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2792 			clabel = raidget_component_label(raidPtr, c);
   2793 			if (clabel->status == rf_ds_spared) {
   2794 				/* XXX do something special...
   2795 				   but whatever you do, don't
   2796 				   try to access it!! */
   2797 			} else {
   2798 				raidmarkdirty(raidPtr, c);
   2799 			}
   2800 		}
   2801 	}
   2802 
   2803 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2804 		sparecol = raidPtr->numCol + c;
   2805 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2806 			/*
   2807 
   2808 			   we claim this disk is "optimal" if it's
   2809 			   rf_ds_used_spare, as that means it should be
   2810 			   directly substitutable for the disk it replaced.
   2811 			   We note that too...
   2812 
   2813 			 */
   2814 
   2815 			for(j=0;j<raidPtr->numCol;j++) {
   2816 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2817 					scol = j;
   2818 					break;
   2819 				}
   2820 			}
   2821 
   2822 			clabel = raidget_component_label(raidPtr, sparecol);
   2823 			/* make sure status is noted */
   2824 
   2825 			raid_init_component_label(raidPtr, clabel);
   2826 
   2827 			clabel->row = 0;
   2828 			clabel->column = scol;
   2829 			/* Note: we *don't* change status from rf_ds_used_spare
   2830 			   to rf_ds_optimal */
   2831 			/* clabel.status = rf_ds_optimal; */
   2832 
   2833 			raidmarkdirty(raidPtr, sparecol);
   2834 		}
   2835 	}
   2836 }
   2837 
   2838 
   2839 void
   2840 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2841 {
   2842 	RF_ComponentLabel_t *clabel;
   2843 	int sparecol;
   2844 	int c;
   2845 	int j;
   2846 	int scol;
   2847 
   2848 	scol = -1;
   2849 
   2850 	/* XXX should do extra checks to make sure things really are clean,
   2851 	   rather than blindly setting the clean bit... */
   2852 
   2853 	raidPtr->mod_counter++;
   2854 
   2855 	for (c = 0; c < raidPtr->numCol; c++) {
   2856 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2857 			clabel = raidget_component_label(raidPtr, c);
   2858 			/* make sure status is noted */
   2859 			clabel->status = rf_ds_optimal;
   2860 
   2861 			/* note what unit we are configured as */
   2862 			clabel->last_unit = raidPtr->raidid;
   2863 
   2864 			raidflush_component_label(raidPtr, c);
   2865 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2866 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2867 					raidmarkclean(raidPtr, c);
   2868 				}
   2869 			}
   2870 		}
   2871 		/* else we don't touch it.. */
   2872 	}
   2873 
   2874 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2875 		sparecol = raidPtr->numCol + c;
   2876 		/* Need to ensure that the reconstruct actually completed! */
   2877 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2878 			/*
   2879 
   2880 			   we claim this disk is "optimal" if it's
   2881 			   rf_ds_used_spare, as that means it should be
   2882 			   directly substitutable for the disk it replaced.
   2883 			   We note that too...
   2884 
   2885 			 */
   2886 
   2887 			for(j=0;j<raidPtr->numCol;j++) {
   2888 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2889 					scol = j;
   2890 					break;
   2891 				}
   2892 			}
   2893 
   2894 			/* XXX shouldn't *really* need this... */
   2895 			clabel = raidget_component_label(raidPtr, sparecol);
   2896 			/* make sure status is noted */
   2897 
   2898 			raid_init_component_label(raidPtr, clabel);
   2899 
   2900 			clabel->column = scol;
   2901 			clabel->status = rf_ds_optimal;
   2902 			clabel->last_unit = raidPtr->raidid;
   2903 
   2904 			raidflush_component_label(raidPtr, sparecol);
   2905 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2906 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2907 					raidmarkclean(raidPtr, sparecol);
   2908 				}
   2909 			}
   2910 		}
   2911 	}
   2912 }
   2913 
   2914 void
   2915 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2916 {
   2917 
   2918 	if (vp != NULL) {
   2919 		if (auto_configured == 1) {
   2920 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2921 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2922 			vput(vp);
   2923 
   2924 		} else {
   2925 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2926 		}
   2927 	}
   2928 }
   2929 
   2930 
   2931 void
   2932 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2933 {
   2934 	int r,c;
   2935 	struct vnode *vp;
   2936 	int acd;
   2937 
   2938 
   2939 	/* We take this opportunity to close the vnodes like we should.. */
   2940 
   2941 	for (c = 0; c < raidPtr->numCol; c++) {
   2942 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2943 		acd = raidPtr->Disks[c].auto_configured;
   2944 		rf_close_component(raidPtr, vp, acd);
   2945 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2946 		raidPtr->Disks[c].auto_configured = 0;
   2947 	}
   2948 
   2949 	for (r = 0; r < raidPtr->numSpare; r++) {
   2950 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2951 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2952 		rf_close_component(raidPtr, vp, acd);
   2953 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2954 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2955 	}
   2956 }
   2957 
   2958 
   2959 void
   2960 rf_ReconThread(struct rf_recon_req *req)
   2961 {
   2962 	int     s;
   2963 	RF_Raid_t *raidPtr;
   2964 
   2965 	s = splbio();
   2966 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2967 	raidPtr->recon_in_progress = 1;
   2968 
   2969 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2970 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2971 
   2972 	RF_Free(req, sizeof(*req));
   2973 
   2974 	raidPtr->recon_in_progress = 0;
   2975 	splx(s);
   2976 
   2977 	/* That's all... */
   2978 	kthread_exit(0);	/* does not return */
   2979 }
   2980 
   2981 void
   2982 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2983 {
   2984 	int retcode;
   2985 	int s;
   2986 
   2987 	raidPtr->parity_rewrite_stripes_done = 0;
   2988 	raidPtr->parity_rewrite_in_progress = 1;
   2989 	s = splbio();
   2990 	retcode = rf_RewriteParity(raidPtr);
   2991 	splx(s);
   2992 	if (retcode) {
   2993 		printf("raid%d: Error re-writing parity (%d)!\n",
   2994 		    raidPtr->raidid, retcode);
   2995 	} else {
   2996 		/* set the clean bit!  If we shutdown correctly,
   2997 		   the clean bit on each component label will get
   2998 		   set */
   2999 		raidPtr->parity_good = RF_RAID_CLEAN;
   3000 	}
   3001 	raidPtr->parity_rewrite_in_progress = 0;
   3002 
   3003 	/* Anyone waiting for us to stop?  If so, inform them... */
   3004 	if (raidPtr->waitShutdown) {
   3005 		wakeup(&raidPtr->parity_rewrite_in_progress);
   3006 	}
   3007 
   3008 	/* That's all... */
   3009 	kthread_exit(0);	/* does not return */
   3010 }
   3011 
   3012 
   3013 void
   3014 rf_CopybackThread(RF_Raid_t *raidPtr)
   3015 {
   3016 	int s;
   3017 
   3018 	raidPtr->copyback_in_progress = 1;
   3019 	s = splbio();
   3020 	rf_CopybackReconstructedData(raidPtr);
   3021 	splx(s);
   3022 	raidPtr->copyback_in_progress = 0;
   3023 
   3024 	/* That's all... */
   3025 	kthread_exit(0);	/* does not return */
   3026 }
   3027 
   3028 
   3029 void
   3030 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3031 {
   3032 	int s;
   3033 	RF_Raid_t *raidPtr;
   3034 
   3035 	s = splbio();
   3036 	raidPtr = req->raidPtr;
   3037 	raidPtr->recon_in_progress = 1;
   3038 	rf_ReconstructInPlace(raidPtr, req->col);
   3039 	RF_Free(req, sizeof(*req));
   3040 	raidPtr->recon_in_progress = 0;
   3041 	splx(s);
   3042 
   3043 	/* That's all... */
   3044 	kthread_exit(0);	/* does not return */
   3045 }
   3046 
   3047 static RF_AutoConfig_t *
   3048 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3049     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3050     unsigned secsize)
   3051 {
   3052 	int good_one = 0;
   3053 	RF_ComponentLabel_t *clabel;
   3054 	RF_AutoConfig_t *ac;
   3055 
   3056 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3057 	if (clabel == NULL) {
   3058 oomem:
   3059 		    while(ac_list) {
   3060 			    ac = ac_list;
   3061 			    if (ac->clabel)
   3062 				    free(ac->clabel, M_RAIDFRAME);
   3063 			    ac_list = ac_list->next;
   3064 			    free(ac, M_RAIDFRAME);
   3065 		    }
   3066 		    printf("RAID auto config: out of memory!\n");
   3067 		    return NULL; /* XXX probably should panic? */
   3068 	}
   3069 
   3070 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3071 		/* Got the label.  Does it look reasonable? */
   3072 		if (rf_reasonable_label(clabel, numsecs) &&
   3073 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3074 #ifdef DEBUG
   3075 			printf("Component on: %s: %llu\n",
   3076 				cname, (unsigned long long)size);
   3077 			rf_print_component_label(clabel);
   3078 #endif
   3079 			/* if it's reasonable, add it, else ignore it. */
   3080 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3081 				M_NOWAIT);
   3082 			if (ac == NULL) {
   3083 				free(clabel, M_RAIDFRAME);
   3084 				goto oomem;
   3085 			}
   3086 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3087 			ac->dev = dev;
   3088 			ac->vp = vp;
   3089 			ac->clabel = clabel;
   3090 			ac->next = ac_list;
   3091 			ac_list = ac;
   3092 			good_one = 1;
   3093 		}
   3094 	}
   3095 	if (!good_one) {
   3096 		/* cleanup */
   3097 		free(clabel, M_RAIDFRAME);
   3098 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3099 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3100 		vput(vp);
   3101 	}
   3102 	return ac_list;
   3103 }
   3104 
   3105 RF_AutoConfig_t *
   3106 rf_find_raid_components(void)
   3107 {
   3108 	struct vnode *vp;
   3109 	struct disklabel label;
   3110 	device_t dv;
   3111 	deviter_t di;
   3112 	dev_t dev;
   3113 	int bmajor, bminor, wedge, rf_part_found;
   3114 	int error;
   3115 	int i;
   3116 	RF_AutoConfig_t *ac_list;
   3117 	uint64_t numsecs;
   3118 	unsigned secsize;
   3119 
   3120 	/* initialize the AutoConfig list */
   3121 	ac_list = NULL;
   3122 
   3123 	/* we begin by trolling through *all* the devices on the system */
   3124 
   3125 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3126 	     dv = deviter_next(&di)) {
   3127 
   3128 		/* we are only interested in disks... */
   3129 		if (device_class(dv) != DV_DISK)
   3130 			continue;
   3131 
   3132 		/* we don't care about floppies... */
   3133 		if (device_is_a(dv, "fd")) {
   3134 			continue;
   3135 		}
   3136 
   3137 		/* we don't care about CD's... */
   3138 		if (device_is_a(dv, "cd")) {
   3139 			continue;
   3140 		}
   3141 
   3142 		/* we don't care about md's... */
   3143 		if (device_is_a(dv, "md")) {
   3144 			continue;
   3145 		}
   3146 
   3147 		/* hdfd is the Atari/Hades floppy driver */
   3148 		if (device_is_a(dv, "hdfd")) {
   3149 			continue;
   3150 		}
   3151 
   3152 		/* fdisa is the Atari/Milan floppy driver */
   3153 		if (device_is_a(dv, "fdisa")) {
   3154 			continue;
   3155 		}
   3156 
   3157 		/* need to find the device_name_to_block_device_major stuff */
   3158 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3159 
   3160 		rf_part_found = 0; /*No raid partition as yet*/
   3161 
   3162 		/* get a vnode for the raw partition of this disk */
   3163 
   3164 		wedge = device_is_a(dv, "dk");
   3165 		bminor = minor(device_unit(dv));
   3166 		dev = wedge ? makedev(bmajor, bminor) :
   3167 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3168 		if (bdevvp(dev, &vp))
   3169 			panic("RAID can't alloc vnode");
   3170 
   3171 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3172 
   3173 		if (error) {
   3174 			/* "Who cares."  Continue looking
   3175 			   for something that exists*/
   3176 			vput(vp);
   3177 			continue;
   3178 		}
   3179 
   3180 		error = getdisksize(vp, &numsecs, &secsize);
   3181 		if (error) {
   3182 			vput(vp);
   3183 			continue;
   3184 		}
   3185 		if (wedge) {
   3186 			struct dkwedge_info dkw;
   3187 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3188 			    NOCRED);
   3189 			if (error) {
   3190 				printf("RAIDframe: can't get wedge info for "
   3191 				    "dev %s (%d)\n", device_xname(dv), error);
   3192 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3193 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3194 				vput(vp);
   3195 				continue;
   3196 			}
   3197 
   3198 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3199 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3200 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3201 				vput(vp);
   3202 				continue;
   3203 			}
   3204 
   3205 			ac_list = rf_get_component(ac_list, dev, vp,
   3206 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3207 			rf_part_found = 1; /*There is a raid component on this disk*/
   3208 			continue;
   3209 		}
   3210 
   3211 		/* Ok, the disk exists.  Go get the disklabel. */
   3212 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3213 		if (error) {
   3214 			/*
   3215 			 * XXX can't happen - open() would
   3216 			 * have errored out (or faked up one)
   3217 			 */
   3218 			if (error != ENOTTY)
   3219 				printf("RAIDframe: can't get label for dev "
   3220 				    "%s (%d)\n", device_xname(dv), error);
   3221 		}
   3222 
   3223 		/* don't need this any more.  We'll allocate it again
   3224 		   a little later if we really do... */
   3225 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3226 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3227 		vput(vp);
   3228 
   3229 		if (error)
   3230 			continue;
   3231 
   3232 		rf_part_found = 0; /*No raid partitions yet*/
   3233 		for (i = 0; i < label.d_npartitions; i++) {
   3234 			char cname[sizeof(ac_list->devname)];
   3235 
   3236 			/* We only support partitions marked as RAID */
   3237 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3238 				continue;
   3239 
   3240 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3241 			if (bdevvp(dev, &vp))
   3242 				panic("RAID can't alloc vnode");
   3243 
   3244 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3245 			if (error) {
   3246 				/* Whatever... */
   3247 				vput(vp);
   3248 				continue;
   3249 			}
   3250 			snprintf(cname, sizeof(cname), "%s%c",
   3251 			    device_xname(dv), 'a' + i);
   3252 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3253 				label.d_partitions[i].p_size, numsecs, secsize);
   3254 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3255 		}
   3256 
   3257 		/*
   3258 		 *If there is no raid component on this disk, either in a
   3259 		 *disklabel or inside a wedge, check the raw partition as well,
   3260 		 *as it is possible to configure raid components on raw disk
   3261 		 *devices.
   3262 		 */
   3263 
   3264 		if (!rf_part_found) {
   3265 			char cname[sizeof(ac_list->devname)];
   3266 
   3267 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3268 			if (bdevvp(dev, &vp))
   3269 				panic("RAID can't alloc vnode");
   3270 
   3271 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3272 			if (error) {
   3273 				/* Whatever... */
   3274 				vput(vp);
   3275 				continue;
   3276 			}
   3277 			snprintf(cname, sizeof(cname), "%s%c",
   3278 			    device_xname(dv), 'a' + RAW_PART);
   3279 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3280 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3281 		}
   3282 	}
   3283 	deviter_release(&di);
   3284 	return ac_list;
   3285 }
   3286 
   3287 
   3288 int
   3289 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3290 {
   3291 
   3292 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3293 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3294 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3295 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3296 	    clabel->row >=0 &&
   3297 	    clabel->column >= 0 &&
   3298 	    clabel->num_rows > 0 &&
   3299 	    clabel->num_columns > 0 &&
   3300 	    clabel->row < clabel->num_rows &&
   3301 	    clabel->column < clabel->num_columns &&
   3302 	    clabel->blockSize > 0 &&
   3303 	    /*
   3304 	     * numBlocksHi may contain garbage, but it is ok since
   3305 	     * the type is unsigned.  If it is really garbage,
   3306 	     * rf_fix_old_label_size() will fix it.
   3307 	     */
   3308 	    rf_component_label_numblocks(clabel) > 0) {
   3309 		/*
   3310 		 * label looks reasonable enough...
   3311 		 * let's make sure it has no old garbage.
   3312 		 */
   3313 		if (numsecs)
   3314 			rf_fix_old_label_size(clabel, numsecs);
   3315 		return(1);
   3316 	}
   3317 	return(0);
   3318 }
   3319 
   3320 
   3321 /*
   3322  * For reasons yet unknown, some old component labels have garbage in
   3323  * the newer numBlocksHi region, and this causes lossage.  Since those
   3324  * disks will also have numsecs set to less than 32 bits of sectors,
   3325  * we can determine when this corruption has occurred, and fix it.
   3326  *
   3327  * The exact same problem, with the same unknown reason, happens to
   3328  * the partitionSizeHi member as well.
   3329  */
   3330 static void
   3331 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3332 {
   3333 
   3334 	if (numsecs < ((uint64_t)1 << 32)) {
   3335 		if (clabel->numBlocksHi) {
   3336 			printf("WARNING: total sectors < 32 bits, yet "
   3337 			       "numBlocksHi set\n"
   3338 			       "WARNING: resetting numBlocksHi to zero.\n");
   3339 			clabel->numBlocksHi = 0;
   3340 		}
   3341 
   3342 		if (clabel->partitionSizeHi) {
   3343 			printf("WARNING: total sectors < 32 bits, yet "
   3344 			       "partitionSizeHi set\n"
   3345 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3346 			clabel->partitionSizeHi = 0;
   3347 		}
   3348 	}
   3349 }
   3350 
   3351 
   3352 #ifdef DEBUG
   3353 void
   3354 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3355 {
   3356 	uint64_t numBlocks;
   3357 	static const char *rp[] = {
   3358 	    "No", "Force", "Soft", "*invalid*"
   3359 	};
   3360 
   3361 
   3362 	numBlocks = rf_component_label_numblocks(clabel);
   3363 
   3364 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3365 	       clabel->row, clabel->column,
   3366 	       clabel->num_rows, clabel->num_columns);
   3367 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3368 	       clabel->version, clabel->serial_number,
   3369 	       clabel->mod_counter);
   3370 	printf("   Clean: %s Status: %d\n",
   3371 	       clabel->clean ? "Yes" : "No", clabel->status);
   3372 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3373 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3374 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3375 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3376 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3377 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3378 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3379 #if 0
   3380 	   printf("   Config order: %d\n", clabel->config_order);
   3381 #endif
   3382 
   3383 }
   3384 #endif
   3385 
   3386 RF_ConfigSet_t *
   3387 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3388 {
   3389 	RF_AutoConfig_t *ac;
   3390 	RF_ConfigSet_t *config_sets;
   3391 	RF_ConfigSet_t *cset;
   3392 	RF_AutoConfig_t *ac_next;
   3393 
   3394 
   3395 	config_sets = NULL;
   3396 
   3397 	/* Go through the AutoConfig list, and figure out which components
   3398 	   belong to what sets.  */
   3399 	ac = ac_list;
   3400 	while(ac!=NULL) {
   3401 		/* we're going to putz with ac->next, so save it here
   3402 		   for use at the end of the loop */
   3403 		ac_next = ac->next;
   3404 
   3405 		if (config_sets == NULL) {
   3406 			/* will need at least this one... */
   3407 			config_sets = (RF_ConfigSet_t *)
   3408 				malloc(sizeof(RF_ConfigSet_t),
   3409 				       M_RAIDFRAME, M_NOWAIT);
   3410 			if (config_sets == NULL) {
   3411 				panic("rf_create_auto_sets: No memory!");
   3412 			}
   3413 			/* this one is easy :) */
   3414 			config_sets->ac = ac;
   3415 			config_sets->next = NULL;
   3416 			config_sets->rootable = 0;
   3417 			ac->next = NULL;
   3418 		} else {
   3419 			/* which set does this component fit into? */
   3420 			cset = config_sets;
   3421 			while(cset!=NULL) {
   3422 				if (rf_does_it_fit(cset, ac)) {
   3423 					/* looks like it matches... */
   3424 					ac->next = cset->ac;
   3425 					cset->ac = ac;
   3426 					break;
   3427 				}
   3428 				cset = cset->next;
   3429 			}
   3430 			if (cset==NULL) {
   3431 				/* didn't find a match above... new set..*/
   3432 				cset = (RF_ConfigSet_t *)
   3433 					malloc(sizeof(RF_ConfigSet_t),
   3434 					       M_RAIDFRAME, M_NOWAIT);
   3435 				if (cset == NULL) {
   3436 					panic("rf_create_auto_sets: No memory!");
   3437 				}
   3438 				cset->ac = ac;
   3439 				ac->next = NULL;
   3440 				cset->next = config_sets;
   3441 				cset->rootable = 0;
   3442 				config_sets = cset;
   3443 			}
   3444 		}
   3445 		ac = ac_next;
   3446 	}
   3447 
   3448 
   3449 	return(config_sets);
   3450 }
   3451 
   3452 static int
   3453 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3454 {
   3455 	RF_ComponentLabel_t *clabel1, *clabel2;
   3456 
   3457 	/* If this one matches the *first* one in the set, that's good
   3458 	   enough, since the other members of the set would have been
   3459 	   through here too... */
   3460 	/* note that we are not checking partitionSize here..
   3461 
   3462 	   Note that we are also not checking the mod_counters here.
   3463 	   If everything else matches except the mod_counter, that's
   3464 	   good enough for this test.  We will deal with the mod_counters
   3465 	   a little later in the autoconfiguration process.
   3466 
   3467 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3468 
   3469 	   The reason we don't check for this is that failed disks
   3470 	   will have lower modification counts.  If those disks are
   3471 	   not added to the set they used to belong to, then they will
   3472 	   form their own set, which may result in 2 different sets,
   3473 	   for example, competing to be configured at raid0, and
   3474 	   perhaps competing to be the root filesystem set.  If the
   3475 	   wrong ones get configured, or both attempt to become /,
   3476 	   weird behaviour and or serious lossage will occur.  Thus we
   3477 	   need to bring them into the fold here, and kick them out at
   3478 	   a later point.
   3479 
   3480 	*/
   3481 
   3482 	clabel1 = cset->ac->clabel;
   3483 	clabel2 = ac->clabel;
   3484 	if ((clabel1->version == clabel2->version) &&
   3485 	    (clabel1->serial_number == clabel2->serial_number) &&
   3486 	    (clabel1->num_rows == clabel2->num_rows) &&
   3487 	    (clabel1->num_columns == clabel2->num_columns) &&
   3488 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3489 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3490 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3491 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3492 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3493 	    (clabel1->blockSize == clabel2->blockSize) &&
   3494 	    rf_component_label_numblocks(clabel1) ==
   3495 	    rf_component_label_numblocks(clabel2) &&
   3496 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3497 	    (clabel1->root_partition == clabel2->root_partition) &&
   3498 	    (clabel1->last_unit == clabel2->last_unit) &&
   3499 	    (clabel1->config_order == clabel2->config_order)) {
   3500 		/* if it get's here, it almost *has* to be a match */
   3501 	} else {
   3502 		/* it's not consistent with somebody in the set..
   3503 		   punt */
   3504 		return(0);
   3505 	}
   3506 	/* all was fine.. it must fit... */
   3507 	return(1);
   3508 }
   3509 
   3510 int
   3511 rf_have_enough_components(RF_ConfigSet_t *cset)
   3512 {
   3513 	RF_AutoConfig_t *ac;
   3514 	RF_AutoConfig_t *auto_config;
   3515 	RF_ComponentLabel_t *clabel;
   3516 	int c;
   3517 	int num_cols;
   3518 	int num_missing;
   3519 	int mod_counter;
   3520 	int mod_counter_found;
   3521 	int even_pair_failed;
   3522 	char parity_type;
   3523 
   3524 
   3525 	/* check to see that we have enough 'live' components
   3526 	   of this set.  If so, we can configure it if necessary */
   3527 
   3528 	num_cols = cset->ac->clabel->num_columns;
   3529 	parity_type = cset->ac->clabel->parityConfig;
   3530 
   3531 	/* XXX Check for duplicate components!?!?!? */
   3532 
   3533 	/* Determine what the mod_counter is supposed to be for this set. */
   3534 
   3535 	mod_counter_found = 0;
   3536 	mod_counter = 0;
   3537 	ac = cset->ac;
   3538 	while(ac!=NULL) {
   3539 		if (mod_counter_found==0) {
   3540 			mod_counter = ac->clabel->mod_counter;
   3541 			mod_counter_found = 1;
   3542 		} else {
   3543 			if (ac->clabel->mod_counter > mod_counter) {
   3544 				mod_counter = ac->clabel->mod_counter;
   3545 			}
   3546 		}
   3547 		ac = ac->next;
   3548 	}
   3549 
   3550 	num_missing = 0;
   3551 	auto_config = cset->ac;
   3552 
   3553 	even_pair_failed = 0;
   3554 	for(c=0; c<num_cols; c++) {
   3555 		ac = auto_config;
   3556 		while(ac!=NULL) {
   3557 			if ((ac->clabel->column == c) &&
   3558 			    (ac->clabel->mod_counter == mod_counter)) {
   3559 				/* it's this one... */
   3560 #ifdef DEBUG
   3561 				printf("Found: %s at %d\n",
   3562 				       ac->devname,c);
   3563 #endif
   3564 				break;
   3565 			}
   3566 			ac=ac->next;
   3567 		}
   3568 		if (ac==NULL) {
   3569 				/* Didn't find one here! */
   3570 				/* special case for RAID 1, especially
   3571 				   where there are more than 2
   3572 				   components (where RAIDframe treats
   3573 				   things a little differently :( ) */
   3574 			if (parity_type == '1') {
   3575 				if (c%2 == 0) { /* even component */
   3576 					even_pair_failed = 1;
   3577 				} else { /* odd component.  If
   3578 					    we're failed, and
   3579 					    so is the even
   3580 					    component, it's
   3581 					    "Good Night, Charlie" */
   3582 					if (even_pair_failed == 1) {
   3583 						return(0);
   3584 					}
   3585 				}
   3586 			} else {
   3587 				/* normal accounting */
   3588 				num_missing++;
   3589 			}
   3590 		}
   3591 		if ((parity_type == '1') && (c%2 == 1)) {
   3592 				/* Just did an even component, and we didn't
   3593 				   bail.. reset the even_pair_failed flag,
   3594 				   and go on to the next component.... */
   3595 			even_pair_failed = 0;
   3596 		}
   3597 	}
   3598 
   3599 	clabel = cset->ac->clabel;
   3600 
   3601 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3602 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3603 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3604 		/* XXX this needs to be made *much* more general */
   3605 		/* Too many failures */
   3606 		return(0);
   3607 	}
   3608 	/* otherwise, all is well, and we've got enough to take a kick
   3609 	   at autoconfiguring this set */
   3610 	return(1);
   3611 }
   3612 
   3613 void
   3614 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3615 			RF_Raid_t *raidPtr)
   3616 {
   3617 	RF_ComponentLabel_t *clabel;
   3618 	int i;
   3619 
   3620 	clabel = ac->clabel;
   3621 
   3622 	/* 1. Fill in the common stuff */
   3623 	config->numRow = clabel->num_rows = 1;
   3624 	config->numCol = clabel->num_columns;
   3625 	config->numSpare = 0; /* XXX should this be set here? */
   3626 	config->sectPerSU = clabel->sectPerSU;
   3627 	config->SUsPerPU = clabel->SUsPerPU;
   3628 	config->SUsPerRU = clabel->SUsPerRU;
   3629 	config->parityConfig = clabel->parityConfig;
   3630 	/* XXX... */
   3631 	strcpy(config->diskQueueType,"fifo");
   3632 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3633 	config->layoutSpecificSize = 0; /* XXX ?? */
   3634 
   3635 	while(ac!=NULL) {
   3636 		/* row/col values will be in range due to the checks
   3637 		   in reasonable_label() */
   3638 		strcpy(config->devnames[0][ac->clabel->column],
   3639 		       ac->devname);
   3640 		ac = ac->next;
   3641 	}
   3642 
   3643 	for(i=0;i<RF_MAXDBGV;i++) {
   3644 		config->debugVars[i][0] = 0;
   3645 	}
   3646 }
   3647 
   3648 int
   3649 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3650 {
   3651 	RF_ComponentLabel_t *clabel;
   3652 	int column;
   3653 	int sparecol;
   3654 
   3655 	raidPtr->autoconfigure = new_value;
   3656 
   3657 	for(column=0; column<raidPtr->numCol; column++) {
   3658 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3659 			clabel = raidget_component_label(raidPtr, column);
   3660 			clabel->autoconfigure = new_value;
   3661 			raidflush_component_label(raidPtr, column);
   3662 		}
   3663 	}
   3664 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3665 		sparecol = raidPtr->numCol + column;
   3666 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3667 			clabel = raidget_component_label(raidPtr, sparecol);
   3668 			clabel->autoconfigure = new_value;
   3669 			raidflush_component_label(raidPtr, sparecol);
   3670 		}
   3671 	}
   3672 	return(new_value);
   3673 }
   3674 
   3675 int
   3676 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3677 {
   3678 	RF_ComponentLabel_t *clabel;
   3679 	int column;
   3680 	int sparecol;
   3681 
   3682 	raidPtr->root_partition = new_value;
   3683 	for(column=0; column<raidPtr->numCol; column++) {
   3684 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3685 			clabel = raidget_component_label(raidPtr, column);
   3686 			clabel->root_partition = new_value;
   3687 			raidflush_component_label(raidPtr, column);
   3688 		}
   3689 	}
   3690 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3691 		sparecol = raidPtr->numCol + column;
   3692 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3693 			clabel = raidget_component_label(raidPtr, sparecol);
   3694 			clabel->root_partition = new_value;
   3695 			raidflush_component_label(raidPtr, sparecol);
   3696 		}
   3697 	}
   3698 	return(new_value);
   3699 }
   3700 
   3701 void
   3702 rf_release_all_vps(RF_ConfigSet_t *cset)
   3703 {
   3704 	RF_AutoConfig_t *ac;
   3705 
   3706 	ac = cset->ac;
   3707 	while(ac!=NULL) {
   3708 		/* Close the vp, and give it back */
   3709 		if (ac->vp) {
   3710 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3711 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3712 			vput(ac->vp);
   3713 			ac->vp = NULL;
   3714 		}
   3715 		ac = ac->next;
   3716 	}
   3717 }
   3718 
   3719 
   3720 void
   3721 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3722 {
   3723 	RF_AutoConfig_t *ac;
   3724 	RF_AutoConfig_t *next_ac;
   3725 
   3726 	ac = cset->ac;
   3727 	while(ac!=NULL) {
   3728 		next_ac = ac->next;
   3729 		/* nuke the label */
   3730 		free(ac->clabel, M_RAIDFRAME);
   3731 		/* cleanup the config structure */
   3732 		free(ac, M_RAIDFRAME);
   3733 		/* "next.." */
   3734 		ac = next_ac;
   3735 	}
   3736 	/* and, finally, nuke the config set */
   3737 	free(cset, M_RAIDFRAME);
   3738 }
   3739 
   3740 
   3741 void
   3742 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3743 {
   3744 	/* current version number */
   3745 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3746 	clabel->serial_number = raidPtr->serial_number;
   3747 	clabel->mod_counter = raidPtr->mod_counter;
   3748 
   3749 	clabel->num_rows = 1;
   3750 	clabel->num_columns = raidPtr->numCol;
   3751 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3752 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3753 
   3754 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3755 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3756 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3757 
   3758 	clabel->blockSize = raidPtr->bytesPerSector;
   3759 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3760 
   3761 	/* XXX not portable */
   3762 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3763 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3764 	clabel->autoconfigure = raidPtr->autoconfigure;
   3765 	clabel->root_partition = raidPtr->root_partition;
   3766 	clabel->last_unit = raidPtr->raidid;
   3767 	clabel->config_order = raidPtr->config_order;
   3768 
   3769 #ifndef RF_NO_PARITY_MAP
   3770 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3771 #endif
   3772 }
   3773 
   3774 struct raid_softc *
   3775 rf_auto_config_set(RF_ConfigSet_t *cset)
   3776 {
   3777 	RF_Raid_t *raidPtr;
   3778 	RF_Config_t *config;
   3779 	int raidID;
   3780 	struct raid_softc *sc;
   3781 
   3782 #ifdef DEBUG
   3783 	printf("RAID autoconfigure\n");
   3784 #endif
   3785 
   3786 	/* 1. Create a config structure */
   3787 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3788 	if (config == NULL) {
   3789 		printf("Out of mem!?!?\n");
   3790 				/* XXX do something more intelligent here. */
   3791 		return NULL;
   3792 	}
   3793 
   3794 	/*
   3795 	   2. Figure out what RAID ID this one is supposed to live at
   3796 	   See if we can get the same RAID dev that it was configured
   3797 	   on last time..
   3798 	*/
   3799 
   3800 	raidID = cset->ac->clabel->last_unit;
   3801 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3802 		continue;
   3803 #ifdef DEBUG
   3804 	printf("Configuring raid%d:\n",raidID);
   3805 #endif
   3806 
   3807 	raidPtr = &sc->sc_r;
   3808 
   3809 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3810 	raidPtr->softc = sc;
   3811 	raidPtr->raidid = raidID;
   3812 	raidPtr->openings = RAIDOUTSTANDING;
   3813 
   3814 	/* 3. Build the configuration structure */
   3815 	rf_create_configuration(cset->ac, config, raidPtr);
   3816 
   3817 	/* 4. Do the configuration */
   3818 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3819 		raidinit(sc);
   3820 
   3821 		rf_markalldirty(raidPtr);
   3822 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3823 		switch (cset->ac->clabel->root_partition) {
   3824 		case 1:	/* Force Root */
   3825 		case 2:	/* Soft Root: root when boot partition part of raid */
   3826 			/*
   3827 			 * everything configured just fine.  Make a note
   3828 			 * that this set is eligible to be root,
   3829 			 * or forced to be root
   3830 			 */
   3831 			cset->rootable = cset->ac->clabel->root_partition;
   3832 			/* XXX do this here? */
   3833 			raidPtr->root_partition = cset->rootable;
   3834 			break;
   3835 		default:
   3836 			break;
   3837 		}
   3838 	} else {
   3839 		raidput(sc);
   3840 		sc = NULL;
   3841 	}
   3842 
   3843 	/* 5. Cleanup */
   3844 	free(config, M_RAIDFRAME);
   3845 	return sc;
   3846 }
   3847 
   3848 void
   3849 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3850 {
   3851 	struct buf *bp;
   3852 	struct raid_softc *rs;
   3853 
   3854 	bp = (struct buf *)desc->bp;
   3855 	rs = desc->raidPtr->softc;
   3856 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3857 	    (bp->b_flags & B_READ));
   3858 }
   3859 
   3860 void
   3861 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3862 	     size_t xmin, size_t xmax)
   3863 {
   3864 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3865 	pool_sethiwat(p, xmax);
   3866 	pool_prime(p, xmin);
   3867 	pool_setlowat(p, xmin);
   3868 }
   3869 
   3870 /*
   3871  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3872  * if there is IO pending and if that IO could possibly be done for a
   3873  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3874  * otherwise.
   3875  *
   3876  */
   3877 
   3878 int
   3879 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3880 {
   3881 	struct raid_softc *rs = raidPtr->softc;
   3882 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3883 		/* there is work to do */
   3884 		return 0;
   3885 	}
   3886 	/* default is nothing to do */
   3887 	return 1;
   3888 }
   3889 
   3890 int
   3891 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3892 {
   3893 	uint64_t numsecs;
   3894 	unsigned secsize;
   3895 	int error;
   3896 
   3897 	error = getdisksize(vp, &numsecs, &secsize);
   3898 	if (error == 0) {
   3899 		diskPtr->blockSize = secsize;
   3900 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3901 		diskPtr->partitionSize = numsecs;
   3902 		return 0;
   3903 	}
   3904 	return error;
   3905 }
   3906 
   3907 static int
   3908 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3909 {
   3910 	return 1;
   3911 }
   3912 
   3913 static void
   3914 raid_attach(device_t parent, device_t self, void *aux)
   3915 {
   3916 
   3917 }
   3918 
   3919 
   3920 static int
   3921 raid_detach(device_t self, int flags)
   3922 {
   3923 	int error;
   3924 	struct raid_softc *rs = raidget(device_unit(self));
   3925 
   3926 	if (rs == NULL)
   3927 		return ENXIO;
   3928 
   3929 	if ((error = raidlock(rs)) != 0)
   3930 		return (error);
   3931 
   3932 	error = raid_detach_unlocked(rs);
   3933 
   3934 	raidunlock(rs);
   3935 
   3936 	/* XXXkd: raidput(rs) ??? */
   3937 
   3938 	return error;
   3939 }
   3940 
   3941 static void
   3942 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3943 {
   3944 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3945 
   3946 	memset(dg, 0, sizeof(*dg));
   3947 
   3948 	dg->dg_secperunit = raidPtr->totalSectors;
   3949 	dg->dg_secsize = raidPtr->bytesPerSector;
   3950 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3951 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3952 
   3953 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3954 }
   3955 
   3956 /*
   3957  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3958  * We end up returning whatever error was returned by the first cache flush
   3959  * that fails.
   3960  */
   3961 
   3962 int
   3963 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3964 {
   3965 	int c, sparecol;
   3966 	int e,error;
   3967 	int force = 1;
   3968 
   3969 	error = 0;
   3970 	for (c = 0; c < raidPtr->numCol; c++) {
   3971 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3972 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3973 					  &force, FWRITE, NOCRED);
   3974 			if (e) {
   3975 				if (e != ENODEV)
   3976 					printf("raid%d: cache flush to component %s failed.\n",
   3977 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3978 				if (error == 0) {
   3979 					error = e;
   3980 				}
   3981 			}
   3982 		}
   3983 	}
   3984 
   3985 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3986 		sparecol = raidPtr->numCol + c;
   3987 		/* Need to ensure that the reconstruct actually completed! */
   3988 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3989 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3990 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3991 			if (e) {
   3992 				if (e != ENODEV)
   3993 					printf("raid%d: cache flush to component %s failed.\n",
   3994 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3995 				if (error == 0) {
   3996 					error = e;
   3997 				}
   3998 			}
   3999 		}
   4000 	}
   4001 	return error;
   4002 }
   4003