Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.313
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.313 2014/10/11 12:01:27 mlelstv Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.313 2014/10/11 12:01:27 mlelstv Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    234 
    235 struct raid_softc {
    236 	device_t sc_dev;
    237 	int	sc_unit;
    238 	int     sc_flags;	/* flags */
    239 	int     sc_cflags;	/* configuration flags */
    240 	uint64_t sc_size;	/* size of the raid device */
    241 	char    sc_xname[20];	/* XXX external name */
    242 	struct disk sc_dkdev;	/* generic disk device info */
    243 	struct bufq_state *buf_queue;	/* used for the device queue */
    244 	RF_Raid_t sc_r;
    245 	LIST_ENTRY(raid_softc) sc_link;
    246 };
    247 /* sc_flags */
    248 #define RAIDF_INITED	0x01	/* unit has been initialized */
    249 #define RAIDF_WLABEL	0x02	/* label area is writable */
    250 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    251 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    252 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    253 #define RAIDF_LOCKED	0x80	/* unit is locked */
    254 
    255 #define	raidunit(x)	DISKUNIT(x)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /*
    263  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    264  * Be aware that large numbers can allow the driver to consume a lot of
    265  * kernel memory, especially on writes, and in degraded mode reads.
    266  *
    267  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    268  * a single 64K write will typically require 64K for the old data,
    269  * 64K for the old parity, and 64K for the new parity, for a total
    270  * of 192K (if the parity buffer is not re-used immediately).
    271  * Even it if is used immediately, that's still 128K, which when multiplied
    272  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    273  *
    274  * Now in degraded mode, for example, a 64K read on the above setup may
    275  * require data reconstruction, which will require *all* of the 4 remaining
    276  * disks to participate -- 4 * 32K/disk == 128K again.
    277  */
    278 
    279 #ifndef RAIDOUTSTANDING
    280 #define RAIDOUTSTANDING   6
    281 #endif
    282 
    283 #define RAIDLABELDEV(dev)	\
    284 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    285 
    286 /* declared here, and made public, for the benefit of KVM stuff.. */
    287 
    288 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    289 				     struct disklabel *);
    290 static void raidgetdisklabel(dev_t);
    291 static void raidmakedisklabel(struct raid_softc *);
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	if (sc == NULL) {
    342 #ifdef DIAGNOSTIC
    343 		printf("%s: out of memory\n", __func__);
    344 #endif
    345 		return NULL;
    346 	}
    347 	sc->sc_unit = unit;
    348 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    349 	return sc;
    350 }
    351 
    352 static void
    353 raiddestroy(struct raid_softc *sc) {
    354 	bufq_free(sc->buf_queue);
    355 	kmem_free(sc, sizeof(*sc));
    356 }
    357 
    358 static struct raid_softc *
    359 raidget(int unit) {
    360 	struct raid_softc *sc;
    361 	if (unit < 0) {
    362 #ifdef DIAGNOSTIC
    363 		panic("%s: unit %d!", __func__, unit);
    364 #endif
    365 		return NULL;
    366 	}
    367 	mutex_enter(&raid_lock);
    368 	LIST_FOREACH(sc, &raids, sc_link) {
    369 		if (sc->sc_unit == unit) {
    370 			mutex_exit(&raid_lock);
    371 			return sc;
    372 		}
    373 	}
    374 	mutex_exit(&raid_lock);
    375 	if ((sc = raidcreate(unit)) == NULL)
    376 		return NULL;
    377 	mutex_enter(&raid_lock);
    378 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    379 	mutex_exit(&raid_lock);
    380 	return sc;
    381 }
    382 
    383 static void
    384 raidput(struct raid_softc *sc) {
    385 	mutex_enter(&raid_lock);
    386 	LIST_REMOVE(sc, sc_link);
    387 	mutex_exit(&raid_lock);
    388 	raiddestroy(sc);
    389 }
    390 
    391 void
    392 raidattach(int num)
    393 {
    394 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    395 	/* This is where all the initialization stuff gets done. */
    396 
    397 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    398 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    399 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    400 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    401 
    402 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    403 #endif
    404 
    405 	if (rf_BootRaidframe() == 0)
    406 		aprint_verbose("Kernelized RAIDframe activated\n");
    407 	else
    408 		panic("Serious error booting RAID!!");
    409 
    410 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    411 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    412 	}
    413 
    414 	raidautoconfigdone = false;
    415 
    416 	/*
    417 	 * Register a finalizer which will be used to auto-config RAID
    418 	 * sets once all real hardware devices have been found.
    419 	 */
    420 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    421 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    422 }
    423 
    424 int
    425 rf_autoconfig(device_t self)
    426 {
    427 	RF_AutoConfig_t *ac_list;
    428 	RF_ConfigSet_t *config_sets;
    429 
    430 	if (!raidautoconfig || raidautoconfigdone == true)
    431 		return (0);
    432 
    433 	/* XXX This code can only be run once. */
    434 	raidautoconfigdone = true;
    435 
    436 #ifdef __HAVE_CPU_BOOTCONF
    437 	/*
    438 	 * 0. find the boot device if needed first so we can use it later
    439 	 * this needs to be done before we autoconfigure any raid sets,
    440 	 * because if we use wedges we are not going to be able to open
    441 	 * the boot device later
    442 	 */
    443 	if (booted_device == NULL)
    444 		cpu_bootconf();
    445 #endif
    446 	/* 1. locate all RAID components on the system */
    447 	aprint_debug("Searching for RAID components...\n");
    448 	ac_list = rf_find_raid_components();
    449 
    450 	/* 2. Sort them into their respective sets. */
    451 	config_sets = rf_create_auto_sets(ac_list);
    452 
    453 	/*
    454 	 * 3. Evaluate each set and configure the valid ones.
    455 	 * This gets done in rf_buildroothack().
    456 	 */
    457 	rf_buildroothack(config_sets);
    458 
    459 	return 1;
    460 }
    461 
    462 static int
    463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    464 	const char *bootname = device_xname(bdv);
    465 	size_t len = strlen(bootname);
    466 
    467 	for (int col = 0; col < r->numCol; col++) {
    468 		const char *devname = r->Disks[col].devname;
    469 		devname += sizeof("/dev/") - 1;
    470 		if (strncmp(devname, "dk", 2) == 0) {
    471 			const char *parent =
    472 			    dkwedge_get_parent_name(r->Disks[col].dev);
    473 			if (parent != NULL)
    474 				devname = parent;
    475 		}
    476 		if (strncmp(devname, bootname, len) == 0) {
    477 			struct raid_softc *sc = r->softc;
    478 			aprint_debug("raid%d includes boot device %s\n",
    479 			    sc->sc_unit, devname);
    480 			return 1;
    481 		}
    482 	}
    483 	return 0;
    484 }
    485 
    486 void
    487 rf_buildroothack(RF_ConfigSet_t *config_sets)
    488 {
    489 	RF_ConfigSet_t *cset;
    490 	RF_ConfigSet_t *next_cset;
    491 	int num_root;
    492 	struct raid_softc *sc, *rsc;
    493 
    494 	sc = rsc = NULL;
    495 	num_root = 0;
    496 	cset = config_sets;
    497 	while (cset != NULL) {
    498 		next_cset = cset->next;
    499 		if (rf_have_enough_components(cset) &&
    500 		    cset->ac->clabel->autoconfigure == 1) {
    501 			sc = rf_auto_config_set(cset);
    502 			if (sc != NULL) {
    503 				aprint_debug("raid%d: configured ok\n",
    504 				    sc->sc_unit);
    505 				if (cset->rootable) {
    506 					rsc = sc;
    507 					num_root++;
    508 				}
    509 			} else {
    510 				/* The autoconfig didn't work :( */
    511 				aprint_debug("Autoconfig failed\n");
    512 				rf_release_all_vps(cset);
    513 			}
    514 		} else {
    515 			/* we're not autoconfiguring this set...
    516 			   release the associated resources */
    517 			rf_release_all_vps(cset);
    518 		}
    519 		/* cleanup */
    520 		rf_cleanup_config_set(cset);
    521 		cset = next_cset;
    522 	}
    523 
    524 	/* if the user has specified what the root device should be
    525 	   then we don't touch booted_device or boothowto... */
    526 
    527 	if (rootspec != NULL)
    528 		return;
    529 
    530 	/* we found something bootable... */
    531 
    532 	/*
    533 	 * XXX: The following code assumes that the root raid
    534 	 * is the first ('a') partition. This is about the best
    535 	 * we can do with a BSD disklabel, but we might be able
    536 	 * to do better with a GPT label, by setting a specified
    537 	 * attribute to indicate the root partition. We can then
    538 	 * stash the partition number in the r->root_partition
    539 	 * high bits (the bottom 2 bits are already used). For
    540 	 * now we just set booted_partition to 0 when we override
    541 	 * root.
    542 	 */
    543 	if (num_root == 1) {
    544 		device_t candidate_root;
    545 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    546 			char cname[sizeof(cset->ac->devname)];
    547 			/* XXX: assume 'a' */
    548 			snprintf(cname, sizeof(cname), "%s%c",
    549 			    device_xname(rsc->sc_dev), 'a');
    550 			candidate_root = dkwedge_find_by_wname(cname);
    551 		} else
    552 			candidate_root = rsc->sc_dev;
    553 		if (booted_device == NULL ||
    554 		    rsc->sc_r.root_partition == 1 ||
    555 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    556 			booted_device = candidate_root;
    557 			booted_partition = 0;	/* XXX assume 'a' */
    558 		}
    559 	} else if (num_root > 1) {
    560 
    561 		/*
    562 		 * Maybe the MD code can help. If it cannot, then
    563 		 * setroot() will discover that we have no
    564 		 * booted_device and will ask the user if nothing was
    565 		 * hardwired in the kernel config file
    566 		 */
    567 		if (booted_device == NULL)
    568 			return;
    569 
    570 		num_root = 0;
    571 		mutex_enter(&raid_lock);
    572 		LIST_FOREACH(sc, &raids, sc_link) {
    573 			RF_Raid_t *r = &sc->sc_r;
    574 			if (r->valid == 0)
    575 				continue;
    576 
    577 			if (r->root_partition == 0)
    578 				continue;
    579 
    580 			if (rf_containsboot(r, booted_device)) {
    581 				num_root++;
    582 				rsc = sc;
    583 			}
    584 		}
    585 		mutex_exit(&raid_lock);
    586 
    587 		if (num_root == 1) {
    588 			booted_device = rsc->sc_dev;
    589 			booted_partition = 0;	/* XXX assume 'a' */
    590 		} else {
    591 			/* we can't guess.. require the user to answer... */
    592 			boothowto |= RB_ASKNAME;
    593 		}
    594 	}
    595 }
    596 
    597 
    598 int
    599 raidsize(dev_t dev)
    600 {
    601 	struct raid_softc *rs;
    602 	struct disklabel *lp;
    603 	int     part, unit, omask, size;
    604 
    605 	unit = raidunit(dev);
    606 	if ((rs = raidget(unit)) == NULL)
    607 		return -1;
    608 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    609 		return (-1);
    610 
    611 	part = DISKPART(dev);
    612 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    613 	lp = rs->sc_dkdev.dk_label;
    614 
    615 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    616 		return (-1);
    617 
    618 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    619 		size = -1;
    620 	else
    621 		size = lp->d_partitions[part].p_size *
    622 		    (lp->d_secsize / DEV_BSIZE);
    623 
    624 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    625 		return (-1);
    626 
    627 	return (size);
    628 
    629 }
    630 
    631 int
    632 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    633 {
    634 	int     unit = raidunit(dev);
    635 	struct raid_softc *rs;
    636 	const struct bdevsw *bdev;
    637 	struct disklabel *lp;
    638 	RF_Raid_t *raidPtr;
    639 	daddr_t offset;
    640 	int     part, c, sparecol, j, scol, dumpto;
    641 	int     error = 0;
    642 
    643 	if ((rs = raidget(unit)) == NULL)
    644 		return ENXIO;
    645 
    646 	raidPtr = &rs->sc_r;
    647 
    648 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    649 		return ENXIO;
    650 
    651 	/* we only support dumping to RAID 1 sets */
    652 	if (raidPtr->Layout.numDataCol != 1 ||
    653 	    raidPtr->Layout.numParityCol != 1)
    654 		return EINVAL;
    655 
    656 
    657 	if ((error = raidlock(rs)) != 0)
    658 		return error;
    659 
    660 	if (size % DEV_BSIZE != 0) {
    661 		error = EINVAL;
    662 		goto out;
    663 	}
    664 
    665 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    666 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    667 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    668 		    size / DEV_BSIZE, rs->sc_size);
    669 		error = EINVAL;
    670 		goto out;
    671 	}
    672 
    673 	part = DISKPART(dev);
    674 	lp = rs->sc_dkdev.dk_label;
    675 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    676 
    677 	/* figure out what device is alive.. */
    678 
    679 	/*
    680 	   Look for a component to dump to.  The preference for the
    681 	   component to dump to is as follows:
    682 	   1) the master
    683 	   2) a used_spare of the master
    684 	   3) the slave
    685 	   4) a used_spare of the slave
    686 	*/
    687 
    688 	dumpto = -1;
    689 	for (c = 0; c < raidPtr->numCol; c++) {
    690 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    691 			/* this might be the one */
    692 			dumpto = c;
    693 			break;
    694 		}
    695 	}
    696 
    697 	/*
    698 	   At this point we have possibly selected a live master or a
    699 	   live slave.  We now check to see if there is a spared
    700 	   master (or a spared slave), if we didn't find a live master
    701 	   or a live slave.
    702 	*/
    703 
    704 	for (c = 0; c < raidPtr->numSpare; c++) {
    705 		sparecol = raidPtr->numCol + c;
    706 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    707 			/* How about this one? */
    708 			scol = -1;
    709 			for(j=0;j<raidPtr->numCol;j++) {
    710 				if (raidPtr->Disks[j].spareCol == sparecol) {
    711 					scol = j;
    712 					break;
    713 				}
    714 			}
    715 			if (scol == 0) {
    716 				/*
    717 				   We must have found a spared master!
    718 				   We'll take that over anything else
    719 				   found so far.  (We couldn't have
    720 				   found a real master before, since
    721 				   this is a used spare, and it's
    722 				   saying that it's replacing the
    723 				   master.)  On reboot (with
    724 				   autoconfiguration turned on)
    725 				   sparecol will become the 1st
    726 				   component (component0) of this set.
    727 				*/
    728 				dumpto = sparecol;
    729 				break;
    730 			} else if (scol != -1) {
    731 				/*
    732 				   Must be a spared slave.  We'll dump
    733 				   to that if we havn't found anything
    734 				   else so far.
    735 				*/
    736 				if (dumpto == -1)
    737 					dumpto = sparecol;
    738 			}
    739 		}
    740 	}
    741 
    742 	if (dumpto == -1) {
    743 		/* we couldn't find any live components to dump to!?!?
    744 		 */
    745 		error = EINVAL;
    746 		goto out;
    747 	}
    748 
    749 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    750 
    751 	/*
    752 	   Note that blkno is relative to this particular partition.
    753 	   By adding the offset of this partition in the RAID
    754 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    755 	   value that is relative to the partition used for the
    756 	   underlying component.
    757 	*/
    758 
    759 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    760 				blkno + offset, va, size);
    761 
    762 out:
    763 	raidunlock(rs);
    764 
    765 	return error;
    766 }
    767 /* ARGSUSED */
    768 int
    769 raidopen(dev_t dev, int flags, int fmt,
    770     struct lwp *l)
    771 {
    772 	int     unit = raidunit(dev);
    773 	struct raid_softc *rs;
    774 	struct disklabel *lp;
    775 	int     part, pmask;
    776 	int     error = 0;
    777 
    778 	if ((rs = raidget(unit)) == NULL)
    779 		return ENXIO;
    780 	if ((error = raidlock(rs)) != 0)
    781 		return (error);
    782 
    783 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    784 		error = EBUSY;
    785 		goto bad;
    786 	}
    787 
    788 	lp = rs->sc_dkdev.dk_label;
    789 
    790 	part = DISKPART(dev);
    791 
    792 	/*
    793 	 * If there are wedges, and this is not RAW_PART, then we
    794 	 * need to fail.
    795 	 */
    796 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    797 		error = EBUSY;
    798 		goto bad;
    799 	}
    800 	pmask = (1 << part);
    801 
    802 	if ((rs->sc_flags & RAIDF_INITED) &&
    803 	    (rs->sc_dkdev.dk_openmask == 0))
    804 		raidgetdisklabel(dev);
    805 
    806 	/* make sure that this partition exists */
    807 
    808 	if (part != RAW_PART) {
    809 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    810 		    ((part >= lp->d_npartitions) ||
    811 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    812 			error = ENXIO;
    813 			goto bad;
    814 		}
    815 	}
    816 	/* Prevent this unit from being unconfigured while open. */
    817 	switch (fmt) {
    818 	case S_IFCHR:
    819 		rs->sc_dkdev.dk_copenmask |= pmask;
    820 		break;
    821 
    822 	case S_IFBLK:
    823 		rs->sc_dkdev.dk_bopenmask |= pmask;
    824 		break;
    825 	}
    826 
    827 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    828 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    829 		/* First one... mark things as dirty... Note that we *MUST*
    830 		 have done a configure before this.  I DO NOT WANT TO BE
    831 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    832 		 THAT THEY BELONG TOGETHER!!!!! */
    833 		/* XXX should check to see if we're only open for reading
    834 		   here... If so, we needn't do this, but then need some
    835 		   other way of keeping track of what's happened.. */
    836 
    837 		rf_markalldirty(&rs->sc_r);
    838 	}
    839 
    840 
    841 	rs->sc_dkdev.dk_openmask =
    842 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    843 
    844 bad:
    845 	raidunlock(rs);
    846 
    847 	return (error);
    848 
    849 
    850 }
    851 /* ARGSUSED */
    852 int
    853 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    854 {
    855 	int     unit = raidunit(dev);
    856 	struct raid_softc *rs;
    857 	int     error = 0;
    858 	int     part;
    859 
    860 	if ((rs = raidget(unit)) == NULL)
    861 		return ENXIO;
    862 
    863 	if ((error = raidlock(rs)) != 0)
    864 		return (error);
    865 
    866 	part = DISKPART(dev);
    867 
    868 	/* ...that much closer to allowing unconfiguration... */
    869 	switch (fmt) {
    870 	case S_IFCHR:
    871 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    872 		break;
    873 
    874 	case S_IFBLK:
    875 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    876 		break;
    877 	}
    878 	rs->sc_dkdev.dk_openmask =
    879 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    880 
    881 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    882 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    883 		/* Last one... device is not unconfigured yet.
    884 		   Device shutdown has taken care of setting the
    885 		   clean bits if RAIDF_INITED is not set
    886 		   mark things as clean... */
    887 
    888 		rf_update_component_labels(&rs->sc_r,
    889 						 RF_FINAL_COMPONENT_UPDATE);
    890 
    891 		/* If the kernel is shutting down, it will detach
    892 		 * this RAID set soon enough.
    893 		 */
    894 	}
    895 
    896 	raidunlock(rs);
    897 	return (0);
    898 
    899 }
    900 
    901 void
    902 raidstrategy(struct buf *bp)
    903 {
    904 	unsigned int unit = raidunit(bp->b_dev);
    905 	RF_Raid_t *raidPtr;
    906 	int     wlabel;
    907 	struct raid_softc *rs;
    908 
    909 	if ((rs = raidget(unit)) == NULL) {
    910 		bp->b_error = ENXIO;
    911 		goto done;
    912 	}
    913 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    914 		bp->b_error = ENXIO;
    915 		goto done;
    916 	}
    917 	raidPtr = &rs->sc_r;
    918 	if (!raidPtr->valid) {
    919 		bp->b_error = ENODEV;
    920 		goto done;
    921 	}
    922 	if (bp->b_bcount == 0) {
    923 		db1_printf(("b_bcount is zero..\n"));
    924 		goto done;
    925 	}
    926 
    927 	/*
    928 	 * Do bounds checking and adjust transfer.  If there's an
    929 	 * error, the bounds check will flag that for us.
    930 	 */
    931 
    932 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    933 	if (DISKPART(bp->b_dev) == RAW_PART) {
    934 		uint64_t size; /* device size in DEV_BSIZE unit */
    935 
    936 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    937 			size = raidPtr->totalSectors <<
    938 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    939 		} else {
    940 			size = raidPtr->totalSectors >>
    941 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    942 		}
    943 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    944 			goto done;
    945 		}
    946 	} else {
    947 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    948 			db1_printf(("Bounds check failed!!:%d %d\n",
    949 				(int) bp->b_blkno, (int) wlabel));
    950 			goto done;
    951 		}
    952 	}
    953 
    954 	rf_lock_mutex2(raidPtr->iodone_lock);
    955 
    956 	bp->b_resid = 0;
    957 
    958 	/* stuff it onto our queue */
    959 	bufq_put(rs->buf_queue, bp);
    960 
    961 	/* scheduled the IO to happen at the next convenient time */
    962 	rf_signal_cond2(raidPtr->iodone_cv);
    963 	rf_unlock_mutex2(raidPtr->iodone_lock);
    964 
    965 	return;
    966 
    967 done:
    968 	bp->b_resid = bp->b_bcount;
    969 	biodone(bp);
    970 }
    971 /* ARGSUSED */
    972 int
    973 raidread(dev_t dev, struct uio *uio, int flags)
    974 {
    975 	int     unit = raidunit(dev);
    976 	struct raid_softc *rs;
    977 
    978 	if ((rs = raidget(unit)) == NULL)
    979 		return ENXIO;
    980 
    981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    982 		return (ENXIO);
    983 
    984 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    985 
    986 }
    987 /* ARGSUSED */
    988 int
    989 raidwrite(dev_t dev, struct uio *uio, int flags)
    990 {
    991 	int     unit = raidunit(dev);
    992 	struct raid_softc *rs;
    993 
    994 	if ((rs = raidget(unit)) == NULL)
    995 		return ENXIO;
    996 
    997 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    998 		return (ENXIO);
    999 
   1000 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1001 
   1002 }
   1003 
   1004 static int
   1005 raid_detach_unlocked(struct raid_softc *rs)
   1006 {
   1007 	int error;
   1008 	RF_Raid_t *raidPtr;
   1009 
   1010 	raidPtr = &rs->sc_r;
   1011 
   1012 	/*
   1013 	 * If somebody has a partition mounted, we shouldn't
   1014 	 * shutdown.
   1015 	 */
   1016 	if (rs->sc_dkdev.dk_openmask != 0)
   1017 		return EBUSY;
   1018 
   1019 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1020 		;	/* not initialized: nothing to do */
   1021 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1022 		return error;
   1023 	else
   1024 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1025 
   1026 	/* Detach the disk. */
   1027 	dkwedge_delall(&rs->sc_dkdev);
   1028 	disk_detach(&rs->sc_dkdev);
   1029 	disk_destroy(&rs->sc_dkdev);
   1030 
   1031 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1032 
   1033 	return 0;
   1034 }
   1035 
   1036 int
   1037 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1038 {
   1039 	int     unit = raidunit(dev);
   1040 	int     error = 0;
   1041 	int     part, pmask, s;
   1042 	cfdata_t cf;
   1043 	struct raid_softc *rs;
   1044 	RF_Config_t *k_cfg, *u_cfg;
   1045 	RF_Raid_t *raidPtr;
   1046 	RF_RaidDisk_t *diskPtr;
   1047 	RF_AccTotals_t *totals;
   1048 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1049 	u_char *specific_buf;
   1050 	int retcode = 0;
   1051 	int column;
   1052 /*	int raidid; */
   1053 	struct rf_recon_req *rrcopy, *rr;
   1054 	RF_ComponentLabel_t *clabel;
   1055 	RF_ComponentLabel_t *ci_label;
   1056 	RF_ComponentLabel_t **clabel_ptr;
   1057 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1058 	RF_SingleComponent_t component;
   1059 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1060 	int i, j, d;
   1061 #ifdef __HAVE_OLD_DISKLABEL
   1062 	struct disklabel newlabel;
   1063 #endif
   1064 	struct dkwedge_info *dkw;
   1065 
   1066 	if ((rs = raidget(unit)) == NULL)
   1067 		return ENXIO;
   1068 	raidPtr = &rs->sc_r;
   1069 
   1070 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1071 		(int) DISKPART(dev), (int) unit, cmd));
   1072 
   1073 	/* Must be open for writes for these commands... */
   1074 	switch (cmd) {
   1075 #ifdef DIOCGSECTORSIZE
   1076 	case DIOCGSECTORSIZE:
   1077 		*(u_int *)data = raidPtr->bytesPerSector;
   1078 		return 0;
   1079 	case DIOCGMEDIASIZE:
   1080 		*(off_t *)data =
   1081 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1082 		return 0;
   1083 #endif
   1084 	case DIOCSDINFO:
   1085 	case DIOCWDINFO:
   1086 #ifdef __HAVE_OLD_DISKLABEL
   1087 	case ODIOCWDINFO:
   1088 	case ODIOCSDINFO:
   1089 #endif
   1090 	case DIOCWLABEL:
   1091 	case DIOCAWEDGE:
   1092 	case DIOCDWEDGE:
   1093 	case DIOCSSTRATEGY:
   1094 		if ((flag & FWRITE) == 0)
   1095 			return (EBADF);
   1096 	}
   1097 
   1098 	/* Must be initialized for these... */
   1099 	switch (cmd) {
   1100 	case DIOCGDINFO:
   1101 	case DIOCSDINFO:
   1102 	case DIOCWDINFO:
   1103 #ifdef __HAVE_OLD_DISKLABEL
   1104 	case ODIOCGDINFO:
   1105 	case ODIOCWDINFO:
   1106 	case ODIOCSDINFO:
   1107 	case ODIOCGDEFLABEL:
   1108 #endif
   1109 	case DIOCGPART:
   1110 	case DIOCWLABEL:
   1111 	case DIOCGDEFLABEL:
   1112 	case DIOCAWEDGE:
   1113 	case DIOCDWEDGE:
   1114 	case DIOCLWEDGES:
   1115 	case DIOCCACHESYNC:
   1116 	case RAIDFRAME_SHUTDOWN:
   1117 	case RAIDFRAME_REWRITEPARITY:
   1118 	case RAIDFRAME_GET_INFO:
   1119 	case RAIDFRAME_RESET_ACCTOTALS:
   1120 	case RAIDFRAME_GET_ACCTOTALS:
   1121 	case RAIDFRAME_KEEP_ACCTOTALS:
   1122 	case RAIDFRAME_GET_SIZE:
   1123 	case RAIDFRAME_FAIL_DISK:
   1124 	case RAIDFRAME_COPYBACK:
   1125 	case RAIDFRAME_CHECK_RECON_STATUS:
   1126 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1127 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1128 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1129 	case RAIDFRAME_ADD_HOT_SPARE:
   1130 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1131 	case RAIDFRAME_INIT_LABELS:
   1132 	case RAIDFRAME_REBUILD_IN_PLACE:
   1133 	case RAIDFRAME_CHECK_PARITY:
   1134 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1135 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1136 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1137 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1138 	case RAIDFRAME_SET_AUTOCONFIG:
   1139 	case RAIDFRAME_SET_ROOT:
   1140 	case RAIDFRAME_DELETE_COMPONENT:
   1141 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1142 	case RAIDFRAME_PARITYMAP_STATUS:
   1143 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1144 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1145 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1146 	case DIOCGSTRATEGY:
   1147 	case DIOCSSTRATEGY:
   1148 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1149 			return (ENXIO);
   1150 	}
   1151 
   1152 	switch (cmd) {
   1153 #ifdef COMPAT_50
   1154 	case RAIDFRAME_GET_INFO50:
   1155 		return rf_get_info50(raidPtr, data);
   1156 
   1157 	case RAIDFRAME_CONFIGURE50:
   1158 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1159 			return retcode;
   1160 		goto config;
   1161 #endif
   1162 		/* configure the system */
   1163 	case RAIDFRAME_CONFIGURE:
   1164 
   1165 		if (raidPtr->valid) {
   1166 			/* There is a valid RAID set running on this unit! */
   1167 			printf("raid%d: Device already configured!\n",unit);
   1168 			return(EINVAL);
   1169 		}
   1170 
   1171 		/* copy-in the configuration information */
   1172 		/* data points to a pointer to the configuration structure */
   1173 
   1174 		u_cfg = *((RF_Config_t **) data);
   1175 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1176 		if (k_cfg == NULL) {
   1177 			return (ENOMEM);
   1178 		}
   1179 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1180 		if (retcode) {
   1181 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1182 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1183 				retcode));
   1184 			return (retcode);
   1185 		}
   1186 		goto config;
   1187 	config:
   1188 		/* allocate a buffer for the layout-specific data, and copy it
   1189 		 * in */
   1190 		if (k_cfg->layoutSpecificSize) {
   1191 			if (k_cfg->layoutSpecificSize > 10000) {
   1192 				/* sanity check */
   1193 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1194 				return (EINVAL);
   1195 			}
   1196 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1197 			    (u_char *));
   1198 			if (specific_buf == NULL) {
   1199 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1200 				return (ENOMEM);
   1201 			}
   1202 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1203 			    k_cfg->layoutSpecificSize);
   1204 			if (retcode) {
   1205 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1206 				RF_Free(specific_buf,
   1207 					k_cfg->layoutSpecificSize);
   1208 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1209 					retcode));
   1210 				return (retcode);
   1211 			}
   1212 		} else
   1213 			specific_buf = NULL;
   1214 		k_cfg->layoutSpecific = specific_buf;
   1215 
   1216 		/* should do some kind of sanity check on the configuration.
   1217 		 * Store the sum of all the bytes in the last byte? */
   1218 
   1219 		/* configure the system */
   1220 
   1221 		/*
   1222 		 * Clear the entire RAID descriptor, just to make sure
   1223 		 *  there is no stale data left in the case of a
   1224 		 *  reconfiguration
   1225 		 */
   1226 		memset(raidPtr, 0, sizeof(*raidPtr));
   1227 		raidPtr->softc = rs;
   1228 		raidPtr->raidid = unit;
   1229 
   1230 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1231 
   1232 		if (retcode == 0) {
   1233 
   1234 			/* allow this many simultaneous IO's to
   1235 			   this RAID device */
   1236 			raidPtr->openings = RAIDOUTSTANDING;
   1237 
   1238 			raidinit(rs);
   1239 			rf_markalldirty(raidPtr);
   1240 		}
   1241 		/* free the buffers.  No return code here. */
   1242 		if (k_cfg->layoutSpecificSize) {
   1243 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1244 		}
   1245 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1246 
   1247 		return (retcode);
   1248 
   1249 		/* shutdown the system */
   1250 	case RAIDFRAME_SHUTDOWN:
   1251 
   1252 		part = DISKPART(dev);
   1253 		pmask = (1 << part);
   1254 
   1255 		if ((error = raidlock(rs)) != 0)
   1256 			return (error);
   1257 
   1258 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1259 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1260 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1261 			retcode = EBUSY;
   1262 		else {
   1263 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1264 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1265 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1266 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1267 			retcode = 0;
   1268 		}
   1269 
   1270 		raidunlock(rs);
   1271 
   1272 		if (retcode != 0)
   1273 			return retcode;
   1274 
   1275 		/* free the pseudo device attach bits */
   1276 
   1277 		cf = device_cfdata(rs->sc_dev);
   1278 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1279 			free(cf, M_RAIDFRAME);
   1280 
   1281 		return (retcode);
   1282 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1283 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1284 		/* need to read the component label for the disk indicated
   1285 		   by row,column in clabel */
   1286 
   1287 		/*
   1288 		 * Perhaps there should be an option to skip the in-core
   1289 		 * copy and hit the disk, as with disklabel(8).
   1290 		 */
   1291 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1292 
   1293 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1294 
   1295 		if (retcode) {
   1296 			RF_Free(clabel, sizeof(*clabel));
   1297 			return retcode;
   1298 		}
   1299 
   1300 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1301 
   1302 		column = clabel->column;
   1303 
   1304 		if ((column < 0) || (column >= raidPtr->numCol +
   1305 		    raidPtr->numSpare)) {
   1306 			RF_Free(clabel, sizeof(*clabel));
   1307 			return EINVAL;
   1308 		}
   1309 
   1310 		RF_Free(clabel, sizeof(*clabel));
   1311 
   1312 		clabel = raidget_component_label(raidPtr, column);
   1313 
   1314 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1315 
   1316 #if 0
   1317 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1318 		clabel = (RF_ComponentLabel_t *) data;
   1319 
   1320 		/* XXX check the label for valid stuff... */
   1321 		/* Note that some things *should not* get modified --
   1322 		   the user should be re-initing the labels instead of
   1323 		   trying to patch things.
   1324 		   */
   1325 
   1326 		raidid = raidPtr->raidid;
   1327 #ifdef DEBUG
   1328 		printf("raid%d: Got component label:\n", raidid);
   1329 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1330 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1331 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1332 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1333 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1334 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1335 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1336 #endif
   1337 		clabel->row = 0;
   1338 		column = clabel->column;
   1339 
   1340 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1341 			return(EINVAL);
   1342 		}
   1343 
   1344 		/* XXX this isn't allowed to do anything for now :-) */
   1345 
   1346 		/* XXX and before it is, we need to fill in the rest
   1347 		   of the fields!?!?!?! */
   1348 		memcpy(raidget_component_label(raidPtr, column),
   1349 		    clabel, sizeof(*clabel));
   1350 		raidflush_component_label(raidPtr, column);
   1351 		return (0);
   1352 #endif
   1353 
   1354 	case RAIDFRAME_INIT_LABELS:
   1355 		clabel = (RF_ComponentLabel_t *) data;
   1356 		/*
   1357 		   we only want the serial number from
   1358 		   the above.  We get all the rest of the information
   1359 		   from the config that was used to create this RAID
   1360 		   set.
   1361 		   */
   1362 
   1363 		raidPtr->serial_number = clabel->serial_number;
   1364 
   1365 		for(column=0;column<raidPtr->numCol;column++) {
   1366 			diskPtr = &raidPtr->Disks[column];
   1367 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1368 				ci_label = raidget_component_label(raidPtr,
   1369 				    column);
   1370 				/* Zeroing this is important. */
   1371 				memset(ci_label, 0, sizeof(*ci_label));
   1372 				raid_init_component_label(raidPtr, ci_label);
   1373 				ci_label->serial_number =
   1374 				    raidPtr->serial_number;
   1375 				ci_label->row = 0; /* we dont' pretend to support more */
   1376 				rf_component_label_set_partitionsize(ci_label,
   1377 				    diskPtr->partitionSize);
   1378 				ci_label->column = column;
   1379 				raidflush_component_label(raidPtr, column);
   1380 			}
   1381 			/* XXXjld what about the spares? */
   1382 		}
   1383 
   1384 		return (retcode);
   1385 	case RAIDFRAME_SET_AUTOCONFIG:
   1386 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1387 		printf("raid%d: New autoconfig value is: %d\n",
   1388 		       raidPtr->raidid, d);
   1389 		*(int *) data = d;
   1390 		return (retcode);
   1391 
   1392 	case RAIDFRAME_SET_ROOT:
   1393 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1394 		printf("raid%d: New rootpartition value is: %d\n",
   1395 		       raidPtr->raidid, d);
   1396 		*(int *) data = d;
   1397 		return (retcode);
   1398 
   1399 		/* initialize all parity */
   1400 	case RAIDFRAME_REWRITEPARITY:
   1401 
   1402 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1403 			/* Parity for RAID 0 is trivially correct */
   1404 			raidPtr->parity_good = RF_RAID_CLEAN;
   1405 			return(0);
   1406 		}
   1407 
   1408 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1409 			/* Re-write is already in progress! */
   1410 			return(EINVAL);
   1411 		}
   1412 
   1413 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1414 					   rf_RewriteParityThread,
   1415 					   raidPtr,"raid_parity");
   1416 		return (retcode);
   1417 
   1418 
   1419 	case RAIDFRAME_ADD_HOT_SPARE:
   1420 		sparePtr = (RF_SingleComponent_t *) data;
   1421 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1422 		retcode = rf_add_hot_spare(raidPtr, &component);
   1423 		return(retcode);
   1424 
   1425 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1426 		return(retcode);
   1427 
   1428 	case RAIDFRAME_DELETE_COMPONENT:
   1429 		componentPtr = (RF_SingleComponent_t *)data;
   1430 		memcpy( &component, componentPtr,
   1431 			sizeof(RF_SingleComponent_t));
   1432 		retcode = rf_delete_component(raidPtr, &component);
   1433 		return(retcode);
   1434 
   1435 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1436 		componentPtr = (RF_SingleComponent_t *)data;
   1437 		memcpy( &component, componentPtr,
   1438 			sizeof(RF_SingleComponent_t));
   1439 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1440 		return(retcode);
   1441 
   1442 	case RAIDFRAME_REBUILD_IN_PLACE:
   1443 
   1444 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1445 			/* Can't do this on a RAID 0!! */
   1446 			return(EINVAL);
   1447 		}
   1448 
   1449 		if (raidPtr->recon_in_progress == 1) {
   1450 			/* a reconstruct is already in progress! */
   1451 			return(EINVAL);
   1452 		}
   1453 
   1454 		componentPtr = (RF_SingleComponent_t *) data;
   1455 		memcpy( &component, componentPtr,
   1456 			sizeof(RF_SingleComponent_t));
   1457 		component.row = 0; /* we don't support any more */
   1458 		column = component.column;
   1459 
   1460 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1461 			return(EINVAL);
   1462 		}
   1463 
   1464 		rf_lock_mutex2(raidPtr->mutex);
   1465 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1466 		    (raidPtr->numFailures > 0)) {
   1467 			/* XXX 0 above shouldn't be constant!!! */
   1468 			/* some component other than this has failed.
   1469 			   Let's not make things worse than they already
   1470 			   are... */
   1471 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1472 			       raidPtr->raidid);
   1473 			printf("raid%d:     Col: %d   Too many failures.\n",
   1474 			       raidPtr->raidid, column);
   1475 			rf_unlock_mutex2(raidPtr->mutex);
   1476 			return (EINVAL);
   1477 		}
   1478 		if (raidPtr->Disks[column].status ==
   1479 		    rf_ds_reconstructing) {
   1480 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1481 			       raidPtr->raidid);
   1482 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1483 
   1484 			rf_unlock_mutex2(raidPtr->mutex);
   1485 			return (EINVAL);
   1486 		}
   1487 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1488 			rf_unlock_mutex2(raidPtr->mutex);
   1489 			return (EINVAL);
   1490 		}
   1491 		rf_unlock_mutex2(raidPtr->mutex);
   1492 
   1493 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1494 		if (rrcopy == NULL)
   1495 			return(ENOMEM);
   1496 
   1497 		rrcopy->raidPtr = (void *) raidPtr;
   1498 		rrcopy->col = column;
   1499 
   1500 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1501 					   rf_ReconstructInPlaceThread,
   1502 					   rrcopy,"raid_reconip");
   1503 		return(retcode);
   1504 
   1505 	case RAIDFRAME_GET_INFO:
   1506 		if (!raidPtr->valid)
   1507 			return (ENODEV);
   1508 		ucfgp = (RF_DeviceConfig_t **) data;
   1509 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1510 			  (RF_DeviceConfig_t *));
   1511 		if (d_cfg == NULL)
   1512 			return (ENOMEM);
   1513 		d_cfg->rows = 1; /* there is only 1 row now */
   1514 		d_cfg->cols = raidPtr->numCol;
   1515 		d_cfg->ndevs = raidPtr->numCol;
   1516 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1517 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1518 			return (ENOMEM);
   1519 		}
   1520 		d_cfg->nspares = raidPtr->numSpare;
   1521 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1522 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1523 			return (ENOMEM);
   1524 		}
   1525 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1526 		d = 0;
   1527 		for (j = 0; j < d_cfg->cols; j++) {
   1528 			d_cfg->devs[d] = raidPtr->Disks[j];
   1529 			d++;
   1530 		}
   1531 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1532 			d_cfg->spares[i] = raidPtr->Disks[j];
   1533 		}
   1534 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1535 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1536 
   1537 		return (retcode);
   1538 
   1539 	case RAIDFRAME_CHECK_PARITY:
   1540 		*(int *) data = raidPtr->parity_good;
   1541 		return (0);
   1542 
   1543 	case RAIDFRAME_PARITYMAP_STATUS:
   1544 		if (rf_paritymap_ineligible(raidPtr))
   1545 			return EINVAL;
   1546 		rf_paritymap_status(raidPtr->parity_map,
   1547 		    (struct rf_pmstat *)data);
   1548 		return 0;
   1549 
   1550 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1551 		if (rf_paritymap_ineligible(raidPtr))
   1552 			return EINVAL;
   1553 		if (raidPtr->parity_map == NULL)
   1554 			return ENOENT; /* ??? */
   1555 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1556 			(struct rf_pmparams *)data, 1))
   1557 			return EINVAL;
   1558 		return 0;
   1559 
   1560 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1561 		if (rf_paritymap_ineligible(raidPtr))
   1562 			return EINVAL;
   1563 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1564 		return 0;
   1565 
   1566 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1567 		if (rf_paritymap_ineligible(raidPtr))
   1568 			return EINVAL;
   1569 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1570 		/* XXX should errors be passed up? */
   1571 		return 0;
   1572 
   1573 	case RAIDFRAME_RESET_ACCTOTALS:
   1574 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1575 		return (0);
   1576 
   1577 	case RAIDFRAME_GET_ACCTOTALS:
   1578 		totals = (RF_AccTotals_t *) data;
   1579 		*totals = raidPtr->acc_totals;
   1580 		return (0);
   1581 
   1582 	case RAIDFRAME_KEEP_ACCTOTALS:
   1583 		raidPtr->keep_acc_totals = *(int *)data;
   1584 		return (0);
   1585 
   1586 	case RAIDFRAME_GET_SIZE:
   1587 		*(int *) data = raidPtr->totalSectors;
   1588 		return (0);
   1589 
   1590 		/* fail a disk & optionally start reconstruction */
   1591 	case RAIDFRAME_FAIL_DISK:
   1592 
   1593 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1594 			/* Can't do this on a RAID 0!! */
   1595 			return(EINVAL);
   1596 		}
   1597 
   1598 		rr = (struct rf_recon_req *) data;
   1599 		rr->row = 0;
   1600 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1601 			return (EINVAL);
   1602 
   1603 
   1604 		rf_lock_mutex2(raidPtr->mutex);
   1605 		if (raidPtr->status == rf_rs_reconstructing) {
   1606 			/* you can't fail a disk while we're reconstructing! */
   1607 			/* XXX wrong for RAID6 */
   1608 			rf_unlock_mutex2(raidPtr->mutex);
   1609 			return (EINVAL);
   1610 		}
   1611 		if ((raidPtr->Disks[rr->col].status ==
   1612 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1613 			/* some other component has failed.  Let's not make
   1614 			   things worse. XXX wrong for RAID6 */
   1615 			rf_unlock_mutex2(raidPtr->mutex);
   1616 			return (EINVAL);
   1617 		}
   1618 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1619 			/* Can't fail a spared disk! */
   1620 			rf_unlock_mutex2(raidPtr->mutex);
   1621 			return (EINVAL);
   1622 		}
   1623 		rf_unlock_mutex2(raidPtr->mutex);
   1624 
   1625 		/* make a copy of the recon request so that we don't rely on
   1626 		 * the user's buffer */
   1627 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1628 		if (rrcopy == NULL)
   1629 			return(ENOMEM);
   1630 		memcpy(rrcopy, rr, sizeof(*rr));
   1631 		rrcopy->raidPtr = (void *) raidPtr;
   1632 
   1633 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1634 					   rf_ReconThread,
   1635 					   rrcopy,"raid_recon");
   1636 		return (0);
   1637 
   1638 		/* invoke a copyback operation after recon on whatever disk
   1639 		 * needs it, if any */
   1640 	case RAIDFRAME_COPYBACK:
   1641 
   1642 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1643 			/* This makes no sense on a RAID 0!! */
   1644 			return(EINVAL);
   1645 		}
   1646 
   1647 		if (raidPtr->copyback_in_progress == 1) {
   1648 			/* Copyback is already in progress! */
   1649 			return(EINVAL);
   1650 		}
   1651 
   1652 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1653 					   rf_CopybackThread,
   1654 					   raidPtr,"raid_copyback");
   1655 		return (retcode);
   1656 
   1657 		/* return the percentage completion of reconstruction */
   1658 	case RAIDFRAME_CHECK_RECON_STATUS:
   1659 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1660 			/* This makes no sense on a RAID 0, so tell the
   1661 			   user it's done. */
   1662 			*(int *) data = 100;
   1663 			return(0);
   1664 		}
   1665 		if (raidPtr->status != rf_rs_reconstructing)
   1666 			*(int *) data = 100;
   1667 		else {
   1668 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1669 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1670 			} else {
   1671 				*(int *) data = 0;
   1672 			}
   1673 		}
   1674 		return (0);
   1675 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1676 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1677 		if (raidPtr->status != rf_rs_reconstructing) {
   1678 			progressInfo.remaining = 0;
   1679 			progressInfo.completed = 100;
   1680 			progressInfo.total = 100;
   1681 		} else {
   1682 			progressInfo.total =
   1683 				raidPtr->reconControl->numRUsTotal;
   1684 			progressInfo.completed =
   1685 				raidPtr->reconControl->numRUsComplete;
   1686 			progressInfo.remaining = progressInfo.total -
   1687 				progressInfo.completed;
   1688 		}
   1689 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1690 				  sizeof(RF_ProgressInfo_t));
   1691 		return (retcode);
   1692 
   1693 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1694 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1695 			/* This makes no sense on a RAID 0, so tell the
   1696 			   user it's done. */
   1697 			*(int *) data = 100;
   1698 			return(0);
   1699 		}
   1700 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1701 			*(int *) data = 100 *
   1702 				raidPtr->parity_rewrite_stripes_done /
   1703 				raidPtr->Layout.numStripe;
   1704 		} else {
   1705 			*(int *) data = 100;
   1706 		}
   1707 		return (0);
   1708 
   1709 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1710 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1711 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1712 			progressInfo.total = raidPtr->Layout.numStripe;
   1713 			progressInfo.completed =
   1714 				raidPtr->parity_rewrite_stripes_done;
   1715 			progressInfo.remaining = progressInfo.total -
   1716 				progressInfo.completed;
   1717 		} else {
   1718 			progressInfo.remaining = 0;
   1719 			progressInfo.completed = 100;
   1720 			progressInfo.total = 100;
   1721 		}
   1722 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1723 				  sizeof(RF_ProgressInfo_t));
   1724 		return (retcode);
   1725 
   1726 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1727 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1728 			/* This makes no sense on a RAID 0 */
   1729 			*(int *) data = 100;
   1730 			return(0);
   1731 		}
   1732 		if (raidPtr->copyback_in_progress == 1) {
   1733 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1734 				raidPtr->Layout.numStripe;
   1735 		} else {
   1736 			*(int *) data = 100;
   1737 		}
   1738 		return (0);
   1739 
   1740 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1741 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1742 		if (raidPtr->copyback_in_progress == 1) {
   1743 			progressInfo.total = raidPtr->Layout.numStripe;
   1744 			progressInfo.completed =
   1745 				raidPtr->copyback_stripes_done;
   1746 			progressInfo.remaining = progressInfo.total -
   1747 				progressInfo.completed;
   1748 		} else {
   1749 			progressInfo.remaining = 0;
   1750 			progressInfo.completed = 100;
   1751 			progressInfo.total = 100;
   1752 		}
   1753 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1754 				  sizeof(RF_ProgressInfo_t));
   1755 		return (retcode);
   1756 
   1757 		/* the sparetable daemon calls this to wait for the kernel to
   1758 		 * need a spare table. this ioctl does not return until a
   1759 		 * spare table is needed. XXX -- calling mpsleep here in the
   1760 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1761 		 * -- I should either compute the spare table in the kernel,
   1762 		 * or have a different -- XXX XXX -- interface (a different
   1763 		 * character device) for delivering the table     -- XXX */
   1764 #if 0
   1765 	case RAIDFRAME_SPARET_WAIT:
   1766 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1767 		while (!rf_sparet_wait_queue)
   1768 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1769 		waitreq = rf_sparet_wait_queue;
   1770 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1771 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1772 
   1773 		/* structure assignment */
   1774 		*((RF_SparetWait_t *) data) = *waitreq;
   1775 
   1776 		RF_Free(waitreq, sizeof(*waitreq));
   1777 		return (0);
   1778 
   1779 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1780 		 * code in it that will cause the dameon to exit */
   1781 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1782 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1783 		waitreq->fcol = -1;
   1784 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1785 		waitreq->next = rf_sparet_wait_queue;
   1786 		rf_sparet_wait_queue = waitreq;
   1787 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1788 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1789 		return (0);
   1790 
   1791 		/* used by the spare table daemon to deliver a spare table
   1792 		 * into the kernel */
   1793 	case RAIDFRAME_SEND_SPARET:
   1794 
   1795 		/* install the spare table */
   1796 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1797 
   1798 		/* respond to the requestor.  the return status of the spare
   1799 		 * table installation is passed in the "fcol" field */
   1800 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1801 		waitreq->fcol = retcode;
   1802 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1803 		waitreq->next = rf_sparet_resp_queue;
   1804 		rf_sparet_resp_queue = waitreq;
   1805 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1806 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1807 
   1808 		return (retcode);
   1809 #endif
   1810 
   1811 	default:
   1812 		break; /* fall through to the os-specific code below */
   1813 
   1814 	}
   1815 
   1816 	if (!raidPtr->valid)
   1817 		return (EINVAL);
   1818 
   1819 	/*
   1820 	 * Add support for "regular" device ioctls here.
   1821 	 */
   1822 
   1823 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1824 	if (error != EPASSTHROUGH)
   1825 		return (error);
   1826 
   1827 	switch (cmd) {
   1828 	case DIOCGDINFO:
   1829 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1830 		break;
   1831 #ifdef __HAVE_OLD_DISKLABEL
   1832 	case ODIOCGDINFO:
   1833 		newlabel = *(rs->sc_dkdev.dk_label);
   1834 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1835 			return ENOTTY;
   1836 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1837 		break;
   1838 #endif
   1839 
   1840 	case DIOCGPART:
   1841 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1842 		((struct partinfo *) data)->part =
   1843 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1844 		break;
   1845 
   1846 	case DIOCWDINFO:
   1847 	case DIOCSDINFO:
   1848 #ifdef __HAVE_OLD_DISKLABEL
   1849 	case ODIOCWDINFO:
   1850 	case ODIOCSDINFO:
   1851 #endif
   1852 	{
   1853 		struct disklabel *lp;
   1854 #ifdef __HAVE_OLD_DISKLABEL
   1855 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1856 			memset(&newlabel, 0, sizeof newlabel);
   1857 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1858 			lp = &newlabel;
   1859 		} else
   1860 #endif
   1861 		lp = (struct disklabel *)data;
   1862 
   1863 		if ((error = raidlock(rs)) != 0)
   1864 			return (error);
   1865 
   1866 		rs->sc_flags |= RAIDF_LABELLING;
   1867 
   1868 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1869 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1870 		if (error == 0) {
   1871 			if (cmd == DIOCWDINFO
   1872 #ifdef __HAVE_OLD_DISKLABEL
   1873 			    || cmd == ODIOCWDINFO
   1874 #endif
   1875 			   )
   1876 				error = writedisklabel(RAIDLABELDEV(dev),
   1877 				    raidstrategy, rs->sc_dkdev.dk_label,
   1878 				    rs->sc_dkdev.dk_cpulabel);
   1879 		}
   1880 		rs->sc_flags &= ~RAIDF_LABELLING;
   1881 
   1882 		raidunlock(rs);
   1883 
   1884 		if (error)
   1885 			return (error);
   1886 		break;
   1887 	}
   1888 
   1889 	case DIOCWLABEL:
   1890 		if (*(int *) data != 0)
   1891 			rs->sc_flags |= RAIDF_WLABEL;
   1892 		else
   1893 			rs->sc_flags &= ~RAIDF_WLABEL;
   1894 		break;
   1895 
   1896 	case DIOCGDEFLABEL:
   1897 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1898 		break;
   1899 
   1900 #ifdef __HAVE_OLD_DISKLABEL
   1901 	case ODIOCGDEFLABEL:
   1902 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1903 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1904 			return ENOTTY;
   1905 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1906 		break;
   1907 #endif
   1908 
   1909 	case DIOCAWEDGE:
   1910 	case DIOCDWEDGE:
   1911 	    	dkw = (void *)data;
   1912 
   1913 		/* If the ioctl happens here, the parent is us. */
   1914 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1915 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1916 
   1917 	case DIOCLWEDGES:
   1918 		return dkwedge_list(&rs->sc_dkdev,
   1919 		    (struct dkwedge_list *)data, l);
   1920 	case DIOCCACHESYNC:
   1921 		return rf_sync_component_caches(raidPtr);
   1922 
   1923 	case DIOCGSTRATEGY:
   1924 	    {
   1925 		struct disk_strategy *dks = (void *)data;
   1926 
   1927 		s = splbio();
   1928 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1929 		    sizeof(dks->dks_name));
   1930 		splx(s);
   1931 		dks->dks_paramlen = 0;
   1932 
   1933 		return 0;
   1934 	    }
   1935 
   1936 	case DIOCSSTRATEGY:
   1937 	    {
   1938 		struct disk_strategy *dks = (void *)data;
   1939 		struct bufq_state *new;
   1940 		struct bufq_state *old;
   1941 
   1942 		if (dks->dks_param != NULL) {
   1943 			return EINVAL;
   1944 		}
   1945 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1946 		error = bufq_alloc(&new, dks->dks_name,
   1947 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1948 		if (error) {
   1949 			return error;
   1950 		}
   1951 		s = splbio();
   1952 		old = rs->buf_queue;
   1953 		bufq_move(new, old);
   1954 		rs->buf_queue = new;
   1955 		splx(s);
   1956 		bufq_free(old);
   1957 
   1958 		return 0;
   1959 	    }
   1960 
   1961 	default:
   1962 		retcode = ENOTTY;
   1963 	}
   1964 	return (retcode);
   1965 
   1966 }
   1967 
   1968 
   1969 /* raidinit -- complete the rest of the initialization for the
   1970    RAIDframe device.  */
   1971 
   1972 
   1973 static void
   1974 raidinit(struct raid_softc *rs)
   1975 {
   1976 	cfdata_t cf;
   1977 	int     unit;
   1978 	RF_Raid_t *raidPtr = &rs->sc_r;
   1979 
   1980 	unit = raidPtr->raidid;
   1981 
   1982 
   1983 	/* XXX should check return code first... */
   1984 	rs->sc_flags |= RAIDF_INITED;
   1985 
   1986 	/* XXX doesn't check bounds. */
   1987 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1988 
   1989 	/* attach the pseudo device */
   1990 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1991 	cf->cf_name = raid_cd.cd_name;
   1992 	cf->cf_atname = raid_cd.cd_name;
   1993 	cf->cf_unit = unit;
   1994 	cf->cf_fstate = FSTATE_STAR;
   1995 
   1996 	rs->sc_dev = config_attach_pseudo(cf);
   1997 
   1998 	if (rs->sc_dev == NULL) {
   1999 		printf("raid%d: config_attach_pseudo failed\n",
   2000 		    raidPtr->raidid);
   2001 		rs->sc_flags &= ~RAIDF_INITED;
   2002 		free(cf, M_RAIDFRAME);
   2003 		return;
   2004 	}
   2005 
   2006 	/* disk_attach actually creates space for the CPU disklabel, among
   2007 	 * other things, so it's critical to call this *BEFORE* we try putzing
   2008 	 * with disklabels. */
   2009 
   2010 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   2011 	disk_attach(&rs->sc_dkdev);
   2012 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   2013 
   2014 	/* XXX There may be a weird interaction here between this, and
   2015 	 * protectedSectors, as used in RAIDframe.  */
   2016 
   2017 	rs->sc_size = raidPtr->totalSectors;
   2018 
   2019 	dkwedge_discover(&rs->sc_dkdev);
   2020 
   2021 	rf_set_geometry(rs, raidPtr);
   2022 
   2023 }
   2024 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2025 /* wake up the daemon & tell it to get us a spare table
   2026  * XXX
   2027  * the entries in the queues should be tagged with the raidPtr
   2028  * so that in the extremely rare case that two recons happen at once,
   2029  * we know for which device were requesting a spare table
   2030  * XXX
   2031  *
   2032  * XXX This code is not currently used. GO
   2033  */
   2034 int
   2035 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2036 {
   2037 	int     retcode;
   2038 
   2039 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2040 	req->next = rf_sparet_wait_queue;
   2041 	rf_sparet_wait_queue = req;
   2042 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2043 
   2044 	/* mpsleep unlocks the mutex */
   2045 	while (!rf_sparet_resp_queue) {
   2046 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2047 	}
   2048 	req = rf_sparet_resp_queue;
   2049 	rf_sparet_resp_queue = req->next;
   2050 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2051 
   2052 	retcode = req->fcol;
   2053 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2054 					 * alloc'd */
   2055 	return (retcode);
   2056 }
   2057 #endif
   2058 
   2059 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2060  * bp & passes it down.
   2061  * any calls originating in the kernel must use non-blocking I/O
   2062  * do some extra sanity checking to return "appropriate" error values for
   2063  * certain conditions (to make some standard utilities work)
   2064  *
   2065  * Formerly known as: rf_DoAccessKernel
   2066  */
   2067 void
   2068 raidstart(RF_Raid_t *raidPtr)
   2069 {
   2070 	RF_SectorCount_t num_blocks, pb, sum;
   2071 	RF_RaidAddr_t raid_addr;
   2072 	struct partition *pp;
   2073 	daddr_t blocknum;
   2074 	struct raid_softc *rs;
   2075 	int     do_async;
   2076 	struct buf *bp;
   2077 	int rc;
   2078 
   2079 	rs = raidPtr->softc;
   2080 	/* quick check to see if anything has died recently */
   2081 	rf_lock_mutex2(raidPtr->mutex);
   2082 	if (raidPtr->numNewFailures > 0) {
   2083 		rf_unlock_mutex2(raidPtr->mutex);
   2084 		rf_update_component_labels(raidPtr,
   2085 					   RF_NORMAL_COMPONENT_UPDATE);
   2086 		rf_lock_mutex2(raidPtr->mutex);
   2087 		raidPtr->numNewFailures--;
   2088 	}
   2089 
   2090 	/* Check to see if we're at the limit... */
   2091 	while (raidPtr->openings > 0) {
   2092 		rf_unlock_mutex2(raidPtr->mutex);
   2093 
   2094 		/* get the next item, if any, from the queue */
   2095 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2096 			/* nothing more to do */
   2097 			return;
   2098 		}
   2099 
   2100 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2101 		 * partition.. Need to make it absolute to the underlying
   2102 		 * device.. */
   2103 
   2104 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2105 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2106 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2107 			blocknum += pp->p_offset;
   2108 		}
   2109 
   2110 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2111 			    (int) blocknum));
   2112 
   2113 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2114 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2115 
   2116 		/* *THIS* is where we adjust what block we're going to...
   2117 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2118 		raid_addr = blocknum;
   2119 
   2120 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2121 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2122 		sum = raid_addr + num_blocks + pb;
   2123 		if (1 || rf_debugKernelAccess) {
   2124 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2125 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2126 				    (int) pb, (int) bp->b_resid));
   2127 		}
   2128 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2129 		    || (sum < num_blocks) || (sum < pb)) {
   2130 			bp->b_error = ENOSPC;
   2131 			bp->b_resid = bp->b_bcount;
   2132 			biodone(bp);
   2133 			rf_lock_mutex2(raidPtr->mutex);
   2134 			continue;
   2135 		}
   2136 		/*
   2137 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2138 		 */
   2139 
   2140 		if (bp->b_bcount & raidPtr->sectorMask) {
   2141 			bp->b_error = EINVAL;
   2142 			bp->b_resid = bp->b_bcount;
   2143 			biodone(bp);
   2144 			rf_lock_mutex2(raidPtr->mutex);
   2145 			continue;
   2146 
   2147 		}
   2148 		db1_printf(("Calling DoAccess..\n"));
   2149 
   2150 
   2151 		rf_lock_mutex2(raidPtr->mutex);
   2152 		raidPtr->openings--;
   2153 		rf_unlock_mutex2(raidPtr->mutex);
   2154 
   2155 		/*
   2156 		 * Everything is async.
   2157 		 */
   2158 		do_async = 1;
   2159 
   2160 		disk_busy(&rs->sc_dkdev);
   2161 
   2162 		/* XXX we're still at splbio() here... do we *really*
   2163 		   need to be? */
   2164 
   2165 		/* don't ever condition on bp->b_flags & B_WRITE.
   2166 		 * always condition on B_READ instead */
   2167 
   2168 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2169 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2170 				 do_async, raid_addr, num_blocks,
   2171 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2172 
   2173 		if (rc) {
   2174 			bp->b_error = rc;
   2175 			bp->b_resid = bp->b_bcount;
   2176 			biodone(bp);
   2177 			/* continue loop */
   2178 		}
   2179 
   2180 		rf_lock_mutex2(raidPtr->mutex);
   2181 	}
   2182 	rf_unlock_mutex2(raidPtr->mutex);
   2183 }
   2184 
   2185 
   2186 
   2187 
   2188 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2189 
   2190 int
   2191 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2192 {
   2193 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2194 	struct buf *bp;
   2195 
   2196 	req->queue = queue;
   2197 	bp = req->bp;
   2198 
   2199 	switch (req->type) {
   2200 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2201 		/* XXX need to do something extra here.. */
   2202 		/* I'm leaving this in, as I've never actually seen it used,
   2203 		 * and I'd like folks to report it... GO */
   2204 		printf(("WAKEUP CALLED\n"));
   2205 		queue->numOutstanding++;
   2206 
   2207 		bp->b_flags = 0;
   2208 		bp->b_private = req;
   2209 
   2210 		KernelWakeupFunc(bp);
   2211 		break;
   2212 
   2213 	case RF_IO_TYPE_READ:
   2214 	case RF_IO_TYPE_WRITE:
   2215 #if RF_ACC_TRACE > 0
   2216 		if (req->tracerec) {
   2217 			RF_ETIMER_START(req->tracerec->timer);
   2218 		}
   2219 #endif
   2220 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2221 		    op, queue->rf_cinfo->ci_dev,
   2222 		    req->sectorOffset, req->numSector,
   2223 		    req->buf, KernelWakeupFunc, (void *) req,
   2224 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2225 
   2226 		if (rf_debugKernelAccess) {
   2227 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2228 				(long) bp->b_blkno));
   2229 		}
   2230 		queue->numOutstanding++;
   2231 		queue->last_deq_sector = req->sectorOffset;
   2232 		/* acc wouldn't have been let in if there were any pending
   2233 		 * reqs at any other priority */
   2234 		queue->curPriority = req->priority;
   2235 
   2236 		db1_printf(("Going for %c to unit %d col %d\n",
   2237 			    req->type, queue->raidPtr->raidid,
   2238 			    queue->col));
   2239 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2240 			(int) req->sectorOffset, (int) req->numSector,
   2241 			(int) (req->numSector <<
   2242 			    queue->raidPtr->logBytesPerSector),
   2243 			(int) queue->raidPtr->logBytesPerSector));
   2244 
   2245 		/*
   2246 		 * XXX: drop lock here since this can block at
   2247 		 * least with backing SCSI devices.  Retake it
   2248 		 * to minimize fuss with calling interfaces.
   2249 		 */
   2250 
   2251 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2252 		bdev_strategy(bp);
   2253 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2254 		break;
   2255 
   2256 	default:
   2257 		panic("bad req->type in rf_DispatchKernelIO");
   2258 	}
   2259 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2260 
   2261 	return (0);
   2262 }
   2263 /* this is the callback function associated with a I/O invoked from
   2264    kernel code.
   2265  */
   2266 static void
   2267 KernelWakeupFunc(struct buf *bp)
   2268 {
   2269 	RF_DiskQueueData_t *req = NULL;
   2270 	RF_DiskQueue_t *queue;
   2271 
   2272 	db1_printf(("recovering the request queue:\n"));
   2273 
   2274 	req = bp->b_private;
   2275 
   2276 	queue = (RF_DiskQueue_t *) req->queue;
   2277 
   2278 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2279 
   2280 #if RF_ACC_TRACE > 0
   2281 	if (req->tracerec) {
   2282 		RF_ETIMER_STOP(req->tracerec->timer);
   2283 		RF_ETIMER_EVAL(req->tracerec->timer);
   2284 		rf_lock_mutex2(rf_tracing_mutex);
   2285 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2286 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2287 		req->tracerec->num_phys_ios++;
   2288 		rf_unlock_mutex2(rf_tracing_mutex);
   2289 	}
   2290 #endif
   2291 
   2292 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2293 	 * ballistic, and mark the component as hosed... */
   2294 
   2295 	if (bp->b_error != 0) {
   2296 		/* Mark the disk as dead */
   2297 		/* but only mark it once... */
   2298 		/* and only if it wouldn't leave this RAID set
   2299 		   completely broken */
   2300 		if (((queue->raidPtr->Disks[queue->col].status ==
   2301 		      rf_ds_optimal) ||
   2302 		     (queue->raidPtr->Disks[queue->col].status ==
   2303 		      rf_ds_used_spare)) &&
   2304 		     (queue->raidPtr->numFailures <
   2305 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2306 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2307 			       queue->raidPtr->raidid,
   2308 			       queue->raidPtr->Disks[queue->col].devname);
   2309 			queue->raidPtr->Disks[queue->col].status =
   2310 			    rf_ds_failed;
   2311 			queue->raidPtr->status = rf_rs_degraded;
   2312 			queue->raidPtr->numFailures++;
   2313 			queue->raidPtr->numNewFailures++;
   2314 		} else {	/* Disk is already dead... */
   2315 			/* printf("Disk already marked as dead!\n"); */
   2316 		}
   2317 
   2318 	}
   2319 
   2320 	/* Fill in the error value */
   2321 	req->error = bp->b_error;
   2322 
   2323 	/* Drop this one on the "finished" queue... */
   2324 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2325 
   2326 	/* Let the raidio thread know there is work to be done. */
   2327 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2328 
   2329 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2330 }
   2331 
   2332 
   2333 /*
   2334  * initialize a buf structure for doing an I/O in the kernel.
   2335  */
   2336 static void
   2337 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2338        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2339        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2340        struct proc *b_proc)
   2341 {
   2342 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2343 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2344 	bp->b_oflags = 0;
   2345 	bp->b_cflags = 0;
   2346 	bp->b_bcount = numSect << logBytesPerSector;
   2347 	bp->b_bufsize = bp->b_bcount;
   2348 	bp->b_error = 0;
   2349 	bp->b_dev = dev;
   2350 	bp->b_data = bf;
   2351 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2352 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2353 	if (bp->b_bcount == 0) {
   2354 		panic("bp->b_bcount is zero in InitBP!!");
   2355 	}
   2356 	bp->b_proc = b_proc;
   2357 	bp->b_iodone = cbFunc;
   2358 	bp->b_private = cbArg;
   2359 }
   2360 
   2361 static void
   2362 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2363 		    struct disklabel *lp)
   2364 {
   2365 	memset(lp, 0, sizeof(*lp));
   2366 
   2367 	/* fabricate a label... */
   2368 	if (raidPtr->totalSectors > UINT32_MAX)
   2369 		lp->d_secperunit = UINT32_MAX;
   2370 	else
   2371 		lp->d_secperunit = raidPtr->totalSectors;
   2372 	lp->d_secsize = raidPtr->bytesPerSector;
   2373 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2374 	lp->d_ntracks = 4 * raidPtr->numCol;
   2375 	lp->d_ncylinders = raidPtr->totalSectors /
   2376 		(lp->d_nsectors * lp->d_ntracks);
   2377 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2378 
   2379 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2380 	lp->d_type = DTYPE_RAID;
   2381 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2382 	lp->d_rpm = 3600;
   2383 	lp->d_interleave = 1;
   2384 	lp->d_flags = 0;
   2385 
   2386 	lp->d_partitions[RAW_PART].p_offset = 0;
   2387 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2388 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2389 	lp->d_npartitions = RAW_PART + 1;
   2390 
   2391 	lp->d_magic = DISKMAGIC;
   2392 	lp->d_magic2 = DISKMAGIC;
   2393 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2394 
   2395 }
   2396 /*
   2397  * Read the disklabel from the raid device.  If one is not present, fake one
   2398  * up.
   2399  */
   2400 static void
   2401 raidgetdisklabel(dev_t dev)
   2402 {
   2403 	int     unit = raidunit(dev);
   2404 	struct raid_softc *rs;
   2405 	const char   *errstring;
   2406 	struct disklabel *lp;
   2407 	struct cpu_disklabel *clp;
   2408 	RF_Raid_t *raidPtr;
   2409 
   2410 	if ((rs = raidget(unit)) == NULL)
   2411 		return;
   2412 
   2413 	lp = rs->sc_dkdev.dk_label;
   2414 	clp = rs->sc_dkdev.dk_cpulabel;
   2415 
   2416 	db1_printf(("Getting the disklabel...\n"));
   2417 
   2418 	memset(clp, 0, sizeof(*clp));
   2419 
   2420 	raidPtr = &rs->sc_r;
   2421 
   2422 	raidgetdefaultlabel(raidPtr, rs, lp);
   2423 
   2424 	/*
   2425 	 * Call the generic disklabel extraction routine.
   2426 	 */
   2427 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2428 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2429 	if (errstring)
   2430 		raidmakedisklabel(rs);
   2431 	else {
   2432 		int     i;
   2433 		struct partition *pp;
   2434 
   2435 		/*
   2436 		 * Sanity check whether the found disklabel is valid.
   2437 		 *
   2438 		 * This is necessary since total size of the raid device
   2439 		 * may vary when an interleave is changed even though exactly
   2440 		 * same components are used, and old disklabel may used
   2441 		 * if that is found.
   2442 		 */
   2443 		if (lp->d_secperunit != rs->sc_size)
   2444 			printf("raid%d: WARNING: %s: "
   2445 			    "total sector size in disklabel (%" PRIu32 ") != "
   2446 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2447 			    lp->d_secperunit, rs->sc_size);
   2448 		for (i = 0; i < lp->d_npartitions; i++) {
   2449 			pp = &lp->d_partitions[i];
   2450 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2451 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2452 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2453 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2454 		}
   2455 	}
   2456 
   2457 }
   2458 /*
   2459  * Take care of things one might want to take care of in the event
   2460  * that a disklabel isn't present.
   2461  */
   2462 static void
   2463 raidmakedisklabel(struct raid_softc *rs)
   2464 {
   2465 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2466 	db1_printf(("Making a label..\n"));
   2467 
   2468 	/*
   2469 	 * For historical reasons, if there's no disklabel present
   2470 	 * the raw partition must be marked FS_BSDFFS.
   2471 	 */
   2472 
   2473 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2474 
   2475 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2476 
   2477 	lp->d_checksum = dkcksum(lp);
   2478 }
   2479 /*
   2480  * Wait interruptibly for an exclusive lock.
   2481  *
   2482  * XXX
   2483  * Several drivers do this; it should be abstracted and made MP-safe.
   2484  * (Hmm... where have we seen this warning before :->  GO )
   2485  */
   2486 static int
   2487 raidlock(struct raid_softc *rs)
   2488 {
   2489 	int     error;
   2490 
   2491 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2492 		rs->sc_flags |= RAIDF_WANTED;
   2493 		if ((error =
   2494 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2495 			return (error);
   2496 	}
   2497 	rs->sc_flags |= RAIDF_LOCKED;
   2498 	return (0);
   2499 }
   2500 /*
   2501  * Unlock and wake up any waiters.
   2502  */
   2503 static void
   2504 raidunlock(struct raid_softc *rs)
   2505 {
   2506 
   2507 	rs->sc_flags &= ~RAIDF_LOCKED;
   2508 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2509 		rs->sc_flags &= ~RAIDF_WANTED;
   2510 		wakeup(rs);
   2511 	}
   2512 }
   2513 
   2514 
   2515 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2516 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2517 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2518 
   2519 static daddr_t
   2520 rf_component_info_offset(void)
   2521 {
   2522 
   2523 	return RF_COMPONENT_INFO_OFFSET;
   2524 }
   2525 
   2526 static daddr_t
   2527 rf_component_info_size(unsigned secsize)
   2528 {
   2529 	daddr_t info_size;
   2530 
   2531 	KASSERT(secsize);
   2532 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2533 		info_size = secsize;
   2534 	else
   2535 		info_size = RF_COMPONENT_INFO_SIZE;
   2536 
   2537 	return info_size;
   2538 }
   2539 
   2540 static daddr_t
   2541 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2542 {
   2543 	daddr_t map_offset;
   2544 
   2545 	KASSERT(raidPtr->bytesPerSector);
   2546 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2547 		map_offset = raidPtr->bytesPerSector;
   2548 	else
   2549 		map_offset = RF_COMPONENT_INFO_SIZE;
   2550 	map_offset += rf_component_info_offset();
   2551 
   2552 	return map_offset;
   2553 }
   2554 
   2555 static daddr_t
   2556 rf_parity_map_size(RF_Raid_t *raidPtr)
   2557 {
   2558 	daddr_t map_size;
   2559 
   2560 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2561 		map_size = raidPtr->bytesPerSector;
   2562 	else
   2563 		map_size = RF_PARITY_MAP_SIZE;
   2564 
   2565 	return map_size;
   2566 }
   2567 
   2568 int
   2569 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2570 {
   2571 	RF_ComponentLabel_t *clabel;
   2572 
   2573 	clabel = raidget_component_label(raidPtr, col);
   2574 	clabel->clean = RF_RAID_CLEAN;
   2575 	raidflush_component_label(raidPtr, col);
   2576 	return(0);
   2577 }
   2578 
   2579 
   2580 int
   2581 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2582 {
   2583 	RF_ComponentLabel_t *clabel;
   2584 
   2585 	clabel = raidget_component_label(raidPtr, col);
   2586 	clabel->clean = RF_RAID_DIRTY;
   2587 	raidflush_component_label(raidPtr, col);
   2588 	return(0);
   2589 }
   2590 
   2591 int
   2592 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2593 {
   2594 	KASSERT(raidPtr->bytesPerSector);
   2595 	return raidread_component_label(raidPtr->bytesPerSector,
   2596 	    raidPtr->Disks[col].dev,
   2597 	    raidPtr->raid_cinfo[col].ci_vp,
   2598 	    &raidPtr->raid_cinfo[col].ci_label);
   2599 }
   2600 
   2601 RF_ComponentLabel_t *
   2602 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2603 {
   2604 	return &raidPtr->raid_cinfo[col].ci_label;
   2605 }
   2606 
   2607 int
   2608 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2609 {
   2610 	RF_ComponentLabel_t *label;
   2611 
   2612 	label = &raidPtr->raid_cinfo[col].ci_label;
   2613 	label->mod_counter = raidPtr->mod_counter;
   2614 #ifndef RF_NO_PARITY_MAP
   2615 	label->parity_map_modcount = label->mod_counter;
   2616 #endif
   2617 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2618 	    raidPtr->Disks[col].dev,
   2619 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2620 }
   2621 
   2622 
   2623 static int
   2624 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2625     RF_ComponentLabel_t *clabel)
   2626 {
   2627 	return raidread_component_area(dev, b_vp, clabel,
   2628 	    sizeof(RF_ComponentLabel_t),
   2629 	    rf_component_info_offset(),
   2630 	    rf_component_info_size(secsize));
   2631 }
   2632 
   2633 /* ARGSUSED */
   2634 static int
   2635 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2636     size_t msize, daddr_t offset, daddr_t dsize)
   2637 {
   2638 	struct buf *bp;
   2639 	const struct bdevsw *bdev;
   2640 	int error;
   2641 
   2642 	/* XXX should probably ensure that we don't try to do this if
   2643 	   someone has changed rf_protected_sectors. */
   2644 
   2645 	if (b_vp == NULL) {
   2646 		/* For whatever reason, this component is not valid.
   2647 		   Don't try to read a component label from it. */
   2648 		return(EINVAL);
   2649 	}
   2650 
   2651 	/* get a block of the appropriate size... */
   2652 	bp = geteblk((int)dsize);
   2653 	bp->b_dev = dev;
   2654 
   2655 	/* get our ducks in a row for the read */
   2656 	bp->b_blkno = offset / DEV_BSIZE;
   2657 	bp->b_bcount = dsize;
   2658 	bp->b_flags |= B_READ;
   2659  	bp->b_resid = dsize;
   2660 
   2661 	bdev = bdevsw_lookup(bp->b_dev);
   2662 	if (bdev == NULL)
   2663 		return (ENXIO);
   2664 	(*bdev->d_strategy)(bp);
   2665 
   2666 	error = biowait(bp);
   2667 
   2668 	if (!error) {
   2669 		memcpy(data, bp->b_data, msize);
   2670 	}
   2671 
   2672 	brelse(bp, 0);
   2673 	return(error);
   2674 }
   2675 
   2676 
   2677 static int
   2678 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2679     RF_ComponentLabel_t *clabel)
   2680 {
   2681 	return raidwrite_component_area(dev, b_vp, clabel,
   2682 	    sizeof(RF_ComponentLabel_t),
   2683 	    rf_component_info_offset(),
   2684 	    rf_component_info_size(secsize), 0);
   2685 }
   2686 
   2687 /* ARGSUSED */
   2688 static int
   2689 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2690     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2691 {
   2692 	struct buf *bp;
   2693 	const struct bdevsw *bdev;
   2694 	int error;
   2695 
   2696 	/* get a block of the appropriate size... */
   2697 	bp = geteblk((int)dsize);
   2698 	bp->b_dev = dev;
   2699 
   2700 	/* get our ducks in a row for the write */
   2701 	bp->b_blkno = offset / DEV_BSIZE;
   2702 	bp->b_bcount = dsize;
   2703 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2704  	bp->b_resid = dsize;
   2705 
   2706 	memset(bp->b_data, 0, dsize);
   2707 	memcpy(bp->b_data, data, msize);
   2708 
   2709 	bdev = bdevsw_lookup(bp->b_dev);
   2710 	if (bdev == NULL)
   2711 		return (ENXIO);
   2712 	(*bdev->d_strategy)(bp);
   2713 	if (asyncp)
   2714 		return 0;
   2715 	error = biowait(bp);
   2716 	brelse(bp, 0);
   2717 	if (error) {
   2718 #if 1
   2719 		printf("Failed to write RAID component info!\n");
   2720 #endif
   2721 	}
   2722 
   2723 	return(error);
   2724 }
   2725 
   2726 void
   2727 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2728 {
   2729 	int c;
   2730 
   2731 	for (c = 0; c < raidPtr->numCol; c++) {
   2732 		/* Skip dead disks. */
   2733 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2734 			continue;
   2735 		/* XXXjld: what if an error occurs here? */
   2736 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2737 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2738 		    RF_PARITYMAP_NBYTE,
   2739 		    rf_parity_map_offset(raidPtr),
   2740 		    rf_parity_map_size(raidPtr), 0);
   2741 	}
   2742 }
   2743 
   2744 void
   2745 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2746 {
   2747 	struct rf_paritymap_ondisk tmp;
   2748 	int c,first;
   2749 
   2750 	first=1;
   2751 	for (c = 0; c < raidPtr->numCol; c++) {
   2752 		/* Skip dead disks. */
   2753 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2754 			continue;
   2755 		raidread_component_area(raidPtr->Disks[c].dev,
   2756 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2757 		    RF_PARITYMAP_NBYTE,
   2758 		    rf_parity_map_offset(raidPtr),
   2759 		    rf_parity_map_size(raidPtr));
   2760 		if (first) {
   2761 			memcpy(map, &tmp, sizeof(*map));
   2762 			first = 0;
   2763 		} else {
   2764 			rf_paritymap_merge(map, &tmp);
   2765 		}
   2766 	}
   2767 }
   2768 
   2769 void
   2770 rf_markalldirty(RF_Raid_t *raidPtr)
   2771 {
   2772 	RF_ComponentLabel_t *clabel;
   2773 	int sparecol;
   2774 	int c;
   2775 	int j;
   2776 	int scol = -1;
   2777 
   2778 	raidPtr->mod_counter++;
   2779 	for (c = 0; c < raidPtr->numCol; c++) {
   2780 		/* we don't want to touch (at all) a disk that has
   2781 		   failed */
   2782 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2783 			clabel = raidget_component_label(raidPtr, c);
   2784 			if (clabel->status == rf_ds_spared) {
   2785 				/* XXX do something special...
   2786 				   but whatever you do, don't
   2787 				   try to access it!! */
   2788 			} else {
   2789 				raidmarkdirty(raidPtr, c);
   2790 			}
   2791 		}
   2792 	}
   2793 
   2794 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2795 		sparecol = raidPtr->numCol + c;
   2796 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2797 			/*
   2798 
   2799 			   we claim this disk is "optimal" if it's
   2800 			   rf_ds_used_spare, as that means it should be
   2801 			   directly substitutable for the disk it replaced.
   2802 			   We note that too...
   2803 
   2804 			 */
   2805 
   2806 			for(j=0;j<raidPtr->numCol;j++) {
   2807 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2808 					scol = j;
   2809 					break;
   2810 				}
   2811 			}
   2812 
   2813 			clabel = raidget_component_label(raidPtr, sparecol);
   2814 			/* make sure status is noted */
   2815 
   2816 			raid_init_component_label(raidPtr, clabel);
   2817 
   2818 			clabel->row = 0;
   2819 			clabel->column = scol;
   2820 			/* Note: we *don't* change status from rf_ds_used_spare
   2821 			   to rf_ds_optimal */
   2822 			/* clabel.status = rf_ds_optimal; */
   2823 
   2824 			raidmarkdirty(raidPtr, sparecol);
   2825 		}
   2826 	}
   2827 }
   2828 
   2829 
   2830 void
   2831 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2832 {
   2833 	RF_ComponentLabel_t *clabel;
   2834 	int sparecol;
   2835 	int c;
   2836 	int j;
   2837 	int scol;
   2838 
   2839 	scol = -1;
   2840 
   2841 	/* XXX should do extra checks to make sure things really are clean,
   2842 	   rather than blindly setting the clean bit... */
   2843 
   2844 	raidPtr->mod_counter++;
   2845 
   2846 	for (c = 0; c < raidPtr->numCol; c++) {
   2847 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2848 			clabel = raidget_component_label(raidPtr, c);
   2849 			/* make sure status is noted */
   2850 			clabel->status = rf_ds_optimal;
   2851 
   2852 			/* note what unit we are configured as */
   2853 			clabel->last_unit = raidPtr->raidid;
   2854 
   2855 			raidflush_component_label(raidPtr, c);
   2856 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2857 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2858 					raidmarkclean(raidPtr, c);
   2859 				}
   2860 			}
   2861 		}
   2862 		/* else we don't touch it.. */
   2863 	}
   2864 
   2865 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2866 		sparecol = raidPtr->numCol + c;
   2867 		/* Need to ensure that the reconstruct actually completed! */
   2868 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2869 			/*
   2870 
   2871 			   we claim this disk is "optimal" if it's
   2872 			   rf_ds_used_spare, as that means it should be
   2873 			   directly substitutable for the disk it replaced.
   2874 			   We note that too...
   2875 
   2876 			 */
   2877 
   2878 			for(j=0;j<raidPtr->numCol;j++) {
   2879 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2880 					scol = j;
   2881 					break;
   2882 				}
   2883 			}
   2884 
   2885 			/* XXX shouldn't *really* need this... */
   2886 			clabel = raidget_component_label(raidPtr, sparecol);
   2887 			/* make sure status is noted */
   2888 
   2889 			raid_init_component_label(raidPtr, clabel);
   2890 
   2891 			clabel->column = scol;
   2892 			clabel->status = rf_ds_optimal;
   2893 			clabel->last_unit = raidPtr->raidid;
   2894 
   2895 			raidflush_component_label(raidPtr, sparecol);
   2896 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2897 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2898 					raidmarkclean(raidPtr, sparecol);
   2899 				}
   2900 			}
   2901 		}
   2902 	}
   2903 }
   2904 
   2905 void
   2906 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2907 {
   2908 
   2909 	if (vp != NULL) {
   2910 		if (auto_configured == 1) {
   2911 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2912 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2913 			vput(vp);
   2914 
   2915 		} else {
   2916 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2917 		}
   2918 	}
   2919 }
   2920 
   2921 
   2922 void
   2923 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2924 {
   2925 	int r,c;
   2926 	struct vnode *vp;
   2927 	int acd;
   2928 
   2929 
   2930 	/* We take this opportunity to close the vnodes like we should.. */
   2931 
   2932 	for (c = 0; c < raidPtr->numCol; c++) {
   2933 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2934 		acd = raidPtr->Disks[c].auto_configured;
   2935 		rf_close_component(raidPtr, vp, acd);
   2936 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2937 		raidPtr->Disks[c].auto_configured = 0;
   2938 	}
   2939 
   2940 	for (r = 0; r < raidPtr->numSpare; r++) {
   2941 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2942 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2943 		rf_close_component(raidPtr, vp, acd);
   2944 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2945 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2946 	}
   2947 }
   2948 
   2949 
   2950 void
   2951 rf_ReconThread(struct rf_recon_req *req)
   2952 {
   2953 	int     s;
   2954 	RF_Raid_t *raidPtr;
   2955 
   2956 	s = splbio();
   2957 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2958 	raidPtr->recon_in_progress = 1;
   2959 
   2960 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2961 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2962 
   2963 	RF_Free(req, sizeof(*req));
   2964 
   2965 	raidPtr->recon_in_progress = 0;
   2966 	splx(s);
   2967 
   2968 	/* That's all... */
   2969 	kthread_exit(0);	/* does not return */
   2970 }
   2971 
   2972 void
   2973 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2974 {
   2975 	int retcode;
   2976 	int s;
   2977 
   2978 	raidPtr->parity_rewrite_stripes_done = 0;
   2979 	raidPtr->parity_rewrite_in_progress = 1;
   2980 	s = splbio();
   2981 	retcode = rf_RewriteParity(raidPtr);
   2982 	splx(s);
   2983 	if (retcode) {
   2984 		printf("raid%d: Error re-writing parity (%d)!\n",
   2985 		    raidPtr->raidid, retcode);
   2986 	} else {
   2987 		/* set the clean bit!  If we shutdown correctly,
   2988 		   the clean bit on each component label will get
   2989 		   set */
   2990 		raidPtr->parity_good = RF_RAID_CLEAN;
   2991 	}
   2992 	raidPtr->parity_rewrite_in_progress = 0;
   2993 
   2994 	/* Anyone waiting for us to stop?  If so, inform them... */
   2995 	if (raidPtr->waitShutdown) {
   2996 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2997 	}
   2998 
   2999 	/* That's all... */
   3000 	kthread_exit(0);	/* does not return */
   3001 }
   3002 
   3003 
   3004 void
   3005 rf_CopybackThread(RF_Raid_t *raidPtr)
   3006 {
   3007 	int s;
   3008 
   3009 	raidPtr->copyback_in_progress = 1;
   3010 	s = splbio();
   3011 	rf_CopybackReconstructedData(raidPtr);
   3012 	splx(s);
   3013 	raidPtr->copyback_in_progress = 0;
   3014 
   3015 	/* That's all... */
   3016 	kthread_exit(0);	/* does not return */
   3017 }
   3018 
   3019 
   3020 void
   3021 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3022 {
   3023 	int s;
   3024 	RF_Raid_t *raidPtr;
   3025 
   3026 	s = splbio();
   3027 	raidPtr = req->raidPtr;
   3028 	raidPtr->recon_in_progress = 1;
   3029 	rf_ReconstructInPlace(raidPtr, req->col);
   3030 	RF_Free(req, sizeof(*req));
   3031 	raidPtr->recon_in_progress = 0;
   3032 	splx(s);
   3033 
   3034 	/* That's all... */
   3035 	kthread_exit(0);	/* does not return */
   3036 }
   3037 
   3038 static RF_AutoConfig_t *
   3039 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3040     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3041     unsigned secsize)
   3042 {
   3043 	int good_one = 0;
   3044 	RF_ComponentLabel_t *clabel;
   3045 	RF_AutoConfig_t *ac;
   3046 
   3047 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3048 	if (clabel == NULL) {
   3049 oomem:
   3050 		    while(ac_list) {
   3051 			    ac = ac_list;
   3052 			    if (ac->clabel)
   3053 				    free(ac->clabel, M_RAIDFRAME);
   3054 			    ac_list = ac_list->next;
   3055 			    free(ac, M_RAIDFRAME);
   3056 		    }
   3057 		    printf("RAID auto config: out of memory!\n");
   3058 		    return NULL; /* XXX probably should panic? */
   3059 	}
   3060 
   3061 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3062 		/* Got the label.  Does it look reasonable? */
   3063 		if (rf_reasonable_label(clabel, numsecs) &&
   3064 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3065 #ifdef DEBUG
   3066 			printf("Component on: %s: %llu\n",
   3067 				cname, (unsigned long long)size);
   3068 			rf_print_component_label(clabel);
   3069 #endif
   3070 			/* if it's reasonable, add it, else ignore it. */
   3071 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3072 				M_NOWAIT);
   3073 			if (ac == NULL) {
   3074 				free(clabel, M_RAIDFRAME);
   3075 				goto oomem;
   3076 			}
   3077 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3078 			ac->dev = dev;
   3079 			ac->vp = vp;
   3080 			ac->clabel = clabel;
   3081 			ac->next = ac_list;
   3082 			ac_list = ac;
   3083 			good_one = 1;
   3084 		}
   3085 	}
   3086 	if (!good_one) {
   3087 		/* cleanup */
   3088 		free(clabel, M_RAIDFRAME);
   3089 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3090 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3091 		vput(vp);
   3092 	}
   3093 	return ac_list;
   3094 }
   3095 
   3096 RF_AutoConfig_t *
   3097 rf_find_raid_components(void)
   3098 {
   3099 	struct vnode *vp;
   3100 	struct disklabel label;
   3101 	device_t dv;
   3102 	deviter_t di;
   3103 	dev_t dev;
   3104 	int bmajor, bminor, wedge, rf_part_found;
   3105 	int error;
   3106 	int i;
   3107 	RF_AutoConfig_t *ac_list;
   3108 	uint64_t numsecs;
   3109 	unsigned secsize;
   3110 
   3111 	/* initialize the AutoConfig list */
   3112 	ac_list = NULL;
   3113 
   3114 	/* we begin by trolling through *all* the devices on the system */
   3115 
   3116 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3117 	     dv = deviter_next(&di)) {
   3118 
   3119 		/* we are only interested in disks... */
   3120 		if (device_class(dv) != DV_DISK)
   3121 			continue;
   3122 
   3123 		/* we don't care about floppies... */
   3124 		if (device_is_a(dv, "fd")) {
   3125 			continue;
   3126 		}
   3127 
   3128 		/* we don't care about CD's... */
   3129 		if (device_is_a(dv, "cd")) {
   3130 			continue;
   3131 		}
   3132 
   3133 		/* we don't care about md's... */
   3134 		if (device_is_a(dv, "md")) {
   3135 			continue;
   3136 		}
   3137 
   3138 		/* hdfd is the Atari/Hades floppy driver */
   3139 		if (device_is_a(dv, "hdfd")) {
   3140 			continue;
   3141 		}
   3142 
   3143 		/* fdisa is the Atari/Milan floppy driver */
   3144 		if (device_is_a(dv, "fdisa")) {
   3145 			continue;
   3146 		}
   3147 
   3148 		/* need to find the device_name_to_block_device_major stuff */
   3149 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3150 
   3151 		rf_part_found = 0; /*No raid partition as yet*/
   3152 
   3153 		/* get a vnode for the raw partition of this disk */
   3154 
   3155 		wedge = device_is_a(dv, "dk");
   3156 		bminor = minor(device_unit(dv));
   3157 		dev = wedge ? makedev(bmajor, bminor) :
   3158 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3159 		if (bdevvp(dev, &vp))
   3160 			panic("RAID can't alloc vnode");
   3161 
   3162 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3163 
   3164 		if (error) {
   3165 			/* "Who cares."  Continue looking
   3166 			   for something that exists*/
   3167 			vput(vp);
   3168 			continue;
   3169 		}
   3170 
   3171 		error = getdisksize(vp, &numsecs, &secsize);
   3172 		if (error) {
   3173 			vput(vp);
   3174 			continue;
   3175 		}
   3176 		if (wedge) {
   3177 			struct dkwedge_info dkw;
   3178 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3179 			    NOCRED);
   3180 			if (error) {
   3181 				printf("RAIDframe: can't get wedge info for "
   3182 				    "dev %s (%d)\n", device_xname(dv), error);
   3183 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3184 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3185 				vput(vp);
   3186 				continue;
   3187 			}
   3188 
   3189 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3190 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3191 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3192 				vput(vp);
   3193 				continue;
   3194 			}
   3195 
   3196 			ac_list = rf_get_component(ac_list, dev, vp,
   3197 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3198 			rf_part_found = 1; /*There is a raid component on this disk*/
   3199 			continue;
   3200 		}
   3201 
   3202 		/* Ok, the disk exists.  Go get the disklabel. */
   3203 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3204 		if (error) {
   3205 			/*
   3206 			 * XXX can't happen - open() would
   3207 			 * have errored out (or faked up one)
   3208 			 */
   3209 			if (error != ENOTTY)
   3210 				printf("RAIDframe: can't get label for dev "
   3211 				    "%s (%d)\n", device_xname(dv), error);
   3212 		}
   3213 
   3214 		/* don't need this any more.  We'll allocate it again
   3215 		   a little later if we really do... */
   3216 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3217 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3218 		vput(vp);
   3219 
   3220 		if (error)
   3221 			continue;
   3222 
   3223 		rf_part_found = 0; /*No raid partitions yet*/
   3224 		for (i = 0; i < label.d_npartitions; i++) {
   3225 			char cname[sizeof(ac_list->devname)];
   3226 
   3227 			/* We only support partitions marked as RAID */
   3228 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3229 				continue;
   3230 
   3231 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3232 			if (bdevvp(dev, &vp))
   3233 				panic("RAID can't alloc vnode");
   3234 
   3235 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3236 			if (error) {
   3237 				/* Whatever... */
   3238 				vput(vp);
   3239 				continue;
   3240 			}
   3241 			snprintf(cname, sizeof(cname), "%s%c",
   3242 			    device_xname(dv), 'a' + i);
   3243 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3244 				label.d_partitions[i].p_size, numsecs, secsize);
   3245 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3246 		}
   3247 
   3248 		/*
   3249 		 *If there is no raid component on this disk, either in a
   3250 		 *disklabel or inside a wedge, check the raw partition as well,
   3251 		 *as it is possible to configure raid components on raw disk
   3252 		 *devices.
   3253 		 */
   3254 
   3255 		if (!rf_part_found) {
   3256 			char cname[sizeof(ac_list->devname)];
   3257 
   3258 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3259 			if (bdevvp(dev, &vp))
   3260 				panic("RAID can't alloc vnode");
   3261 
   3262 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3263 			if (error) {
   3264 				/* Whatever... */
   3265 				vput(vp);
   3266 				continue;
   3267 			}
   3268 			snprintf(cname, sizeof(cname), "%s%c",
   3269 			    device_xname(dv), 'a' + RAW_PART);
   3270 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3271 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3272 		}
   3273 	}
   3274 	deviter_release(&di);
   3275 	return ac_list;
   3276 }
   3277 
   3278 
   3279 int
   3280 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3281 {
   3282 
   3283 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3284 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3285 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3286 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3287 	    clabel->row >=0 &&
   3288 	    clabel->column >= 0 &&
   3289 	    clabel->num_rows > 0 &&
   3290 	    clabel->num_columns > 0 &&
   3291 	    clabel->row < clabel->num_rows &&
   3292 	    clabel->column < clabel->num_columns &&
   3293 	    clabel->blockSize > 0 &&
   3294 	    /*
   3295 	     * numBlocksHi may contain garbage, but it is ok since
   3296 	     * the type is unsigned.  If it is really garbage,
   3297 	     * rf_fix_old_label_size() will fix it.
   3298 	     */
   3299 	    rf_component_label_numblocks(clabel) > 0) {
   3300 		/*
   3301 		 * label looks reasonable enough...
   3302 		 * let's make sure it has no old garbage.
   3303 		 */
   3304 		if (numsecs)
   3305 			rf_fix_old_label_size(clabel, numsecs);
   3306 		return(1);
   3307 	}
   3308 	return(0);
   3309 }
   3310 
   3311 
   3312 /*
   3313  * For reasons yet unknown, some old component labels have garbage in
   3314  * the newer numBlocksHi region, and this causes lossage.  Since those
   3315  * disks will also have numsecs set to less than 32 bits of sectors,
   3316  * we can determine when this corruption has occurred, and fix it.
   3317  *
   3318  * The exact same problem, with the same unknown reason, happens to
   3319  * the partitionSizeHi member as well.
   3320  */
   3321 static void
   3322 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3323 {
   3324 
   3325 	if (numsecs < ((uint64_t)1 << 32)) {
   3326 		if (clabel->numBlocksHi) {
   3327 			printf("WARNING: total sectors < 32 bits, yet "
   3328 			       "numBlocksHi set\n"
   3329 			       "WARNING: resetting numBlocksHi to zero.\n");
   3330 			clabel->numBlocksHi = 0;
   3331 		}
   3332 
   3333 		if (clabel->partitionSizeHi) {
   3334 			printf("WARNING: total sectors < 32 bits, yet "
   3335 			       "partitionSizeHi set\n"
   3336 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3337 			clabel->partitionSizeHi = 0;
   3338 		}
   3339 	}
   3340 }
   3341 
   3342 
   3343 #ifdef DEBUG
   3344 void
   3345 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3346 {
   3347 	uint64_t numBlocks;
   3348 	static const char *rp[] = {
   3349 	    "No", "Force", "Soft", "*invalid*"
   3350 	};
   3351 
   3352 
   3353 	numBlocks = rf_component_label_numblocks(clabel);
   3354 
   3355 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3356 	       clabel->row, clabel->column,
   3357 	       clabel->num_rows, clabel->num_columns);
   3358 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3359 	       clabel->version, clabel->serial_number,
   3360 	       clabel->mod_counter);
   3361 	printf("   Clean: %s Status: %d\n",
   3362 	       clabel->clean ? "Yes" : "No", clabel->status);
   3363 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3364 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3365 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3366 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3367 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3368 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3369 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3370 #if 0
   3371 	   printf("   Config order: %d\n", clabel->config_order);
   3372 #endif
   3373 
   3374 }
   3375 #endif
   3376 
   3377 RF_ConfigSet_t *
   3378 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3379 {
   3380 	RF_AutoConfig_t *ac;
   3381 	RF_ConfigSet_t *config_sets;
   3382 	RF_ConfigSet_t *cset;
   3383 	RF_AutoConfig_t *ac_next;
   3384 
   3385 
   3386 	config_sets = NULL;
   3387 
   3388 	/* Go through the AutoConfig list, and figure out which components
   3389 	   belong to what sets.  */
   3390 	ac = ac_list;
   3391 	while(ac!=NULL) {
   3392 		/* we're going to putz with ac->next, so save it here
   3393 		   for use at the end of the loop */
   3394 		ac_next = ac->next;
   3395 
   3396 		if (config_sets == NULL) {
   3397 			/* will need at least this one... */
   3398 			config_sets = (RF_ConfigSet_t *)
   3399 				malloc(sizeof(RF_ConfigSet_t),
   3400 				       M_RAIDFRAME, M_NOWAIT);
   3401 			if (config_sets == NULL) {
   3402 				panic("rf_create_auto_sets: No memory!");
   3403 			}
   3404 			/* this one is easy :) */
   3405 			config_sets->ac = ac;
   3406 			config_sets->next = NULL;
   3407 			config_sets->rootable = 0;
   3408 			ac->next = NULL;
   3409 		} else {
   3410 			/* which set does this component fit into? */
   3411 			cset = config_sets;
   3412 			while(cset!=NULL) {
   3413 				if (rf_does_it_fit(cset, ac)) {
   3414 					/* looks like it matches... */
   3415 					ac->next = cset->ac;
   3416 					cset->ac = ac;
   3417 					break;
   3418 				}
   3419 				cset = cset->next;
   3420 			}
   3421 			if (cset==NULL) {
   3422 				/* didn't find a match above... new set..*/
   3423 				cset = (RF_ConfigSet_t *)
   3424 					malloc(sizeof(RF_ConfigSet_t),
   3425 					       M_RAIDFRAME, M_NOWAIT);
   3426 				if (cset == NULL) {
   3427 					panic("rf_create_auto_sets: No memory!");
   3428 				}
   3429 				cset->ac = ac;
   3430 				ac->next = NULL;
   3431 				cset->next = config_sets;
   3432 				cset->rootable = 0;
   3433 				config_sets = cset;
   3434 			}
   3435 		}
   3436 		ac = ac_next;
   3437 	}
   3438 
   3439 
   3440 	return(config_sets);
   3441 }
   3442 
   3443 static int
   3444 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3445 {
   3446 	RF_ComponentLabel_t *clabel1, *clabel2;
   3447 
   3448 	/* If this one matches the *first* one in the set, that's good
   3449 	   enough, since the other members of the set would have been
   3450 	   through here too... */
   3451 	/* note that we are not checking partitionSize here..
   3452 
   3453 	   Note that we are also not checking the mod_counters here.
   3454 	   If everything else matches except the mod_counter, that's
   3455 	   good enough for this test.  We will deal with the mod_counters
   3456 	   a little later in the autoconfiguration process.
   3457 
   3458 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3459 
   3460 	   The reason we don't check for this is that failed disks
   3461 	   will have lower modification counts.  If those disks are
   3462 	   not added to the set they used to belong to, then they will
   3463 	   form their own set, which may result in 2 different sets,
   3464 	   for example, competing to be configured at raid0, and
   3465 	   perhaps competing to be the root filesystem set.  If the
   3466 	   wrong ones get configured, or both attempt to become /,
   3467 	   weird behaviour and or serious lossage will occur.  Thus we
   3468 	   need to bring them into the fold here, and kick them out at
   3469 	   a later point.
   3470 
   3471 	*/
   3472 
   3473 	clabel1 = cset->ac->clabel;
   3474 	clabel2 = ac->clabel;
   3475 	if ((clabel1->version == clabel2->version) &&
   3476 	    (clabel1->serial_number == clabel2->serial_number) &&
   3477 	    (clabel1->num_rows == clabel2->num_rows) &&
   3478 	    (clabel1->num_columns == clabel2->num_columns) &&
   3479 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3480 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3481 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3482 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3483 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3484 	    (clabel1->blockSize == clabel2->blockSize) &&
   3485 	    rf_component_label_numblocks(clabel1) ==
   3486 	    rf_component_label_numblocks(clabel2) &&
   3487 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3488 	    (clabel1->root_partition == clabel2->root_partition) &&
   3489 	    (clabel1->last_unit == clabel2->last_unit) &&
   3490 	    (clabel1->config_order == clabel2->config_order)) {
   3491 		/* if it get's here, it almost *has* to be a match */
   3492 	} else {
   3493 		/* it's not consistent with somebody in the set..
   3494 		   punt */
   3495 		return(0);
   3496 	}
   3497 	/* all was fine.. it must fit... */
   3498 	return(1);
   3499 }
   3500 
   3501 int
   3502 rf_have_enough_components(RF_ConfigSet_t *cset)
   3503 {
   3504 	RF_AutoConfig_t *ac;
   3505 	RF_AutoConfig_t *auto_config;
   3506 	RF_ComponentLabel_t *clabel;
   3507 	int c;
   3508 	int num_cols;
   3509 	int num_missing;
   3510 	int mod_counter;
   3511 	int mod_counter_found;
   3512 	int even_pair_failed;
   3513 	char parity_type;
   3514 
   3515 
   3516 	/* check to see that we have enough 'live' components
   3517 	   of this set.  If so, we can configure it if necessary */
   3518 
   3519 	num_cols = cset->ac->clabel->num_columns;
   3520 	parity_type = cset->ac->clabel->parityConfig;
   3521 
   3522 	/* XXX Check for duplicate components!?!?!? */
   3523 
   3524 	/* Determine what the mod_counter is supposed to be for this set. */
   3525 
   3526 	mod_counter_found = 0;
   3527 	mod_counter = 0;
   3528 	ac = cset->ac;
   3529 	while(ac!=NULL) {
   3530 		if (mod_counter_found==0) {
   3531 			mod_counter = ac->clabel->mod_counter;
   3532 			mod_counter_found = 1;
   3533 		} else {
   3534 			if (ac->clabel->mod_counter > mod_counter) {
   3535 				mod_counter = ac->clabel->mod_counter;
   3536 			}
   3537 		}
   3538 		ac = ac->next;
   3539 	}
   3540 
   3541 	num_missing = 0;
   3542 	auto_config = cset->ac;
   3543 
   3544 	even_pair_failed = 0;
   3545 	for(c=0; c<num_cols; c++) {
   3546 		ac = auto_config;
   3547 		while(ac!=NULL) {
   3548 			if ((ac->clabel->column == c) &&
   3549 			    (ac->clabel->mod_counter == mod_counter)) {
   3550 				/* it's this one... */
   3551 #ifdef DEBUG
   3552 				printf("Found: %s at %d\n",
   3553 				       ac->devname,c);
   3554 #endif
   3555 				break;
   3556 			}
   3557 			ac=ac->next;
   3558 		}
   3559 		if (ac==NULL) {
   3560 				/* Didn't find one here! */
   3561 				/* special case for RAID 1, especially
   3562 				   where there are more than 2
   3563 				   components (where RAIDframe treats
   3564 				   things a little differently :( ) */
   3565 			if (parity_type == '1') {
   3566 				if (c%2 == 0) { /* even component */
   3567 					even_pair_failed = 1;
   3568 				} else { /* odd component.  If
   3569 					    we're failed, and
   3570 					    so is the even
   3571 					    component, it's
   3572 					    "Good Night, Charlie" */
   3573 					if (even_pair_failed == 1) {
   3574 						return(0);
   3575 					}
   3576 				}
   3577 			} else {
   3578 				/* normal accounting */
   3579 				num_missing++;
   3580 			}
   3581 		}
   3582 		if ((parity_type == '1') && (c%2 == 1)) {
   3583 				/* Just did an even component, and we didn't
   3584 				   bail.. reset the even_pair_failed flag,
   3585 				   and go on to the next component.... */
   3586 			even_pair_failed = 0;
   3587 		}
   3588 	}
   3589 
   3590 	clabel = cset->ac->clabel;
   3591 
   3592 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3593 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3594 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3595 		/* XXX this needs to be made *much* more general */
   3596 		/* Too many failures */
   3597 		return(0);
   3598 	}
   3599 	/* otherwise, all is well, and we've got enough to take a kick
   3600 	   at autoconfiguring this set */
   3601 	return(1);
   3602 }
   3603 
   3604 void
   3605 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3606 			RF_Raid_t *raidPtr)
   3607 {
   3608 	RF_ComponentLabel_t *clabel;
   3609 	int i;
   3610 
   3611 	clabel = ac->clabel;
   3612 
   3613 	/* 1. Fill in the common stuff */
   3614 	config->numRow = clabel->num_rows = 1;
   3615 	config->numCol = clabel->num_columns;
   3616 	config->numSpare = 0; /* XXX should this be set here? */
   3617 	config->sectPerSU = clabel->sectPerSU;
   3618 	config->SUsPerPU = clabel->SUsPerPU;
   3619 	config->SUsPerRU = clabel->SUsPerRU;
   3620 	config->parityConfig = clabel->parityConfig;
   3621 	/* XXX... */
   3622 	strcpy(config->diskQueueType,"fifo");
   3623 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3624 	config->layoutSpecificSize = 0; /* XXX ?? */
   3625 
   3626 	while(ac!=NULL) {
   3627 		/* row/col values will be in range due to the checks
   3628 		   in reasonable_label() */
   3629 		strcpy(config->devnames[0][ac->clabel->column],
   3630 		       ac->devname);
   3631 		ac = ac->next;
   3632 	}
   3633 
   3634 	for(i=0;i<RF_MAXDBGV;i++) {
   3635 		config->debugVars[i][0] = 0;
   3636 	}
   3637 }
   3638 
   3639 int
   3640 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3641 {
   3642 	RF_ComponentLabel_t *clabel;
   3643 	int column;
   3644 	int sparecol;
   3645 
   3646 	raidPtr->autoconfigure = new_value;
   3647 
   3648 	for(column=0; column<raidPtr->numCol; column++) {
   3649 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3650 			clabel = raidget_component_label(raidPtr, column);
   3651 			clabel->autoconfigure = new_value;
   3652 			raidflush_component_label(raidPtr, column);
   3653 		}
   3654 	}
   3655 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3656 		sparecol = raidPtr->numCol + column;
   3657 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3658 			clabel = raidget_component_label(raidPtr, sparecol);
   3659 			clabel->autoconfigure = new_value;
   3660 			raidflush_component_label(raidPtr, sparecol);
   3661 		}
   3662 	}
   3663 	return(new_value);
   3664 }
   3665 
   3666 int
   3667 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3668 {
   3669 	RF_ComponentLabel_t *clabel;
   3670 	int column;
   3671 	int sparecol;
   3672 
   3673 	raidPtr->root_partition = new_value;
   3674 	for(column=0; column<raidPtr->numCol; column++) {
   3675 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3676 			clabel = raidget_component_label(raidPtr, column);
   3677 			clabel->root_partition = new_value;
   3678 			raidflush_component_label(raidPtr, column);
   3679 		}
   3680 	}
   3681 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3682 		sparecol = raidPtr->numCol + column;
   3683 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3684 			clabel = raidget_component_label(raidPtr, sparecol);
   3685 			clabel->root_partition = new_value;
   3686 			raidflush_component_label(raidPtr, sparecol);
   3687 		}
   3688 	}
   3689 	return(new_value);
   3690 }
   3691 
   3692 void
   3693 rf_release_all_vps(RF_ConfigSet_t *cset)
   3694 {
   3695 	RF_AutoConfig_t *ac;
   3696 
   3697 	ac = cset->ac;
   3698 	while(ac!=NULL) {
   3699 		/* Close the vp, and give it back */
   3700 		if (ac->vp) {
   3701 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3702 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3703 			vput(ac->vp);
   3704 			ac->vp = NULL;
   3705 		}
   3706 		ac = ac->next;
   3707 	}
   3708 }
   3709 
   3710 
   3711 void
   3712 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3713 {
   3714 	RF_AutoConfig_t *ac;
   3715 	RF_AutoConfig_t *next_ac;
   3716 
   3717 	ac = cset->ac;
   3718 	while(ac!=NULL) {
   3719 		next_ac = ac->next;
   3720 		/* nuke the label */
   3721 		free(ac->clabel, M_RAIDFRAME);
   3722 		/* cleanup the config structure */
   3723 		free(ac, M_RAIDFRAME);
   3724 		/* "next.." */
   3725 		ac = next_ac;
   3726 	}
   3727 	/* and, finally, nuke the config set */
   3728 	free(cset, M_RAIDFRAME);
   3729 }
   3730 
   3731 
   3732 void
   3733 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3734 {
   3735 	/* current version number */
   3736 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3737 	clabel->serial_number = raidPtr->serial_number;
   3738 	clabel->mod_counter = raidPtr->mod_counter;
   3739 
   3740 	clabel->num_rows = 1;
   3741 	clabel->num_columns = raidPtr->numCol;
   3742 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3743 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3744 
   3745 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3746 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3747 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3748 
   3749 	clabel->blockSize = raidPtr->bytesPerSector;
   3750 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3751 
   3752 	/* XXX not portable */
   3753 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3754 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3755 	clabel->autoconfigure = raidPtr->autoconfigure;
   3756 	clabel->root_partition = raidPtr->root_partition;
   3757 	clabel->last_unit = raidPtr->raidid;
   3758 	clabel->config_order = raidPtr->config_order;
   3759 
   3760 #ifndef RF_NO_PARITY_MAP
   3761 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3762 #endif
   3763 }
   3764 
   3765 struct raid_softc *
   3766 rf_auto_config_set(RF_ConfigSet_t *cset)
   3767 {
   3768 	RF_Raid_t *raidPtr;
   3769 	RF_Config_t *config;
   3770 	int raidID;
   3771 	struct raid_softc *sc;
   3772 
   3773 #ifdef DEBUG
   3774 	printf("RAID autoconfigure\n");
   3775 #endif
   3776 
   3777 	/* 1. Create a config structure */
   3778 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3779 	if (config == NULL) {
   3780 		printf("Out of mem!?!?\n");
   3781 				/* XXX do something more intelligent here. */
   3782 		return NULL;
   3783 	}
   3784 
   3785 	/*
   3786 	   2. Figure out what RAID ID this one is supposed to live at
   3787 	   See if we can get the same RAID dev that it was configured
   3788 	   on last time..
   3789 	*/
   3790 
   3791 	raidID = cset->ac->clabel->last_unit;
   3792 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3793 		continue;
   3794 #ifdef DEBUG
   3795 	printf("Configuring raid%d:\n",raidID);
   3796 #endif
   3797 
   3798 	raidPtr = &sc->sc_r;
   3799 
   3800 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3801 	raidPtr->softc = sc;
   3802 	raidPtr->raidid = raidID;
   3803 	raidPtr->openings = RAIDOUTSTANDING;
   3804 
   3805 	/* 3. Build the configuration structure */
   3806 	rf_create_configuration(cset->ac, config, raidPtr);
   3807 
   3808 	/* 4. Do the configuration */
   3809 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3810 		raidinit(sc);
   3811 
   3812 		rf_markalldirty(raidPtr);
   3813 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3814 		switch (cset->ac->clabel->root_partition) {
   3815 		case 1:	/* Force Root */
   3816 		case 2:	/* Soft Root: root when boot partition part of raid */
   3817 			/*
   3818 			 * everything configured just fine.  Make a note
   3819 			 * that this set is eligible to be root,
   3820 			 * or forced to be root
   3821 			 */
   3822 			cset->rootable = cset->ac->clabel->root_partition;
   3823 			/* XXX do this here? */
   3824 			raidPtr->root_partition = cset->rootable;
   3825 			break;
   3826 		default:
   3827 			break;
   3828 		}
   3829 	} else {
   3830 		raidput(sc);
   3831 		sc = NULL;
   3832 	}
   3833 
   3834 	/* 5. Cleanup */
   3835 	free(config, M_RAIDFRAME);
   3836 	return sc;
   3837 }
   3838 
   3839 void
   3840 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3841 {
   3842 	struct buf *bp;
   3843 	struct raid_softc *rs;
   3844 
   3845 	bp = (struct buf *)desc->bp;
   3846 	rs = desc->raidPtr->softc;
   3847 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3848 	    (bp->b_flags & B_READ));
   3849 }
   3850 
   3851 void
   3852 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3853 	     size_t xmin, size_t xmax)
   3854 {
   3855 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3856 	pool_sethiwat(p, xmax);
   3857 	pool_prime(p, xmin);
   3858 	pool_setlowat(p, xmin);
   3859 }
   3860 
   3861 /*
   3862  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3863  * if there is IO pending and if that IO could possibly be done for a
   3864  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3865  * otherwise.
   3866  *
   3867  */
   3868 
   3869 int
   3870 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3871 {
   3872 	struct raid_softc *rs = raidPtr->softc;
   3873 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3874 		/* there is work to do */
   3875 		return 0;
   3876 	}
   3877 	/* default is nothing to do */
   3878 	return 1;
   3879 }
   3880 
   3881 int
   3882 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3883 {
   3884 	uint64_t numsecs;
   3885 	unsigned secsize;
   3886 	int error;
   3887 
   3888 	error = getdisksize(vp, &numsecs, &secsize);
   3889 	if (error == 0) {
   3890 		diskPtr->blockSize = secsize;
   3891 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3892 		diskPtr->partitionSize = numsecs;
   3893 		return 0;
   3894 	}
   3895 	return error;
   3896 }
   3897 
   3898 static int
   3899 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3900 {
   3901 	return 1;
   3902 }
   3903 
   3904 static void
   3905 raid_attach(device_t parent, device_t self, void *aux)
   3906 {
   3907 
   3908 }
   3909 
   3910 
   3911 static int
   3912 raid_detach(device_t self, int flags)
   3913 {
   3914 	int error;
   3915 	struct raid_softc *rs = raidget(device_unit(self));
   3916 
   3917 	if (rs == NULL)
   3918 		return ENXIO;
   3919 
   3920 	if ((error = raidlock(rs)) != 0)
   3921 		return (error);
   3922 
   3923 	error = raid_detach_unlocked(rs);
   3924 
   3925 	raidunlock(rs);
   3926 
   3927 	/* XXXkd: raidput(rs) ??? */
   3928 
   3929 	return error;
   3930 }
   3931 
   3932 static void
   3933 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3934 {
   3935 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3936 
   3937 	memset(dg, 0, sizeof(*dg));
   3938 
   3939 	dg->dg_secperunit = raidPtr->totalSectors;
   3940 	dg->dg_secsize = raidPtr->bytesPerSector;
   3941 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3942 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3943 
   3944 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3945 }
   3946 
   3947 /*
   3948  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3949  * We end up returning whatever error was returned by the first cache flush
   3950  * that fails.
   3951  */
   3952 
   3953 int
   3954 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3955 {
   3956 	int c, sparecol;
   3957 	int e,error;
   3958 	int force = 1;
   3959 
   3960 	error = 0;
   3961 	for (c = 0; c < raidPtr->numCol; c++) {
   3962 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3963 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3964 					  &force, FWRITE, NOCRED);
   3965 			if (e) {
   3966 				if (e != ENODEV)
   3967 					printf("raid%d: cache flush to component %s failed.\n",
   3968 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3969 				if (error == 0) {
   3970 					error = e;
   3971 				}
   3972 			}
   3973 		}
   3974 	}
   3975 
   3976 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3977 		sparecol = raidPtr->numCol + c;
   3978 		/* Need to ensure that the reconstruct actually completed! */
   3979 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3980 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3981 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3982 			if (e) {
   3983 				if (e != ENODEV)
   3984 					printf("raid%d: cache flush to component %s failed.\n",
   3985 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3986 				if (error == 0) {
   3987 					error = e;
   3988 				}
   3989 			}
   3990 		}
   3991 	}
   3992 	return error;
   3993 }
   3994