Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.316.2.1
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.316.2.1 2015/04/06 15:18:13 skrll Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.316.2.1 2015/04/06 15:18:13 skrll Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_discard = nodiscard,
    215 	.d_flag = D_DISK
    216 };
    217 
    218 const struct cdevsw raid_cdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_read = raidread,
    222 	.d_write = raidwrite,
    223 	.d_ioctl = raidioctl,
    224 	.d_stop = nostop,
    225 	.d_tty = notty,
    226 	.d_poll = nopoll,
    227 	.d_mmap = nommap,
    228 	.d_kqfilter = nokqfilter,
    229 	.d_discard = nodiscard,
    230 	.d_flag = D_DISK
    231 };
    232 
    233 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    234 
    235 struct raid_softc {
    236 	device_t sc_dev;
    237 	int	sc_unit;
    238 	int     sc_flags;	/* flags */
    239 	int     sc_cflags;	/* configuration flags */
    240 	uint64_t sc_size;	/* size of the raid device */
    241 	char    sc_xname[20];	/* XXX external name */
    242 	struct disk sc_dkdev;	/* generic disk device info */
    243 	struct bufq_state *buf_queue;	/* used for the device queue */
    244 	RF_Raid_t sc_r;
    245 	LIST_ENTRY(raid_softc) sc_link;
    246 };
    247 /* sc_flags */
    248 #define RAIDF_INITED	0x01	/* unit has been initialized */
    249 #define RAIDF_WLABEL	0x02	/* label area is writable */
    250 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    251 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    252 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    253 #define RAIDF_LOCKED	0x80	/* unit is locked */
    254 
    255 #define	raidunit(x)	DISKUNIT(x)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /*
    263  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    264  * Be aware that large numbers can allow the driver to consume a lot of
    265  * kernel memory, especially on writes, and in degraded mode reads.
    266  *
    267  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    268  * a single 64K write will typically require 64K for the old data,
    269  * 64K for the old parity, and 64K for the new parity, for a total
    270  * of 192K (if the parity buffer is not re-used immediately).
    271  * Even it if is used immediately, that's still 128K, which when multiplied
    272  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    273  *
    274  * Now in degraded mode, for example, a 64K read on the above setup may
    275  * require data reconstruction, which will require *all* of the 4 remaining
    276  * disks to participate -- 4 * 32K/disk == 128K again.
    277  */
    278 
    279 #ifndef RAIDOUTSTANDING
    280 #define RAIDOUTSTANDING   6
    281 #endif
    282 
    283 #define RAIDLABELDEV(dev)	\
    284 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    285 
    286 /* declared here, and made public, for the benefit of KVM stuff.. */
    287 
    288 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    289 				     struct disklabel *);
    290 static void raidgetdisklabel(dev_t);
    291 static void raidmakedisklabel(struct raid_softc *);
    292 
    293 static int raidlock(struct raid_softc *);
    294 static void raidunlock(struct raid_softc *);
    295 
    296 static int raid_detach_unlocked(struct raid_softc *);
    297 
    298 static void rf_markalldirty(RF_Raid_t *);
    299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    300 
    301 void rf_ReconThread(struct rf_recon_req *);
    302 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    303 void rf_CopybackThread(RF_Raid_t *raidPtr);
    304 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    305 int rf_autoconfig(device_t);
    306 void rf_buildroothack(RF_ConfigSet_t *);
    307 
    308 RF_AutoConfig_t *rf_find_raid_components(void);
    309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    313 int rf_set_autoconfig(RF_Raid_t *, int);
    314 int rf_set_rootpartition(RF_Raid_t *, int);
    315 void rf_release_all_vps(RF_ConfigSet_t *);
    316 void rf_cleanup_config_set(RF_ConfigSet_t *);
    317 int rf_have_enough_components(RF_ConfigSet_t *);
    318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    320 
    321 /*
    322  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    323  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    324  * in the kernel config file.
    325  */
    326 #ifdef RAID_AUTOCONFIG
    327 int raidautoconfig = 1;
    328 #else
    329 int raidautoconfig = 0;
    330 #endif
    331 static bool raidautoconfigdone = false;
    332 
    333 struct RF_Pools_s rf_pools;
    334 
    335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    336 static kmutex_t raid_lock;
    337 
    338 static struct raid_softc *
    339 raidcreate(int unit) {
    340 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    341 	if (sc == NULL) {
    342 #ifdef DIAGNOSTIC
    343 		printf("%s: out of memory\n", __func__);
    344 #endif
    345 		return NULL;
    346 	}
    347 	sc->sc_unit = unit;
    348 	bufq_alloc(&sc->buf_queue, "fcfs", BUFQ_SORT_RAWBLOCK);
    349 	return sc;
    350 }
    351 
    352 static void
    353 raiddestroy(struct raid_softc *sc) {
    354 	bufq_free(sc->buf_queue);
    355 	kmem_free(sc, sizeof(*sc));
    356 }
    357 
    358 static struct raid_softc *
    359 raidget(int unit) {
    360 	struct raid_softc *sc;
    361 	if (unit < 0) {
    362 #ifdef DIAGNOSTIC
    363 		panic("%s: unit %d!", __func__, unit);
    364 #endif
    365 		return NULL;
    366 	}
    367 	mutex_enter(&raid_lock);
    368 	LIST_FOREACH(sc, &raids, sc_link) {
    369 		if (sc->sc_unit == unit) {
    370 			mutex_exit(&raid_lock);
    371 			return sc;
    372 		}
    373 	}
    374 	mutex_exit(&raid_lock);
    375 	if ((sc = raidcreate(unit)) == NULL)
    376 		return NULL;
    377 	mutex_enter(&raid_lock);
    378 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    379 	mutex_exit(&raid_lock);
    380 	return sc;
    381 }
    382 
    383 static void
    384 raidput(struct raid_softc *sc) {
    385 	mutex_enter(&raid_lock);
    386 	LIST_REMOVE(sc, sc_link);
    387 	mutex_exit(&raid_lock);
    388 	raiddestroy(sc);
    389 }
    390 
    391 void
    392 raidattach(int num)
    393 {
    394 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    395 	/* This is where all the initialization stuff gets done. */
    396 
    397 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    398 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    399 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    400 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    401 
    402 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    403 #endif
    404 
    405 	if (rf_BootRaidframe() == 0)
    406 		aprint_verbose("Kernelized RAIDframe activated\n");
    407 	else
    408 		panic("Serious error booting RAID!!");
    409 
    410 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    411 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    412 	}
    413 
    414 	raidautoconfigdone = false;
    415 
    416 	/*
    417 	 * Register a finalizer which will be used to auto-config RAID
    418 	 * sets once all real hardware devices have been found.
    419 	 */
    420 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    421 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    422 }
    423 
    424 int
    425 rf_autoconfig(device_t self)
    426 {
    427 	RF_AutoConfig_t *ac_list;
    428 	RF_ConfigSet_t *config_sets;
    429 
    430 	if (!raidautoconfig || raidautoconfigdone == true)
    431 		return (0);
    432 
    433 	/* XXX This code can only be run once. */
    434 	raidautoconfigdone = true;
    435 
    436 #ifdef __HAVE_CPU_BOOTCONF
    437 	/*
    438 	 * 0. find the boot device if needed first so we can use it later
    439 	 * this needs to be done before we autoconfigure any raid sets,
    440 	 * because if we use wedges we are not going to be able to open
    441 	 * the boot device later
    442 	 */
    443 	if (booted_device == NULL)
    444 		cpu_bootconf();
    445 #endif
    446 	/* 1. locate all RAID components on the system */
    447 	aprint_debug("Searching for RAID components...\n");
    448 	ac_list = rf_find_raid_components();
    449 
    450 	/* 2. Sort them into their respective sets. */
    451 	config_sets = rf_create_auto_sets(ac_list);
    452 
    453 	/*
    454 	 * 3. Evaluate each set and configure the valid ones.
    455 	 * This gets done in rf_buildroothack().
    456 	 */
    457 	rf_buildroothack(config_sets);
    458 
    459 	return 1;
    460 }
    461 
    462 static int
    463 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    464 	const char *bootname = device_xname(bdv);
    465 	size_t len = strlen(bootname);
    466 
    467 	for (int col = 0; col < r->numCol; col++) {
    468 		const char *devname = r->Disks[col].devname;
    469 		devname += sizeof("/dev/") - 1;
    470 		if (strncmp(devname, "dk", 2) == 0) {
    471 			const char *parent =
    472 			    dkwedge_get_parent_name(r->Disks[col].dev);
    473 			if (parent != NULL)
    474 				devname = parent;
    475 		}
    476 		if (strncmp(devname, bootname, len) == 0) {
    477 			struct raid_softc *sc = r->softc;
    478 			aprint_debug("raid%d includes boot device %s\n",
    479 			    sc->sc_unit, devname);
    480 			return 1;
    481 		}
    482 	}
    483 	return 0;
    484 }
    485 
    486 void
    487 rf_buildroothack(RF_ConfigSet_t *config_sets)
    488 {
    489 	RF_ConfigSet_t *cset;
    490 	RF_ConfigSet_t *next_cset;
    491 	int num_root;
    492 	struct raid_softc *sc, *rsc;
    493 
    494 	sc = rsc = NULL;
    495 	num_root = 0;
    496 	cset = config_sets;
    497 	while (cset != NULL) {
    498 		next_cset = cset->next;
    499 		if (rf_have_enough_components(cset) &&
    500 		    cset->ac->clabel->autoconfigure == 1) {
    501 			sc = rf_auto_config_set(cset);
    502 			if (sc != NULL) {
    503 				aprint_debug("raid%d: configured ok\n",
    504 				    sc->sc_unit);
    505 				if (cset->rootable) {
    506 					rsc = sc;
    507 					num_root++;
    508 				}
    509 			} else {
    510 				/* The autoconfig didn't work :( */
    511 				aprint_debug("Autoconfig failed\n");
    512 				rf_release_all_vps(cset);
    513 			}
    514 		} else {
    515 			/* we're not autoconfiguring this set...
    516 			   release the associated resources */
    517 			rf_release_all_vps(cset);
    518 		}
    519 		/* cleanup */
    520 		rf_cleanup_config_set(cset);
    521 		cset = next_cset;
    522 	}
    523 
    524 	/* if the user has specified what the root device should be
    525 	   then we don't touch booted_device or boothowto... */
    526 
    527 	if (rootspec != NULL)
    528 		return;
    529 
    530 	/* we found something bootable... */
    531 
    532 	/*
    533 	 * XXX: The following code assumes that the root raid
    534 	 * is the first ('a') partition. This is about the best
    535 	 * we can do with a BSD disklabel, but we might be able
    536 	 * to do better with a GPT label, by setting a specified
    537 	 * attribute to indicate the root partition. We can then
    538 	 * stash the partition number in the r->root_partition
    539 	 * high bits (the bottom 2 bits are already used). For
    540 	 * now we just set booted_partition to 0 when we override
    541 	 * root.
    542 	 */
    543 	if (num_root == 1) {
    544 		device_t candidate_root;
    545 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    546 			char cname[sizeof(cset->ac->devname)];
    547 			/* XXX: assume 'a' */
    548 			snprintf(cname, sizeof(cname), "%s%c",
    549 			    device_xname(rsc->sc_dev), 'a');
    550 			candidate_root = dkwedge_find_by_wname(cname);
    551 		} else
    552 			candidate_root = rsc->sc_dev;
    553 		if (booted_device == NULL ||
    554 		    rsc->sc_r.root_partition == 1 ||
    555 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    556 			booted_device = candidate_root;
    557 			booted_partition = 0;	/* XXX assume 'a' */
    558 		}
    559 	} else if (num_root > 1) {
    560 
    561 		/*
    562 		 * Maybe the MD code can help. If it cannot, then
    563 		 * setroot() will discover that we have no
    564 		 * booted_device and will ask the user if nothing was
    565 		 * hardwired in the kernel config file
    566 		 */
    567 		if (booted_device == NULL)
    568 			return;
    569 
    570 		num_root = 0;
    571 		mutex_enter(&raid_lock);
    572 		LIST_FOREACH(sc, &raids, sc_link) {
    573 			RF_Raid_t *r = &sc->sc_r;
    574 			if (r->valid == 0)
    575 				continue;
    576 
    577 			if (r->root_partition == 0)
    578 				continue;
    579 
    580 			if (rf_containsboot(r, booted_device)) {
    581 				num_root++;
    582 				rsc = sc;
    583 			}
    584 		}
    585 		mutex_exit(&raid_lock);
    586 
    587 		if (num_root == 1) {
    588 			booted_device = rsc->sc_dev;
    589 			booted_partition = 0;	/* XXX assume 'a' */
    590 		} else {
    591 			/* we can't guess.. require the user to answer... */
    592 			boothowto |= RB_ASKNAME;
    593 		}
    594 	}
    595 }
    596 
    597 
    598 int
    599 raidsize(dev_t dev)
    600 {
    601 	struct raid_softc *rs;
    602 	struct disklabel *lp;
    603 	int     part, unit, omask, size;
    604 
    605 	unit = raidunit(dev);
    606 	if ((rs = raidget(unit)) == NULL)
    607 		return -1;
    608 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    609 		return (-1);
    610 
    611 	part = DISKPART(dev);
    612 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    613 	lp = rs->sc_dkdev.dk_label;
    614 
    615 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    616 		return (-1);
    617 
    618 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    619 		size = -1;
    620 	else
    621 		size = lp->d_partitions[part].p_size *
    622 		    (lp->d_secsize / DEV_BSIZE);
    623 
    624 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    625 		return (-1);
    626 
    627 	return (size);
    628 
    629 }
    630 
    631 int
    632 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    633 {
    634 	int     unit = raidunit(dev);
    635 	struct raid_softc *rs;
    636 	const struct bdevsw *bdev;
    637 	struct disklabel *lp;
    638 	RF_Raid_t *raidPtr;
    639 	daddr_t offset;
    640 	int     part, c, sparecol, j, scol, dumpto;
    641 	int     error = 0;
    642 
    643 	if ((rs = raidget(unit)) == NULL)
    644 		return ENXIO;
    645 
    646 	raidPtr = &rs->sc_r;
    647 
    648 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    649 		return ENXIO;
    650 
    651 	/* we only support dumping to RAID 1 sets */
    652 	if (raidPtr->Layout.numDataCol != 1 ||
    653 	    raidPtr->Layout.numParityCol != 1)
    654 		return EINVAL;
    655 
    656 
    657 	if ((error = raidlock(rs)) != 0)
    658 		return error;
    659 
    660 	if (size % DEV_BSIZE != 0) {
    661 		error = EINVAL;
    662 		goto out;
    663 	}
    664 
    665 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    666 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    667 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    668 		    size / DEV_BSIZE, rs->sc_size);
    669 		error = EINVAL;
    670 		goto out;
    671 	}
    672 
    673 	part = DISKPART(dev);
    674 	lp = rs->sc_dkdev.dk_label;
    675 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    676 
    677 	/* figure out what device is alive.. */
    678 
    679 	/*
    680 	   Look for a component to dump to.  The preference for the
    681 	   component to dump to is as follows:
    682 	   1) the master
    683 	   2) a used_spare of the master
    684 	   3) the slave
    685 	   4) a used_spare of the slave
    686 	*/
    687 
    688 	dumpto = -1;
    689 	for (c = 0; c < raidPtr->numCol; c++) {
    690 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    691 			/* this might be the one */
    692 			dumpto = c;
    693 			break;
    694 		}
    695 	}
    696 
    697 	/*
    698 	   At this point we have possibly selected a live master or a
    699 	   live slave.  We now check to see if there is a spared
    700 	   master (or a spared slave), if we didn't find a live master
    701 	   or a live slave.
    702 	*/
    703 
    704 	for (c = 0; c < raidPtr->numSpare; c++) {
    705 		sparecol = raidPtr->numCol + c;
    706 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    707 			/* How about this one? */
    708 			scol = -1;
    709 			for(j=0;j<raidPtr->numCol;j++) {
    710 				if (raidPtr->Disks[j].spareCol == sparecol) {
    711 					scol = j;
    712 					break;
    713 				}
    714 			}
    715 			if (scol == 0) {
    716 				/*
    717 				   We must have found a spared master!
    718 				   We'll take that over anything else
    719 				   found so far.  (We couldn't have
    720 				   found a real master before, since
    721 				   this is a used spare, and it's
    722 				   saying that it's replacing the
    723 				   master.)  On reboot (with
    724 				   autoconfiguration turned on)
    725 				   sparecol will become the 1st
    726 				   component (component0) of this set.
    727 				*/
    728 				dumpto = sparecol;
    729 				break;
    730 			} else if (scol != -1) {
    731 				/*
    732 				   Must be a spared slave.  We'll dump
    733 				   to that if we havn't found anything
    734 				   else so far.
    735 				*/
    736 				if (dumpto == -1)
    737 					dumpto = sparecol;
    738 			}
    739 		}
    740 	}
    741 
    742 	if (dumpto == -1) {
    743 		/* we couldn't find any live components to dump to!?!?
    744 		 */
    745 		error = EINVAL;
    746 		goto out;
    747 	}
    748 
    749 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    750 
    751 	/*
    752 	   Note that blkno is relative to this particular partition.
    753 	   By adding the offset of this partition in the RAID
    754 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    755 	   value that is relative to the partition used for the
    756 	   underlying component.
    757 	*/
    758 
    759 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    760 				blkno + offset, va, size);
    761 
    762 out:
    763 	raidunlock(rs);
    764 
    765 	return error;
    766 }
    767 /* ARGSUSED */
    768 int
    769 raidopen(dev_t dev, int flags, int fmt,
    770     struct lwp *l)
    771 {
    772 	int     unit = raidunit(dev);
    773 	struct raid_softc *rs;
    774 	struct disklabel *lp;
    775 	int     part, pmask;
    776 	int     error = 0;
    777 
    778 	if ((rs = raidget(unit)) == NULL)
    779 		return ENXIO;
    780 	if ((error = raidlock(rs)) != 0)
    781 		return (error);
    782 
    783 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    784 		error = EBUSY;
    785 		goto bad;
    786 	}
    787 
    788 	lp = rs->sc_dkdev.dk_label;
    789 
    790 	part = DISKPART(dev);
    791 
    792 	/*
    793 	 * If there are wedges, and this is not RAW_PART, then we
    794 	 * need to fail.
    795 	 */
    796 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    797 		error = EBUSY;
    798 		goto bad;
    799 	}
    800 	pmask = (1 << part);
    801 
    802 	if ((rs->sc_flags & RAIDF_INITED) &&
    803 	    (rs->sc_dkdev.dk_nwedges == 0) &&
    804 	    (rs->sc_dkdev.dk_openmask == 0))
    805 		raidgetdisklabel(dev);
    806 
    807 	/* make sure that this partition exists */
    808 
    809 	if (part != RAW_PART) {
    810 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    811 		    ((part >= lp->d_npartitions) ||
    812 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    813 			error = ENXIO;
    814 			goto bad;
    815 		}
    816 	}
    817 	/* Prevent this unit from being unconfigured while open. */
    818 	switch (fmt) {
    819 	case S_IFCHR:
    820 		rs->sc_dkdev.dk_copenmask |= pmask;
    821 		break;
    822 
    823 	case S_IFBLK:
    824 		rs->sc_dkdev.dk_bopenmask |= pmask;
    825 		break;
    826 	}
    827 
    828 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    829 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    830 		/* First one... mark things as dirty... Note that we *MUST*
    831 		 have done a configure before this.  I DO NOT WANT TO BE
    832 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    833 		 THAT THEY BELONG TOGETHER!!!!! */
    834 		/* XXX should check to see if we're only open for reading
    835 		   here... If so, we needn't do this, but then need some
    836 		   other way of keeping track of what's happened.. */
    837 
    838 		rf_markalldirty(&rs->sc_r);
    839 	}
    840 
    841 
    842 	rs->sc_dkdev.dk_openmask =
    843 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    844 
    845 bad:
    846 	raidunlock(rs);
    847 
    848 	return (error);
    849 
    850 
    851 }
    852 /* ARGSUSED */
    853 int
    854 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    855 {
    856 	int     unit = raidunit(dev);
    857 	struct raid_softc *rs;
    858 	int     error = 0;
    859 	int     part;
    860 
    861 	if ((rs = raidget(unit)) == NULL)
    862 		return ENXIO;
    863 
    864 	if ((error = raidlock(rs)) != 0)
    865 		return (error);
    866 
    867 	part = DISKPART(dev);
    868 
    869 	/* ...that much closer to allowing unconfiguration... */
    870 	switch (fmt) {
    871 	case S_IFCHR:
    872 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    873 		break;
    874 
    875 	case S_IFBLK:
    876 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    877 		break;
    878 	}
    879 	rs->sc_dkdev.dk_openmask =
    880 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    881 
    882 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    883 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    884 		/* Last one... device is not unconfigured yet.
    885 		   Device shutdown has taken care of setting the
    886 		   clean bits if RAIDF_INITED is not set
    887 		   mark things as clean... */
    888 
    889 		rf_update_component_labels(&rs->sc_r,
    890 						 RF_FINAL_COMPONENT_UPDATE);
    891 
    892 		/* If the kernel is shutting down, it will detach
    893 		 * this RAID set soon enough.
    894 		 */
    895 	}
    896 
    897 	raidunlock(rs);
    898 	return (0);
    899 
    900 }
    901 
    902 void
    903 raidstrategy(struct buf *bp)
    904 {
    905 	unsigned int unit = raidunit(bp->b_dev);
    906 	RF_Raid_t *raidPtr;
    907 	int     wlabel;
    908 	struct raid_softc *rs;
    909 
    910 	if ((rs = raidget(unit)) == NULL) {
    911 		bp->b_error = ENXIO;
    912 		goto done;
    913 	}
    914 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    915 		bp->b_error = ENXIO;
    916 		goto done;
    917 	}
    918 	raidPtr = &rs->sc_r;
    919 	if (!raidPtr->valid) {
    920 		bp->b_error = ENODEV;
    921 		goto done;
    922 	}
    923 	if (bp->b_bcount == 0) {
    924 		db1_printf(("b_bcount is zero..\n"));
    925 		goto done;
    926 	}
    927 
    928 	/*
    929 	 * Do bounds checking and adjust transfer.  If there's an
    930 	 * error, the bounds check will flag that for us.
    931 	 */
    932 
    933 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    934 	if (DISKPART(bp->b_dev) == RAW_PART) {
    935 		uint64_t size; /* device size in DEV_BSIZE unit */
    936 
    937 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    938 			size = raidPtr->totalSectors <<
    939 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    940 		} else {
    941 			size = raidPtr->totalSectors >>
    942 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    943 		}
    944 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    945 			goto done;
    946 		}
    947 	} else {
    948 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    949 			db1_printf(("Bounds check failed!!:%d %d\n",
    950 				(int) bp->b_blkno, (int) wlabel));
    951 			goto done;
    952 		}
    953 	}
    954 
    955 	rf_lock_mutex2(raidPtr->iodone_lock);
    956 
    957 	bp->b_resid = 0;
    958 
    959 	/* stuff it onto our queue */
    960 	bufq_put(rs->buf_queue, bp);
    961 
    962 	/* scheduled the IO to happen at the next convenient time */
    963 	rf_signal_cond2(raidPtr->iodone_cv);
    964 	rf_unlock_mutex2(raidPtr->iodone_lock);
    965 
    966 	return;
    967 
    968 done:
    969 	bp->b_resid = bp->b_bcount;
    970 	biodone(bp);
    971 }
    972 /* ARGSUSED */
    973 int
    974 raidread(dev_t dev, struct uio *uio, int flags)
    975 {
    976 	int     unit = raidunit(dev);
    977 	struct raid_softc *rs;
    978 
    979 	if ((rs = raidget(unit)) == NULL)
    980 		return ENXIO;
    981 
    982 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    983 		return (ENXIO);
    984 
    985 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    986 
    987 }
    988 /* ARGSUSED */
    989 int
    990 raidwrite(dev_t dev, struct uio *uio, int flags)
    991 {
    992 	int     unit = raidunit(dev);
    993 	struct raid_softc *rs;
    994 
    995 	if ((rs = raidget(unit)) == NULL)
    996 		return ENXIO;
    997 
    998 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    999 		return (ENXIO);
   1000 
   1001 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1002 
   1003 }
   1004 
   1005 static int
   1006 raid_detach_unlocked(struct raid_softc *rs)
   1007 {
   1008 	int error;
   1009 	RF_Raid_t *raidPtr;
   1010 
   1011 	raidPtr = &rs->sc_r;
   1012 
   1013 	/*
   1014 	 * If somebody has a partition mounted, we shouldn't
   1015 	 * shutdown.
   1016 	 */
   1017 	if (rs->sc_dkdev.dk_openmask != 0)
   1018 		return EBUSY;
   1019 
   1020 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1021 		;	/* not initialized: nothing to do */
   1022 	else if ((error = rf_Shutdown(raidPtr)) != 0)
   1023 		return error;
   1024 	else
   1025 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
   1026 
   1027 	/* Detach the disk. */
   1028 	dkwedge_delall(&rs->sc_dkdev);
   1029 	disk_detach(&rs->sc_dkdev);
   1030 	disk_destroy(&rs->sc_dkdev);
   1031 
   1032 	aprint_normal_dev(rs->sc_dev, "detached\n");
   1033 
   1034 	return 0;
   1035 }
   1036 
   1037 int
   1038 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1039 {
   1040 	int     unit = raidunit(dev);
   1041 	int     error = 0;
   1042 	int     part, pmask, s;
   1043 	cfdata_t cf;
   1044 	struct raid_softc *rs;
   1045 	RF_Config_t *k_cfg, *u_cfg;
   1046 	RF_Raid_t *raidPtr;
   1047 	RF_RaidDisk_t *diskPtr;
   1048 	RF_AccTotals_t *totals;
   1049 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1050 	u_char *specific_buf;
   1051 	int retcode = 0;
   1052 	int column;
   1053 /*	int raidid; */
   1054 	struct rf_recon_req *rrcopy, *rr;
   1055 	RF_ComponentLabel_t *clabel;
   1056 	RF_ComponentLabel_t *ci_label;
   1057 	RF_ComponentLabel_t **clabel_ptr;
   1058 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1059 	RF_SingleComponent_t component;
   1060 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1061 	int i, j, d;
   1062 #ifdef __HAVE_OLD_DISKLABEL
   1063 	struct disklabel newlabel;
   1064 #endif
   1065 
   1066 	if ((rs = raidget(unit)) == NULL)
   1067 		return ENXIO;
   1068 	raidPtr = &rs->sc_r;
   1069 
   1070 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1071 		(int) DISKPART(dev), (int) unit, cmd));
   1072 
   1073 	/* Must be open for writes for these commands... */
   1074 	switch (cmd) {
   1075 #ifdef DIOCGSECTORSIZE
   1076 	case DIOCGSECTORSIZE:
   1077 		*(u_int *)data = raidPtr->bytesPerSector;
   1078 		return 0;
   1079 	case DIOCGMEDIASIZE:
   1080 		*(off_t *)data =
   1081 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1082 		return 0;
   1083 #endif
   1084 	case DIOCSDINFO:
   1085 	case DIOCWDINFO:
   1086 #ifdef __HAVE_OLD_DISKLABEL
   1087 	case ODIOCWDINFO:
   1088 	case ODIOCSDINFO:
   1089 #endif
   1090 	case DIOCWLABEL:
   1091 	case DIOCAWEDGE:
   1092 	case DIOCDWEDGE:
   1093 	case DIOCMWEDGES:
   1094 	case DIOCSSTRATEGY:
   1095 		if ((flag & FWRITE) == 0)
   1096 			return (EBADF);
   1097 	}
   1098 
   1099 	/* Must be initialized for these... */
   1100 	switch (cmd) {
   1101 	case DIOCGDINFO:
   1102 	case DIOCSDINFO:
   1103 	case DIOCWDINFO:
   1104 #ifdef __HAVE_OLD_DISKLABEL
   1105 	case ODIOCGDINFO:
   1106 	case ODIOCWDINFO:
   1107 	case ODIOCSDINFO:
   1108 	case ODIOCGDEFLABEL:
   1109 #endif
   1110 	case DIOCGPART:
   1111 	case DIOCWLABEL:
   1112 	case DIOCGDEFLABEL:
   1113 	case DIOCAWEDGE:
   1114 	case DIOCDWEDGE:
   1115 	case DIOCLWEDGES:
   1116 	case DIOCMWEDGES:
   1117 	case DIOCCACHESYNC:
   1118 	case RAIDFRAME_SHUTDOWN:
   1119 	case RAIDFRAME_REWRITEPARITY:
   1120 	case RAIDFRAME_GET_INFO:
   1121 	case RAIDFRAME_RESET_ACCTOTALS:
   1122 	case RAIDFRAME_GET_ACCTOTALS:
   1123 	case RAIDFRAME_KEEP_ACCTOTALS:
   1124 	case RAIDFRAME_GET_SIZE:
   1125 	case RAIDFRAME_FAIL_DISK:
   1126 	case RAIDFRAME_COPYBACK:
   1127 	case RAIDFRAME_CHECK_RECON_STATUS:
   1128 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1129 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1130 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1131 	case RAIDFRAME_ADD_HOT_SPARE:
   1132 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1133 	case RAIDFRAME_INIT_LABELS:
   1134 	case RAIDFRAME_REBUILD_IN_PLACE:
   1135 	case RAIDFRAME_CHECK_PARITY:
   1136 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1137 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1138 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1139 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1140 	case RAIDFRAME_SET_AUTOCONFIG:
   1141 	case RAIDFRAME_SET_ROOT:
   1142 	case RAIDFRAME_DELETE_COMPONENT:
   1143 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1144 	case RAIDFRAME_PARITYMAP_STATUS:
   1145 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1146 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1147 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1148 	case DIOCGSTRATEGY:
   1149 	case DIOCSSTRATEGY:
   1150 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1151 			return (ENXIO);
   1152 	}
   1153 
   1154 	switch (cmd) {
   1155 #ifdef COMPAT_50
   1156 	case RAIDFRAME_GET_INFO50:
   1157 		return rf_get_info50(raidPtr, data);
   1158 
   1159 	case RAIDFRAME_CONFIGURE50:
   1160 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1161 			return retcode;
   1162 		goto config;
   1163 #endif
   1164 		/* configure the system */
   1165 	case RAIDFRAME_CONFIGURE:
   1166 
   1167 		if (raidPtr->valid) {
   1168 			/* There is a valid RAID set running on this unit! */
   1169 			printf("raid%d: Device already configured!\n",unit);
   1170 			return(EINVAL);
   1171 		}
   1172 
   1173 		/* copy-in the configuration information */
   1174 		/* data points to a pointer to the configuration structure */
   1175 
   1176 		u_cfg = *((RF_Config_t **) data);
   1177 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1178 		if (k_cfg == NULL) {
   1179 			return (ENOMEM);
   1180 		}
   1181 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1182 		if (retcode) {
   1183 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1184 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1185 				retcode));
   1186 			return (retcode);
   1187 		}
   1188 		goto config;
   1189 	config:
   1190 		/* allocate a buffer for the layout-specific data, and copy it
   1191 		 * in */
   1192 		if (k_cfg->layoutSpecificSize) {
   1193 			if (k_cfg->layoutSpecificSize > 10000) {
   1194 				/* sanity check */
   1195 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1196 				return (EINVAL);
   1197 			}
   1198 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1199 			    (u_char *));
   1200 			if (specific_buf == NULL) {
   1201 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1202 				return (ENOMEM);
   1203 			}
   1204 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1205 			    k_cfg->layoutSpecificSize);
   1206 			if (retcode) {
   1207 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1208 				RF_Free(specific_buf,
   1209 					k_cfg->layoutSpecificSize);
   1210 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1211 					retcode));
   1212 				return (retcode);
   1213 			}
   1214 		} else
   1215 			specific_buf = NULL;
   1216 		k_cfg->layoutSpecific = specific_buf;
   1217 
   1218 		/* should do some kind of sanity check on the configuration.
   1219 		 * Store the sum of all the bytes in the last byte? */
   1220 
   1221 		/* configure the system */
   1222 
   1223 		/*
   1224 		 * Clear the entire RAID descriptor, just to make sure
   1225 		 *  there is no stale data left in the case of a
   1226 		 *  reconfiguration
   1227 		 */
   1228 		memset(raidPtr, 0, sizeof(*raidPtr));
   1229 		raidPtr->softc = rs;
   1230 		raidPtr->raidid = unit;
   1231 
   1232 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1233 
   1234 		if (retcode == 0) {
   1235 
   1236 			/* allow this many simultaneous IO's to
   1237 			   this RAID device */
   1238 			raidPtr->openings = RAIDOUTSTANDING;
   1239 
   1240 			raidinit(rs);
   1241 			rf_markalldirty(raidPtr);
   1242 		}
   1243 		/* free the buffers.  No return code here. */
   1244 		if (k_cfg->layoutSpecificSize) {
   1245 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1246 		}
   1247 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1248 
   1249 		return (retcode);
   1250 
   1251 		/* shutdown the system */
   1252 	case RAIDFRAME_SHUTDOWN:
   1253 
   1254 		part = DISKPART(dev);
   1255 		pmask = (1 << part);
   1256 
   1257 		if ((error = raidlock(rs)) != 0)
   1258 			return (error);
   1259 
   1260 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1261 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1262 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1263 			retcode = EBUSY;
   1264 		else {
   1265 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1266 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1267 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1268 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1269 			retcode = 0;
   1270 		}
   1271 
   1272 		raidunlock(rs);
   1273 
   1274 		if (retcode != 0)
   1275 			return retcode;
   1276 
   1277 		/* free the pseudo device attach bits */
   1278 
   1279 		cf = device_cfdata(rs->sc_dev);
   1280 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1281 			free(cf, M_RAIDFRAME);
   1282 
   1283 		return (retcode);
   1284 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1285 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1286 		/* need to read the component label for the disk indicated
   1287 		   by row,column in clabel */
   1288 
   1289 		/*
   1290 		 * Perhaps there should be an option to skip the in-core
   1291 		 * copy and hit the disk, as with disklabel(8).
   1292 		 */
   1293 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1294 
   1295 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1296 
   1297 		if (retcode) {
   1298 			RF_Free(clabel, sizeof(*clabel));
   1299 			return retcode;
   1300 		}
   1301 
   1302 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1303 
   1304 		column = clabel->column;
   1305 
   1306 		if ((column < 0) || (column >= raidPtr->numCol +
   1307 		    raidPtr->numSpare)) {
   1308 			RF_Free(clabel, sizeof(*clabel));
   1309 			return EINVAL;
   1310 		}
   1311 
   1312 		RF_Free(clabel, sizeof(*clabel));
   1313 
   1314 		clabel = raidget_component_label(raidPtr, column);
   1315 
   1316 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1317 
   1318 #if 0
   1319 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1320 		clabel = (RF_ComponentLabel_t *) data;
   1321 
   1322 		/* XXX check the label for valid stuff... */
   1323 		/* Note that some things *should not* get modified --
   1324 		   the user should be re-initing the labels instead of
   1325 		   trying to patch things.
   1326 		   */
   1327 
   1328 		raidid = raidPtr->raidid;
   1329 #ifdef DEBUG
   1330 		printf("raid%d: Got component label:\n", raidid);
   1331 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1332 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1333 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1334 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1335 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1336 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1337 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1338 #endif
   1339 		clabel->row = 0;
   1340 		column = clabel->column;
   1341 
   1342 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1343 			return(EINVAL);
   1344 		}
   1345 
   1346 		/* XXX this isn't allowed to do anything for now :-) */
   1347 
   1348 		/* XXX and before it is, we need to fill in the rest
   1349 		   of the fields!?!?!?! */
   1350 		memcpy(raidget_component_label(raidPtr, column),
   1351 		    clabel, sizeof(*clabel));
   1352 		raidflush_component_label(raidPtr, column);
   1353 		return (0);
   1354 #endif
   1355 
   1356 	case RAIDFRAME_INIT_LABELS:
   1357 		clabel = (RF_ComponentLabel_t *) data;
   1358 		/*
   1359 		   we only want the serial number from
   1360 		   the above.  We get all the rest of the information
   1361 		   from the config that was used to create this RAID
   1362 		   set.
   1363 		   */
   1364 
   1365 		raidPtr->serial_number = clabel->serial_number;
   1366 
   1367 		for(column=0;column<raidPtr->numCol;column++) {
   1368 			diskPtr = &raidPtr->Disks[column];
   1369 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1370 				ci_label = raidget_component_label(raidPtr,
   1371 				    column);
   1372 				/* Zeroing this is important. */
   1373 				memset(ci_label, 0, sizeof(*ci_label));
   1374 				raid_init_component_label(raidPtr, ci_label);
   1375 				ci_label->serial_number =
   1376 				    raidPtr->serial_number;
   1377 				ci_label->row = 0; /* we dont' pretend to support more */
   1378 				rf_component_label_set_partitionsize(ci_label,
   1379 				    diskPtr->partitionSize);
   1380 				ci_label->column = column;
   1381 				raidflush_component_label(raidPtr, column);
   1382 			}
   1383 			/* XXXjld what about the spares? */
   1384 		}
   1385 
   1386 		return (retcode);
   1387 	case RAIDFRAME_SET_AUTOCONFIG:
   1388 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1389 		printf("raid%d: New autoconfig value is: %d\n",
   1390 		       raidPtr->raidid, d);
   1391 		*(int *) data = d;
   1392 		return (retcode);
   1393 
   1394 	case RAIDFRAME_SET_ROOT:
   1395 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1396 		printf("raid%d: New rootpartition value is: %d\n",
   1397 		       raidPtr->raidid, d);
   1398 		*(int *) data = d;
   1399 		return (retcode);
   1400 
   1401 		/* initialize all parity */
   1402 	case RAIDFRAME_REWRITEPARITY:
   1403 
   1404 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1405 			/* Parity for RAID 0 is trivially correct */
   1406 			raidPtr->parity_good = RF_RAID_CLEAN;
   1407 			return(0);
   1408 		}
   1409 
   1410 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1411 			/* Re-write is already in progress! */
   1412 			return(EINVAL);
   1413 		}
   1414 
   1415 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1416 					   rf_RewriteParityThread,
   1417 					   raidPtr,"raid_parity");
   1418 		return (retcode);
   1419 
   1420 
   1421 	case RAIDFRAME_ADD_HOT_SPARE:
   1422 		sparePtr = (RF_SingleComponent_t *) data;
   1423 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1424 		retcode = rf_add_hot_spare(raidPtr, &component);
   1425 		return(retcode);
   1426 
   1427 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1428 		return(retcode);
   1429 
   1430 	case RAIDFRAME_DELETE_COMPONENT:
   1431 		componentPtr = (RF_SingleComponent_t *)data;
   1432 		memcpy( &component, componentPtr,
   1433 			sizeof(RF_SingleComponent_t));
   1434 		retcode = rf_delete_component(raidPtr, &component);
   1435 		return(retcode);
   1436 
   1437 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1438 		componentPtr = (RF_SingleComponent_t *)data;
   1439 		memcpy( &component, componentPtr,
   1440 			sizeof(RF_SingleComponent_t));
   1441 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1442 		return(retcode);
   1443 
   1444 	case RAIDFRAME_REBUILD_IN_PLACE:
   1445 
   1446 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1447 			/* Can't do this on a RAID 0!! */
   1448 			return(EINVAL);
   1449 		}
   1450 
   1451 		if (raidPtr->recon_in_progress == 1) {
   1452 			/* a reconstruct is already in progress! */
   1453 			return(EINVAL);
   1454 		}
   1455 
   1456 		componentPtr = (RF_SingleComponent_t *) data;
   1457 		memcpy( &component, componentPtr,
   1458 			sizeof(RF_SingleComponent_t));
   1459 		component.row = 0; /* we don't support any more */
   1460 		column = component.column;
   1461 
   1462 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1463 			return(EINVAL);
   1464 		}
   1465 
   1466 		rf_lock_mutex2(raidPtr->mutex);
   1467 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1468 		    (raidPtr->numFailures > 0)) {
   1469 			/* XXX 0 above shouldn't be constant!!! */
   1470 			/* some component other than this has failed.
   1471 			   Let's not make things worse than they already
   1472 			   are... */
   1473 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1474 			       raidPtr->raidid);
   1475 			printf("raid%d:     Col: %d   Too many failures.\n",
   1476 			       raidPtr->raidid, column);
   1477 			rf_unlock_mutex2(raidPtr->mutex);
   1478 			return (EINVAL);
   1479 		}
   1480 		if (raidPtr->Disks[column].status ==
   1481 		    rf_ds_reconstructing) {
   1482 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1483 			       raidPtr->raidid);
   1484 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1485 
   1486 			rf_unlock_mutex2(raidPtr->mutex);
   1487 			return (EINVAL);
   1488 		}
   1489 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1490 			rf_unlock_mutex2(raidPtr->mutex);
   1491 			return (EINVAL);
   1492 		}
   1493 		rf_unlock_mutex2(raidPtr->mutex);
   1494 
   1495 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1496 		if (rrcopy == NULL)
   1497 			return(ENOMEM);
   1498 
   1499 		rrcopy->raidPtr = (void *) raidPtr;
   1500 		rrcopy->col = column;
   1501 
   1502 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1503 					   rf_ReconstructInPlaceThread,
   1504 					   rrcopy,"raid_reconip");
   1505 		return(retcode);
   1506 
   1507 	case RAIDFRAME_GET_INFO:
   1508 		if (!raidPtr->valid)
   1509 			return (ENODEV);
   1510 		ucfgp = (RF_DeviceConfig_t **) data;
   1511 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1512 			  (RF_DeviceConfig_t *));
   1513 		if (d_cfg == NULL)
   1514 			return (ENOMEM);
   1515 		d_cfg->rows = 1; /* there is only 1 row now */
   1516 		d_cfg->cols = raidPtr->numCol;
   1517 		d_cfg->ndevs = raidPtr->numCol;
   1518 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1519 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1520 			return (ENOMEM);
   1521 		}
   1522 		d_cfg->nspares = raidPtr->numSpare;
   1523 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1524 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1525 			return (ENOMEM);
   1526 		}
   1527 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1528 		d = 0;
   1529 		for (j = 0; j < d_cfg->cols; j++) {
   1530 			d_cfg->devs[d] = raidPtr->Disks[j];
   1531 			d++;
   1532 		}
   1533 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1534 			d_cfg->spares[i] = raidPtr->Disks[j];
   1535 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1536 				/* XXX: raidctl(8) expects to see this as a used spare */
   1537 				d_cfg->spares[i].status = rf_ds_used_spare;
   1538 			}
   1539 		}
   1540 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1541 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1542 
   1543 		return (retcode);
   1544 
   1545 	case RAIDFRAME_CHECK_PARITY:
   1546 		*(int *) data = raidPtr->parity_good;
   1547 		return (0);
   1548 
   1549 	case RAIDFRAME_PARITYMAP_STATUS:
   1550 		if (rf_paritymap_ineligible(raidPtr))
   1551 			return EINVAL;
   1552 		rf_paritymap_status(raidPtr->parity_map,
   1553 		    (struct rf_pmstat *)data);
   1554 		return 0;
   1555 
   1556 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1557 		if (rf_paritymap_ineligible(raidPtr))
   1558 			return EINVAL;
   1559 		if (raidPtr->parity_map == NULL)
   1560 			return ENOENT; /* ??? */
   1561 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1562 			(struct rf_pmparams *)data, 1))
   1563 			return EINVAL;
   1564 		return 0;
   1565 
   1566 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1567 		if (rf_paritymap_ineligible(raidPtr))
   1568 			return EINVAL;
   1569 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1570 		return 0;
   1571 
   1572 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1573 		if (rf_paritymap_ineligible(raidPtr))
   1574 			return EINVAL;
   1575 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1576 		/* XXX should errors be passed up? */
   1577 		return 0;
   1578 
   1579 	case RAIDFRAME_RESET_ACCTOTALS:
   1580 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1581 		return (0);
   1582 
   1583 	case RAIDFRAME_GET_ACCTOTALS:
   1584 		totals = (RF_AccTotals_t *) data;
   1585 		*totals = raidPtr->acc_totals;
   1586 		return (0);
   1587 
   1588 	case RAIDFRAME_KEEP_ACCTOTALS:
   1589 		raidPtr->keep_acc_totals = *(int *)data;
   1590 		return (0);
   1591 
   1592 	case RAIDFRAME_GET_SIZE:
   1593 		*(int *) data = raidPtr->totalSectors;
   1594 		return (0);
   1595 
   1596 		/* fail a disk & optionally start reconstruction */
   1597 	case RAIDFRAME_FAIL_DISK:
   1598 
   1599 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1600 			/* Can't do this on a RAID 0!! */
   1601 			return(EINVAL);
   1602 		}
   1603 
   1604 		rr = (struct rf_recon_req *) data;
   1605 		rr->row = 0;
   1606 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1607 			return (EINVAL);
   1608 
   1609 
   1610 		rf_lock_mutex2(raidPtr->mutex);
   1611 		if (raidPtr->status == rf_rs_reconstructing) {
   1612 			/* you can't fail a disk while we're reconstructing! */
   1613 			/* XXX wrong for RAID6 */
   1614 			rf_unlock_mutex2(raidPtr->mutex);
   1615 			return (EINVAL);
   1616 		}
   1617 		if ((raidPtr->Disks[rr->col].status ==
   1618 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1619 			/* some other component has failed.  Let's not make
   1620 			   things worse. XXX wrong for RAID6 */
   1621 			rf_unlock_mutex2(raidPtr->mutex);
   1622 			return (EINVAL);
   1623 		}
   1624 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1625 			/* Can't fail a spared disk! */
   1626 			rf_unlock_mutex2(raidPtr->mutex);
   1627 			return (EINVAL);
   1628 		}
   1629 		rf_unlock_mutex2(raidPtr->mutex);
   1630 
   1631 		/* make a copy of the recon request so that we don't rely on
   1632 		 * the user's buffer */
   1633 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1634 		if (rrcopy == NULL)
   1635 			return(ENOMEM);
   1636 		memcpy(rrcopy, rr, sizeof(*rr));
   1637 		rrcopy->raidPtr = (void *) raidPtr;
   1638 
   1639 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1640 					   rf_ReconThread,
   1641 					   rrcopy,"raid_recon");
   1642 		return (0);
   1643 
   1644 		/* invoke a copyback operation after recon on whatever disk
   1645 		 * needs it, if any */
   1646 	case RAIDFRAME_COPYBACK:
   1647 
   1648 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1649 			/* This makes no sense on a RAID 0!! */
   1650 			return(EINVAL);
   1651 		}
   1652 
   1653 		if (raidPtr->copyback_in_progress == 1) {
   1654 			/* Copyback is already in progress! */
   1655 			return(EINVAL);
   1656 		}
   1657 
   1658 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1659 					   rf_CopybackThread,
   1660 					   raidPtr,"raid_copyback");
   1661 		return (retcode);
   1662 
   1663 		/* return the percentage completion of reconstruction */
   1664 	case RAIDFRAME_CHECK_RECON_STATUS:
   1665 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1666 			/* This makes no sense on a RAID 0, so tell the
   1667 			   user it's done. */
   1668 			*(int *) data = 100;
   1669 			return(0);
   1670 		}
   1671 		if (raidPtr->status != rf_rs_reconstructing)
   1672 			*(int *) data = 100;
   1673 		else {
   1674 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1675 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1676 			} else {
   1677 				*(int *) data = 0;
   1678 			}
   1679 		}
   1680 		return (0);
   1681 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1682 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1683 		if (raidPtr->status != rf_rs_reconstructing) {
   1684 			progressInfo.remaining = 0;
   1685 			progressInfo.completed = 100;
   1686 			progressInfo.total = 100;
   1687 		} else {
   1688 			progressInfo.total =
   1689 				raidPtr->reconControl->numRUsTotal;
   1690 			progressInfo.completed =
   1691 				raidPtr->reconControl->numRUsComplete;
   1692 			progressInfo.remaining = progressInfo.total -
   1693 				progressInfo.completed;
   1694 		}
   1695 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1696 				  sizeof(RF_ProgressInfo_t));
   1697 		return (retcode);
   1698 
   1699 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1700 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1701 			/* This makes no sense on a RAID 0, so tell the
   1702 			   user it's done. */
   1703 			*(int *) data = 100;
   1704 			return(0);
   1705 		}
   1706 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1707 			*(int *) data = 100 *
   1708 				raidPtr->parity_rewrite_stripes_done /
   1709 				raidPtr->Layout.numStripe;
   1710 		} else {
   1711 			*(int *) data = 100;
   1712 		}
   1713 		return (0);
   1714 
   1715 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1716 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1717 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1718 			progressInfo.total = raidPtr->Layout.numStripe;
   1719 			progressInfo.completed =
   1720 				raidPtr->parity_rewrite_stripes_done;
   1721 			progressInfo.remaining = progressInfo.total -
   1722 				progressInfo.completed;
   1723 		} else {
   1724 			progressInfo.remaining = 0;
   1725 			progressInfo.completed = 100;
   1726 			progressInfo.total = 100;
   1727 		}
   1728 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1729 				  sizeof(RF_ProgressInfo_t));
   1730 		return (retcode);
   1731 
   1732 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1733 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1734 			/* This makes no sense on a RAID 0 */
   1735 			*(int *) data = 100;
   1736 			return(0);
   1737 		}
   1738 		if (raidPtr->copyback_in_progress == 1) {
   1739 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1740 				raidPtr->Layout.numStripe;
   1741 		} else {
   1742 			*(int *) data = 100;
   1743 		}
   1744 		return (0);
   1745 
   1746 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1747 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1748 		if (raidPtr->copyback_in_progress == 1) {
   1749 			progressInfo.total = raidPtr->Layout.numStripe;
   1750 			progressInfo.completed =
   1751 				raidPtr->copyback_stripes_done;
   1752 			progressInfo.remaining = progressInfo.total -
   1753 				progressInfo.completed;
   1754 		} else {
   1755 			progressInfo.remaining = 0;
   1756 			progressInfo.completed = 100;
   1757 			progressInfo.total = 100;
   1758 		}
   1759 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1760 				  sizeof(RF_ProgressInfo_t));
   1761 		return (retcode);
   1762 
   1763 		/* the sparetable daemon calls this to wait for the kernel to
   1764 		 * need a spare table. this ioctl does not return until a
   1765 		 * spare table is needed. XXX -- calling mpsleep here in the
   1766 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1767 		 * -- I should either compute the spare table in the kernel,
   1768 		 * or have a different -- XXX XXX -- interface (a different
   1769 		 * character device) for delivering the table     -- XXX */
   1770 #if 0
   1771 	case RAIDFRAME_SPARET_WAIT:
   1772 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1773 		while (!rf_sparet_wait_queue)
   1774 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1775 		waitreq = rf_sparet_wait_queue;
   1776 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1777 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1778 
   1779 		/* structure assignment */
   1780 		*((RF_SparetWait_t *) data) = *waitreq;
   1781 
   1782 		RF_Free(waitreq, sizeof(*waitreq));
   1783 		return (0);
   1784 
   1785 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1786 		 * code in it that will cause the dameon to exit */
   1787 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1788 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1789 		waitreq->fcol = -1;
   1790 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1791 		waitreq->next = rf_sparet_wait_queue;
   1792 		rf_sparet_wait_queue = waitreq;
   1793 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1794 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1795 		return (0);
   1796 
   1797 		/* used by the spare table daemon to deliver a spare table
   1798 		 * into the kernel */
   1799 	case RAIDFRAME_SEND_SPARET:
   1800 
   1801 		/* install the spare table */
   1802 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1803 
   1804 		/* respond to the requestor.  the return status of the spare
   1805 		 * table installation is passed in the "fcol" field */
   1806 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1807 		waitreq->fcol = retcode;
   1808 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1809 		waitreq->next = rf_sparet_resp_queue;
   1810 		rf_sparet_resp_queue = waitreq;
   1811 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1812 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1813 
   1814 		return (retcode);
   1815 #endif
   1816 
   1817 	default:
   1818 		break; /* fall through to the os-specific code below */
   1819 
   1820 	}
   1821 
   1822 	if (!raidPtr->valid)
   1823 		return (EINVAL);
   1824 
   1825 	/*
   1826 	 * Add support for "regular" device ioctls here.
   1827 	 */
   1828 
   1829 	error = disk_ioctl(&rs->sc_dkdev, dev, cmd, data, flag, l);
   1830 	if (error != EPASSTHROUGH)
   1831 		return (error);
   1832 
   1833 	switch (cmd) {
   1834 	case DIOCWDINFO:
   1835 	case DIOCSDINFO:
   1836 #ifdef __HAVE_OLD_DISKLABEL
   1837 	case ODIOCWDINFO:
   1838 	case ODIOCSDINFO:
   1839 #endif
   1840 	{
   1841 		struct disklabel *lp;
   1842 #ifdef __HAVE_OLD_DISKLABEL
   1843 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1844 			memset(&newlabel, 0, sizeof newlabel);
   1845 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1846 			lp = &newlabel;
   1847 		} else
   1848 #endif
   1849 		lp = (struct disklabel *)data;
   1850 
   1851 		if ((error = raidlock(rs)) != 0)
   1852 			return (error);
   1853 
   1854 		rs->sc_flags |= RAIDF_LABELLING;
   1855 
   1856 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1857 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1858 		if (error == 0) {
   1859 			if (cmd == DIOCWDINFO
   1860 #ifdef __HAVE_OLD_DISKLABEL
   1861 			    || cmd == ODIOCWDINFO
   1862 #endif
   1863 			   )
   1864 				error = writedisklabel(RAIDLABELDEV(dev),
   1865 				    raidstrategy, rs->sc_dkdev.dk_label,
   1866 				    rs->sc_dkdev.dk_cpulabel);
   1867 		}
   1868 		rs->sc_flags &= ~RAIDF_LABELLING;
   1869 
   1870 		raidunlock(rs);
   1871 
   1872 		if (error)
   1873 			return (error);
   1874 		break;
   1875 	}
   1876 
   1877 	case DIOCWLABEL:
   1878 		if (*(int *) data != 0)
   1879 			rs->sc_flags |= RAIDF_WLABEL;
   1880 		else
   1881 			rs->sc_flags &= ~RAIDF_WLABEL;
   1882 		break;
   1883 
   1884 	case DIOCGDEFLABEL:
   1885 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1886 		break;
   1887 
   1888 #ifdef __HAVE_OLD_DISKLABEL
   1889 	case ODIOCGDEFLABEL:
   1890 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1891 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1892 			return ENOTTY;
   1893 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1894 		break;
   1895 #endif
   1896 
   1897 	case DIOCCACHESYNC:
   1898 		return rf_sync_component_caches(raidPtr);
   1899 
   1900 	case DIOCGSTRATEGY:
   1901 	    {
   1902 		struct disk_strategy *dks = (void *)data;
   1903 
   1904 		s = splbio();
   1905 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1906 		    sizeof(dks->dks_name));
   1907 		splx(s);
   1908 		dks->dks_paramlen = 0;
   1909 
   1910 		return 0;
   1911 	    }
   1912 
   1913 	case DIOCSSTRATEGY:
   1914 	    {
   1915 		struct disk_strategy *dks = (void *)data;
   1916 		struct bufq_state *new;
   1917 		struct bufq_state *old;
   1918 
   1919 		if (dks->dks_param != NULL) {
   1920 			return EINVAL;
   1921 		}
   1922 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1923 		error = bufq_alloc(&new, dks->dks_name,
   1924 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1925 		if (error) {
   1926 			return error;
   1927 		}
   1928 		s = splbio();
   1929 		old = rs->buf_queue;
   1930 		bufq_move(new, old);
   1931 		rs->buf_queue = new;
   1932 		splx(s);
   1933 		bufq_free(old);
   1934 
   1935 		return 0;
   1936 	    }
   1937 
   1938 	default:
   1939 		retcode = ENOTTY;
   1940 	}
   1941 	return (retcode);
   1942 
   1943 }
   1944 
   1945 
   1946 /* raidinit -- complete the rest of the initialization for the
   1947    RAIDframe device.  */
   1948 
   1949 
   1950 static void
   1951 raidinit(struct raid_softc *rs)
   1952 {
   1953 	cfdata_t cf;
   1954 	int     unit;
   1955 	RF_Raid_t *raidPtr = &rs->sc_r;
   1956 
   1957 	unit = raidPtr->raidid;
   1958 
   1959 
   1960 	/* XXX should check return code first... */
   1961 	rs->sc_flags |= RAIDF_INITED;
   1962 
   1963 	/* XXX doesn't check bounds. */
   1964 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1965 
   1966 	/* attach the pseudo device */
   1967 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1968 	cf->cf_name = raid_cd.cd_name;
   1969 	cf->cf_atname = raid_cd.cd_name;
   1970 	cf->cf_unit = unit;
   1971 	cf->cf_fstate = FSTATE_STAR;
   1972 
   1973 	rs->sc_dev = config_attach_pseudo(cf);
   1974 
   1975 	if (rs->sc_dev == NULL) {
   1976 		printf("raid%d: config_attach_pseudo failed\n",
   1977 		    raidPtr->raidid);
   1978 		rs->sc_flags &= ~RAIDF_INITED;
   1979 		free(cf, M_RAIDFRAME);
   1980 		return;
   1981 	}
   1982 
   1983 	/* disk_attach actually creates space for the CPU disklabel, among
   1984 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1985 	 * with disklabels. */
   1986 
   1987 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1988 	disk_attach(&rs->sc_dkdev);
   1989 
   1990 	/* XXX There may be a weird interaction here between this, and
   1991 	 * protectedSectors, as used in RAIDframe.  */
   1992 
   1993 	rs->sc_size = raidPtr->totalSectors;
   1994 
   1995 	rf_set_geometry(rs, raidPtr);
   1996 
   1997 	dkwedge_discover(&rs->sc_dkdev);
   1998 
   1999 }
   2000 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   2001 /* wake up the daemon & tell it to get us a spare table
   2002  * XXX
   2003  * the entries in the queues should be tagged with the raidPtr
   2004  * so that in the extremely rare case that two recons happen at once,
   2005  * we know for which device were requesting a spare table
   2006  * XXX
   2007  *
   2008  * XXX This code is not currently used. GO
   2009  */
   2010 int
   2011 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   2012 {
   2013 	int     retcode;
   2014 
   2015 	rf_lock_mutex2(rf_sparet_wait_mutex);
   2016 	req->next = rf_sparet_wait_queue;
   2017 	rf_sparet_wait_queue = req;
   2018 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2019 
   2020 	/* mpsleep unlocks the mutex */
   2021 	while (!rf_sparet_resp_queue) {
   2022 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2023 	}
   2024 	req = rf_sparet_resp_queue;
   2025 	rf_sparet_resp_queue = req->next;
   2026 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2027 
   2028 	retcode = req->fcol;
   2029 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2030 					 * alloc'd */
   2031 	return (retcode);
   2032 }
   2033 #endif
   2034 
   2035 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2036  * bp & passes it down.
   2037  * any calls originating in the kernel must use non-blocking I/O
   2038  * do some extra sanity checking to return "appropriate" error values for
   2039  * certain conditions (to make some standard utilities work)
   2040  *
   2041  * Formerly known as: rf_DoAccessKernel
   2042  */
   2043 void
   2044 raidstart(RF_Raid_t *raidPtr)
   2045 {
   2046 	RF_SectorCount_t num_blocks, pb, sum;
   2047 	RF_RaidAddr_t raid_addr;
   2048 	struct partition *pp;
   2049 	daddr_t blocknum;
   2050 	struct raid_softc *rs;
   2051 	int     do_async;
   2052 	struct buf *bp;
   2053 	int rc;
   2054 
   2055 	rs = raidPtr->softc;
   2056 	/* quick check to see if anything has died recently */
   2057 	rf_lock_mutex2(raidPtr->mutex);
   2058 	if (raidPtr->numNewFailures > 0) {
   2059 		rf_unlock_mutex2(raidPtr->mutex);
   2060 		rf_update_component_labels(raidPtr,
   2061 					   RF_NORMAL_COMPONENT_UPDATE);
   2062 		rf_lock_mutex2(raidPtr->mutex);
   2063 		raidPtr->numNewFailures--;
   2064 	}
   2065 
   2066 	/* Check to see if we're at the limit... */
   2067 	while (raidPtr->openings > 0) {
   2068 		rf_unlock_mutex2(raidPtr->mutex);
   2069 
   2070 		/* get the next item, if any, from the queue */
   2071 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2072 			/* nothing more to do */
   2073 			return;
   2074 		}
   2075 
   2076 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2077 		 * partition.. Need to make it absolute to the underlying
   2078 		 * device.. */
   2079 
   2080 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2081 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2082 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2083 			blocknum += pp->p_offset;
   2084 		}
   2085 
   2086 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2087 			    (int) blocknum));
   2088 
   2089 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2090 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2091 
   2092 		/* *THIS* is where we adjust what block we're going to...
   2093 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2094 		raid_addr = blocknum;
   2095 
   2096 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2097 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2098 		sum = raid_addr + num_blocks + pb;
   2099 		if (1 || rf_debugKernelAccess) {
   2100 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2101 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2102 				    (int) pb, (int) bp->b_resid));
   2103 		}
   2104 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2105 		    || (sum < num_blocks) || (sum < pb)) {
   2106 			bp->b_error = ENOSPC;
   2107 			bp->b_resid = bp->b_bcount;
   2108 			biodone(bp);
   2109 			rf_lock_mutex2(raidPtr->mutex);
   2110 			continue;
   2111 		}
   2112 		/*
   2113 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2114 		 */
   2115 
   2116 		if (bp->b_bcount & raidPtr->sectorMask) {
   2117 			bp->b_error = EINVAL;
   2118 			bp->b_resid = bp->b_bcount;
   2119 			biodone(bp);
   2120 			rf_lock_mutex2(raidPtr->mutex);
   2121 			continue;
   2122 
   2123 		}
   2124 		db1_printf(("Calling DoAccess..\n"));
   2125 
   2126 
   2127 		rf_lock_mutex2(raidPtr->mutex);
   2128 		raidPtr->openings--;
   2129 		rf_unlock_mutex2(raidPtr->mutex);
   2130 
   2131 		/*
   2132 		 * Everything is async.
   2133 		 */
   2134 		do_async = 1;
   2135 
   2136 		disk_busy(&rs->sc_dkdev);
   2137 
   2138 		/* XXX we're still at splbio() here... do we *really*
   2139 		   need to be? */
   2140 
   2141 		/* don't ever condition on bp->b_flags & B_WRITE.
   2142 		 * always condition on B_READ instead */
   2143 
   2144 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2145 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2146 				 do_async, raid_addr, num_blocks,
   2147 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2148 
   2149 		if (rc) {
   2150 			bp->b_error = rc;
   2151 			bp->b_resid = bp->b_bcount;
   2152 			biodone(bp);
   2153 			/* continue loop */
   2154 		}
   2155 
   2156 		rf_lock_mutex2(raidPtr->mutex);
   2157 	}
   2158 	rf_unlock_mutex2(raidPtr->mutex);
   2159 }
   2160 
   2161 
   2162 
   2163 
   2164 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2165 
   2166 int
   2167 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2168 {
   2169 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2170 	struct buf *bp;
   2171 
   2172 	req->queue = queue;
   2173 	bp = req->bp;
   2174 
   2175 	switch (req->type) {
   2176 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2177 		/* XXX need to do something extra here.. */
   2178 		/* I'm leaving this in, as I've never actually seen it used,
   2179 		 * and I'd like folks to report it... GO */
   2180 		printf(("WAKEUP CALLED\n"));
   2181 		queue->numOutstanding++;
   2182 
   2183 		bp->b_flags = 0;
   2184 		bp->b_private = req;
   2185 
   2186 		KernelWakeupFunc(bp);
   2187 		break;
   2188 
   2189 	case RF_IO_TYPE_READ:
   2190 	case RF_IO_TYPE_WRITE:
   2191 #if RF_ACC_TRACE > 0
   2192 		if (req->tracerec) {
   2193 			RF_ETIMER_START(req->tracerec->timer);
   2194 		}
   2195 #endif
   2196 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2197 		    op, queue->rf_cinfo->ci_dev,
   2198 		    req->sectorOffset, req->numSector,
   2199 		    req->buf, KernelWakeupFunc, (void *) req,
   2200 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2201 
   2202 		if (rf_debugKernelAccess) {
   2203 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2204 				(long) bp->b_blkno));
   2205 		}
   2206 		queue->numOutstanding++;
   2207 		queue->last_deq_sector = req->sectorOffset;
   2208 		/* acc wouldn't have been let in if there were any pending
   2209 		 * reqs at any other priority */
   2210 		queue->curPriority = req->priority;
   2211 
   2212 		db1_printf(("Going for %c to unit %d col %d\n",
   2213 			    req->type, queue->raidPtr->raidid,
   2214 			    queue->col));
   2215 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2216 			(int) req->sectorOffset, (int) req->numSector,
   2217 			(int) (req->numSector <<
   2218 			    queue->raidPtr->logBytesPerSector),
   2219 			(int) queue->raidPtr->logBytesPerSector));
   2220 
   2221 		/*
   2222 		 * XXX: drop lock here since this can block at
   2223 		 * least with backing SCSI devices.  Retake it
   2224 		 * to minimize fuss with calling interfaces.
   2225 		 */
   2226 
   2227 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2228 		bdev_strategy(bp);
   2229 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2230 		break;
   2231 
   2232 	default:
   2233 		panic("bad req->type in rf_DispatchKernelIO");
   2234 	}
   2235 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2236 
   2237 	return (0);
   2238 }
   2239 /* this is the callback function associated with a I/O invoked from
   2240    kernel code.
   2241  */
   2242 static void
   2243 KernelWakeupFunc(struct buf *bp)
   2244 {
   2245 	RF_DiskQueueData_t *req = NULL;
   2246 	RF_DiskQueue_t *queue;
   2247 
   2248 	db1_printf(("recovering the request queue:\n"));
   2249 
   2250 	req = bp->b_private;
   2251 
   2252 	queue = (RF_DiskQueue_t *) req->queue;
   2253 
   2254 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2255 
   2256 #if RF_ACC_TRACE > 0
   2257 	if (req->tracerec) {
   2258 		RF_ETIMER_STOP(req->tracerec->timer);
   2259 		RF_ETIMER_EVAL(req->tracerec->timer);
   2260 		rf_lock_mutex2(rf_tracing_mutex);
   2261 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2262 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2263 		req->tracerec->num_phys_ios++;
   2264 		rf_unlock_mutex2(rf_tracing_mutex);
   2265 	}
   2266 #endif
   2267 
   2268 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2269 	 * ballistic, and mark the component as hosed... */
   2270 
   2271 	if (bp->b_error != 0) {
   2272 		/* Mark the disk as dead */
   2273 		/* but only mark it once... */
   2274 		/* and only if it wouldn't leave this RAID set
   2275 		   completely broken */
   2276 		if (((queue->raidPtr->Disks[queue->col].status ==
   2277 		      rf_ds_optimal) ||
   2278 		     (queue->raidPtr->Disks[queue->col].status ==
   2279 		      rf_ds_used_spare)) &&
   2280 		     (queue->raidPtr->numFailures <
   2281 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2282 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2283 			       queue->raidPtr->raidid,
   2284 			       bp->b_error,
   2285 			       queue->raidPtr->Disks[queue->col].devname);
   2286 			queue->raidPtr->Disks[queue->col].status =
   2287 			    rf_ds_failed;
   2288 			queue->raidPtr->status = rf_rs_degraded;
   2289 			queue->raidPtr->numFailures++;
   2290 			queue->raidPtr->numNewFailures++;
   2291 		} else {	/* Disk is already dead... */
   2292 			/* printf("Disk already marked as dead!\n"); */
   2293 		}
   2294 
   2295 	}
   2296 
   2297 	/* Fill in the error value */
   2298 	req->error = bp->b_error;
   2299 
   2300 	/* Drop this one on the "finished" queue... */
   2301 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2302 
   2303 	/* Let the raidio thread know there is work to be done. */
   2304 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2305 
   2306 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2307 }
   2308 
   2309 
   2310 /*
   2311  * initialize a buf structure for doing an I/O in the kernel.
   2312  */
   2313 static void
   2314 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2315        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2316        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2317        struct proc *b_proc)
   2318 {
   2319 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2320 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2321 	bp->b_oflags = 0;
   2322 	bp->b_cflags = 0;
   2323 	bp->b_bcount = numSect << logBytesPerSector;
   2324 	bp->b_bufsize = bp->b_bcount;
   2325 	bp->b_error = 0;
   2326 	bp->b_dev = dev;
   2327 	bp->b_data = bf;
   2328 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2329 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2330 	if (bp->b_bcount == 0) {
   2331 		panic("bp->b_bcount is zero in InitBP!!");
   2332 	}
   2333 	bp->b_proc = b_proc;
   2334 	bp->b_iodone = cbFunc;
   2335 	bp->b_private = cbArg;
   2336 }
   2337 
   2338 static void
   2339 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2340 		    struct disklabel *lp)
   2341 {
   2342 	memset(lp, 0, sizeof(*lp));
   2343 
   2344 	/* fabricate a label... */
   2345 	if (raidPtr->totalSectors > UINT32_MAX)
   2346 		lp->d_secperunit = UINT32_MAX;
   2347 	else
   2348 		lp->d_secperunit = raidPtr->totalSectors;
   2349 	lp->d_secsize = raidPtr->bytesPerSector;
   2350 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2351 	lp->d_ntracks = 4 * raidPtr->numCol;
   2352 	lp->d_ncylinders = raidPtr->totalSectors /
   2353 		(lp->d_nsectors * lp->d_ntracks);
   2354 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2355 
   2356 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2357 	lp->d_type = DKTYPE_RAID;
   2358 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2359 	lp->d_rpm = 3600;
   2360 	lp->d_interleave = 1;
   2361 	lp->d_flags = 0;
   2362 
   2363 	lp->d_partitions[RAW_PART].p_offset = 0;
   2364 	lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
   2365 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2366 	lp->d_npartitions = RAW_PART + 1;
   2367 
   2368 	lp->d_magic = DISKMAGIC;
   2369 	lp->d_magic2 = DISKMAGIC;
   2370 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2371 
   2372 }
   2373 /*
   2374  * Read the disklabel from the raid device.  If one is not present, fake one
   2375  * up.
   2376  */
   2377 static void
   2378 raidgetdisklabel(dev_t dev)
   2379 {
   2380 	int     unit = raidunit(dev);
   2381 	struct raid_softc *rs;
   2382 	const char   *errstring;
   2383 	struct disklabel *lp;
   2384 	struct cpu_disklabel *clp;
   2385 	RF_Raid_t *raidPtr;
   2386 
   2387 	if ((rs = raidget(unit)) == NULL)
   2388 		return;
   2389 
   2390 	lp = rs->sc_dkdev.dk_label;
   2391 	clp = rs->sc_dkdev.dk_cpulabel;
   2392 
   2393 	db1_printf(("Getting the disklabel...\n"));
   2394 
   2395 	memset(clp, 0, sizeof(*clp));
   2396 
   2397 	raidPtr = &rs->sc_r;
   2398 
   2399 	raidgetdefaultlabel(raidPtr, rs, lp);
   2400 
   2401 	/*
   2402 	 * Call the generic disklabel extraction routine.
   2403 	 */
   2404 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2405 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2406 	if (errstring)
   2407 		raidmakedisklabel(rs);
   2408 	else {
   2409 		int     i;
   2410 		struct partition *pp;
   2411 
   2412 		/*
   2413 		 * Sanity check whether the found disklabel is valid.
   2414 		 *
   2415 		 * This is necessary since total size of the raid device
   2416 		 * may vary when an interleave is changed even though exactly
   2417 		 * same components are used, and old disklabel may used
   2418 		 * if that is found.
   2419 		 */
   2420 		if (lp->d_secperunit < UINT32_MAX ?
   2421 		    lp->d_secperunit != rs->sc_size :
   2422 		    lp->d_secperunit > rs->sc_size)
   2423 			printf("raid%d: WARNING: %s: "
   2424 			    "total sector size in disklabel (%ju) != "
   2425 			    "the size of raid (%ju)\n", unit, rs->sc_xname,
   2426 			    (uintmax_t)lp->d_secperunit,
   2427 			    (uintmax_t)rs->sc_size);
   2428 		for (i = 0; i < lp->d_npartitions; i++) {
   2429 			pp = &lp->d_partitions[i];
   2430 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2431 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2432 				       "exceeds the size of raid (%ju)\n",
   2433 				       unit, rs->sc_xname, 'a' + i,
   2434 				       (uintmax_t)rs->sc_size);
   2435 		}
   2436 	}
   2437 
   2438 }
   2439 /*
   2440  * Take care of things one might want to take care of in the event
   2441  * that a disklabel isn't present.
   2442  */
   2443 static void
   2444 raidmakedisklabel(struct raid_softc *rs)
   2445 {
   2446 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2447 	db1_printf(("Making a label..\n"));
   2448 
   2449 	/*
   2450 	 * For historical reasons, if there's no disklabel present
   2451 	 * the raw partition must be marked FS_BSDFFS.
   2452 	 */
   2453 
   2454 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2455 
   2456 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2457 
   2458 	lp->d_checksum = dkcksum(lp);
   2459 }
   2460 /*
   2461  * Wait interruptibly for an exclusive lock.
   2462  *
   2463  * XXX
   2464  * Several drivers do this; it should be abstracted and made MP-safe.
   2465  * (Hmm... where have we seen this warning before :->  GO )
   2466  */
   2467 static int
   2468 raidlock(struct raid_softc *rs)
   2469 {
   2470 	int     error;
   2471 
   2472 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2473 		rs->sc_flags |= RAIDF_WANTED;
   2474 		if ((error =
   2475 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2476 			return (error);
   2477 	}
   2478 	rs->sc_flags |= RAIDF_LOCKED;
   2479 	return (0);
   2480 }
   2481 /*
   2482  * Unlock and wake up any waiters.
   2483  */
   2484 static void
   2485 raidunlock(struct raid_softc *rs)
   2486 {
   2487 
   2488 	rs->sc_flags &= ~RAIDF_LOCKED;
   2489 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2490 		rs->sc_flags &= ~RAIDF_WANTED;
   2491 		wakeup(rs);
   2492 	}
   2493 }
   2494 
   2495 
   2496 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2497 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2498 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2499 
   2500 static daddr_t
   2501 rf_component_info_offset(void)
   2502 {
   2503 
   2504 	return RF_COMPONENT_INFO_OFFSET;
   2505 }
   2506 
   2507 static daddr_t
   2508 rf_component_info_size(unsigned secsize)
   2509 {
   2510 	daddr_t info_size;
   2511 
   2512 	KASSERT(secsize);
   2513 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2514 		info_size = secsize;
   2515 	else
   2516 		info_size = RF_COMPONENT_INFO_SIZE;
   2517 
   2518 	return info_size;
   2519 }
   2520 
   2521 static daddr_t
   2522 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2523 {
   2524 	daddr_t map_offset;
   2525 
   2526 	KASSERT(raidPtr->bytesPerSector);
   2527 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2528 		map_offset = raidPtr->bytesPerSector;
   2529 	else
   2530 		map_offset = RF_COMPONENT_INFO_SIZE;
   2531 	map_offset += rf_component_info_offset();
   2532 
   2533 	return map_offset;
   2534 }
   2535 
   2536 static daddr_t
   2537 rf_parity_map_size(RF_Raid_t *raidPtr)
   2538 {
   2539 	daddr_t map_size;
   2540 
   2541 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2542 		map_size = raidPtr->bytesPerSector;
   2543 	else
   2544 		map_size = RF_PARITY_MAP_SIZE;
   2545 
   2546 	return map_size;
   2547 }
   2548 
   2549 int
   2550 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2551 {
   2552 	RF_ComponentLabel_t *clabel;
   2553 
   2554 	clabel = raidget_component_label(raidPtr, col);
   2555 	clabel->clean = RF_RAID_CLEAN;
   2556 	raidflush_component_label(raidPtr, col);
   2557 	return(0);
   2558 }
   2559 
   2560 
   2561 int
   2562 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2563 {
   2564 	RF_ComponentLabel_t *clabel;
   2565 
   2566 	clabel = raidget_component_label(raidPtr, col);
   2567 	clabel->clean = RF_RAID_DIRTY;
   2568 	raidflush_component_label(raidPtr, col);
   2569 	return(0);
   2570 }
   2571 
   2572 int
   2573 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2574 {
   2575 	KASSERT(raidPtr->bytesPerSector);
   2576 	return raidread_component_label(raidPtr->bytesPerSector,
   2577 	    raidPtr->Disks[col].dev,
   2578 	    raidPtr->raid_cinfo[col].ci_vp,
   2579 	    &raidPtr->raid_cinfo[col].ci_label);
   2580 }
   2581 
   2582 RF_ComponentLabel_t *
   2583 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2584 {
   2585 	return &raidPtr->raid_cinfo[col].ci_label;
   2586 }
   2587 
   2588 int
   2589 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2590 {
   2591 	RF_ComponentLabel_t *label;
   2592 
   2593 	label = &raidPtr->raid_cinfo[col].ci_label;
   2594 	label->mod_counter = raidPtr->mod_counter;
   2595 #ifndef RF_NO_PARITY_MAP
   2596 	label->parity_map_modcount = label->mod_counter;
   2597 #endif
   2598 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2599 	    raidPtr->Disks[col].dev,
   2600 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2601 }
   2602 
   2603 
   2604 static int
   2605 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2606     RF_ComponentLabel_t *clabel)
   2607 {
   2608 	return raidread_component_area(dev, b_vp, clabel,
   2609 	    sizeof(RF_ComponentLabel_t),
   2610 	    rf_component_info_offset(),
   2611 	    rf_component_info_size(secsize));
   2612 }
   2613 
   2614 /* ARGSUSED */
   2615 static int
   2616 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2617     size_t msize, daddr_t offset, daddr_t dsize)
   2618 {
   2619 	struct buf *bp;
   2620 	const struct bdevsw *bdev;
   2621 	int error;
   2622 
   2623 	/* XXX should probably ensure that we don't try to do this if
   2624 	   someone has changed rf_protected_sectors. */
   2625 
   2626 	if (b_vp == NULL) {
   2627 		/* For whatever reason, this component is not valid.
   2628 		   Don't try to read a component label from it. */
   2629 		return(EINVAL);
   2630 	}
   2631 
   2632 	/* get a block of the appropriate size... */
   2633 	bp = geteblk((int)dsize);
   2634 	bp->b_dev = dev;
   2635 
   2636 	/* get our ducks in a row for the read */
   2637 	bp->b_blkno = offset / DEV_BSIZE;
   2638 	bp->b_bcount = dsize;
   2639 	bp->b_flags |= B_READ;
   2640  	bp->b_resid = dsize;
   2641 
   2642 	bdev = bdevsw_lookup(bp->b_dev);
   2643 	if (bdev == NULL)
   2644 		return (ENXIO);
   2645 	(*bdev->d_strategy)(bp);
   2646 
   2647 	error = biowait(bp);
   2648 
   2649 	if (!error) {
   2650 		memcpy(data, bp->b_data, msize);
   2651 	}
   2652 
   2653 	brelse(bp, 0);
   2654 	return(error);
   2655 }
   2656 
   2657 
   2658 static int
   2659 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2660     RF_ComponentLabel_t *clabel)
   2661 {
   2662 	return raidwrite_component_area(dev, b_vp, clabel,
   2663 	    sizeof(RF_ComponentLabel_t),
   2664 	    rf_component_info_offset(),
   2665 	    rf_component_info_size(secsize), 0);
   2666 }
   2667 
   2668 /* ARGSUSED */
   2669 static int
   2670 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2671     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2672 {
   2673 	struct buf *bp;
   2674 	const struct bdevsw *bdev;
   2675 	int error;
   2676 
   2677 	/* get a block of the appropriate size... */
   2678 	bp = geteblk((int)dsize);
   2679 	bp->b_dev = dev;
   2680 
   2681 	/* get our ducks in a row for the write */
   2682 	bp->b_blkno = offset / DEV_BSIZE;
   2683 	bp->b_bcount = dsize;
   2684 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2685  	bp->b_resid = dsize;
   2686 
   2687 	memset(bp->b_data, 0, dsize);
   2688 	memcpy(bp->b_data, data, msize);
   2689 
   2690 	bdev = bdevsw_lookup(bp->b_dev);
   2691 	if (bdev == NULL)
   2692 		return (ENXIO);
   2693 	(*bdev->d_strategy)(bp);
   2694 	if (asyncp)
   2695 		return 0;
   2696 	error = biowait(bp);
   2697 	brelse(bp, 0);
   2698 	if (error) {
   2699 #if 1
   2700 		printf("Failed to write RAID component info!\n");
   2701 #endif
   2702 	}
   2703 
   2704 	return(error);
   2705 }
   2706 
   2707 void
   2708 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2709 {
   2710 	int c;
   2711 
   2712 	for (c = 0; c < raidPtr->numCol; c++) {
   2713 		/* Skip dead disks. */
   2714 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2715 			continue;
   2716 		/* XXXjld: what if an error occurs here? */
   2717 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2718 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2719 		    RF_PARITYMAP_NBYTE,
   2720 		    rf_parity_map_offset(raidPtr),
   2721 		    rf_parity_map_size(raidPtr), 0);
   2722 	}
   2723 }
   2724 
   2725 void
   2726 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2727 {
   2728 	struct rf_paritymap_ondisk tmp;
   2729 	int c,first;
   2730 
   2731 	first=1;
   2732 	for (c = 0; c < raidPtr->numCol; c++) {
   2733 		/* Skip dead disks. */
   2734 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2735 			continue;
   2736 		raidread_component_area(raidPtr->Disks[c].dev,
   2737 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2738 		    RF_PARITYMAP_NBYTE,
   2739 		    rf_parity_map_offset(raidPtr),
   2740 		    rf_parity_map_size(raidPtr));
   2741 		if (first) {
   2742 			memcpy(map, &tmp, sizeof(*map));
   2743 			first = 0;
   2744 		} else {
   2745 			rf_paritymap_merge(map, &tmp);
   2746 		}
   2747 	}
   2748 }
   2749 
   2750 void
   2751 rf_markalldirty(RF_Raid_t *raidPtr)
   2752 {
   2753 	RF_ComponentLabel_t *clabel;
   2754 	int sparecol;
   2755 	int c;
   2756 	int j;
   2757 	int scol = -1;
   2758 
   2759 	raidPtr->mod_counter++;
   2760 	for (c = 0; c < raidPtr->numCol; c++) {
   2761 		/* we don't want to touch (at all) a disk that has
   2762 		   failed */
   2763 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2764 			clabel = raidget_component_label(raidPtr, c);
   2765 			if (clabel->status == rf_ds_spared) {
   2766 				/* XXX do something special...
   2767 				   but whatever you do, don't
   2768 				   try to access it!! */
   2769 			} else {
   2770 				raidmarkdirty(raidPtr, c);
   2771 			}
   2772 		}
   2773 	}
   2774 
   2775 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2776 		sparecol = raidPtr->numCol + c;
   2777 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2778 			/*
   2779 
   2780 			   we claim this disk is "optimal" if it's
   2781 			   rf_ds_used_spare, as that means it should be
   2782 			   directly substitutable for the disk it replaced.
   2783 			   We note that too...
   2784 
   2785 			 */
   2786 
   2787 			for(j=0;j<raidPtr->numCol;j++) {
   2788 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2789 					scol = j;
   2790 					break;
   2791 				}
   2792 			}
   2793 
   2794 			clabel = raidget_component_label(raidPtr, sparecol);
   2795 			/* make sure status is noted */
   2796 
   2797 			raid_init_component_label(raidPtr, clabel);
   2798 
   2799 			clabel->row = 0;
   2800 			clabel->column = scol;
   2801 			/* Note: we *don't* change status from rf_ds_used_spare
   2802 			   to rf_ds_optimal */
   2803 			/* clabel.status = rf_ds_optimal; */
   2804 
   2805 			raidmarkdirty(raidPtr, sparecol);
   2806 		}
   2807 	}
   2808 }
   2809 
   2810 
   2811 void
   2812 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2813 {
   2814 	RF_ComponentLabel_t *clabel;
   2815 	int sparecol;
   2816 	int c;
   2817 	int j;
   2818 	int scol;
   2819 
   2820 	scol = -1;
   2821 
   2822 	/* XXX should do extra checks to make sure things really are clean,
   2823 	   rather than blindly setting the clean bit... */
   2824 
   2825 	raidPtr->mod_counter++;
   2826 
   2827 	for (c = 0; c < raidPtr->numCol; c++) {
   2828 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2829 			clabel = raidget_component_label(raidPtr, c);
   2830 			/* make sure status is noted */
   2831 			clabel->status = rf_ds_optimal;
   2832 
   2833 			/* note what unit we are configured as */
   2834 			clabel->last_unit = raidPtr->raidid;
   2835 
   2836 			raidflush_component_label(raidPtr, c);
   2837 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2838 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2839 					raidmarkclean(raidPtr, c);
   2840 				}
   2841 			}
   2842 		}
   2843 		/* else we don't touch it.. */
   2844 	}
   2845 
   2846 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2847 		sparecol = raidPtr->numCol + c;
   2848 		/* Need to ensure that the reconstruct actually completed! */
   2849 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2850 			/*
   2851 
   2852 			   we claim this disk is "optimal" if it's
   2853 			   rf_ds_used_spare, as that means it should be
   2854 			   directly substitutable for the disk it replaced.
   2855 			   We note that too...
   2856 
   2857 			 */
   2858 
   2859 			for(j=0;j<raidPtr->numCol;j++) {
   2860 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2861 					scol = j;
   2862 					break;
   2863 				}
   2864 			}
   2865 
   2866 			/* XXX shouldn't *really* need this... */
   2867 			clabel = raidget_component_label(raidPtr, sparecol);
   2868 			/* make sure status is noted */
   2869 
   2870 			raid_init_component_label(raidPtr, clabel);
   2871 
   2872 			clabel->column = scol;
   2873 			clabel->status = rf_ds_optimal;
   2874 			clabel->last_unit = raidPtr->raidid;
   2875 
   2876 			raidflush_component_label(raidPtr, sparecol);
   2877 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2878 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2879 					raidmarkclean(raidPtr, sparecol);
   2880 				}
   2881 			}
   2882 		}
   2883 	}
   2884 }
   2885 
   2886 void
   2887 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2888 {
   2889 
   2890 	if (vp != NULL) {
   2891 		if (auto_configured == 1) {
   2892 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2893 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2894 			vput(vp);
   2895 
   2896 		} else {
   2897 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2898 		}
   2899 	}
   2900 }
   2901 
   2902 
   2903 void
   2904 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2905 {
   2906 	int r,c;
   2907 	struct vnode *vp;
   2908 	int acd;
   2909 
   2910 
   2911 	/* We take this opportunity to close the vnodes like we should.. */
   2912 
   2913 	for (c = 0; c < raidPtr->numCol; c++) {
   2914 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2915 		acd = raidPtr->Disks[c].auto_configured;
   2916 		rf_close_component(raidPtr, vp, acd);
   2917 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2918 		raidPtr->Disks[c].auto_configured = 0;
   2919 	}
   2920 
   2921 	for (r = 0; r < raidPtr->numSpare; r++) {
   2922 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2923 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2924 		rf_close_component(raidPtr, vp, acd);
   2925 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2926 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2927 	}
   2928 }
   2929 
   2930 
   2931 void
   2932 rf_ReconThread(struct rf_recon_req *req)
   2933 {
   2934 	int     s;
   2935 	RF_Raid_t *raidPtr;
   2936 
   2937 	s = splbio();
   2938 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2939 	raidPtr->recon_in_progress = 1;
   2940 
   2941 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2942 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2943 
   2944 	RF_Free(req, sizeof(*req));
   2945 
   2946 	raidPtr->recon_in_progress = 0;
   2947 	splx(s);
   2948 
   2949 	/* That's all... */
   2950 	kthread_exit(0);	/* does not return */
   2951 }
   2952 
   2953 void
   2954 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2955 {
   2956 	int retcode;
   2957 	int s;
   2958 
   2959 	raidPtr->parity_rewrite_stripes_done = 0;
   2960 	raidPtr->parity_rewrite_in_progress = 1;
   2961 	s = splbio();
   2962 	retcode = rf_RewriteParity(raidPtr);
   2963 	splx(s);
   2964 	if (retcode) {
   2965 		printf("raid%d: Error re-writing parity (%d)!\n",
   2966 		    raidPtr->raidid, retcode);
   2967 	} else {
   2968 		/* set the clean bit!  If we shutdown correctly,
   2969 		   the clean bit on each component label will get
   2970 		   set */
   2971 		raidPtr->parity_good = RF_RAID_CLEAN;
   2972 	}
   2973 	raidPtr->parity_rewrite_in_progress = 0;
   2974 
   2975 	/* Anyone waiting for us to stop?  If so, inform them... */
   2976 	if (raidPtr->waitShutdown) {
   2977 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2978 	}
   2979 
   2980 	/* That's all... */
   2981 	kthread_exit(0);	/* does not return */
   2982 }
   2983 
   2984 
   2985 void
   2986 rf_CopybackThread(RF_Raid_t *raidPtr)
   2987 {
   2988 	int s;
   2989 
   2990 	raidPtr->copyback_in_progress = 1;
   2991 	s = splbio();
   2992 	rf_CopybackReconstructedData(raidPtr);
   2993 	splx(s);
   2994 	raidPtr->copyback_in_progress = 0;
   2995 
   2996 	/* That's all... */
   2997 	kthread_exit(0);	/* does not return */
   2998 }
   2999 
   3000 
   3001 void
   3002 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   3003 {
   3004 	int s;
   3005 	RF_Raid_t *raidPtr;
   3006 
   3007 	s = splbio();
   3008 	raidPtr = req->raidPtr;
   3009 	raidPtr->recon_in_progress = 1;
   3010 	rf_ReconstructInPlace(raidPtr, req->col);
   3011 	RF_Free(req, sizeof(*req));
   3012 	raidPtr->recon_in_progress = 0;
   3013 	splx(s);
   3014 
   3015 	/* That's all... */
   3016 	kthread_exit(0);	/* does not return */
   3017 }
   3018 
   3019 static RF_AutoConfig_t *
   3020 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   3021     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   3022     unsigned secsize)
   3023 {
   3024 	int good_one = 0;
   3025 	RF_ComponentLabel_t *clabel;
   3026 	RF_AutoConfig_t *ac;
   3027 
   3028 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3029 	if (clabel == NULL) {
   3030 oomem:
   3031 		    while(ac_list) {
   3032 			    ac = ac_list;
   3033 			    if (ac->clabel)
   3034 				    free(ac->clabel, M_RAIDFRAME);
   3035 			    ac_list = ac_list->next;
   3036 			    free(ac, M_RAIDFRAME);
   3037 		    }
   3038 		    printf("RAID auto config: out of memory!\n");
   3039 		    return NULL; /* XXX probably should panic? */
   3040 	}
   3041 
   3042 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3043 		/* Got the label.  Does it look reasonable? */
   3044 		if (rf_reasonable_label(clabel, numsecs) &&
   3045 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3046 #ifdef DEBUG
   3047 			printf("Component on: %s: %llu\n",
   3048 				cname, (unsigned long long)size);
   3049 			rf_print_component_label(clabel);
   3050 #endif
   3051 			/* if it's reasonable, add it, else ignore it. */
   3052 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3053 				M_NOWAIT);
   3054 			if (ac == NULL) {
   3055 				free(clabel, M_RAIDFRAME);
   3056 				goto oomem;
   3057 			}
   3058 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3059 			ac->dev = dev;
   3060 			ac->vp = vp;
   3061 			ac->clabel = clabel;
   3062 			ac->next = ac_list;
   3063 			ac_list = ac;
   3064 			good_one = 1;
   3065 		}
   3066 	}
   3067 	if (!good_one) {
   3068 		/* cleanup */
   3069 		free(clabel, M_RAIDFRAME);
   3070 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3071 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3072 		vput(vp);
   3073 	}
   3074 	return ac_list;
   3075 }
   3076 
   3077 RF_AutoConfig_t *
   3078 rf_find_raid_components(void)
   3079 {
   3080 	struct vnode *vp;
   3081 	struct disklabel label;
   3082 	device_t dv;
   3083 	deviter_t di;
   3084 	dev_t dev;
   3085 	int bmajor, bminor, wedge, rf_part_found;
   3086 	int error;
   3087 	int i;
   3088 	RF_AutoConfig_t *ac_list;
   3089 	uint64_t numsecs;
   3090 	unsigned secsize;
   3091 
   3092 	/* initialize the AutoConfig list */
   3093 	ac_list = NULL;
   3094 
   3095 	/* we begin by trolling through *all* the devices on the system */
   3096 
   3097 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3098 	     dv = deviter_next(&di)) {
   3099 
   3100 		/* we are only interested in disks... */
   3101 		if (device_class(dv) != DV_DISK)
   3102 			continue;
   3103 
   3104 		/* we don't care about floppies... */
   3105 		if (device_is_a(dv, "fd")) {
   3106 			continue;
   3107 		}
   3108 
   3109 		/* we don't care about CD's... */
   3110 		if (device_is_a(dv, "cd")) {
   3111 			continue;
   3112 		}
   3113 
   3114 		/* we don't care about md's... */
   3115 		if (device_is_a(dv, "md")) {
   3116 			continue;
   3117 		}
   3118 
   3119 		/* hdfd is the Atari/Hades floppy driver */
   3120 		if (device_is_a(dv, "hdfd")) {
   3121 			continue;
   3122 		}
   3123 
   3124 		/* fdisa is the Atari/Milan floppy driver */
   3125 		if (device_is_a(dv, "fdisa")) {
   3126 			continue;
   3127 		}
   3128 
   3129 		/* need to find the device_name_to_block_device_major stuff */
   3130 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3131 
   3132 		rf_part_found = 0; /*No raid partition as yet*/
   3133 
   3134 		/* get a vnode for the raw partition of this disk */
   3135 
   3136 		wedge = device_is_a(dv, "dk");
   3137 		bminor = minor(device_unit(dv));
   3138 		dev = wedge ? makedev(bmajor, bminor) :
   3139 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3140 		if (bdevvp(dev, &vp))
   3141 			panic("RAID can't alloc vnode");
   3142 
   3143 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3144 
   3145 		if (error) {
   3146 			/* "Who cares."  Continue looking
   3147 			   for something that exists*/
   3148 			vput(vp);
   3149 			continue;
   3150 		}
   3151 
   3152 		error = getdisksize(vp, &numsecs, &secsize);
   3153 		if (error) {
   3154 			vput(vp);
   3155 			continue;
   3156 		}
   3157 		if (wedge) {
   3158 			struct dkwedge_info dkw;
   3159 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3160 			    NOCRED);
   3161 			if (error) {
   3162 				printf("RAIDframe: can't get wedge info for "
   3163 				    "dev %s (%d)\n", device_xname(dv), error);
   3164 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3165 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3166 				vput(vp);
   3167 				continue;
   3168 			}
   3169 
   3170 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3171 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3172 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3173 				vput(vp);
   3174 				continue;
   3175 			}
   3176 
   3177 			ac_list = rf_get_component(ac_list, dev, vp,
   3178 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3179 			rf_part_found = 1; /*There is a raid component on this disk*/
   3180 			continue;
   3181 		}
   3182 
   3183 		/* Ok, the disk exists.  Go get the disklabel. */
   3184 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3185 		if (error) {
   3186 			/*
   3187 			 * XXX can't happen - open() would
   3188 			 * have errored out (or faked up one)
   3189 			 */
   3190 			if (error != ENOTTY)
   3191 				printf("RAIDframe: can't get label for dev "
   3192 				    "%s (%d)\n", device_xname(dv), error);
   3193 		}
   3194 
   3195 		/* don't need this any more.  We'll allocate it again
   3196 		   a little later if we really do... */
   3197 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3198 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3199 		vput(vp);
   3200 
   3201 		if (error)
   3202 			continue;
   3203 
   3204 		rf_part_found = 0; /*No raid partitions yet*/
   3205 		for (i = 0; i < label.d_npartitions; i++) {
   3206 			char cname[sizeof(ac_list->devname)];
   3207 
   3208 			/* We only support partitions marked as RAID */
   3209 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3210 				continue;
   3211 
   3212 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3213 			if (bdevvp(dev, &vp))
   3214 				panic("RAID can't alloc vnode");
   3215 
   3216 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3217 			if (error) {
   3218 				/* Whatever... */
   3219 				vput(vp);
   3220 				continue;
   3221 			}
   3222 			snprintf(cname, sizeof(cname), "%s%c",
   3223 			    device_xname(dv), 'a' + i);
   3224 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3225 				label.d_partitions[i].p_size, numsecs, secsize);
   3226 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3227 		}
   3228 
   3229 		/*
   3230 		 *If there is no raid component on this disk, either in a
   3231 		 *disklabel or inside a wedge, check the raw partition as well,
   3232 		 *as it is possible to configure raid components on raw disk
   3233 		 *devices.
   3234 		 */
   3235 
   3236 		if (!rf_part_found) {
   3237 			char cname[sizeof(ac_list->devname)];
   3238 
   3239 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3240 			if (bdevvp(dev, &vp))
   3241 				panic("RAID can't alloc vnode");
   3242 
   3243 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3244 			if (error) {
   3245 				/* Whatever... */
   3246 				vput(vp);
   3247 				continue;
   3248 			}
   3249 			snprintf(cname, sizeof(cname), "%s%c",
   3250 			    device_xname(dv), 'a' + RAW_PART);
   3251 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3252 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3253 		}
   3254 	}
   3255 	deviter_release(&di);
   3256 	return ac_list;
   3257 }
   3258 
   3259 
   3260 int
   3261 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3262 {
   3263 
   3264 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3265 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3266 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3267 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3268 	    clabel->row >=0 &&
   3269 	    clabel->column >= 0 &&
   3270 	    clabel->num_rows > 0 &&
   3271 	    clabel->num_columns > 0 &&
   3272 	    clabel->row < clabel->num_rows &&
   3273 	    clabel->column < clabel->num_columns &&
   3274 	    clabel->blockSize > 0 &&
   3275 	    /*
   3276 	     * numBlocksHi may contain garbage, but it is ok since
   3277 	     * the type is unsigned.  If it is really garbage,
   3278 	     * rf_fix_old_label_size() will fix it.
   3279 	     */
   3280 	    rf_component_label_numblocks(clabel) > 0) {
   3281 		/*
   3282 		 * label looks reasonable enough...
   3283 		 * let's make sure it has no old garbage.
   3284 		 */
   3285 		if (numsecs)
   3286 			rf_fix_old_label_size(clabel, numsecs);
   3287 		return(1);
   3288 	}
   3289 	return(0);
   3290 }
   3291 
   3292 
   3293 /*
   3294  * For reasons yet unknown, some old component labels have garbage in
   3295  * the newer numBlocksHi region, and this causes lossage.  Since those
   3296  * disks will also have numsecs set to less than 32 bits of sectors,
   3297  * we can determine when this corruption has occurred, and fix it.
   3298  *
   3299  * The exact same problem, with the same unknown reason, happens to
   3300  * the partitionSizeHi member as well.
   3301  */
   3302 static void
   3303 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3304 {
   3305 
   3306 	if (numsecs < ((uint64_t)1 << 32)) {
   3307 		if (clabel->numBlocksHi) {
   3308 			printf("WARNING: total sectors < 32 bits, yet "
   3309 			       "numBlocksHi set\n"
   3310 			       "WARNING: resetting numBlocksHi to zero.\n");
   3311 			clabel->numBlocksHi = 0;
   3312 		}
   3313 
   3314 		if (clabel->partitionSizeHi) {
   3315 			printf("WARNING: total sectors < 32 bits, yet "
   3316 			       "partitionSizeHi set\n"
   3317 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3318 			clabel->partitionSizeHi = 0;
   3319 		}
   3320 	}
   3321 }
   3322 
   3323 
   3324 #ifdef DEBUG
   3325 void
   3326 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3327 {
   3328 	uint64_t numBlocks;
   3329 	static const char *rp[] = {
   3330 	    "No", "Force", "Soft", "*invalid*"
   3331 	};
   3332 
   3333 
   3334 	numBlocks = rf_component_label_numblocks(clabel);
   3335 
   3336 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3337 	       clabel->row, clabel->column,
   3338 	       clabel->num_rows, clabel->num_columns);
   3339 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3340 	       clabel->version, clabel->serial_number,
   3341 	       clabel->mod_counter);
   3342 	printf("   Clean: %s Status: %d\n",
   3343 	       clabel->clean ? "Yes" : "No", clabel->status);
   3344 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3345 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3346 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3347 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3348 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3349 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3350 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3351 #if 0
   3352 	   printf("   Config order: %d\n", clabel->config_order);
   3353 #endif
   3354 
   3355 }
   3356 #endif
   3357 
   3358 RF_ConfigSet_t *
   3359 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3360 {
   3361 	RF_AutoConfig_t *ac;
   3362 	RF_ConfigSet_t *config_sets;
   3363 	RF_ConfigSet_t *cset;
   3364 	RF_AutoConfig_t *ac_next;
   3365 
   3366 
   3367 	config_sets = NULL;
   3368 
   3369 	/* Go through the AutoConfig list, and figure out which components
   3370 	   belong to what sets.  */
   3371 	ac = ac_list;
   3372 	while(ac!=NULL) {
   3373 		/* we're going to putz with ac->next, so save it here
   3374 		   for use at the end of the loop */
   3375 		ac_next = ac->next;
   3376 
   3377 		if (config_sets == NULL) {
   3378 			/* will need at least this one... */
   3379 			config_sets = (RF_ConfigSet_t *)
   3380 				malloc(sizeof(RF_ConfigSet_t),
   3381 				       M_RAIDFRAME, M_NOWAIT);
   3382 			if (config_sets == NULL) {
   3383 				panic("rf_create_auto_sets: No memory!");
   3384 			}
   3385 			/* this one is easy :) */
   3386 			config_sets->ac = ac;
   3387 			config_sets->next = NULL;
   3388 			config_sets->rootable = 0;
   3389 			ac->next = NULL;
   3390 		} else {
   3391 			/* which set does this component fit into? */
   3392 			cset = config_sets;
   3393 			while(cset!=NULL) {
   3394 				if (rf_does_it_fit(cset, ac)) {
   3395 					/* looks like it matches... */
   3396 					ac->next = cset->ac;
   3397 					cset->ac = ac;
   3398 					break;
   3399 				}
   3400 				cset = cset->next;
   3401 			}
   3402 			if (cset==NULL) {
   3403 				/* didn't find a match above... new set..*/
   3404 				cset = (RF_ConfigSet_t *)
   3405 					malloc(sizeof(RF_ConfigSet_t),
   3406 					       M_RAIDFRAME, M_NOWAIT);
   3407 				if (cset == NULL) {
   3408 					panic("rf_create_auto_sets: No memory!");
   3409 				}
   3410 				cset->ac = ac;
   3411 				ac->next = NULL;
   3412 				cset->next = config_sets;
   3413 				cset->rootable = 0;
   3414 				config_sets = cset;
   3415 			}
   3416 		}
   3417 		ac = ac_next;
   3418 	}
   3419 
   3420 
   3421 	return(config_sets);
   3422 }
   3423 
   3424 static int
   3425 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3426 {
   3427 	RF_ComponentLabel_t *clabel1, *clabel2;
   3428 
   3429 	/* If this one matches the *first* one in the set, that's good
   3430 	   enough, since the other members of the set would have been
   3431 	   through here too... */
   3432 	/* note that we are not checking partitionSize here..
   3433 
   3434 	   Note that we are also not checking the mod_counters here.
   3435 	   If everything else matches except the mod_counter, that's
   3436 	   good enough for this test.  We will deal with the mod_counters
   3437 	   a little later in the autoconfiguration process.
   3438 
   3439 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3440 
   3441 	   The reason we don't check for this is that failed disks
   3442 	   will have lower modification counts.  If those disks are
   3443 	   not added to the set they used to belong to, then they will
   3444 	   form their own set, which may result in 2 different sets,
   3445 	   for example, competing to be configured at raid0, and
   3446 	   perhaps competing to be the root filesystem set.  If the
   3447 	   wrong ones get configured, or both attempt to become /,
   3448 	   weird behaviour and or serious lossage will occur.  Thus we
   3449 	   need to bring them into the fold here, and kick them out at
   3450 	   a later point.
   3451 
   3452 	*/
   3453 
   3454 	clabel1 = cset->ac->clabel;
   3455 	clabel2 = ac->clabel;
   3456 	if ((clabel1->version == clabel2->version) &&
   3457 	    (clabel1->serial_number == clabel2->serial_number) &&
   3458 	    (clabel1->num_rows == clabel2->num_rows) &&
   3459 	    (clabel1->num_columns == clabel2->num_columns) &&
   3460 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3461 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3462 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3463 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3464 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3465 	    (clabel1->blockSize == clabel2->blockSize) &&
   3466 	    rf_component_label_numblocks(clabel1) ==
   3467 	    rf_component_label_numblocks(clabel2) &&
   3468 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3469 	    (clabel1->root_partition == clabel2->root_partition) &&
   3470 	    (clabel1->last_unit == clabel2->last_unit) &&
   3471 	    (clabel1->config_order == clabel2->config_order)) {
   3472 		/* if it get's here, it almost *has* to be a match */
   3473 	} else {
   3474 		/* it's not consistent with somebody in the set..
   3475 		   punt */
   3476 		return(0);
   3477 	}
   3478 	/* all was fine.. it must fit... */
   3479 	return(1);
   3480 }
   3481 
   3482 int
   3483 rf_have_enough_components(RF_ConfigSet_t *cset)
   3484 {
   3485 	RF_AutoConfig_t *ac;
   3486 	RF_AutoConfig_t *auto_config;
   3487 	RF_ComponentLabel_t *clabel;
   3488 	int c;
   3489 	int num_cols;
   3490 	int num_missing;
   3491 	int mod_counter;
   3492 	int mod_counter_found;
   3493 	int even_pair_failed;
   3494 	char parity_type;
   3495 
   3496 
   3497 	/* check to see that we have enough 'live' components
   3498 	   of this set.  If so, we can configure it if necessary */
   3499 
   3500 	num_cols = cset->ac->clabel->num_columns;
   3501 	parity_type = cset->ac->clabel->parityConfig;
   3502 
   3503 	/* XXX Check for duplicate components!?!?!? */
   3504 
   3505 	/* Determine what the mod_counter is supposed to be for this set. */
   3506 
   3507 	mod_counter_found = 0;
   3508 	mod_counter = 0;
   3509 	ac = cset->ac;
   3510 	while(ac!=NULL) {
   3511 		if (mod_counter_found==0) {
   3512 			mod_counter = ac->clabel->mod_counter;
   3513 			mod_counter_found = 1;
   3514 		} else {
   3515 			if (ac->clabel->mod_counter > mod_counter) {
   3516 				mod_counter = ac->clabel->mod_counter;
   3517 			}
   3518 		}
   3519 		ac = ac->next;
   3520 	}
   3521 
   3522 	num_missing = 0;
   3523 	auto_config = cset->ac;
   3524 
   3525 	even_pair_failed = 0;
   3526 	for(c=0; c<num_cols; c++) {
   3527 		ac = auto_config;
   3528 		while(ac!=NULL) {
   3529 			if ((ac->clabel->column == c) &&
   3530 			    (ac->clabel->mod_counter == mod_counter)) {
   3531 				/* it's this one... */
   3532 #ifdef DEBUG
   3533 				printf("Found: %s at %d\n",
   3534 				       ac->devname,c);
   3535 #endif
   3536 				break;
   3537 			}
   3538 			ac=ac->next;
   3539 		}
   3540 		if (ac==NULL) {
   3541 				/* Didn't find one here! */
   3542 				/* special case for RAID 1, especially
   3543 				   where there are more than 2
   3544 				   components (where RAIDframe treats
   3545 				   things a little differently :( ) */
   3546 			if (parity_type == '1') {
   3547 				if (c%2 == 0) { /* even component */
   3548 					even_pair_failed = 1;
   3549 				} else { /* odd component.  If
   3550 					    we're failed, and
   3551 					    so is the even
   3552 					    component, it's
   3553 					    "Good Night, Charlie" */
   3554 					if (even_pair_failed == 1) {
   3555 						return(0);
   3556 					}
   3557 				}
   3558 			} else {
   3559 				/* normal accounting */
   3560 				num_missing++;
   3561 			}
   3562 		}
   3563 		if ((parity_type == '1') && (c%2 == 1)) {
   3564 				/* Just did an even component, and we didn't
   3565 				   bail.. reset the even_pair_failed flag,
   3566 				   and go on to the next component.... */
   3567 			even_pair_failed = 0;
   3568 		}
   3569 	}
   3570 
   3571 	clabel = cset->ac->clabel;
   3572 
   3573 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3574 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3575 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3576 		/* XXX this needs to be made *much* more general */
   3577 		/* Too many failures */
   3578 		return(0);
   3579 	}
   3580 	/* otherwise, all is well, and we've got enough to take a kick
   3581 	   at autoconfiguring this set */
   3582 	return(1);
   3583 }
   3584 
   3585 void
   3586 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3587 			RF_Raid_t *raidPtr)
   3588 {
   3589 	RF_ComponentLabel_t *clabel;
   3590 	int i;
   3591 
   3592 	clabel = ac->clabel;
   3593 
   3594 	/* 1. Fill in the common stuff */
   3595 	config->numRow = clabel->num_rows = 1;
   3596 	config->numCol = clabel->num_columns;
   3597 	config->numSpare = 0; /* XXX should this be set here? */
   3598 	config->sectPerSU = clabel->sectPerSU;
   3599 	config->SUsPerPU = clabel->SUsPerPU;
   3600 	config->SUsPerRU = clabel->SUsPerRU;
   3601 	config->parityConfig = clabel->parityConfig;
   3602 	/* XXX... */
   3603 	strcpy(config->diskQueueType,"fifo");
   3604 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3605 	config->layoutSpecificSize = 0; /* XXX ?? */
   3606 
   3607 	while(ac!=NULL) {
   3608 		/* row/col values will be in range due to the checks
   3609 		   in reasonable_label() */
   3610 		strcpy(config->devnames[0][ac->clabel->column],
   3611 		       ac->devname);
   3612 		ac = ac->next;
   3613 	}
   3614 
   3615 	for(i=0;i<RF_MAXDBGV;i++) {
   3616 		config->debugVars[i][0] = 0;
   3617 	}
   3618 }
   3619 
   3620 int
   3621 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3622 {
   3623 	RF_ComponentLabel_t *clabel;
   3624 	int column;
   3625 	int sparecol;
   3626 
   3627 	raidPtr->autoconfigure = new_value;
   3628 
   3629 	for(column=0; column<raidPtr->numCol; column++) {
   3630 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3631 			clabel = raidget_component_label(raidPtr, column);
   3632 			clabel->autoconfigure = new_value;
   3633 			raidflush_component_label(raidPtr, column);
   3634 		}
   3635 	}
   3636 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3637 		sparecol = raidPtr->numCol + column;
   3638 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3639 			clabel = raidget_component_label(raidPtr, sparecol);
   3640 			clabel->autoconfigure = new_value;
   3641 			raidflush_component_label(raidPtr, sparecol);
   3642 		}
   3643 	}
   3644 	return(new_value);
   3645 }
   3646 
   3647 int
   3648 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3649 {
   3650 	RF_ComponentLabel_t *clabel;
   3651 	int column;
   3652 	int sparecol;
   3653 
   3654 	raidPtr->root_partition = new_value;
   3655 	for(column=0; column<raidPtr->numCol; column++) {
   3656 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3657 			clabel = raidget_component_label(raidPtr, column);
   3658 			clabel->root_partition = new_value;
   3659 			raidflush_component_label(raidPtr, column);
   3660 		}
   3661 	}
   3662 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3663 		sparecol = raidPtr->numCol + column;
   3664 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3665 			clabel = raidget_component_label(raidPtr, sparecol);
   3666 			clabel->root_partition = new_value;
   3667 			raidflush_component_label(raidPtr, sparecol);
   3668 		}
   3669 	}
   3670 	return(new_value);
   3671 }
   3672 
   3673 void
   3674 rf_release_all_vps(RF_ConfigSet_t *cset)
   3675 {
   3676 	RF_AutoConfig_t *ac;
   3677 
   3678 	ac = cset->ac;
   3679 	while(ac!=NULL) {
   3680 		/* Close the vp, and give it back */
   3681 		if (ac->vp) {
   3682 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3683 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3684 			vput(ac->vp);
   3685 			ac->vp = NULL;
   3686 		}
   3687 		ac = ac->next;
   3688 	}
   3689 }
   3690 
   3691 
   3692 void
   3693 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3694 {
   3695 	RF_AutoConfig_t *ac;
   3696 	RF_AutoConfig_t *next_ac;
   3697 
   3698 	ac = cset->ac;
   3699 	while(ac!=NULL) {
   3700 		next_ac = ac->next;
   3701 		/* nuke the label */
   3702 		free(ac->clabel, M_RAIDFRAME);
   3703 		/* cleanup the config structure */
   3704 		free(ac, M_RAIDFRAME);
   3705 		/* "next.." */
   3706 		ac = next_ac;
   3707 	}
   3708 	/* and, finally, nuke the config set */
   3709 	free(cset, M_RAIDFRAME);
   3710 }
   3711 
   3712 
   3713 void
   3714 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3715 {
   3716 	/* current version number */
   3717 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3718 	clabel->serial_number = raidPtr->serial_number;
   3719 	clabel->mod_counter = raidPtr->mod_counter;
   3720 
   3721 	clabel->num_rows = 1;
   3722 	clabel->num_columns = raidPtr->numCol;
   3723 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3724 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3725 
   3726 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3727 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3728 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3729 
   3730 	clabel->blockSize = raidPtr->bytesPerSector;
   3731 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3732 
   3733 	/* XXX not portable */
   3734 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3735 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3736 	clabel->autoconfigure = raidPtr->autoconfigure;
   3737 	clabel->root_partition = raidPtr->root_partition;
   3738 	clabel->last_unit = raidPtr->raidid;
   3739 	clabel->config_order = raidPtr->config_order;
   3740 
   3741 #ifndef RF_NO_PARITY_MAP
   3742 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3743 #endif
   3744 }
   3745 
   3746 struct raid_softc *
   3747 rf_auto_config_set(RF_ConfigSet_t *cset)
   3748 {
   3749 	RF_Raid_t *raidPtr;
   3750 	RF_Config_t *config;
   3751 	int raidID;
   3752 	struct raid_softc *sc;
   3753 
   3754 #ifdef DEBUG
   3755 	printf("RAID autoconfigure\n");
   3756 #endif
   3757 
   3758 	/* 1. Create a config structure */
   3759 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3760 	if (config == NULL) {
   3761 		printf("Out of mem!?!?\n");
   3762 				/* XXX do something more intelligent here. */
   3763 		return NULL;
   3764 	}
   3765 
   3766 	/*
   3767 	   2. Figure out what RAID ID this one is supposed to live at
   3768 	   See if we can get the same RAID dev that it was configured
   3769 	   on last time..
   3770 	*/
   3771 
   3772 	raidID = cset->ac->clabel->last_unit;
   3773 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3774 		continue;
   3775 #ifdef DEBUG
   3776 	printf("Configuring raid%d:\n",raidID);
   3777 #endif
   3778 
   3779 	raidPtr = &sc->sc_r;
   3780 
   3781 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3782 	raidPtr->softc = sc;
   3783 	raidPtr->raidid = raidID;
   3784 	raidPtr->openings = RAIDOUTSTANDING;
   3785 
   3786 	/* 3. Build the configuration structure */
   3787 	rf_create_configuration(cset->ac, config, raidPtr);
   3788 
   3789 	/* 4. Do the configuration */
   3790 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3791 		raidinit(sc);
   3792 
   3793 		rf_markalldirty(raidPtr);
   3794 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3795 		switch (cset->ac->clabel->root_partition) {
   3796 		case 1:	/* Force Root */
   3797 		case 2:	/* Soft Root: root when boot partition part of raid */
   3798 			/*
   3799 			 * everything configured just fine.  Make a note
   3800 			 * that this set is eligible to be root,
   3801 			 * or forced to be root
   3802 			 */
   3803 			cset->rootable = cset->ac->clabel->root_partition;
   3804 			/* XXX do this here? */
   3805 			raidPtr->root_partition = cset->rootable;
   3806 			break;
   3807 		default:
   3808 			break;
   3809 		}
   3810 	} else {
   3811 		raidput(sc);
   3812 		sc = NULL;
   3813 	}
   3814 
   3815 	/* 5. Cleanup */
   3816 	free(config, M_RAIDFRAME);
   3817 	return sc;
   3818 }
   3819 
   3820 void
   3821 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3822 {
   3823 	struct buf *bp;
   3824 	struct raid_softc *rs;
   3825 
   3826 	bp = (struct buf *)desc->bp;
   3827 	rs = desc->raidPtr->softc;
   3828 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3829 	    (bp->b_flags & B_READ));
   3830 }
   3831 
   3832 void
   3833 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3834 	     size_t xmin, size_t xmax)
   3835 {
   3836 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3837 	pool_sethiwat(p, xmax);
   3838 	pool_prime(p, xmin);
   3839 	pool_setlowat(p, xmin);
   3840 }
   3841 
   3842 /*
   3843  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3844  * if there is IO pending and if that IO could possibly be done for a
   3845  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3846  * otherwise.
   3847  *
   3848  */
   3849 
   3850 int
   3851 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3852 {
   3853 	struct raid_softc *rs = raidPtr->softc;
   3854 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3855 		/* there is work to do */
   3856 		return 0;
   3857 	}
   3858 	/* default is nothing to do */
   3859 	return 1;
   3860 }
   3861 
   3862 int
   3863 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3864 {
   3865 	uint64_t numsecs;
   3866 	unsigned secsize;
   3867 	int error;
   3868 
   3869 	error = getdisksize(vp, &numsecs, &secsize);
   3870 	if (error == 0) {
   3871 		diskPtr->blockSize = secsize;
   3872 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3873 		diskPtr->partitionSize = numsecs;
   3874 		return 0;
   3875 	}
   3876 	return error;
   3877 }
   3878 
   3879 static int
   3880 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3881 {
   3882 	return 1;
   3883 }
   3884 
   3885 static void
   3886 raid_attach(device_t parent, device_t self, void *aux)
   3887 {
   3888 
   3889 }
   3890 
   3891 
   3892 static int
   3893 raid_detach(device_t self, int flags)
   3894 {
   3895 	int error;
   3896 	struct raid_softc *rs = raidget(device_unit(self));
   3897 
   3898 	if (rs == NULL)
   3899 		return ENXIO;
   3900 
   3901 	if ((error = raidlock(rs)) != 0)
   3902 		return (error);
   3903 
   3904 	error = raid_detach_unlocked(rs);
   3905 
   3906 	raidunlock(rs);
   3907 
   3908 	/* XXXkd: raidput(rs) ??? */
   3909 
   3910 	return error;
   3911 }
   3912 
   3913 static void
   3914 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3915 {
   3916 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3917 
   3918 	memset(dg, 0, sizeof(*dg));
   3919 
   3920 	dg->dg_secperunit = raidPtr->totalSectors;
   3921 	dg->dg_secsize = raidPtr->bytesPerSector;
   3922 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3923 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3924 
   3925 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3926 }
   3927 
   3928 /*
   3929  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3930  * We end up returning whatever error was returned by the first cache flush
   3931  * that fails.
   3932  */
   3933 
   3934 int
   3935 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3936 {
   3937 	int c, sparecol;
   3938 	int e,error;
   3939 	int force = 1;
   3940 
   3941 	error = 0;
   3942 	for (c = 0; c < raidPtr->numCol; c++) {
   3943 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3944 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3945 					  &force, FWRITE, NOCRED);
   3946 			if (e) {
   3947 				if (e != ENODEV)
   3948 					printf("raid%d: cache flush to component %s failed.\n",
   3949 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3950 				if (error == 0) {
   3951 					error = e;
   3952 				}
   3953 			}
   3954 		}
   3955 	}
   3956 
   3957 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3958 		sparecol = raidPtr->numCol + c;
   3959 		/* Need to ensure that the reconstruct actually completed! */
   3960 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3961 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3962 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3963 			if (e) {
   3964 				if (e != ENODEV)
   3965 					printf("raid%d: cache flush to component %s failed.\n",
   3966 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3967 				if (error == 0) {
   3968 					error = e;
   3969 				}
   3970 			}
   3971 		}
   3972 	}
   3973 	return error;
   3974 }
   3975