Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.302
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.302 2013/04/29 21:21:10 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.302 2013/04/29 21:21:10 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	raidopen, raidclose, raidstrategy, raidioctl,
    209 	raiddump, raidsize, D_DISK
    210 };
    211 
    212 const struct cdevsw raid_cdevsw = {
    213 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    214 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    215 };
    216 
    217 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    218 
    219 struct raid_softc {
    220 	device_t sc_dev;
    221 	int	sc_unit;
    222 	int     sc_flags;	/* flags */
    223 	int     sc_cflags;	/* configuration flags */
    224 	uint64_t sc_size;	/* size of the raid device */
    225 	char    sc_xname[20];	/* XXX external name */
    226 	struct disk sc_dkdev;	/* generic disk device info */
    227 	struct bufq_state *buf_queue;	/* used for the device queue */
    228 	RF_Raid_t sc_r;
    229 	LIST_ENTRY(raid_softc) sc_link;
    230 };
    231 /* sc_flags */
    232 #define RAIDF_INITED	0x01	/* unit has been initialized */
    233 #define RAIDF_WLABEL	0x02	/* label area is writable */
    234 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    235 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    236 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    237 #define RAIDF_LOCKED	0x80	/* unit is locked */
    238 
    239 #define	raidunit(x)	DISKUNIT(x)
    240 
    241 extern struct cfdriver raid_cd;
    242 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    243     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    244     DVF_DETACH_SHUTDOWN);
    245 
    246 /*
    247  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    248  * Be aware that large numbers can allow the driver to consume a lot of
    249  * kernel memory, especially on writes, and in degraded mode reads.
    250  *
    251  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    252  * a single 64K write will typically require 64K for the old data,
    253  * 64K for the old parity, and 64K for the new parity, for a total
    254  * of 192K (if the parity buffer is not re-used immediately).
    255  * Even it if is used immediately, that's still 128K, which when multiplied
    256  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    257  *
    258  * Now in degraded mode, for example, a 64K read on the above setup may
    259  * require data reconstruction, which will require *all* of the 4 remaining
    260  * disks to participate -- 4 * 32K/disk == 128K again.
    261  */
    262 
    263 #ifndef RAIDOUTSTANDING
    264 #define RAIDOUTSTANDING   6
    265 #endif
    266 
    267 #define RAIDLABELDEV(dev)	\
    268 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    269 
    270 /* declared here, and made public, for the benefit of KVM stuff.. */
    271 
    272 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    273 				     struct disklabel *);
    274 static void raidgetdisklabel(dev_t);
    275 static void raidmakedisklabel(struct raid_softc *);
    276 
    277 static int raidlock(struct raid_softc *);
    278 static void raidunlock(struct raid_softc *);
    279 
    280 static int raid_detach_unlocked(struct raid_softc *);
    281 
    282 static void rf_markalldirty(RF_Raid_t *);
    283 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    284 
    285 void rf_ReconThread(struct rf_recon_req *);
    286 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    287 void rf_CopybackThread(RF_Raid_t *raidPtr);
    288 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    289 int rf_autoconfig(device_t);
    290 void rf_buildroothack(RF_ConfigSet_t *);
    291 
    292 RF_AutoConfig_t *rf_find_raid_components(void);
    293 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    294 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    295 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    296 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    297 int rf_set_autoconfig(RF_Raid_t *, int);
    298 int rf_set_rootpartition(RF_Raid_t *, int);
    299 void rf_release_all_vps(RF_ConfigSet_t *);
    300 void rf_cleanup_config_set(RF_ConfigSet_t *);
    301 int rf_have_enough_components(RF_ConfigSet_t *);
    302 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    303 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    304 
    305 /*
    306  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    307  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    308  * in the kernel config file.
    309  */
    310 #ifdef RAID_AUTOCONFIG
    311 int raidautoconfig = 1;
    312 #else
    313 int raidautoconfig = 0;
    314 #endif
    315 static bool raidautoconfigdone = false;
    316 
    317 struct RF_Pools_s rf_pools;
    318 
    319 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    320 static kmutex_t raid_lock;
    321 
    322 static struct raid_softc *
    323 raidcreate(int unit) {
    324 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    325 	if (sc == NULL) {
    326 #ifdef DIAGNOSTIC
    327 		printf("%s: out of memory\n", __func__);
    328 #endif
    329 		return NULL;
    330 	}
    331 	sc->sc_unit = unit;
    332 	bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    333 	return sc;
    334 }
    335 
    336 static void
    337 raiddestroy(struct raid_softc *sc) {
    338 	bufq_free(sc->buf_queue);
    339 	kmem_free(sc, sizeof(*sc));
    340 }
    341 
    342 static struct raid_softc *
    343 raidget(int unit) {
    344 	struct raid_softc *sc;
    345 	if (unit < 0) {
    346 #ifdef DIAGNOSTIC
    347 		panic("%s: unit %d!", __func__, unit);
    348 #endif
    349 		return NULL;
    350 	}
    351 	mutex_enter(&raid_lock);
    352 	LIST_FOREACH(sc, &raids, sc_link) {
    353 		if (sc->sc_unit == unit) {
    354 			mutex_exit(&raid_lock);
    355 			return sc;
    356 		}
    357 	}
    358 	mutex_exit(&raid_lock);
    359 	if ((sc = raidcreate(unit)) == NULL)
    360 		return NULL;
    361 	mutex_enter(&raid_lock);
    362 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    363 	mutex_exit(&raid_lock);
    364 	return sc;
    365 }
    366 
    367 static void
    368 raidput(struct raid_softc *sc) {
    369 	mutex_enter(&raid_lock);
    370 	LIST_REMOVE(sc, sc_link);
    371 	mutex_exit(&raid_lock);
    372 	raiddestroy(sc);
    373 }
    374 
    375 void
    376 raidattach(int num)
    377 {
    378 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    379 	/* This is where all the initialization stuff gets done. */
    380 
    381 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    382 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    383 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    384 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    385 
    386 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    387 #endif
    388 
    389 	if (rf_BootRaidframe() == 0)
    390 		aprint_verbose("Kernelized RAIDframe activated\n");
    391 	else
    392 		panic("Serious error booting RAID!!");
    393 
    394 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    395 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    396 	}
    397 
    398 	raidautoconfigdone = false;
    399 
    400 	/*
    401 	 * Register a finalizer which will be used to auto-config RAID
    402 	 * sets once all real hardware devices have been found.
    403 	 */
    404 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    405 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    406 }
    407 
    408 int
    409 rf_autoconfig(device_t self)
    410 {
    411 	RF_AutoConfig_t *ac_list;
    412 	RF_ConfigSet_t *config_sets;
    413 
    414 	if (!raidautoconfig || raidautoconfigdone == true)
    415 		return (0);
    416 
    417 	/* XXX This code can only be run once. */
    418 	raidautoconfigdone = true;
    419 
    420 	/* 1. locate all RAID components on the system */
    421 	aprint_debug("Searching for RAID components...\n");
    422 	ac_list = rf_find_raid_components();
    423 
    424 	/* 2. Sort them into their respective sets. */
    425 	config_sets = rf_create_auto_sets(ac_list);
    426 
    427 	/*
    428 	 * 3. Evaluate each set and configure the valid ones.
    429 	 * This gets done in rf_buildroothack().
    430 	 */
    431 	rf_buildroothack(config_sets);
    432 
    433 	return 1;
    434 }
    435 
    436 void
    437 rf_buildroothack(RF_ConfigSet_t *config_sets)
    438 {
    439 	RF_ConfigSet_t *cset;
    440 	RF_ConfigSet_t *next_cset;
    441 	int col;
    442 	int num_root;
    443 	char *devname;
    444 	struct raid_softc *sc, *rsc;
    445 
    446 	sc = rsc = NULL;
    447 	num_root = 0;
    448 	cset = config_sets;
    449 	while (cset != NULL) {
    450 		next_cset = cset->next;
    451 		if (rf_have_enough_components(cset) &&
    452 		    cset->ac->clabel->autoconfigure == 1) {
    453 			sc = rf_auto_config_set(cset);
    454 			if (sc != NULL) {
    455 				aprint_debug("raid%d: configured ok\n",
    456 				    sc->sc_unit);
    457 				if (cset->rootable) {
    458 					rsc = sc;
    459 					num_root++;
    460 				}
    461 			} else {
    462 				/* The autoconfig didn't work :( */
    463 				aprint_debug("Autoconfig failed\n");
    464 				rf_release_all_vps(cset);
    465 			}
    466 		} else {
    467 			/* we're not autoconfiguring this set...
    468 			   release the associated resources */
    469 			rf_release_all_vps(cset);
    470 		}
    471 		/* cleanup */
    472 		rf_cleanup_config_set(cset);
    473 		cset = next_cset;
    474 	}
    475 
    476 	/* if the user has specified what the root device should be
    477 	   then we don't touch booted_device or boothowto... */
    478 
    479 	if (rootspec != NULL)
    480 		return;
    481 
    482 	/* we found something bootable... */
    483 
    484 	if (num_root == 1) {
    485 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    486 			/* XXX: How do we find the real root partition? */
    487 			char cname[sizeof(cset->ac->devname)];
    488 			snprintf(cname, sizeof(cname), "%s%c",
    489 			    device_xname(rsc->sc_dev), 'a');
    490 			booted_device = dkwedge_find_by_wname(cname);
    491 		} else
    492 			booted_device = rsc->sc_dev;
    493 	} else if (num_root > 1) {
    494 
    495 		/*
    496 		 * Maybe the MD code can help. If it cannot, then
    497 		 * setroot() will discover that we have no
    498 		 * booted_device and will ask the user if nothing was
    499 		 * hardwired in the kernel config file
    500 		 */
    501 
    502 		if (booted_device == NULL)
    503 			cpu_rootconf();
    504 		if (booted_device == NULL)
    505 			return;
    506 
    507 		num_root = 0;
    508 		mutex_enter(&raid_lock);
    509 		LIST_FOREACH(sc, &raids, sc_link) {
    510 			RF_Raid_t *r = &sc->sc_r;
    511 			if (r->valid == 0)
    512 				continue;
    513 
    514 			if (r->root_partition == 0)
    515 				continue;
    516 
    517 			for (col = 0; col < r->numCol; col++) {
    518 				devname = r->Disks[col].devname;
    519 				devname += sizeof("/dev/") - 1;
    520 				if (strncmp(devname, device_xname(booted_device),
    521 					    strlen(device_xname(booted_device))) != 0)
    522 					continue;
    523 				aprint_debug("raid%d includes boot device %s\n",
    524 				       sc->sc_unit, devname);
    525 				num_root++;
    526 				rsc = sc;
    527 			}
    528 		}
    529 		mutex_exit(&raid_lock);
    530 
    531 		if (num_root == 1) {
    532 			booted_device = rsc->sc_dev;
    533 		} else {
    534 			/* we can't guess.. require the user to answer... */
    535 			boothowto |= RB_ASKNAME;
    536 		}
    537 	}
    538 }
    539 
    540 
    541 int
    542 raidsize(dev_t dev)
    543 {
    544 	struct raid_softc *rs;
    545 	struct disklabel *lp;
    546 	int     part, unit, omask, size;
    547 
    548 	unit = raidunit(dev);
    549 	if ((rs = raidget(unit)) == NULL)
    550 		return -1;
    551 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    552 		return (-1);
    553 
    554 	part = DISKPART(dev);
    555 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    556 	lp = rs->sc_dkdev.dk_label;
    557 
    558 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    559 		return (-1);
    560 
    561 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    562 		size = -1;
    563 	else
    564 		size = lp->d_partitions[part].p_size *
    565 		    (lp->d_secsize / DEV_BSIZE);
    566 
    567 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    568 		return (-1);
    569 
    570 	return (size);
    571 
    572 }
    573 
    574 int
    575 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    576 {
    577 	int     unit = raidunit(dev);
    578 	struct raid_softc *rs;
    579 	const struct bdevsw *bdev;
    580 	struct disklabel *lp;
    581 	RF_Raid_t *raidPtr;
    582 	daddr_t offset;
    583 	int     part, c, sparecol, j, scol, dumpto;
    584 	int     error = 0;
    585 
    586 	if ((rs = raidget(unit)) == NULL)
    587 		return ENXIO;
    588 
    589 	raidPtr = &rs->sc_r;
    590 
    591 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    592 		return ENXIO;
    593 
    594 	/* we only support dumping to RAID 1 sets */
    595 	if (raidPtr->Layout.numDataCol != 1 ||
    596 	    raidPtr->Layout.numParityCol != 1)
    597 		return EINVAL;
    598 
    599 
    600 	if ((error = raidlock(rs)) != 0)
    601 		return error;
    602 
    603 	if (size % DEV_BSIZE != 0) {
    604 		error = EINVAL;
    605 		goto out;
    606 	}
    607 
    608 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    609 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    610 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    611 		    size / DEV_BSIZE, rs->sc_size);
    612 		error = EINVAL;
    613 		goto out;
    614 	}
    615 
    616 	part = DISKPART(dev);
    617 	lp = rs->sc_dkdev.dk_label;
    618 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    619 
    620 	/* figure out what device is alive.. */
    621 
    622 	/*
    623 	   Look for a component to dump to.  The preference for the
    624 	   component to dump to is as follows:
    625 	   1) the master
    626 	   2) a used_spare of the master
    627 	   3) the slave
    628 	   4) a used_spare of the slave
    629 	*/
    630 
    631 	dumpto = -1;
    632 	for (c = 0; c < raidPtr->numCol; c++) {
    633 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    634 			/* this might be the one */
    635 			dumpto = c;
    636 			break;
    637 		}
    638 	}
    639 
    640 	/*
    641 	   At this point we have possibly selected a live master or a
    642 	   live slave.  We now check to see if there is a spared
    643 	   master (or a spared slave), if we didn't find a live master
    644 	   or a live slave.
    645 	*/
    646 
    647 	for (c = 0; c < raidPtr->numSpare; c++) {
    648 		sparecol = raidPtr->numCol + c;
    649 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    650 			/* How about this one? */
    651 			scol = -1;
    652 			for(j=0;j<raidPtr->numCol;j++) {
    653 				if (raidPtr->Disks[j].spareCol == sparecol) {
    654 					scol = j;
    655 					break;
    656 				}
    657 			}
    658 			if (scol == 0) {
    659 				/*
    660 				   We must have found a spared master!
    661 				   We'll take that over anything else
    662 				   found so far.  (We couldn't have
    663 				   found a real master before, since
    664 				   this is a used spare, and it's
    665 				   saying that it's replacing the
    666 				   master.)  On reboot (with
    667 				   autoconfiguration turned on)
    668 				   sparecol will become the 1st
    669 				   component (component0) of this set.
    670 				*/
    671 				dumpto = sparecol;
    672 				break;
    673 			} else if (scol != -1) {
    674 				/*
    675 				   Must be a spared slave.  We'll dump
    676 				   to that if we havn't found anything
    677 				   else so far.
    678 				*/
    679 				if (dumpto == -1)
    680 					dumpto = sparecol;
    681 			}
    682 		}
    683 	}
    684 
    685 	if (dumpto == -1) {
    686 		/* we couldn't find any live components to dump to!?!?
    687 		 */
    688 		error = EINVAL;
    689 		goto out;
    690 	}
    691 
    692 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    693 
    694 	/*
    695 	   Note that blkno is relative to this particular partition.
    696 	   By adding the offset of this partition in the RAID
    697 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    698 	   value that is relative to the partition used for the
    699 	   underlying component.
    700 	*/
    701 
    702 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    703 				blkno + offset, va, size);
    704 
    705 out:
    706 	raidunlock(rs);
    707 
    708 	return error;
    709 }
    710 /* ARGSUSED */
    711 int
    712 raidopen(dev_t dev, int flags, int fmt,
    713     struct lwp *l)
    714 {
    715 	int     unit = raidunit(dev);
    716 	struct raid_softc *rs;
    717 	struct disklabel *lp;
    718 	int     part, pmask;
    719 	int     error = 0;
    720 
    721 	if ((rs = raidget(unit)) == NULL)
    722 		return ENXIO;
    723 	if ((error = raidlock(rs)) != 0)
    724 		return (error);
    725 
    726 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    727 		error = EBUSY;
    728 		goto bad;
    729 	}
    730 
    731 	lp = rs->sc_dkdev.dk_label;
    732 
    733 	part = DISKPART(dev);
    734 
    735 	/*
    736 	 * If there are wedges, and this is not RAW_PART, then we
    737 	 * need to fail.
    738 	 */
    739 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    740 		error = EBUSY;
    741 		goto bad;
    742 	}
    743 	pmask = (1 << part);
    744 
    745 	if ((rs->sc_flags & RAIDF_INITED) &&
    746 	    (rs->sc_dkdev.dk_openmask == 0))
    747 		raidgetdisklabel(dev);
    748 
    749 	/* make sure that this partition exists */
    750 
    751 	if (part != RAW_PART) {
    752 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    753 		    ((part >= lp->d_npartitions) ||
    754 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    755 			error = ENXIO;
    756 			goto bad;
    757 		}
    758 	}
    759 	/* Prevent this unit from being unconfigured while open. */
    760 	switch (fmt) {
    761 	case S_IFCHR:
    762 		rs->sc_dkdev.dk_copenmask |= pmask;
    763 		break;
    764 
    765 	case S_IFBLK:
    766 		rs->sc_dkdev.dk_bopenmask |= pmask;
    767 		break;
    768 	}
    769 
    770 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    771 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    772 		/* First one... mark things as dirty... Note that we *MUST*
    773 		 have done a configure before this.  I DO NOT WANT TO BE
    774 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    775 		 THAT THEY BELONG TOGETHER!!!!! */
    776 		/* XXX should check to see if we're only open for reading
    777 		   here... If so, we needn't do this, but then need some
    778 		   other way of keeping track of what's happened.. */
    779 
    780 		rf_markalldirty(&rs->sc_r);
    781 	}
    782 
    783 
    784 	rs->sc_dkdev.dk_openmask =
    785 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    786 
    787 bad:
    788 	raidunlock(rs);
    789 
    790 	return (error);
    791 
    792 
    793 }
    794 /* ARGSUSED */
    795 int
    796 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    797 {
    798 	int     unit = raidunit(dev);
    799 	struct raid_softc *rs;
    800 	int     error = 0;
    801 	int     part;
    802 
    803 	if ((rs = raidget(unit)) == NULL)
    804 		return ENXIO;
    805 
    806 	if ((error = raidlock(rs)) != 0)
    807 		return (error);
    808 
    809 	part = DISKPART(dev);
    810 
    811 	/* ...that much closer to allowing unconfiguration... */
    812 	switch (fmt) {
    813 	case S_IFCHR:
    814 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    815 		break;
    816 
    817 	case S_IFBLK:
    818 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    819 		break;
    820 	}
    821 	rs->sc_dkdev.dk_openmask =
    822 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    823 
    824 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    825 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    826 		/* Last one... device is not unconfigured yet.
    827 		   Device shutdown has taken care of setting the
    828 		   clean bits if RAIDF_INITED is not set
    829 		   mark things as clean... */
    830 
    831 		rf_update_component_labels(&rs->sc_r,
    832 						 RF_FINAL_COMPONENT_UPDATE);
    833 
    834 		/* If the kernel is shutting down, it will detach
    835 		 * this RAID set soon enough.
    836 		 */
    837 	}
    838 
    839 	raidunlock(rs);
    840 	return (0);
    841 
    842 }
    843 
    844 void
    845 raidstrategy(struct buf *bp)
    846 {
    847 	unsigned int unit = raidunit(bp->b_dev);
    848 	RF_Raid_t *raidPtr;
    849 	int     wlabel;
    850 	struct raid_softc *rs;
    851 
    852 	if ((rs = raidget(unit)) == NULL) {
    853 		bp->b_error = ENXIO;
    854 		goto done;
    855 	}
    856 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    857 		bp->b_error = ENXIO;
    858 		goto done;
    859 	}
    860 	raidPtr = &rs->sc_r;
    861 	if (!raidPtr->valid) {
    862 		bp->b_error = ENODEV;
    863 		goto done;
    864 	}
    865 	if (bp->b_bcount == 0) {
    866 		db1_printf(("b_bcount is zero..\n"));
    867 		goto done;
    868 	}
    869 
    870 	/*
    871 	 * Do bounds checking and adjust transfer.  If there's an
    872 	 * error, the bounds check will flag that for us.
    873 	 */
    874 
    875 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    876 	if (DISKPART(bp->b_dev) == RAW_PART) {
    877 		uint64_t size; /* device size in DEV_BSIZE unit */
    878 
    879 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    880 			size = raidPtr->totalSectors <<
    881 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    882 		} else {
    883 			size = raidPtr->totalSectors >>
    884 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    885 		}
    886 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    887 			goto done;
    888 		}
    889 	} else {
    890 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    891 			db1_printf(("Bounds check failed!!:%d %d\n",
    892 				(int) bp->b_blkno, (int) wlabel));
    893 			goto done;
    894 		}
    895 	}
    896 
    897 	rf_lock_mutex2(raidPtr->iodone_lock);
    898 
    899 	bp->b_resid = 0;
    900 
    901 	/* stuff it onto our queue */
    902 	bufq_put(rs->buf_queue, bp);
    903 
    904 	/* scheduled the IO to happen at the next convenient time */
    905 	rf_signal_cond2(raidPtr->iodone_cv);
    906 	rf_unlock_mutex2(raidPtr->iodone_lock);
    907 
    908 	return;
    909 
    910 done:
    911 	bp->b_resid = bp->b_bcount;
    912 	biodone(bp);
    913 }
    914 /* ARGSUSED */
    915 int
    916 raidread(dev_t dev, struct uio *uio, int flags)
    917 {
    918 	int     unit = raidunit(dev);
    919 	struct raid_softc *rs;
    920 
    921 	if ((rs = raidget(unit)) == NULL)
    922 		return ENXIO;
    923 
    924 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    925 		return (ENXIO);
    926 
    927 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    928 
    929 }
    930 /* ARGSUSED */
    931 int
    932 raidwrite(dev_t dev, struct uio *uio, int flags)
    933 {
    934 	int     unit = raidunit(dev);
    935 	struct raid_softc *rs;
    936 
    937 	if ((rs = raidget(unit)) == NULL)
    938 		return ENXIO;
    939 
    940 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    941 		return (ENXIO);
    942 
    943 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    944 
    945 }
    946 
    947 static int
    948 raid_detach_unlocked(struct raid_softc *rs)
    949 {
    950 	int error;
    951 	RF_Raid_t *raidPtr;
    952 
    953 	raidPtr = &rs->sc_r;
    954 
    955 	/*
    956 	 * If somebody has a partition mounted, we shouldn't
    957 	 * shutdown.
    958 	 */
    959 	if (rs->sc_dkdev.dk_openmask != 0)
    960 		return EBUSY;
    961 
    962 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    963 		;	/* not initialized: nothing to do */
    964 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    965 		return error;
    966 	else
    967 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    968 
    969 	/* Detach the disk. */
    970 	dkwedge_delall(&rs->sc_dkdev);
    971 	disk_detach(&rs->sc_dkdev);
    972 	disk_destroy(&rs->sc_dkdev);
    973 
    974 	aprint_normal_dev(rs->sc_dev, "detached\n");
    975 
    976 	return 0;
    977 }
    978 
    979 int
    980 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    981 {
    982 	int     unit = raidunit(dev);
    983 	int     error = 0;
    984 	int     part, pmask, s;
    985 	cfdata_t cf;
    986 	struct raid_softc *rs;
    987 	RF_Config_t *k_cfg, *u_cfg;
    988 	RF_Raid_t *raidPtr;
    989 	RF_RaidDisk_t *diskPtr;
    990 	RF_AccTotals_t *totals;
    991 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    992 	u_char *specific_buf;
    993 	int retcode = 0;
    994 	int column;
    995 /*	int raidid; */
    996 	struct rf_recon_req *rrcopy, *rr;
    997 	RF_ComponentLabel_t *clabel;
    998 	RF_ComponentLabel_t *ci_label;
    999 	RF_ComponentLabel_t **clabel_ptr;
   1000 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1001 	RF_SingleComponent_t component;
   1002 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1003 	int i, j, d;
   1004 #ifdef __HAVE_OLD_DISKLABEL
   1005 	struct disklabel newlabel;
   1006 #endif
   1007 	struct dkwedge_info *dkw;
   1008 
   1009 	if ((rs = raidget(unit)) == NULL)
   1010 		return ENXIO;
   1011 	raidPtr = &rs->sc_r;
   1012 
   1013 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1014 		(int) DISKPART(dev), (int) unit, cmd));
   1015 
   1016 	/* Must be open for writes for these commands... */
   1017 	switch (cmd) {
   1018 #ifdef DIOCGSECTORSIZE
   1019 	case DIOCGSECTORSIZE:
   1020 		*(u_int *)data = raidPtr->bytesPerSector;
   1021 		return 0;
   1022 	case DIOCGMEDIASIZE:
   1023 		*(off_t *)data =
   1024 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1025 		return 0;
   1026 #endif
   1027 	case DIOCSDINFO:
   1028 	case DIOCWDINFO:
   1029 #ifdef __HAVE_OLD_DISKLABEL
   1030 	case ODIOCWDINFO:
   1031 	case ODIOCSDINFO:
   1032 #endif
   1033 	case DIOCWLABEL:
   1034 	case DIOCAWEDGE:
   1035 	case DIOCDWEDGE:
   1036 	case DIOCSSTRATEGY:
   1037 		if ((flag & FWRITE) == 0)
   1038 			return (EBADF);
   1039 	}
   1040 
   1041 	/* Must be initialized for these... */
   1042 	switch (cmd) {
   1043 	case DIOCGDINFO:
   1044 	case DIOCSDINFO:
   1045 	case DIOCWDINFO:
   1046 #ifdef __HAVE_OLD_DISKLABEL
   1047 	case ODIOCGDINFO:
   1048 	case ODIOCWDINFO:
   1049 	case ODIOCSDINFO:
   1050 	case ODIOCGDEFLABEL:
   1051 #endif
   1052 	case DIOCGPART:
   1053 	case DIOCWLABEL:
   1054 	case DIOCGDEFLABEL:
   1055 	case DIOCAWEDGE:
   1056 	case DIOCDWEDGE:
   1057 	case DIOCLWEDGES:
   1058 	case DIOCCACHESYNC:
   1059 	case RAIDFRAME_SHUTDOWN:
   1060 	case RAIDFRAME_REWRITEPARITY:
   1061 	case RAIDFRAME_GET_INFO:
   1062 	case RAIDFRAME_RESET_ACCTOTALS:
   1063 	case RAIDFRAME_GET_ACCTOTALS:
   1064 	case RAIDFRAME_KEEP_ACCTOTALS:
   1065 	case RAIDFRAME_GET_SIZE:
   1066 	case RAIDFRAME_FAIL_DISK:
   1067 	case RAIDFRAME_COPYBACK:
   1068 	case RAIDFRAME_CHECK_RECON_STATUS:
   1069 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1070 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1071 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1072 	case RAIDFRAME_ADD_HOT_SPARE:
   1073 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1074 	case RAIDFRAME_INIT_LABELS:
   1075 	case RAIDFRAME_REBUILD_IN_PLACE:
   1076 	case RAIDFRAME_CHECK_PARITY:
   1077 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1078 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1079 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1080 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1081 	case RAIDFRAME_SET_AUTOCONFIG:
   1082 	case RAIDFRAME_SET_ROOT:
   1083 	case RAIDFRAME_DELETE_COMPONENT:
   1084 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1085 	case RAIDFRAME_PARITYMAP_STATUS:
   1086 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1087 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1088 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1089 	case DIOCGSTRATEGY:
   1090 	case DIOCSSTRATEGY:
   1091 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1092 			return (ENXIO);
   1093 	}
   1094 
   1095 	switch (cmd) {
   1096 #ifdef COMPAT_50
   1097 	case RAIDFRAME_GET_INFO50:
   1098 		return rf_get_info50(raidPtr, data);
   1099 
   1100 	case RAIDFRAME_CONFIGURE50:
   1101 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1102 			return retcode;
   1103 		goto config;
   1104 #endif
   1105 		/* configure the system */
   1106 	case RAIDFRAME_CONFIGURE:
   1107 
   1108 		if (raidPtr->valid) {
   1109 			/* There is a valid RAID set running on this unit! */
   1110 			printf("raid%d: Device already configured!\n",unit);
   1111 			return(EINVAL);
   1112 		}
   1113 
   1114 		/* copy-in the configuration information */
   1115 		/* data points to a pointer to the configuration structure */
   1116 
   1117 		u_cfg = *((RF_Config_t **) data);
   1118 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1119 		if (k_cfg == NULL) {
   1120 			return (ENOMEM);
   1121 		}
   1122 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1123 		if (retcode) {
   1124 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1125 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1126 				retcode));
   1127 			return (retcode);
   1128 		}
   1129 		goto config;
   1130 	config:
   1131 		/* allocate a buffer for the layout-specific data, and copy it
   1132 		 * in */
   1133 		if (k_cfg->layoutSpecificSize) {
   1134 			if (k_cfg->layoutSpecificSize > 10000) {
   1135 				/* sanity check */
   1136 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1137 				return (EINVAL);
   1138 			}
   1139 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1140 			    (u_char *));
   1141 			if (specific_buf == NULL) {
   1142 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1143 				return (ENOMEM);
   1144 			}
   1145 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1146 			    k_cfg->layoutSpecificSize);
   1147 			if (retcode) {
   1148 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1149 				RF_Free(specific_buf,
   1150 					k_cfg->layoutSpecificSize);
   1151 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1152 					retcode));
   1153 				return (retcode);
   1154 			}
   1155 		} else
   1156 			specific_buf = NULL;
   1157 		k_cfg->layoutSpecific = specific_buf;
   1158 
   1159 		/* should do some kind of sanity check on the configuration.
   1160 		 * Store the sum of all the bytes in the last byte? */
   1161 
   1162 		/* configure the system */
   1163 
   1164 		/*
   1165 		 * Clear the entire RAID descriptor, just to make sure
   1166 		 *  there is no stale data left in the case of a
   1167 		 *  reconfiguration
   1168 		 */
   1169 		memset(raidPtr, 0, sizeof(*raidPtr));
   1170 		raidPtr->softc = rs;
   1171 		raidPtr->raidid = unit;
   1172 
   1173 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1174 
   1175 		if (retcode == 0) {
   1176 
   1177 			/* allow this many simultaneous IO's to
   1178 			   this RAID device */
   1179 			raidPtr->openings = RAIDOUTSTANDING;
   1180 
   1181 			raidinit(rs);
   1182 			rf_markalldirty(raidPtr);
   1183 		}
   1184 		/* free the buffers.  No return code here. */
   1185 		if (k_cfg->layoutSpecificSize) {
   1186 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1187 		}
   1188 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1189 
   1190 		return (retcode);
   1191 
   1192 		/* shutdown the system */
   1193 	case RAIDFRAME_SHUTDOWN:
   1194 
   1195 		part = DISKPART(dev);
   1196 		pmask = (1 << part);
   1197 
   1198 		if ((error = raidlock(rs)) != 0)
   1199 			return (error);
   1200 
   1201 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1202 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1203 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1204 			retcode = EBUSY;
   1205 		else {
   1206 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1207 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1208 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1209 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1210 			retcode = 0;
   1211 		}
   1212 
   1213 		raidunlock(rs);
   1214 
   1215 		if (retcode != 0)
   1216 			return retcode;
   1217 
   1218 		/* free the pseudo device attach bits */
   1219 
   1220 		cf = device_cfdata(rs->sc_dev);
   1221 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1222 			free(cf, M_RAIDFRAME);
   1223 
   1224 		return (retcode);
   1225 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1226 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1227 		/* need to read the component label for the disk indicated
   1228 		   by row,column in clabel */
   1229 
   1230 		/*
   1231 		 * Perhaps there should be an option to skip the in-core
   1232 		 * copy and hit the disk, as with disklabel(8).
   1233 		 */
   1234 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1235 
   1236 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1237 
   1238 		if (retcode) {
   1239 			RF_Free(clabel, sizeof(*clabel));
   1240 			return retcode;
   1241 		}
   1242 
   1243 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1244 
   1245 		column = clabel->column;
   1246 
   1247 		if ((column < 0) || (column >= raidPtr->numCol +
   1248 		    raidPtr->numSpare)) {
   1249 			RF_Free(clabel, sizeof(*clabel));
   1250 			return EINVAL;
   1251 		}
   1252 
   1253 		RF_Free(clabel, sizeof(*clabel));
   1254 
   1255 		clabel = raidget_component_label(raidPtr, column);
   1256 
   1257 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1258 
   1259 #if 0
   1260 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1261 		clabel = (RF_ComponentLabel_t *) data;
   1262 
   1263 		/* XXX check the label for valid stuff... */
   1264 		/* Note that some things *should not* get modified --
   1265 		   the user should be re-initing the labels instead of
   1266 		   trying to patch things.
   1267 		   */
   1268 
   1269 		raidid = raidPtr->raidid;
   1270 #ifdef DEBUG
   1271 		printf("raid%d: Got component label:\n", raidid);
   1272 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1273 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1274 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1275 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1276 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1277 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1278 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1279 #endif
   1280 		clabel->row = 0;
   1281 		column = clabel->column;
   1282 
   1283 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1284 			return(EINVAL);
   1285 		}
   1286 
   1287 		/* XXX this isn't allowed to do anything for now :-) */
   1288 
   1289 		/* XXX and before it is, we need to fill in the rest
   1290 		   of the fields!?!?!?! */
   1291 		memcpy(raidget_component_label(raidPtr, column),
   1292 		    clabel, sizeof(*clabel));
   1293 		raidflush_component_label(raidPtr, column);
   1294 		return (0);
   1295 #endif
   1296 
   1297 	case RAIDFRAME_INIT_LABELS:
   1298 		clabel = (RF_ComponentLabel_t *) data;
   1299 		/*
   1300 		   we only want the serial number from
   1301 		   the above.  We get all the rest of the information
   1302 		   from the config that was used to create this RAID
   1303 		   set.
   1304 		   */
   1305 
   1306 		raidPtr->serial_number = clabel->serial_number;
   1307 
   1308 		for(column=0;column<raidPtr->numCol;column++) {
   1309 			diskPtr = &raidPtr->Disks[column];
   1310 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1311 				ci_label = raidget_component_label(raidPtr,
   1312 				    column);
   1313 				/* Zeroing this is important. */
   1314 				memset(ci_label, 0, sizeof(*ci_label));
   1315 				raid_init_component_label(raidPtr, ci_label);
   1316 				ci_label->serial_number =
   1317 				    raidPtr->serial_number;
   1318 				ci_label->row = 0; /* we dont' pretend to support more */
   1319 				rf_component_label_set_partitionsize(ci_label,
   1320 				    diskPtr->partitionSize);
   1321 				ci_label->column = column;
   1322 				raidflush_component_label(raidPtr, column);
   1323 			}
   1324 			/* XXXjld what about the spares? */
   1325 		}
   1326 
   1327 		return (retcode);
   1328 	case RAIDFRAME_SET_AUTOCONFIG:
   1329 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1330 		printf("raid%d: New autoconfig value is: %d\n",
   1331 		       raidPtr->raidid, d);
   1332 		*(int *) data = d;
   1333 		return (retcode);
   1334 
   1335 	case RAIDFRAME_SET_ROOT:
   1336 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1337 		printf("raid%d: New rootpartition value is: %d\n",
   1338 		       raidPtr->raidid, d);
   1339 		*(int *) data = d;
   1340 		return (retcode);
   1341 
   1342 		/* initialize all parity */
   1343 	case RAIDFRAME_REWRITEPARITY:
   1344 
   1345 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1346 			/* Parity for RAID 0 is trivially correct */
   1347 			raidPtr->parity_good = RF_RAID_CLEAN;
   1348 			return(0);
   1349 		}
   1350 
   1351 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1352 			/* Re-write is already in progress! */
   1353 			return(EINVAL);
   1354 		}
   1355 
   1356 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1357 					   rf_RewriteParityThread,
   1358 					   raidPtr,"raid_parity");
   1359 		return (retcode);
   1360 
   1361 
   1362 	case RAIDFRAME_ADD_HOT_SPARE:
   1363 		sparePtr = (RF_SingleComponent_t *) data;
   1364 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1365 		retcode = rf_add_hot_spare(raidPtr, &component);
   1366 		return(retcode);
   1367 
   1368 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1369 		return(retcode);
   1370 
   1371 	case RAIDFRAME_DELETE_COMPONENT:
   1372 		componentPtr = (RF_SingleComponent_t *)data;
   1373 		memcpy( &component, componentPtr,
   1374 			sizeof(RF_SingleComponent_t));
   1375 		retcode = rf_delete_component(raidPtr, &component);
   1376 		return(retcode);
   1377 
   1378 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1379 		componentPtr = (RF_SingleComponent_t *)data;
   1380 		memcpy( &component, componentPtr,
   1381 			sizeof(RF_SingleComponent_t));
   1382 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1383 		return(retcode);
   1384 
   1385 	case RAIDFRAME_REBUILD_IN_PLACE:
   1386 
   1387 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1388 			/* Can't do this on a RAID 0!! */
   1389 			return(EINVAL);
   1390 		}
   1391 
   1392 		if (raidPtr->recon_in_progress == 1) {
   1393 			/* a reconstruct is already in progress! */
   1394 			return(EINVAL);
   1395 		}
   1396 
   1397 		componentPtr = (RF_SingleComponent_t *) data;
   1398 		memcpy( &component, componentPtr,
   1399 			sizeof(RF_SingleComponent_t));
   1400 		component.row = 0; /* we don't support any more */
   1401 		column = component.column;
   1402 
   1403 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1404 			return(EINVAL);
   1405 		}
   1406 
   1407 		rf_lock_mutex2(raidPtr->mutex);
   1408 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1409 		    (raidPtr->numFailures > 0)) {
   1410 			/* XXX 0 above shouldn't be constant!!! */
   1411 			/* some component other than this has failed.
   1412 			   Let's not make things worse than they already
   1413 			   are... */
   1414 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1415 			       raidPtr->raidid);
   1416 			printf("raid%d:     Col: %d   Too many failures.\n",
   1417 			       raidPtr->raidid, column);
   1418 			rf_unlock_mutex2(raidPtr->mutex);
   1419 			return (EINVAL);
   1420 		}
   1421 		if (raidPtr->Disks[column].status ==
   1422 		    rf_ds_reconstructing) {
   1423 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1424 			       raidPtr->raidid);
   1425 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1426 
   1427 			rf_unlock_mutex2(raidPtr->mutex);
   1428 			return (EINVAL);
   1429 		}
   1430 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1431 			rf_unlock_mutex2(raidPtr->mutex);
   1432 			return (EINVAL);
   1433 		}
   1434 		rf_unlock_mutex2(raidPtr->mutex);
   1435 
   1436 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1437 		if (rrcopy == NULL)
   1438 			return(ENOMEM);
   1439 
   1440 		rrcopy->raidPtr = (void *) raidPtr;
   1441 		rrcopy->col = column;
   1442 
   1443 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1444 					   rf_ReconstructInPlaceThread,
   1445 					   rrcopy,"raid_reconip");
   1446 		return(retcode);
   1447 
   1448 	case RAIDFRAME_GET_INFO:
   1449 		if (!raidPtr->valid)
   1450 			return (ENODEV);
   1451 		ucfgp = (RF_DeviceConfig_t **) data;
   1452 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1453 			  (RF_DeviceConfig_t *));
   1454 		if (d_cfg == NULL)
   1455 			return (ENOMEM);
   1456 		d_cfg->rows = 1; /* there is only 1 row now */
   1457 		d_cfg->cols = raidPtr->numCol;
   1458 		d_cfg->ndevs = raidPtr->numCol;
   1459 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1460 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1461 			return (ENOMEM);
   1462 		}
   1463 		d_cfg->nspares = raidPtr->numSpare;
   1464 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1465 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1466 			return (ENOMEM);
   1467 		}
   1468 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1469 		d = 0;
   1470 		for (j = 0; j < d_cfg->cols; j++) {
   1471 			d_cfg->devs[d] = raidPtr->Disks[j];
   1472 			d++;
   1473 		}
   1474 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1475 			d_cfg->spares[i] = raidPtr->Disks[j];
   1476 		}
   1477 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1478 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1479 
   1480 		return (retcode);
   1481 
   1482 	case RAIDFRAME_CHECK_PARITY:
   1483 		*(int *) data = raidPtr->parity_good;
   1484 		return (0);
   1485 
   1486 	case RAIDFRAME_PARITYMAP_STATUS:
   1487 		if (rf_paritymap_ineligible(raidPtr))
   1488 			return EINVAL;
   1489 		rf_paritymap_status(raidPtr->parity_map,
   1490 		    (struct rf_pmstat *)data);
   1491 		return 0;
   1492 
   1493 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1494 		if (rf_paritymap_ineligible(raidPtr))
   1495 			return EINVAL;
   1496 		if (raidPtr->parity_map == NULL)
   1497 			return ENOENT; /* ??? */
   1498 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1499 			(struct rf_pmparams *)data, 1))
   1500 			return EINVAL;
   1501 		return 0;
   1502 
   1503 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1504 		if (rf_paritymap_ineligible(raidPtr))
   1505 			return EINVAL;
   1506 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1507 		return 0;
   1508 
   1509 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1510 		if (rf_paritymap_ineligible(raidPtr))
   1511 			return EINVAL;
   1512 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1513 		/* XXX should errors be passed up? */
   1514 		return 0;
   1515 
   1516 	case RAIDFRAME_RESET_ACCTOTALS:
   1517 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1518 		return (0);
   1519 
   1520 	case RAIDFRAME_GET_ACCTOTALS:
   1521 		totals = (RF_AccTotals_t *) data;
   1522 		*totals = raidPtr->acc_totals;
   1523 		return (0);
   1524 
   1525 	case RAIDFRAME_KEEP_ACCTOTALS:
   1526 		raidPtr->keep_acc_totals = *(int *)data;
   1527 		return (0);
   1528 
   1529 	case RAIDFRAME_GET_SIZE:
   1530 		*(int *) data = raidPtr->totalSectors;
   1531 		return (0);
   1532 
   1533 		/* fail a disk & optionally start reconstruction */
   1534 	case RAIDFRAME_FAIL_DISK:
   1535 
   1536 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1537 			/* Can't do this on a RAID 0!! */
   1538 			return(EINVAL);
   1539 		}
   1540 
   1541 		rr = (struct rf_recon_req *) data;
   1542 		rr->row = 0;
   1543 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1544 			return (EINVAL);
   1545 
   1546 
   1547 		rf_lock_mutex2(raidPtr->mutex);
   1548 		if (raidPtr->status == rf_rs_reconstructing) {
   1549 			/* you can't fail a disk while we're reconstructing! */
   1550 			/* XXX wrong for RAID6 */
   1551 			rf_unlock_mutex2(raidPtr->mutex);
   1552 			return (EINVAL);
   1553 		}
   1554 		if ((raidPtr->Disks[rr->col].status ==
   1555 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1556 			/* some other component has failed.  Let's not make
   1557 			   things worse. XXX wrong for RAID6 */
   1558 			rf_unlock_mutex2(raidPtr->mutex);
   1559 			return (EINVAL);
   1560 		}
   1561 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1562 			/* Can't fail a spared disk! */
   1563 			rf_unlock_mutex2(raidPtr->mutex);
   1564 			return (EINVAL);
   1565 		}
   1566 		rf_unlock_mutex2(raidPtr->mutex);
   1567 
   1568 		/* make a copy of the recon request so that we don't rely on
   1569 		 * the user's buffer */
   1570 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1571 		if (rrcopy == NULL)
   1572 			return(ENOMEM);
   1573 		memcpy(rrcopy, rr, sizeof(*rr));
   1574 		rrcopy->raidPtr = (void *) raidPtr;
   1575 
   1576 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1577 					   rf_ReconThread,
   1578 					   rrcopy,"raid_recon");
   1579 		return (0);
   1580 
   1581 		/* invoke a copyback operation after recon on whatever disk
   1582 		 * needs it, if any */
   1583 	case RAIDFRAME_COPYBACK:
   1584 
   1585 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1586 			/* This makes no sense on a RAID 0!! */
   1587 			return(EINVAL);
   1588 		}
   1589 
   1590 		if (raidPtr->copyback_in_progress == 1) {
   1591 			/* Copyback is already in progress! */
   1592 			return(EINVAL);
   1593 		}
   1594 
   1595 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1596 					   rf_CopybackThread,
   1597 					   raidPtr,"raid_copyback");
   1598 		return (retcode);
   1599 
   1600 		/* return the percentage completion of reconstruction */
   1601 	case RAIDFRAME_CHECK_RECON_STATUS:
   1602 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1603 			/* This makes no sense on a RAID 0, so tell the
   1604 			   user it's done. */
   1605 			*(int *) data = 100;
   1606 			return(0);
   1607 		}
   1608 		if (raidPtr->status != rf_rs_reconstructing)
   1609 			*(int *) data = 100;
   1610 		else {
   1611 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1612 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1613 			} else {
   1614 				*(int *) data = 0;
   1615 			}
   1616 		}
   1617 		return (0);
   1618 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1619 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1620 		if (raidPtr->status != rf_rs_reconstructing) {
   1621 			progressInfo.remaining = 0;
   1622 			progressInfo.completed = 100;
   1623 			progressInfo.total = 100;
   1624 		} else {
   1625 			progressInfo.total =
   1626 				raidPtr->reconControl->numRUsTotal;
   1627 			progressInfo.completed =
   1628 				raidPtr->reconControl->numRUsComplete;
   1629 			progressInfo.remaining = progressInfo.total -
   1630 				progressInfo.completed;
   1631 		}
   1632 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1633 				  sizeof(RF_ProgressInfo_t));
   1634 		return (retcode);
   1635 
   1636 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1637 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1638 			/* This makes no sense on a RAID 0, so tell the
   1639 			   user it's done. */
   1640 			*(int *) data = 100;
   1641 			return(0);
   1642 		}
   1643 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1644 			*(int *) data = 100 *
   1645 				raidPtr->parity_rewrite_stripes_done /
   1646 				raidPtr->Layout.numStripe;
   1647 		} else {
   1648 			*(int *) data = 100;
   1649 		}
   1650 		return (0);
   1651 
   1652 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1653 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1654 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1655 			progressInfo.total = raidPtr->Layout.numStripe;
   1656 			progressInfo.completed =
   1657 				raidPtr->parity_rewrite_stripes_done;
   1658 			progressInfo.remaining = progressInfo.total -
   1659 				progressInfo.completed;
   1660 		} else {
   1661 			progressInfo.remaining = 0;
   1662 			progressInfo.completed = 100;
   1663 			progressInfo.total = 100;
   1664 		}
   1665 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1666 				  sizeof(RF_ProgressInfo_t));
   1667 		return (retcode);
   1668 
   1669 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1670 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1671 			/* This makes no sense on a RAID 0 */
   1672 			*(int *) data = 100;
   1673 			return(0);
   1674 		}
   1675 		if (raidPtr->copyback_in_progress == 1) {
   1676 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1677 				raidPtr->Layout.numStripe;
   1678 		} else {
   1679 			*(int *) data = 100;
   1680 		}
   1681 		return (0);
   1682 
   1683 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1684 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1685 		if (raidPtr->copyback_in_progress == 1) {
   1686 			progressInfo.total = raidPtr->Layout.numStripe;
   1687 			progressInfo.completed =
   1688 				raidPtr->copyback_stripes_done;
   1689 			progressInfo.remaining = progressInfo.total -
   1690 				progressInfo.completed;
   1691 		} else {
   1692 			progressInfo.remaining = 0;
   1693 			progressInfo.completed = 100;
   1694 			progressInfo.total = 100;
   1695 		}
   1696 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1697 				  sizeof(RF_ProgressInfo_t));
   1698 		return (retcode);
   1699 
   1700 		/* the sparetable daemon calls this to wait for the kernel to
   1701 		 * need a spare table. this ioctl does not return until a
   1702 		 * spare table is needed. XXX -- calling mpsleep here in the
   1703 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1704 		 * -- I should either compute the spare table in the kernel,
   1705 		 * or have a different -- XXX XXX -- interface (a different
   1706 		 * character device) for delivering the table     -- XXX */
   1707 #if 0
   1708 	case RAIDFRAME_SPARET_WAIT:
   1709 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1710 		while (!rf_sparet_wait_queue)
   1711 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1712 		waitreq = rf_sparet_wait_queue;
   1713 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1714 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1715 
   1716 		/* structure assignment */
   1717 		*((RF_SparetWait_t *) data) = *waitreq;
   1718 
   1719 		RF_Free(waitreq, sizeof(*waitreq));
   1720 		return (0);
   1721 
   1722 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1723 		 * code in it that will cause the dameon to exit */
   1724 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1725 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1726 		waitreq->fcol = -1;
   1727 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1728 		waitreq->next = rf_sparet_wait_queue;
   1729 		rf_sparet_wait_queue = waitreq;
   1730 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1731 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1732 		return (0);
   1733 
   1734 		/* used by the spare table daemon to deliver a spare table
   1735 		 * into the kernel */
   1736 	case RAIDFRAME_SEND_SPARET:
   1737 
   1738 		/* install the spare table */
   1739 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1740 
   1741 		/* respond to the requestor.  the return status of the spare
   1742 		 * table installation is passed in the "fcol" field */
   1743 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1744 		waitreq->fcol = retcode;
   1745 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1746 		waitreq->next = rf_sparet_resp_queue;
   1747 		rf_sparet_resp_queue = waitreq;
   1748 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1749 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1750 
   1751 		return (retcode);
   1752 #endif
   1753 
   1754 	default:
   1755 		break; /* fall through to the os-specific code below */
   1756 
   1757 	}
   1758 
   1759 	if (!raidPtr->valid)
   1760 		return (EINVAL);
   1761 
   1762 	/*
   1763 	 * Add support for "regular" device ioctls here.
   1764 	 */
   1765 
   1766 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1767 	if (error != EPASSTHROUGH)
   1768 		return (error);
   1769 
   1770 	switch (cmd) {
   1771 	case DIOCGDINFO:
   1772 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1773 		break;
   1774 #ifdef __HAVE_OLD_DISKLABEL
   1775 	case ODIOCGDINFO:
   1776 		newlabel = *(rs->sc_dkdev.dk_label);
   1777 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1778 			return ENOTTY;
   1779 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1780 		break;
   1781 #endif
   1782 
   1783 	case DIOCGPART:
   1784 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1785 		((struct partinfo *) data)->part =
   1786 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1787 		break;
   1788 
   1789 	case DIOCWDINFO:
   1790 	case DIOCSDINFO:
   1791 #ifdef __HAVE_OLD_DISKLABEL
   1792 	case ODIOCWDINFO:
   1793 	case ODIOCSDINFO:
   1794 #endif
   1795 	{
   1796 		struct disklabel *lp;
   1797 #ifdef __HAVE_OLD_DISKLABEL
   1798 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1799 			memset(&newlabel, 0, sizeof newlabel);
   1800 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1801 			lp = &newlabel;
   1802 		} else
   1803 #endif
   1804 		lp = (struct disklabel *)data;
   1805 
   1806 		if ((error = raidlock(rs)) != 0)
   1807 			return (error);
   1808 
   1809 		rs->sc_flags |= RAIDF_LABELLING;
   1810 
   1811 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1812 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1813 		if (error == 0) {
   1814 			if (cmd == DIOCWDINFO
   1815 #ifdef __HAVE_OLD_DISKLABEL
   1816 			    || cmd == ODIOCWDINFO
   1817 #endif
   1818 			   )
   1819 				error = writedisklabel(RAIDLABELDEV(dev),
   1820 				    raidstrategy, rs->sc_dkdev.dk_label,
   1821 				    rs->sc_dkdev.dk_cpulabel);
   1822 		}
   1823 		rs->sc_flags &= ~RAIDF_LABELLING;
   1824 
   1825 		raidunlock(rs);
   1826 
   1827 		if (error)
   1828 			return (error);
   1829 		break;
   1830 	}
   1831 
   1832 	case DIOCWLABEL:
   1833 		if (*(int *) data != 0)
   1834 			rs->sc_flags |= RAIDF_WLABEL;
   1835 		else
   1836 			rs->sc_flags &= ~RAIDF_WLABEL;
   1837 		break;
   1838 
   1839 	case DIOCGDEFLABEL:
   1840 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1841 		break;
   1842 
   1843 #ifdef __HAVE_OLD_DISKLABEL
   1844 	case ODIOCGDEFLABEL:
   1845 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1846 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1847 			return ENOTTY;
   1848 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1849 		break;
   1850 #endif
   1851 
   1852 	case DIOCAWEDGE:
   1853 	case DIOCDWEDGE:
   1854 	    	dkw = (void *)data;
   1855 
   1856 		/* If the ioctl happens here, the parent is us. */
   1857 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1858 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1859 
   1860 	case DIOCLWEDGES:
   1861 		return dkwedge_list(&rs->sc_dkdev,
   1862 		    (struct dkwedge_list *)data, l);
   1863 	case DIOCCACHESYNC:
   1864 		return rf_sync_component_caches(raidPtr);
   1865 
   1866 	case DIOCGSTRATEGY:
   1867 	    {
   1868 		struct disk_strategy *dks = (void *)data;
   1869 
   1870 		s = splbio();
   1871 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1872 		    sizeof(dks->dks_name));
   1873 		splx(s);
   1874 		dks->dks_paramlen = 0;
   1875 
   1876 		return 0;
   1877 	    }
   1878 
   1879 	case DIOCSSTRATEGY:
   1880 	    {
   1881 		struct disk_strategy *dks = (void *)data;
   1882 		struct bufq_state *new;
   1883 		struct bufq_state *old;
   1884 
   1885 		if (dks->dks_param != NULL) {
   1886 			return EINVAL;
   1887 		}
   1888 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1889 		error = bufq_alloc(&new, dks->dks_name,
   1890 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1891 		if (error) {
   1892 			return error;
   1893 		}
   1894 		s = splbio();
   1895 		old = rs->buf_queue;
   1896 		bufq_move(new, old);
   1897 		rs->buf_queue = new;
   1898 		splx(s);
   1899 		bufq_free(old);
   1900 
   1901 		return 0;
   1902 	    }
   1903 
   1904 	default:
   1905 		retcode = ENOTTY;
   1906 	}
   1907 	return (retcode);
   1908 
   1909 }
   1910 
   1911 
   1912 /* raidinit -- complete the rest of the initialization for the
   1913    RAIDframe device.  */
   1914 
   1915 
   1916 static void
   1917 raidinit(struct raid_softc *rs)
   1918 {
   1919 	cfdata_t cf;
   1920 	int     unit;
   1921 	RF_Raid_t *raidPtr = &rs->sc_r;
   1922 
   1923 	unit = raidPtr->raidid;
   1924 
   1925 
   1926 	/* XXX should check return code first... */
   1927 	rs->sc_flags |= RAIDF_INITED;
   1928 
   1929 	/* XXX doesn't check bounds. */
   1930 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1931 
   1932 	/* attach the pseudo device */
   1933 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1934 	cf->cf_name = raid_cd.cd_name;
   1935 	cf->cf_atname = raid_cd.cd_name;
   1936 	cf->cf_unit = unit;
   1937 	cf->cf_fstate = FSTATE_STAR;
   1938 
   1939 	rs->sc_dev = config_attach_pseudo(cf);
   1940 
   1941 	if (rs->sc_dev == NULL) {
   1942 		printf("raid%d: config_attach_pseudo failed\n",
   1943 		    raidPtr->raidid);
   1944 		rs->sc_flags &= ~RAIDF_INITED;
   1945 		free(cf, M_RAIDFRAME);
   1946 		return;
   1947 	}
   1948 
   1949 	/* disk_attach actually creates space for the CPU disklabel, among
   1950 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1951 	 * with disklabels. */
   1952 
   1953 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1954 	disk_attach(&rs->sc_dkdev);
   1955 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1956 
   1957 	/* XXX There may be a weird interaction here between this, and
   1958 	 * protectedSectors, as used in RAIDframe.  */
   1959 
   1960 	rs->sc_size = raidPtr->totalSectors;
   1961 
   1962 	dkwedge_discover(&rs->sc_dkdev);
   1963 
   1964 	rf_set_properties(rs, raidPtr);
   1965 
   1966 }
   1967 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1968 /* wake up the daemon & tell it to get us a spare table
   1969  * XXX
   1970  * the entries in the queues should be tagged with the raidPtr
   1971  * so that in the extremely rare case that two recons happen at once,
   1972  * we know for which device were requesting a spare table
   1973  * XXX
   1974  *
   1975  * XXX This code is not currently used. GO
   1976  */
   1977 int
   1978 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1979 {
   1980 	int     retcode;
   1981 
   1982 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1983 	req->next = rf_sparet_wait_queue;
   1984 	rf_sparet_wait_queue = req;
   1985 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1986 
   1987 	/* mpsleep unlocks the mutex */
   1988 	while (!rf_sparet_resp_queue) {
   1989 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1990 	}
   1991 	req = rf_sparet_resp_queue;
   1992 	rf_sparet_resp_queue = req->next;
   1993 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1994 
   1995 	retcode = req->fcol;
   1996 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1997 					 * alloc'd */
   1998 	return (retcode);
   1999 }
   2000 #endif
   2001 
   2002 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2003  * bp & passes it down.
   2004  * any calls originating in the kernel must use non-blocking I/O
   2005  * do some extra sanity checking to return "appropriate" error values for
   2006  * certain conditions (to make some standard utilities work)
   2007  *
   2008  * Formerly known as: rf_DoAccessKernel
   2009  */
   2010 void
   2011 raidstart(RF_Raid_t *raidPtr)
   2012 {
   2013 	RF_SectorCount_t num_blocks, pb, sum;
   2014 	RF_RaidAddr_t raid_addr;
   2015 	struct partition *pp;
   2016 	daddr_t blocknum;
   2017 	struct raid_softc *rs;
   2018 	int     do_async;
   2019 	struct buf *bp;
   2020 	int rc;
   2021 
   2022 	rs = raidPtr->softc;
   2023 	/* quick check to see if anything has died recently */
   2024 	rf_lock_mutex2(raidPtr->mutex);
   2025 	if (raidPtr->numNewFailures > 0) {
   2026 		rf_unlock_mutex2(raidPtr->mutex);
   2027 		rf_update_component_labels(raidPtr,
   2028 					   RF_NORMAL_COMPONENT_UPDATE);
   2029 		rf_lock_mutex2(raidPtr->mutex);
   2030 		raidPtr->numNewFailures--;
   2031 	}
   2032 
   2033 	/* Check to see if we're at the limit... */
   2034 	while (raidPtr->openings > 0) {
   2035 		rf_unlock_mutex2(raidPtr->mutex);
   2036 
   2037 		/* get the next item, if any, from the queue */
   2038 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2039 			/* nothing more to do */
   2040 			return;
   2041 		}
   2042 
   2043 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2044 		 * partition.. Need to make it absolute to the underlying
   2045 		 * device.. */
   2046 
   2047 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2048 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2049 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2050 			blocknum += pp->p_offset;
   2051 		}
   2052 
   2053 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2054 			    (int) blocknum));
   2055 
   2056 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2057 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2058 
   2059 		/* *THIS* is where we adjust what block we're going to...
   2060 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2061 		raid_addr = blocknum;
   2062 
   2063 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2064 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2065 		sum = raid_addr + num_blocks + pb;
   2066 		if (1 || rf_debugKernelAccess) {
   2067 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2068 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2069 				    (int) pb, (int) bp->b_resid));
   2070 		}
   2071 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2072 		    || (sum < num_blocks) || (sum < pb)) {
   2073 			bp->b_error = ENOSPC;
   2074 			bp->b_resid = bp->b_bcount;
   2075 			biodone(bp);
   2076 			rf_lock_mutex2(raidPtr->mutex);
   2077 			continue;
   2078 		}
   2079 		/*
   2080 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2081 		 */
   2082 
   2083 		if (bp->b_bcount & raidPtr->sectorMask) {
   2084 			bp->b_error = EINVAL;
   2085 			bp->b_resid = bp->b_bcount;
   2086 			biodone(bp);
   2087 			rf_lock_mutex2(raidPtr->mutex);
   2088 			continue;
   2089 
   2090 		}
   2091 		db1_printf(("Calling DoAccess..\n"));
   2092 
   2093 
   2094 		rf_lock_mutex2(raidPtr->mutex);
   2095 		raidPtr->openings--;
   2096 		rf_unlock_mutex2(raidPtr->mutex);
   2097 
   2098 		/*
   2099 		 * Everything is async.
   2100 		 */
   2101 		do_async = 1;
   2102 
   2103 		disk_busy(&rs->sc_dkdev);
   2104 
   2105 		/* XXX we're still at splbio() here... do we *really*
   2106 		   need to be? */
   2107 
   2108 		/* don't ever condition on bp->b_flags & B_WRITE.
   2109 		 * always condition on B_READ instead */
   2110 
   2111 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2112 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2113 				 do_async, raid_addr, num_blocks,
   2114 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2115 
   2116 		if (rc) {
   2117 			bp->b_error = rc;
   2118 			bp->b_resid = bp->b_bcount;
   2119 			biodone(bp);
   2120 			/* continue loop */
   2121 		}
   2122 
   2123 		rf_lock_mutex2(raidPtr->mutex);
   2124 	}
   2125 	rf_unlock_mutex2(raidPtr->mutex);
   2126 }
   2127 
   2128 
   2129 
   2130 
   2131 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2132 
   2133 int
   2134 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2135 {
   2136 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2137 	struct buf *bp;
   2138 
   2139 	req->queue = queue;
   2140 	bp = req->bp;
   2141 
   2142 	switch (req->type) {
   2143 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2144 		/* XXX need to do something extra here.. */
   2145 		/* I'm leaving this in, as I've never actually seen it used,
   2146 		 * and I'd like folks to report it... GO */
   2147 		printf(("WAKEUP CALLED\n"));
   2148 		queue->numOutstanding++;
   2149 
   2150 		bp->b_flags = 0;
   2151 		bp->b_private = req;
   2152 
   2153 		KernelWakeupFunc(bp);
   2154 		break;
   2155 
   2156 	case RF_IO_TYPE_READ:
   2157 	case RF_IO_TYPE_WRITE:
   2158 #if RF_ACC_TRACE > 0
   2159 		if (req->tracerec) {
   2160 			RF_ETIMER_START(req->tracerec->timer);
   2161 		}
   2162 #endif
   2163 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2164 		    op, queue->rf_cinfo->ci_dev,
   2165 		    req->sectorOffset, req->numSector,
   2166 		    req->buf, KernelWakeupFunc, (void *) req,
   2167 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2168 
   2169 		if (rf_debugKernelAccess) {
   2170 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2171 				(long) bp->b_blkno));
   2172 		}
   2173 		queue->numOutstanding++;
   2174 		queue->last_deq_sector = req->sectorOffset;
   2175 		/* acc wouldn't have been let in if there were any pending
   2176 		 * reqs at any other priority */
   2177 		queue->curPriority = req->priority;
   2178 
   2179 		db1_printf(("Going for %c to unit %d col %d\n",
   2180 			    req->type, queue->raidPtr->raidid,
   2181 			    queue->col));
   2182 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2183 			(int) req->sectorOffset, (int) req->numSector,
   2184 			(int) (req->numSector <<
   2185 			    queue->raidPtr->logBytesPerSector),
   2186 			(int) queue->raidPtr->logBytesPerSector));
   2187 
   2188 		/*
   2189 		 * XXX: drop lock here since this can block at
   2190 		 * least with backing SCSI devices.  Retake it
   2191 		 * to minimize fuss with calling interfaces.
   2192 		 */
   2193 
   2194 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2195 		bdev_strategy(bp);
   2196 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2197 		break;
   2198 
   2199 	default:
   2200 		panic("bad req->type in rf_DispatchKernelIO");
   2201 	}
   2202 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2203 
   2204 	return (0);
   2205 }
   2206 /* this is the callback function associated with a I/O invoked from
   2207    kernel code.
   2208  */
   2209 static void
   2210 KernelWakeupFunc(struct buf *bp)
   2211 {
   2212 	RF_DiskQueueData_t *req = NULL;
   2213 	RF_DiskQueue_t *queue;
   2214 
   2215 	db1_printf(("recovering the request queue:\n"));
   2216 
   2217 	req = bp->b_private;
   2218 
   2219 	queue = (RF_DiskQueue_t *) req->queue;
   2220 
   2221 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2222 
   2223 #if RF_ACC_TRACE > 0
   2224 	if (req->tracerec) {
   2225 		RF_ETIMER_STOP(req->tracerec->timer);
   2226 		RF_ETIMER_EVAL(req->tracerec->timer);
   2227 		rf_lock_mutex2(rf_tracing_mutex);
   2228 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2229 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2230 		req->tracerec->num_phys_ios++;
   2231 		rf_unlock_mutex2(rf_tracing_mutex);
   2232 	}
   2233 #endif
   2234 
   2235 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2236 	 * ballistic, and mark the component as hosed... */
   2237 
   2238 	if (bp->b_error != 0) {
   2239 		/* Mark the disk as dead */
   2240 		/* but only mark it once... */
   2241 		/* and only if it wouldn't leave this RAID set
   2242 		   completely broken */
   2243 		if (((queue->raidPtr->Disks[queue->col].status ==
   2244 		      rf_ds_optimal) ||
   2245 		     (queue->raidPtr->Disks[queue->col].status ==
   2246 		      rf_ds_used_spare)) &&
   2247 		     (queue->raidPtr->numFailures <
   2248 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2249 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2250 			       queue->raidPtr->raidid,
   2251 			       queue->raidPtr->Disks[queue->col].devname);
   2252 			queue->raidPtr->Disks[queue->col].status =
   2253 			    rf_ds_failed;
   2254 			queue->raidPtr->status = rf_rs_degraded;
   2255 			queue->raidPtr->numFailures++;
   2256 			queue->raidPtr->numNewFailures++;
   2257 		} else {	/* Disk is already dead... */
   2258 			/* printf("Disk already marked as dead!\n"); */
   2259 		}
   2260 
   2261 	}
   2262 
   2263 	/* Fill in the error value */
   2264 	req->error = bp->b_error;
   2265 
   2266 	/* Drop this one on the "finished" queue... */
   2267 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2268 
   2269 	/* Let the raidio thread know there is work to be done. */
   2270 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2271 
   2272 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2273 }
   2274 
   2275 
   2276 /*
   2277  * initialize a buf structure for doing an I/O in the kernel.
   2278  */
   2279 static void
   2280 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2281        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2282        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2283        struct proc *b_proc)
   2284 {
   2285 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2286 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2287 	bp->b_oflags = 0;
   2288 	bp->b_cflags = 0;
   2289 	bp->b_bcount = numSect << logBytesPerSector;
   2290 	bp->b_bufsize = bp->b_bcount;
   2291 	bp->b_error = 0;
   2292 	bp->b_dev = dev;
   2293 	bp->b_data = bf;
   2294 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2295 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2296 	if (bp->b_bcount == 0) {
   2297 		panic("bp->b_bcount is zero in InitBP!!");
   2298 	}
   2299 	bp->b_proc = b_proc;
   2300 	bp->b_iodone = cbFunc;
   2301 	bp->b_private = cbArg;
   2302 }
   2303 
   2304 static void
   2305 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2306 		    struct disklabel *lp)
   2307 {
   2308 	memset(lp, 0, sizeof(*lp));
   2309 
   2310 	/* fabricate a label... */
   2311 	lp->d_secperunit = raidPtr->totalSectors;
   2312 	lp->d_secsize = raidPtr->bytesPerSector;
   2313 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2314 	lp->d_ntracks = 4 * raidPtr->numCol;
   2315 	lp->d_ncylinders = raidPtr->totalSectors /
   2316 		(lp->d_nsectors * lp->d_ntracks);
   2317 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2318 
   2319 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2320 	lp->d_type = DTYPE_RAID;
   2321 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2322 	lp->d_rpm = 3600;
   2323 	lp->d_interleave = 1;
   2324 	lp->d_flags = 0;
   2325 
   2326 	lp->d_partitions[RAW_PART].p_offset = 0;
   2327 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2328 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2329 	lp->d_npartitions = RAW_PART + 1;
   2330 
   2331 	lp->d_magic = DISKMAGIC;
   2332 	lp->d_magic2 = DISKMAGIC;
   2333 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2334 
   2335 }
   2336 /*
   2337  * Read the disklabel from the raid device.  If one is not present, fake one
   2338  * up.
   2339  */
   2340 static void
   2341 raidgetdisklabel(dev_t dev)
   2342 {
   2343 	int     unit = raidunit(dev);
   2344 	struct raid_softc *rs;
   2345 	const char   *errstring;
   2346 	struct disklabel *lp;
   2347 	struct cpu_disklabel *clp;
   2348 	RF_Raid_t *raidPtr;
   2349 
   2350 	if ((rs = raidget(unit)) == NULL)
   2351 		return;
   2352 
   2353 	lp = rs->sc_dkdev.dk_label;
   2354 	clp = rs->sc_dkdev.dk_cpulabel;
   2355 
   2356 	db1_printf(("Getting the disklabel...\n"));
   2357 
   2358 	memset(clp, 0, sizeof(*clp));
   2359 
   2360 	raidPtr = &rs->sc_r;
   2361 
   2362 	raidgetdefaultlabel(raidPtr, rs, lp);
   2363 
   2364 	/*
   2365 	 * Call the generic disklabel extraction routine.
   2366 	 */
   2367 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2368 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2369 	if (errstring)
   2370 		raidmakedisklabel(rs);
   2371 	else {
   2372 		int     i;
   2373 		struct partition *pp;
   2374 
   2375 		/*
   2376 		 * Sanity check whether the found disklabel is valid.
   2377 		 *
   2378 		 * This is necessary since total size of the raid device
   2379 		 * may vary when an interleave is changed even though exactly
   2380 		 * same components are used, and old disklabel may used
   2381 		 * if that is found.
   2382 		 */
   2383 		if (lp->d_secperunit != rs->sc_size)
   2384 			printf("raid%d: WARNING: %s: "
   2385 			    "total sector size in disklabel (%" PRIu32 ") != "
   2386 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2387 			    lp->d_secperunit, rs->sc_size);
   2388 		for (i = 0; i < lp->d_npartitions; i++) {
   2389 			pp = &lp->d_partitions[i];
   2390 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2391 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2392 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2393 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2394 		}
   2395 	}
   2396 
   2397 }
   2398 /*
   2399  * Take care of things one might want to take care of in the event
   2400  * that a disklabel isn't present.
   2401  */
   2402 static void
   2403 raidmakedisklabel(struct raid_softc *rs)
   2404 {
   2405 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2406 	db1_printf(("Making a label..\n"));
   2407 
   2408 	/*
   2409 	 * For historical reasons, if there's no disklabel present
   2410 	 * the raw partition must be marked FS_BSDFFS.
   2411 	 */
   2412 
   2413 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2414 
   2415 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2416 
   2417 	lp->d_checksum = dkcksum(lp);
   2418 }
   2419 /*
   2420  * Wait interruptibly for an exclusive lock.
   2421  *
   2422  * XXX
   2423  * Several drivers do this; it should be abstracted and made MP-safe.
   2424  * (Hmm... where have we seen this warning before :->  GO )
   2425  */
   2426 static int
   2427 raidlock(struct raid_softc *rs)
   2428 {
   2429 	int     error;
   2430 
   2431 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2432 		rs->sc_flags |= RAIDF_WANTED;
   2433 		if ((error =
   2434 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2435 			return (error);
   2436 	}
   2437 	rs->sc_flags |= RAIDF_LOCKED;
   2438 	return (0);
   2439 }
   2440 /*
   2441  * Unlock and wake up any waiters.
   2442  */
   2443 static void
   2444 raidunlock(struct raid_softc *rs)
   2445 {
   2446 
   2447 	rs->sc_flags &= ~RAIDF_LOCKED;
   2448 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2449 		rs->sc_flags &= ~RAIDF_WANTED;
   2450 		wakeup(rs);
   2451 	}
   2452 }
   2453 
   2454 
   2455 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2456 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2457 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2458 
   2459 static daddr_t
   2460 rf_component_info_offset(void)
   2461 {
   2462 
   2463 	return RF_COMPONENT_INFO_OFFSET;
   2464 }
   2465 
   2466 static daddr_t
   2467 rf_component_info_size(unsigned secsize)
   2468 {
   2469 	daddr_t info_size;
   2470 
   2471 	KASSERT(secsize);
   2472 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2473 		info_size = secsize;
   2474 	else
   2475 		info_size = RF_COMPONENT_INFO_SIZE;
   2476 
   2477 	return info_size;
   2478 }
   2479 
   2480 static daddr_t
   2481 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2482 {
   2483 	daddr_t map_offset;
   2484 
   2485 	KASSERT(raidPtr->bytesPerSector);
   2486 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2487 		map_offset = raidPtr->bytesPerSector;
   2488 	else
   2489 		map_offset = RF_COMPONENT_INFO_SIZE;
   2490 	map_offset += rf_component_info_offset();
   2491 
   2492 	return map_offset;
   2493 }
   2494 
   2495 static daddr_t
   2496 rf_parity_map_size(RF_Raid_t *raidPtr)
   2497 {
   2498 	daddr_t map_size;
   2499 
   2500 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2501 		map_size = raidPtr->bytesPerSector;
   2502 	else
   2503 		map_size = RF_PARITY_MAP_SIZE;
   2504 
   2505 	return map_size;
   2506 }
   2507 
   2508 int
   2509 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2510 {
   2511 	RF_ComponentLabel_t *clabel;
   2512 
   2513 	clabel = raidget_component_label(raidPtr, col);
   2514 	clabel->clean = RF_RAID_CLEAN;
   2515 	raidflush_component_label(raidPtr, col);
   2516 	return(0);
   2517 }
   2518 
   2519 
   2520 int
   2521 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2522 {
   2523 	RF_ComponentLabel_t *clabel;
   2524 
   2525 	clabel = raidget_component_label(raidPtr, col);
   2526 	clabel->clean = RF_RAID_DIRTY;
   2527 	raidflush_component_label(raidPtr, col);
   2528 	return(0);
   2529 }
   2530 
   2531 int
   2532 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2533 {
   2534 	KASSERT(raidPtr->bytesPerSector);
   2535 	return raidread_component_label(raidPtr->bytesPerSector,
   2536 	    raidPtr->Disks[col].dev,
   2537 	    raidPtr->raid_cinfo[col].ci_vp,
   2538 	    &raidPtr->raid_cinfo[col].ci_label);
   2539 }
   2540 
   2541 RF_ComponentLabel_t *
   2542 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2543 {
   2544 	return &raidPtr->raid_cinfo[col].ci_label;
   2545 }
   2546 
   2547 int
   2548 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2549 {
   2550 	RF_ComponentLabel_t *label;
   2551 
   2552 	label = &raidPtr->raid_cinfo[col].ci_label;
   2553 	label->mod_counter = raidPtr->mod_counter;
   2554 #ifndef RF_NO_PARITY_MAP
   2555 	label->parity_map_modcount = label->mod_counter;
   2556 #endif
   2557 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2558 	    raidPtr->Disks[col].dev,
   2559 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2560 }
   2561 
   2562 
   2563 static int
   2564 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2565     RF_ComponentLabel_t *clabel)
   2566 {
   2567 	return raidread_component_area(dev, b_vp, clabel,
   2568 	    sizeof(RF_ComponentLabel_t),
   2569 	    rf_component_info_offset(),
   2570 	    rf_component_info_size(secsize));
   2571 }
   2572 
   2573 /* ARGSUSED */
   2574 static int
   2575 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2576     size_t msize, daddr_t offset, daddr_t dsize)
   2577 {
   2578 	struct buf *bp;
   2579 	const struct bdevsw *bdev;
   2580 	int error;
   2581 
   2582 	/* XXX should probably ensure that we don't try to do this if
   2583 	   someone has changed rf_protected_sectors. */
   2584 
   2585 	if (b_vp == NULL) {
   2586 		/* For whatever reason, this component is not valid.
   2587 		   Don't try to read a component label from it. */
   2588 		return(EINVAL);
   2589 	}
   2590 
   2591 	/* get a block of the appropriate size... */
   2592 	bp = geteblk((int)dsize);
   2593 	bp->b_dev = dev;
   2594 
   2595 	/* get our ducks in a row for the read */
   2596 	bp->b_blkno = offset / DEV_BSIZE;
   2597 	bp->b_bcount = dsize;
   2598 	bp->b_flags |= B_READ;
   2599  	bp->b_resid = dsize;
   2600 
   2601 	bdev = bdevsw_lookup(bp->b_dev);
   2602 	if (bdev == NULL)
   2603 		return (ENXIO);
   2604 	(*bdev->d_strategy)(bp);
   2605 
   2606 	error = biowait(bp);
   2607 
   2608 	if (!error) {
   2609 		memcpy(data, bp->b_data, msize);
   2610 	}
   2611 
   2612 	brelse(bp, 0);
   2613 	return(error);
   2614 }
   2615 
   2616 
   2617 static int
   2618 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2619     RF_ComponentLabel_t *clabel)
   2620 {
   2621 	return raidwrite_component_area(dev, b_vp, clabel,
   2622 	    sizeof(RF_ComponentLabel_t),
   2623 	    rf_component_info_offset(),
   2624 	    rf_component_info_size(secsize), 0);
   2625 }
   2626 
   2627 /* ARGSUSED */
   2628 static int
   2629 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2630     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2631 {
   2632 	struct buf *bp;
   2633 	const struct bdevsw *bdev;
   2634 	int error;
   2635 
   2636 	/* get a block of the appropriate size... */
   2637 	bp = geteblk((int)dsize);
   2638 	bp->b_dev = dev;
   2639 
   2640 	/* get our ducks in a row for the write */
   2641 	bp->b_blkno = offset / DEV_BSIZE;
   2642 	bp->b_bcount = dsize;
   2643 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2644  	bp->b_resid = dsize;
   2645 
   2646 	memset(bp->b_data, 0, dsize);
   2647 	memcpy(bp->b_data, data, msize);
   2648 
   2649 	bdev = bdevsw_lookup(bp->b_dev);
   2650 	if (bdev == NULL)
   2651 		return (ENXIO);
   2652 	(*bdev->d_strategy)(bp);
   2653 	if (asyncp)
   2654 		return 0;
   2655 	error = biowait(bp);
   2656 	brelse(bp, 0);
   2657 	if (error) {
   2658 #if 1
   2659 		printf("Failed to write RAID component info!\n");
   2660 #endif
   2661 	}
   2662 
   2663 	return(error);
   2664 }
   2665 
   2666 void
   2667 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2668 {
   2669 	int c;
   2670 
   2671 	for (c = 0; c < raidPtr->numCol; c++) {
   2672 		/* Skip dead disks. */
   2673 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2674 			continue;
   2675 		/* XXXjld: what if an error occurs here? */
   2676 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2677 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2678 		    RF_PARITYMAP_NBYTE,
   2679 		    rf_parity_map_offset(raidPtr),
   2680 		    rf_parity_map_size(raidPtr), 0);
   2681 	}
   2682 }
   2683 
   2684 void
   2685 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2686 {
   2687 	struct rf_paritymap_ondisk tmp;
   2688 	int c,first;
   2689 
   2690 	first=1;
   2691 	for (c = 0; c < raidPtr->numCol; c++) {
   2692 		/* Skip dead disks. */
   2693 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2694 			continue;
   2695 		raidread_component_area(raidPtr->Disks[c].dev,
   2696 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2697 		    RF_PARITYMAP_NBYTE,
   2698 		    rf_parity_map_offset(raidPtr),
   2699 		    rf_parity_map_size(raidPtr));
   2700 		if (first) {
   2701 			memcpy(map, &tmp, sizeof(*map));
   2702 			first = 0;
   2703 		} else {
   2704 			rf_paritymap_merge(map, &tmp);
   2705 		}
   2706 	}
   2707 }
   2708 
   2709 void
   2710 rf_markalldirty(RF_Raid_t *raidPtr)
   2711 {
   2712 	RF_ComponentLabel_t *clabel;
   2713 	int sparecol;
   2714 	int c;
   2715 	int j;
   2716 	int scol = -1;
   2717 
   2718 	raidPtr->mod_counter++;
   2719 	for (c = 0; c < raidPtr->numCol; c++) {
   2720 		/* we don't want to touch (at all) a disk that has
   2721 		   failed */
   2722 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2723 			clabel = raidget_component_label(raidPtr, c);
   2724 			if (clabel->status == rf_ds_spared) {
   2725 				/* XXX do something special...
   2726 				   but whatever you do, don't
   2727 				   try to access it!! */
   2728 			} else {
   2729 				raidmarkdirty(raidPtr, c);
   2730 			}
   2731 		}
   2732 	}
   2733 
   2734 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2735 		sparecol = raidPtr->numCol + c;
   2736 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2737 			/*
   2738 
   2739 			   we claim this disk is "optimal" if it's
   2740 			   rf_ds_used_spare, as that means it should be
   2741 			   directly substitutable for the disk it replaced.
   2742 			   We note that too...
   2743 
   2744 			 */
   2745 
   2746 			for(j=0;j<raidPtr->numCol;j++) {
   2747 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2748 					scol = j;
   2749 					break;
   2750 				}
   2751 			}
   2752 
   2753 			clabel = raidget_component_label(raidPtr, sparecol);
   2754 			/* make sure status is noted */
   2755 
   2756 			raid_init_component_label(raidPtr, clabel);
   2757 
   2758 			clabel->row = 0;
   2759 			clabel->column = scol;
   2760 			/* Note: we *don't* change status from rf_ds_used_spare
   2761 			   to rf_ds_optimal */
   2762 			/* clabel.status = rf_ds_optimal; */
   2763 
   2764 			raidmarkdirty(raidPtr, sparecol);
   2765 		}
   2766 	}
   2767 }
   2768 
   2769 
   2770 void
   2771 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2772 {
   2773 	RF_ComponentLabel_t *clabel;
   2774 	int sparecol;
   2775 	int c;
   2776 	int j;
   2777 	int scol;
   2778 
   2779 	scol = -1;
   2780 
   2781 	/* XXX should do extra checks to make sure things really are clean,
   2782 	   rather than blindly setting the clean bit... */
   2783 
   2784 	raidPtr->mod_counter++;
   2785 
   2786 	for (c = 0; c < raidPtr->numCol; c++) {
   2787 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2788 			clabel = raidget_component_label(raidPtr, c);
   2789 			/* make sure status is noted */
   2790 			clabel->status = rf_ds_optimal;
   2791 
   2792 			/* note what unit we are configured as */
   2793 			clabel->last_unit = raidPtr->raidid;
   2794 
   2795 			raidflush_component_label(raidPtr, c);
   2796 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2797 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2798 					raidmarkclean(raidPtr, c);
   2799 				}
   2800 			}
   2801 		}
   2802 		/* else we don't touch it.. */
   2803 	}
   2804 
   2805 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2806 		sparecol = raidPtr->numCol + c;
   2807 		/* Need to ensure that the reconstruct actually completed! */
   2808 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2809 			/*
   2810 
   2811 			   we claim this disk is "optimal" if it's
   2812 			   rf_ds_used_spare, as that means it should be
   2813 			   directly substitutable for the disk it replaced.
   2814 			   We note that too...
   2815 
   2816 			 */
   2817 
   2818 			for(j=0;j<raidPtr->numCol;j++) {
   2819 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2820 					scol = j;
   2821 					break;
   2822 				}
   2823 			}
   2824 
   2825 			/* XXX shouldn't *really* need this... */
   2826 			clabel = raidget_component_label(raidPtr, sparecol);
   2827 			/* make sure status is noted */
   2828 
   2829 			raid_init_component_label(raidPtr, clabel);
   2830 
   2831 			clabel->column = scol;
   2832 			clabel->status = rf_ds_optimal;
   2833 			clabel->last_unit = raidPtr->raidid;
   2834 
   2835 			raidflush_component_label(raidPtr, sparecol);
   2836 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2837 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2838 					raidmarkclean(raidPtr, sparecol);
   2839 				}
   2840 			}
   2841 		}
   2842 	}
   2843 }
   2844 
   2845 void
   2846 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2847 {
   2848 
   2849 	if (vp != NULL) {
   2850 		if (auto_configured == 1) {
   2851 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2852 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2853 			vput(vp);
   2854 
   2855 		} else {
   2856 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2857 		}
   2858 	}
   2859 }
   2860 
   2861 
   2862 void
   2863 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2864 {
   2865 	int r,c;
   2866 	struct vnode *vp;
   2867 	int acd;
   2868 
   2869 
   2870 	/* We take this opportunity to close the vnodes like we should.. */
   2871 
   2872 	for (c = 0; c < raidPtr->numCol; c++) {
   2873 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2874 		acd = raidPtr->Disks[c].auto_configured;
   2875 		rf_close_component(raidPtr, vp, acd);
   2876 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2877 		raidPtr->Disks[c].auto_configured = 0;
   2878 	}
   2879 
   2880 	for (r = 0; r < raidPtr->numSpare; r++) {
   2881 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2882 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2883 		rf_close_component(raidPtr, vp, acd);
   2884 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2885 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2886 	}
   2887 }
   2888 
   2889 
   2890 void
   2891 rf_ReconThread(struct rf_recon_req *req)
   2892 {
   2893 	int     s;
   2894 	RF_Raid_t *raidPtr;
   2895 
   2896 	s = splbio();
   2897 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2898 	raidPtr->recon_in_progress = 1;
   2899 
   2900 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2901 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2902 
   2903 	RF_Free(req, sizeof(*req));
   2904 
   2905 	raidPtr->recon_in_progress = 0;
   2906 	splx(s);
   2907 
   2908 	/* That's all... */
   2909 	kthread_exit(0);	/* does not return */
   2910 }
   2911 
   2912 void
   2913 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2914 {
   2915 	int retcode;
   2916 	int s;
   2917 
   2918 	raidPtr->parity_rewrite_stripes_done = 0;
   2919 	raidPtr->parity_rewrite_in_progress = 1;
   2920 	s = splbio();
   2921 	retcode = rf_RewriteParity(raidPtr);
   2922 	splx(s);
   2923 	if (retcode) {
   2924 		printf("raid%d: Error re-writing parity (%d)!\n",
   2925 		    raidPtr->raidid, retcode);
   2926 	} else {
   2927 		/* set the clean bit!  If we shutdown correctly,
   2928 		   the clean bit on each component label will get
   2929 		   set */
   2930 		raidPtr->parity_good = RF_RAID_CLEAN;
   2931 	}
   2932 	raidPtr->parity_rewrite_in_progress = 0;
   2933 
   2934 	/* Anyone waiting for us to stop?  If so, inform them... */
   2935 	if (raidPtr->waitShutdown) {
   2936 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2937 	}
   2938 
   2939 	/* That's all... */
   2940 	kthread_exit(0);	/* does not return */
   2941 }
   2942 
   2943 
   2944 void
   2945 rf_CopybackThread(RF_Raid_t *raidPtr)
   2946 {
   2947 	int s;
   2948 
   2949 	raidPtr->copyback_in_progress = 1;
   2950 	s = splbio();
   2951 	rf_CopybackReconstructedData(raidPtr);
   2952 	splx(s);
   2953 	raidPtr->copyback_in_progress = 0;
   2954 
   2955 	/* That's all... */
   2956 	kthread_exit(0);	/* does not return */
   2957 }
   2958 
   2959 
   2960 void
   2961 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2962 {
   2963 	int s;
   2964 	RF_Raid_t *raidPtr;
   2965 
   2966 	s = splbio();
   2967 	raidPtr = req->raidPtr;
   2968 	raidPtr->recon_in_progress = 1;
   2969 	rf_ReconstructInPlace(raidPtr, req->col);
   2970 	RF_Free(req, sizeof(*req));
   2971 	raidPtr->recon_in_progress = 0;
   2972 	splx(s);
   2973 
   2974 	/* That's all... */
   2975 	kthread_exit(0);	/* does not return */
   2976 }
   2977 
   2978 static RF_AutoConfig_t *
   2979 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2980     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2981     unsigned secsize)
   2982 {
   2983 	int good_one = 0;
   2984 	RF_ComponentLabel_t *clabel;
   2985 	RF_AutoConfig_t *ac;
   2986 
   2987 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2988 	if (clabel == NULL) {
   2989 oomem:
   2990 		    while(ac_list) {
   2991 			    ac = ac_list;
   2992 			    if (ac->clabel)
   2993 				    free(ac->clabel, M_RAIDFRAME);
   2994 			    ac_list = ac_list->next;
   2995 			    free(ac, M_RAIDFRAME);
   2996 		    }
   2997 		    printf("RAID auto config: out of memory!\n");
   2998 		    return NULL; /* XXX probably should panic? */
   2999 	}
   3000 
   3001 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3002 		/* Got the label.  Does it look reasonable? */
   3003 		if (rf_reasonable_label(clabel, numsecs) &&
   3004 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3005 #ifdef DEBUG
   3006 			printf("Component on: %s: %llu\n",
   3007 				cname, (unsigned long long)size);
   3008 			rf_print_component_label(clabel);
   3009 #endif
   3010 			/* if it's reasonable, add it, else ignore it. */
   3011 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3012 				M_NOWAIT);
   3013 			if (ac == NULL) {
   3014 				free(clabel, M_RAIDFRAME);
   3015 				goto oomem;
   3016 			}
   3017 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3018 			ac->dev = dev;
   3019 			ac->vp = vp;
   3020 			ac->clabel = clabel;
   3021 			ac->next = ac_list;
   3022 			ac_list = ac;
   3023 			good_one = 1;
   3024 		}
   3025 	}
   3026 	if (!good_one) {
   3027 		/* cleanup */
   3028 		free(clabel, M_RAIDFRAME);
   3029 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3030 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3031 		vput(vp);
   3032 	}
   3033 	return ac_list;
   3034 }
   3035 
   3036 RF_AutoConfig_t *
   3037 rf_find_raid_components(void)
   3038 {
   3039 	struct vnode *vp;
   3040 	struct disklabel label;
   3041 	device_t dv;
   3042 	deviter_t di;
   3043 	dev_t dev;
   3044 	int bmajor, bminor, wedge, rf_part_found;
   3045 	int error;
   3046 	int i;
   3047 	RF_AutoConfig_t *ac_list;
   3048 	uint64_t numsecs;
   3049 	unsigned secsize;
   3050 
   3051 	/* initialize the AutoConfig list */
   3052 	ac_list = NULL;
   3053 
   3054 	/* we begin by trolling through *all* the devices on the system */
   3055 
   3056 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3057 	     dv = deviter_next(&di)) {
   3058 
   3059 		/* we are only interested in disks... */
   3060 		if (device_class(dv) != DV_DISK)
   3061 			continue;
   3062 
   3063 		/* we don't care about floppies... */
   3064 		if (device_is_a(dv, "fd")) {
   3065 			continue;
   3066 		}
   3067 
   3068 		/* we don't care about CD's... */
   3069 		if (device_is_a(dv, "cd")) {
   3070 			continue;
   3071 		}
   3072 
   3073 		/* we don't care about md's... */
   3074 		if (device_is_a(dv, "md")) {
   3075 			continue;
   3076 		}
   3077 
   3078 		/* hdfd is the Atari/Hades floppy driver */
   3079 		if (device_is_a(dv, "hdfd")) {
   3080 			continue;
   3081 		}
   3082 
   3083 		/* fdisa is the Atari/Milan floppy driver */
   3084 		if (device_is_a(dv, "fdisa")) {
   3085 			continue;
   3086 		}
   3087 
   3088 		/* need to find the device_name_to_block_device_major stuff */
   3089 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3090 
   3091 		rf_part_found = 0; /*No raid partition as yet*/
   3092 
   3093 		/* get a vnode for the raw partition of this disk */
   3094 
   3095 		wedge = device_is_a(dv, "dk");
   3096 		bminor = minor(device_unit(dv));
   3097 		dev = wedge ? makedev(bmajor, bminor) :
   3098 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3099 		if (bdevvp(dev, &vp))
   3100 			panic("RAID can't alloc vnode");
   3101 
   3102 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3103 
   3104 		if (error) {
   3105 			/* "Who cares."  Continue looking
   3106 			   for something that exists*/
   3107 			vput(vp);
   3108 			continue;
   3109 		}
   3110 
   3111 		error = getdisksize(vp, &numsecs, &secsize);
   3112 		if (error) {
   3113 			vput(vp);
   3114 			continue;
   3115 		}
   3116 		if (wedge) {
   3117 			struct dkwedge_info dkw;
   3118 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3119 			    NOCRED);
   3120 			if (error) {
   3121 				printf("RAIDframe: can't get wedge info for "
   3122 				    "dev %s (%d)\n", device_xname(dv), error);
   3123 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3124 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3125 				vput(vp);
   3126 				continue;
   3127 			}
   3128 
   3129 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3130 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3131 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3132 				vput(vp);
   3133 				continue;
   3134 			}
   3135 
   3136 			ac_list = rf_get_component(ac_list, dev, vp,
   3137 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3138 			rf_part_found = 1; /*There is a raid component on this disk*/
   3139 			continue;
   3140 		}
   3141 
   3142 		/* Ok, the disk exists.  Go get the disklabel. */
   3143 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3144 		if (error) {
   3145 			/*
   3146 			 * XXX can't happen - open() would
   3147 			 * have errored out (or faked up one)
   3148 			 */
   3149 			if (error != ENOTTY)
   3150 				printf("RAIDframe: can't get label for dev "
   3151 				    "%s (%d)\n", device_xname(dv), error);
   3152 		}
   3153 
   3154 		/* don't need this any more.  We'll allocate it again
   3155 		   a little later if we really do... */
   3156 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3157 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3158 		vput(vp);
   3159 
   3160 		if (error)
   3161 			continue;
   3162 
   3163 		rf_part_found = 0; /*No raid partitions yet*/
   3164 		for (i = 0; i < label.d_npartitions; i++) {
   3165 			char cname[sizeof(ac_list->devname)];
   3166 
   3167 			/* We only support partitions marked as RAID */
   3168 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3169 				continue;
   3170 
   3171 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3172 			if (bdevvp(dev, &vp))
   3173 				panic("RAID can't alloc vnode");
   3174 
   3175 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3176 			if (error) {
   3177 				/* Whatever... */
   3178 				vput(vp);
   3179 				continue;
   3180 			}
   3181 			snprintf(cname, sizeof(cname), "%s%c",
   3182 			    device_xname(dv), 'a' + i);
   3183 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3184 				label.d_partitions[i].p_size, numsecs, secsize);
   3185 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3186 		}
   3187 
   3188 		/*
   3189 		 *If there is no raid component on this disk, either in a
   3190 		 *disklabel or inside a wedge, check the raw partition as well,
   3191 		 *as it is possible to configure raid components on raw disk
   3192 		 *devices.
   3193 		 */
   3194 
   3195 		if (!rf_part_found) {
   3196 			char cname[sizeof(ac_list->devname)];
   3197 
   3198 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3199 			if (bdevvp(dev, &vp))
   3200 				panic("RAID can't alloc vnode");
   3201 
   3202 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3203 			if (error) {
   3204 				/* Whatever... */
   3205 				vput(vp);
   3206 				continue;
   3207 			}
   3208 			snprintf(cname, sizeof(cname), "%s%c",
   3209 			    device_xname(dv), 'a' + RAW_PART);
   3210 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3211 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3212 		}
   3213 	}
   3214 	deviter_release(&di);
   3215 	return ac_list;
   3216 }
   3217 
   3218 
   3219 int
   3220 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3221 {
   3222 
   3223 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3224 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3225 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3226 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3227 	    clabel->row >=0 &&
   3228 	    clabel->column >= 0 &&
   3229 	    clabel->num_rows > 0 &&
   3230 	    clabel->num_columns > 0 &&
   3231 	    clabel->row < clabel->num_rows &&
   3232 	    clabel->column < clabel->num_columns &&
   3233 	    clabel->blockSize > 0 &&
   3234 	    /*
   3235 	     * numBlocksHi may contain garbage, but it is ok since
   3236 	     * the type is unsigned.  If it is really garbage,
   3237 	     * rf_fix_old_label_size() will fix it.
   3238 	     */
   3239 	    rf_component_label_numblocks(clabel) > 0) {
   3240 		/*
   3241 		 * label looks reasonable enough...
   3242 		 * let's make sure it has no old garbage.
   3243 		 */
   3244 		if (numsecs)
   3245 			rf_fix_old_label_size(clabel, numsecs);
   3246 		return(1);
   3247 	}
   3248 	return(0);
   3249 }
   3250 
   3251 
   3252 /*
   3253  * For reasons yet unknown, some old component labels have garbage in
   3254  * the newer numBlocksHi region, and this causes lossage.  Since those
   3255  * disks will also have numsecs set to less than 32 bits of sectors,
   3256  * we can determine when this corruption has occurred, and fix it.
   3257  *
   3258  * The exact same problem, with the same unknown reason, happens to
   3259  * the partitionSizeHi member as well.
   3260  */
   3261 static void
   3262 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3263 {
   3264 
   3265 	if (numsecs < ((uint64_t)1 << 32)) {
   3266 		if (clabel->numBlocksHi) {
   3267 			printf("WARNING: total sectors < 32 bits, yet "
   3268 			       "numBlocksHi set\n"
   3269 			       "WARNING: resetting numBlocksHi to zero.\n");
   3270 			clabel->numBlocksHi = 0;
   3271 		}
   3272 
   3273 		if (clabel->partitionSizeHi) {
   3274 			printf("WARNING: total sectors < 32 bits, yet "
   3275 			       "partitionSizeHi set\n"
   3276 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3277 			clabel->partitionSizeHi = 0;
   3278 		}
   3279 	}
   3280 }
   3281 
   3282 
   3283 #ifdef DEBUG
   3284 void
   3285 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3286 {
   3287 	uint64_t numBlocks;
   3288 
   3289 	numBlocks = rf_component_label_numblocks(clabel);
   3290 
   3291 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3292 	       clabel->row, clabel->column,
   3293 	       clabel->num_rows, clabel->num_columns);
   3294 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3295 	       clabel->version, clabel->serial_number,
   3296 	       clabel->mod_counter);
   3297 	printf("   Clean: %s Status: %d\n",
   3298 	       clabel->clean ? "Yes" : "No", clabel->status);
   3299 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3300 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3301 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3302 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3303 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3304 	printf("   Contains root partition: %s\n",
   3305 	       clabel->root_partition ? "Yes" : "No");
   3306 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3307 #if 0
   3308 	   printf("   Config order: %d\n", clabel->config_order);
   3309 #endif
   3310 
   3311 }
   3312 #endif
   3313 
   3314 RF_ConfigSet_t *
   3315 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3316 {
   3317 	RF_AutoConfig_t *ac;
   3318 	RF_ConfigSet_t *config_sets;
   3319 	RF_ConfigSet_t *cset;
   3320 	RF_AutoConfig_t *ac_next;
   3321 
   3322 
   3323 	config_sets = NULL;
   3324 
   3325 	/* Go through the AutoConfig list, and figure out which components
   3326 	   belong to what sets.  */
   3327 	ac = ac_list;
   3328 	while(ac!=NULL) {
   3329 		/* we're going to putz with ac->next, so save it here
   3330 		   for use at the end of the loop */
   3331 		ac_next = ac->next;
   3332 
   3333 		if (config_sets == NULL) {
   3334 			/* will need at least this one... */
   3335 			config_sets = (RF_ConfigSet_t *)
   3336 				malloc(sizeof(RF_ConfigSet_t),
   3337 				       M_RAIDFRAME, M_NOWAIT);
   3338 			if (config_sets == NULL) {
   3339 				panic("rf_create_auto_sets: No memory!");
   3340 			}
   3341 			/* this one is easy :) */
   3342 			config_sets->ac = ac;
   3343 			config_sets->next = NULL;
   3344 			config_sets->rootable = 0;
   3345 			ac->next = NULL;
   3346 		} else {
   3347 			/* which set does this component fit into? */
   3348 			cset = config_sets;
   3349 			while(cset!=NULL) {
   3350 				if (rf_does_it_fit(cset, ac)) {
   3351 					/* looks like it matches... */
   3352 					ac->next = cset->ac;
   3353 					cset->ac = ac;
   3354 					break;
   3355 				}
   3356 				cset = cset->next;
   3357 			}
   3358 			if (cset==NULL) {
   3359 				/* didn't find a match above... new set..*/
   3360 				cset = (RF_ConfigSet_t *)
   3361 					malloc(sizeof(RF_ConfigSet_t),
   3362 					       M_RAIDFRAME, M_NOWAIT);
   3363 				if (cset == NULL) {
   3364 					panic("rf_create_auto_sets: No memory!");
   3365 				}
   3366 				cset->ac = ac;
   3367 				ac->next = NULL;
   3368 				cset->next = config_sets;
   3369 				cset->rootable = 0;
   3370 				config_sets = cset;
   3371 			}
   3372 		}
   3373 		ac = ac_next;
   3374 	}
   3375 
   3376 
   3377 	return(config_sets);
   3378 }
   3379 
   3380 static int
   3381 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3382 {
   3383 	RF_ComponentLabel_t *clabel1, *clabel2;
   3384 
   3385 	/* If this one matches the *first* one in the set, that's good
   3386 	   enough, since the other members of the set would have been
   3387 	   through here too... */
   3388 	/* note that we are not checking partitionSize here..
   3389 
   3390 	   Note that we are also not checking the mod_counters here.
   3391 	   If everything else matches except the mod_counter, that's
   3392 	   good enough for this test.  We will deal with the mod_counters
   3393 	   a little later in the autoconfiguration process.
   3394 
   3395 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3396 
   3397 	   The reason we don't check for this is that failed disks
   3398 	   will have lower modification counts.  If those disks are
   3399 	   not added to the set they used to belong to, then they will
   3400 	   form their own set, which may result in 2 different sets,
   3401 	   for example, competing to be configured at raid0, and
   3402 	   perhaps competing to be the root filesystem set.  If the
   3403 	   wrong ones get configured, or both attempt to become /,
   3404 	   weird behaviour and or serious lossage will occur.  Thus we
   3405 	   need to bring them into the fold here, and kick them out at
   3406 	   a later point.
   3407 
   3408 	*/
   3409 
   3410 	clabel1 = cset->ac->clabel;
   3411 	clabel2 = ac->clabel;
   3412 	if ((clabel1->version == clabel2->version) &&
   3413 	    (clabel1->serial_number == clabel2->serial_number) &&
   3414 	    (clabel1->num_rows == clabel2->num_rows) &&
   3415 	    (clabel1->num_columns == clabel2->num_columns) &&
   3416 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3417 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3418 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3419 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3420 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3421 	    (clabel1->blockSize == clabel2->blockSize) &&
   3422 	    rf_component_label_numblocks(clabel1) ==
   3423 	    rf_component_label_numblocks(clabel2) &&
   3424 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3425 	    (clabel1->root_partition == clabel2->root_partition) &&
   3426 	    (clabel1->last_unit == clabel2->last_unit) &&
   3427 	    (clabel1->config_order == clabel2->config_order)) {
   3428 		/* if it get's here, it almost *has* to be a match */
   3429 	} else {
   3430 		/* it's not consistent with somebody in the set..
   3431 		   punt */
   3432 		return(0);
   3433 	}
   3434 	/* all was fine.. it must fit... */
   3435 	return(1);
   3436 }
   3437 
   3438 int
   3439 rf_have_enough_components(RF_ConfigSet_t *cset)
   3440 {
   3441 	RF_AutoConfig_t *ac;
   3442 	RF_AutoConfig_t *auto_config;
   3443 	RF_ComponentLabel_t *clabel;
   3444 	int c;
   3445 	int num_cols;
   3446 	int num_missing;
   3447 	int mod_counter;
   3448 	int mod_counter_found;
   3449 	int even_pair_failed;
   3450 	char parity_type;
   3451 
   3452 
   3453 	/* check to see that we have enough 'live' components
   3454 	   of this set.  If so, we can configure it if necessary */
   3455 
   3456 	num_cols = cset->ac->clabel->num_columns;
   3457 	parity_type = cset->ac->clabel->parityConfig;
   3458 
   3459 	/* XXX Check for duplicate components!?!?!? */
   3460 
   3461 	/* Determine what the mod_counter is supposed to be for this set. */
   3462 
   3463 	mod_counter_found = 0;
   3464 	mod_counter = 0;
   3465 	ac = cset->ac;
   3466 	while(ac!=NULL) {
   3467 		if (mod_counter_found==0) {
   3468 			mod_counter = ac->clabel->mod_counter;
   3469 			mod_counter_found = 1;
   3470 		} else {
   3471 			if (ac->clabel->mod_counter > mod_counter) {
   3472 				mod_counter = ac->clabel->mod_counter;
   3473 			}
   3474 		}
   3475 		ac = ac->next;
   3476 	}
   3477 
   3478 	num_missing = 0;
   3479 	auto_config = cset->ac;
   3480 
   3481 	even_pair_failed = 0;
   3482 	for(c=0; c<num_cols; c++) {
   3483 		ac = auto_config;
   3484 		while(ac!=NULL) {
   3485 			if ((ac->clabel->column == c) &&
   3486 			    (ac->clabel->mod_counter == mod_counter)) {
   3487 				/* it's this one... */
   3488 #ifdef DEBUG
   3489 				printf("Found: %s at %d\n",
   3490 				       ac->devname,c);
   3491 #endif
   3492 				break;
   3493 			}
   3494 			ac=ac->next;
   3495 		}
   3496 		if (ac==NULL) {
   3497 				/* Didn't find one here! */
   3498 				/* special case for RAID 1, especially
   3499 				   where there are more than 2
   3500 				   components (where RAIDframe treats
   3501 				   things a little differently :( ) */
   3502 			if (parity_type == '1') {
   3503 				if (c%2 == 0) { /* even component */
   3504 					even_pair_failed = 1;
   3505 				} else { /* odd component.  If
   3506 					    we're failed, and
   3507 					    so is the even
   3508 					    component, it's
   3509 					    "Good Night, Charlie" */
   3510 					if (even_pair_failed == 1) {
   3511 						return(0);
   3512 					}
   3513 				}
   3514 			} else {
   3515 				/* normal accounting */
   3516 				num_missing++;
   3517 			}
   3518 		}
   3519 		if ((parity_type == '1') && (c%2 == 1)) {
   3520 				/* Just did an even component, and we didn't
   3521 				   bail.. reset the even_pair_failed flag,
   3522 				   and go on to the next component.... */
   3523 			even_pair_failed = 0;
   3524 		}
   3525 	}
   3526 
   3527 	clabel = cset->ac->clabel;
   3528 
   3529 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3530 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3531 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3532 		/* XXX this needs to be made *much* more general */
   3533 		/* Too many failures */
   3534 		return(0);
   3535 	}
   3536 	/* otherwise, all is well, and we've got enough to take a kick
   3537 	   at autoconfiguring this set */
   3538 	return(1);
   3539 }
   3540 
   3541 void
   3542 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3543 			RF_Raid_t *raidPtr)
   3544 {
   3545 	RF_ComponentLabel_t *clabel;
   3546 	int i;
   3547 
   3548 	clabel = ac->clabel;
   3549 
   3550 	/* 1. Fill in the common stuff */
   3551 	config->numRow = clabel->num_rows = 1;
   3552 	config->numCol = clabel->num_columns;
   3553 	config->numSpare = 0; /* XXX should this be set here? */
   3554 	config->sectPerSU = clabel->sectPerSU;
   3555 	config->SUsPerPU = clabel->SUsPerPU;
   3556 	config->SUsPerRU = clabel->SUsPerRU;
   3557 	config->parityConfig = clabel->parityConfig;
   3558 	/* XXX... */
   3559 	strcpy(config->diskQueueType,"fifo");
   3560 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3561 	config->layoutSpecificSize = 0; /* XXX ?? */
   3562 
   3563 	while(ac!=NULL) {
   3564 		/* row/col values will be in range due to the checks
   3565 		   in reasonable_label() */
   3566 		strcpy(config->devnames[0][ac->clabel->column],
   3567 		       ac->devname);
   3568 		ac = ac->next;
   3569 	}
   3570 
   3571 	for(i=0;i<RF_MAXDBGV;i++) {
   3572 		config->debugVars[i][0] = 0;
   3573 	}
   3574 }
   3575 
   3576 int
   3577 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3578 {
   3579 	RF_ComponentLabel_t *clabel;
   3580 	int column;
   3581 	int sparecol;
   3582 
   3583 	raidPtr->autoconfigure = new_value;
   3584 
   3585 	for(column=0; column<raidPtr->numCol; column++) {
   3586 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3587 			clabel = raidget_component_label(raidPtr, column);
   3588 			clabel->autoconfigure = new_value;
   3589 			raidflush_component_label(raidPtr, column);
   3590 		}
   3591 	}
   3592 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3593 		sparecol = raidPtr->numCol + column;
   3594 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3595 			clabel = raidget_component_label(raidPtr, sparecol);
   3596 			clabel->autoconfigure = new_value;
   3597 			raidflush_component_label(raidPtr, sparecol);
   3598 		}
   3599 	}
   3600 	return(new_value);
   3601 }
   3602 
   3603 int
   3604 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3605 {
   3606 	RF_ComponentLabel_t *clabel;
   3607 	int column;
   3608 	int sparecol;
   3609 
   3610 	raidPtr->root_partition = new_value;
   3611 	for(column=0; column<raidPtr->numCol; column++) {
   3612 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3613 			clabel = raidget_component_label(raidPtr, column);
   3614 			clabel->root_partition = new_value;
   3615 			raidflush_component_label(raidPtr, column);
   3616 		}
   3617 	}
   3618 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3619 		sparecol = raidPtr->numCol + column;
   3620 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3621 			clabel = raidget_component_label(raidPtr, sparecol);
   3622 			clabel->root_partition = new_value;
   3623 			raidflush_component_label(raidPtr, sparecol);
   3624 		}
   3625 	}
   3626 	return(new_value);
   3627 }
   3628 
   3629 void
   3630 rf_release_all_vps(RF_ConfigSet_t *cset)
   3631 {
   3632 	RF_AutoConfig_t *ac;
   3633 
   3634 	ac = cset->ac;
   3635 	while(ac!=NULL) {
   3636 		/* Close the vp, and give it back */
   3637 		if (ac->vp) {
   3638 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3639 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3640 			vput(ac->vp);
   3641 			ac->vp = NULL;
   3642 		}
   3643 		ac = ac->next;
   3644 	}
   3645 }
   3646 
   3647 
   3648 void
   3649 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3650 {
   3651 	RF_AutoConfig_t *ac;
   3652 	RF_AutoConfig_t *next_ac;
   3653 
   3654 	ac = cset->ac;
   3655 	while(ac!=NULL) {
   3656 		next_ac = ac->next;
   3657 		/* nuke the label */
   3658 		free(ac->clabel, M_RAIDFRAME);
   3659 		/* cleanup the config structure */
   3660 		free(ac, M_RAIDFRAME);
   3661 		/* "next.." */
   3662 		ac = next_ac;
   3663 	}
   3664 	/* and, finally, nuke the config set */
   3665 	free(cset, M_RAIDFRAME);
   3666 }
   3667 
   3668 
   3669 void
   3670 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3671 {
   3672 	/* current version number */
   3673 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3674 	clabel->serial_number = raidPtr->serial_number;
   3675 	clabel->mod_counter = raidPtr->mod_counter;
   3676 
   3677 	clabel->num_rows = 1;
   3678 	clabel->num_columns = raidPtr->numCol;
   3679 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3680 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3681 
   3682 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3683 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3684 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3685 
   3686 	clabel->blockSize = raidPtr->bytesPerSector;
   3687 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3688 
   3689 	/* XXX not portable */
   3690 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3691 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3692 	clabel->autoconfigure = raidPtr->autoconfigure;
   3693 	clabel->root_partition = raidPtr->root_partition;
   3694 	clabel->last_unit = raidPtr->raidid;
   3695 	clabel->config_order = raidPtr->config_order;
   3696 
   3697 #ifndef RF_NO_PARITY_MAP
   3698 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3699 #endif
   3700 }
   3701 
   3702 struct raid_softc *
   3703 rf_auto_config_set(RF_ConfigSet_t *cset)
   3704 {
   3705 	RF_Raid_t *raidPtr;
   3706 	RF_Config_t *config;
   3707 	int raidID;
   3708 	struct raid_softc *sc;
   3709 
   3710 #ifdef DEBUG
   3711 	printf("RAID autoconfigure\n");
   3712 #endif
   3713 
   3714 	/* 1. Create a config structure */
   3715 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3716 	if (config == NULL) {
   3717 		printf("Out of mem!?!?\n");
   3718 				/* XXX do something more intelligent here. */
   3719 		return NULL;
   3720 	}
   3721 
   3722 	/*
   3723 	   2. Figure out what RAID ID this one is supposed to live at
   3724 	   See if we can get the same RAID dev that it was configured
   3725 	   on last time..
   3726 	*/
   3727 
   3728 	raidID = cset->ac->clabel->last_unit;
   3729 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3730 		continue;
   3731 #ifdef DEBUG
   3732 	printf("Configuring raid%d:\n",raidID);
   3733 #endif
   3734 
   3735 	raidPtr = &sc->sc_r;
   3736 
   3737 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3738 	raidPtr->softc = sc;
   3739 	raidPtr->raidid = raidID;
   3740 	raidPtr->openings = RAIDOUTSTANDING;
   3741 
   3742 	/* 3. Build the configuration structure */
   3743 	rf_create_configuration(cset->ac, config, raidPtr);
   3744 
   3745 	/* 4. Do the configuration */
   3746 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3747 		raidinit(sc);
   3748 
   3749 		rf_markalldirty(raidPtr);
   3750 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3751 		if (cset->ac->clabel->root_partition==1) {
   3752 			/* everything configured just fine.  Make a note
   3753 			   that this set is eligible to be root. */
   3754 			cset->rootable = 1;
   3755 			/* XXX do this here? */
   3756 			raidPtr->root_partition = 1;
   3757 		}
   3758 	} else {
   3759 		raidput(sc);
   3760 		sc = NULL;
   3761 	}
   3762 
   3763 	/* 5. Cleanup */
   3764 	free(config, M_RAIDFRAME);
   3765 	return sc;
   3766 }
   3767 
   3768 void
   3769 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3770 {
   3771 	struct buf *bp;
   3772 	struct raid_softc *rs;
   3773 
   3774 	bp = (struct buf *)desc->bp;
   3775 	rs = desc->raidPtr->softc;
   3776 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3777 	    (bp->b_flags & B_READ));
   3778 }
   3779 
   3780 void
   3781 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3782 	     size_t xmin, size_t xmax)
   3783 {
   3784 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3785 	pool_sethiwat(p, xmax);
   3786 	pool_prime(p, xmin);
   3787 	pool_setlowat(p, xmin);
   3788 }
   3789 
   3790 /*
   3791  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3792  * if there is IO pending and if that IO could possibly be done for a
   3793  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3794  * otherwise.
   3795  *
   3796  */
   3797 
   3798 int
   3799 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3800 {
   3801 	struct raid_softc *rs = raidPtr->softc;
   3802 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3803 		/* there is work to do */
   3804 		return 0;
   3805 	}
   3806 	/* default is nothing to do */
   3807 	return 1;
   3808 }
   3809 
   3810 int
   3811 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3812 {
   3813 	uint64_t numsecs;
   3814 	unsigned secsize;
   3815 	int error;
   3816 
   3817 	error = getdisksize(vp, &numsecs, &secsize);
   3818 	if (error == 0) {
   3819 		diskPtr->blockSize = secsize;
   3820 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3821 		diskPtr->partitionSize = numsecs;
   3822 		return 0;
   3823 	}
   3824 	return error;
   3825 }
   3826 
   3827 static int
   3828 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3829 {
   3830 	return 1;
   3831 }
   3832 
   3833 static void
   3834 raid_attach(device_t parent, device_t self, void *aux)
   3835 {
   3836 
   3837 }
   3838 
   3839 
   3840 static int
   3841 raid_detach(device_t self, int flags)
   3842 {
   3843 	int error;
   3844 	struct raid_softc *rs = device_private(self);
   3845 
   3846 	if ((error = raidlock(rs)) != 0)
   3847 		return (error);
   3848 
   3849 	error = raid_detach_unlocked(rs);
   3850 
   3851 	raidunlock(rs);
   3852 
   3853 	return error;
   3854 }
   3855 
   3856 static void
   3857 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3858 {
   3859 	prop_dictionary_t disk_info, odisk_info, geom;
   3860 	disk_info = prop_dictionary_create();
   3861 	geom = prop_dictionary_create();
   3862 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3863 				   raidPtr->totalSectors);
   3864 	prop_dictionary_set_uint32(geom, "sector-size",
   3865 				   raidPtr->bytesPerSector);
   3866 
   3867 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3868 				   raidPtr->Layout.dataSectorsPerStripe);
   3869 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3870 				   4 * raidPtr->numCol);
   3871 
   3872 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3873 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3874 	   (4 * raidPtr->numCol)));
   3875 
   3876 	prop_dictionary_set(disk_info, "geometry", geom);
   3877 	prop_object_release(geom);
   3878 	prop_dictionary_set(device_properties(rs->sc_dev),
   3879 			    "disk-info", disk_info);
   3880 	odisk_info = rs->sc_dkdev.dk_info;
   3881 	rs->sc_dkdev.dk_info = disk_info;
   3882 	if (odisk_info)
   3883 		prop_object_release(odisk_info);
   3884 }
   3885 
   3886 /*
   3887  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3888  * We end up returning whatever error was returned by the first cache flush
   3889  * that fails.
   3890  */
   3891 
   3892 int
   3893 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3894 {
   3895 	int c, sparecol;
   3896 	int e,error;
   3897 	int force = 1;
   3898 
   3899 	error = 0;
   3900 	for (c = 0; c < raidPtr->numCol; c++) {
   3901 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3902 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3903 					  &force, FWRITE, NOCRED);
   3904 			if (e) {
   3905 				if (e != ENODEV)
   3906 					printf("raid%d: cache flush to component %s failed.\n",
   3907 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3908 				if (error == 0) {
   3909 					error = e;
   3910 				}
   3911 			}
   3912 		}
   3913 	}
   3914 
   3915 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3916 		sparecol = raidPtr->numCol + c;
   3917 		/* Need to ensure that the reconstruct actually completed! */
   3918 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3919 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3920 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3921 			if (e) {
   3922 				if (e != ENODEV)
   3923 					printf("raid%d: cache flush to component %s failed.\n",
   3924 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3925 				if (error == 0) {
   3926 					error = e;
   3927 				}
   3928 			}
   3929 		}
   3930 	}
   3931 	return error;
   3932 }
   3933