Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.305
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.305 2014/03/16 05:20:29 dholland Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.305 2014/03/16 05:20:29 dholland Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	.d_open = raidopen,
    209 	.d_close = raidclose,
    210 	.d_strategy = raidstrategy,
    211 	.d_ioctl = raidioctl,
    212 	.d_dump = raiddump,
    213 	.d_psize = raidsize,
    214 	.d_flag = D_DISK
    215 };
    216 
    217 const struct cdevsw raid_cdevsw = {
    218 	.d_open = raidopen,
    219 	.d_close = raidclose,
    220 	.d_read = raidread,
    221 	.d_write = raidwrite,
    222 	.d_ioctl = raidioctl,
    223 	.d_stop = nostop,
    224 	.d_tty = notty,
    225 	.d_poll = nopoll,
    226 	.d_mmap = nommap,
    227 	.d_kqfilter = nokqfilter,
    228 	.d_flag = D_DISK
    229 };
    230 
    231 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    232 
    233 struct raid_softc {
    234 	device_t sc_dev;
    235 	int	sc_unit;
    236 	int     sc_flags;	/* flags */
    237 	int     sc_cflags;	/* configuration flags */
    238 	uint64_t sc_size;	/* size of the raid device */
    239 	char    sc_xname[20];	/* XXX external name */
    240 	struct disk sc_dkdev;	/* generic disk device info */
    241 	struct bufq_state *buf_queue;	/* used for the device queue */
    242 	RF_Raid_t sc_r;
    243 	LIST_ENTRY(raid_softc) sc_link;
    244 };
    245 /* sc_flags */
    246 #define RAIDF_INITED	0x01	/* unit has been initialized */
    247 #define RAIDF_WLABEL	0x02	/* label area is writable */
    248 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    249 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    250 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    251 #define RAIDF_LOCKED	0x80	/* unit is locked */
    252 
    253 #define	raidunit(x)	DISKUNIT(x)
    254 
    255 extern struct cfdriver raid_cd;
    256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    257     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    258     DVF_DETACH_SHUTDOWN);
    259 
    260 /*
    261  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    262  * Be aware that large numbers can allow the driver to consume a lot of
    263  * kernel memory, especially on writes, and in degraded mode reads.
    264  *
    265  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    266  * a single 64K write will typically require 64K for the old data,
    267  * 64K for the old parity, and 64K for the new parity, for a total
    268  * of 192K (if the parity buffer is not re-used immediately).
    269  * Even it if is used immediately, that's still 128K, which when multiplied
    270  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    271  *
    272  * Now in degraded mode, for example, a 64K read on the above setup may
    273  * require data reconstruction, which will require *all* of the 4 remaining
    274  * disks to participate -- 4 * 32K/disk == 128K again.
    275  */
    276 
    277 #ifndef RAIDOUTSTANDING
    278 #define RAIDOUTSTANDING   6
    279 #endif
    280 
    281 #define RAIDLABELDEV(dev)	\
    282 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    283 
    284 /* declared here, and made public, for the benefit of KVM stuff.. */
    285 
    286 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    287 				     struct disklabel *);
    288 static void raidgetdisklabel(dev_t);
    289 static void raidmakedisklabel(struct raid_softc *);
    290 
    291 static int raidlock(struct raid_softc *);
    292 static void raidunlock(struct raid_softc *);
    293 
    294 static int raid_detach_unlocked(struct raid_softc *);
    295 
    296 static void rf_markalldirty(RF_Raid_t *);
    297 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    298 
    299 void rf_ReconThread(struct rf_recon_req *);
    300 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    301 void rf_CopybackThread(RF_Raid_t *raidPtr);
    302 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    303 int rf_autoconfig(device_t);
    304 void rf_buildroothack(RF_ConfigSet_t *);
    305 
    306 RF_AutoConfig_t *rf_find_raid_components(void);
    307 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    308 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    309 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    310 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    311 int rf_set_autoconfig(RF_Raid_t *, int);
    312 int rf_set_rootpartition(RF_Raid_t *, int);
    313 void rf_release_all_vps(RF_ConfigSet_t *);
    314 void rf_cleanup_config_set(RF_ConfigSet_t *);
    315 int rf_have_enough_components(RF_ConfigSet_t *);
    316 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    317 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    318 
    319 /*
    320  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    321  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    322  * in the kernel config file.
    323  */
    324 #ifdef RAID_AUTOCONFIG
    325 int raidautoconfig = 1;
    326 #else
    327 int raidautoconfig = 0;
    328 #endif
    329 static bool raidautoconfigdone = false;
    330 
    331 struct RF_Pools_s rf_pools;
    332 
    333 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    334 static kmutex_t raid_lock;
    335 
    336 static struct raid_softc *
    337 raidcreate(int unit) {
    338 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    339 	if (sc == NULL) {
    340 #ifdef DIAGNOSTIC
    341 		printf("%s: out of memory\n", __func__);
    342 #endif
    343 		return NULL;
    344 	}
    345 	sc->sc_unit = unit;
    346 	bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    347 	return sc;
    348 }
    349 
    350 static void
    351 raiddestroy(struct raid_softc *sc) {
    352 	bufq_free(sc->buf_queue);
    353 	kmem_free(sc, sizeof(*sc));
    354 }
    355 
    356 static struct raid_softc *
    357 raidget(int unit) {
    358 	struct raid_softc *sc;
    359 	if (unit < 0) {
    360 #ifdef DIAGNOSTIC
    361 		panic("%s: unit %d!", __func__, unit);
    362 #endif
    363 		return NULL;
    364 	}
    365 	mutex_enter(&raid_lock);
    366 	LIST_FOREACH(sc, &raids, sc_link) {
    367 		if (sc->sc_unit == unit) {
    368 			mutex_exit(&raid_lock);
    369 			return sc;
    370 		}
    371 	}
    372 	mutex_exit(&raid_lock);
    373 	if ((sc = raidcreate(unit)) == NULL)
    374 		return NULL;
    375 	mutex_enter(&raid_lock);
    376 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    377 	mutex_exit(&raid_lock);
    378 	return sc;
    379 }
    380 
    381 static void
    382 raidput(struct raid_softc *sc) {
    383 	mutex_enter(&raid_lock);
    384 	LIST_REMOVE(sc, sc_link);
    385 	mutex_exit(&raid_lock);
    386 	raiddestroy(sc);
    387 }
    388 
    389 void
    390 raidattach(int num)
    391 {
    392 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    393 	/* This is where all the initialization stuff gets done. */
    394 
    395 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    396 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    397 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    398 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    399 
    400 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    401 #endif
    402 
    403 	if (rf_BootRaidframe() == 0)
    404 		aprint_verbose("Kernelized RAIDframe activated\n");
    405 	else
    406 		panic("Serious error booting RAID!!");
    407 
    408 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    409 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    410 	}
    411 
    412 	raidautoconfigdone = false;
    413 
    414 	/*
    415 	 * Register a finalizer which will be used to auto-config RAID
    416 	 * sets once all real hardware devices have been found.
    417 	 */
    418 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    419 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    420 }
    421 
    422 int
    423 rf_autoconfig(device_t self)
    424 {
    425 	RF_AutoConfig_t *ac_list;
    426 	RF_ConfigSet_t *config_sets;
    427 
    428 	if (!raidautoconfig || raidautoconfigdone == true)
    429 		return (0);
    430 
    431 	/* XXX This code can only be run once. */
    432 	raidautoconfigdone = true;
    433 
    434 	/* 1. locate all RAID components on the system */
    435 	aprint_debug("Searching for RAID components...\n");
    436 	ac_list = rf_find_raid_components();
    437 
    438 	/* 2. Sort them into their respective sets. */
    439 	config_sets = rf_create_auto_sets(ac_list);
    440 
    441 	/*
    442 	 * 3. Evaluate each set and configure the valid ones.
    443 	 * This gets done in rf_buildroothack().
    444 	 */
    445 	rf_buildroothack(config_sets);
    446 
    447 	return 1;
    448 }
    449 
    450 void
    451 rf_buildroothack(RF_ConfigSet_t *config_sets)
    452 {
    453 	RF_ConfigSet_t *cset;
    454 	RF_ConfigSet_t *next_cset;
    455 	int col;
    456 	int num_root;
    457 	char *devname;
    458 	struct raid_softc *sc, *rsc;
    459 
    460 	sc = rsc = NULL;
    461 	num_root = 0;
    462 	cset = config_sets;
    463 	while (cset != NULL) {
    464 		next_cset = cset->next;
    465 		if (rf_have_enough_components(cset) &&
    466 		    cset->ac->clabel->autoconfigure == 1) {
    467 			sc = rf_auto_config_set(cset);
    468 			if (sc != NULL) {
    469 				aprint_debug("raid%d: configured ok\n",
    470 				    sc->sc_unit);
    471 				if (cset->rootable) {
    472 					rsc = sc;
    473 					num_root++;
    474 				}
    475 			} else {
    476 				/* The autoconfig didn't work :( */
    477 				aprint_debug("Autoconfig failed\n");
    478 				rf_release_all_vps(cset);
    479 			}
    480 		} else {
    481 			/* we're not autoconfiguring this set...
    482 			   release the associated resources */
    483 			rf_release_all_vps(cset);
    484 		}
    485 		/* cleanup */
    486 		rf_cleanup_config_set(cset);
    487 		cset = next_cset;
    488 	}
    489 
    490 	/* if the user has specified what the root device should be
    491 	   then we don't touch booted_device or boothowto... */
    492 
    493 	if (rootspec != NULL)
    494 		return;
    495 
    496 	/* we found something bootable... */
    497 
    498 	if (num_root == 1) {
    499 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    500 			/* XXX: How do we find the real root partition? */
    501 			char cname[sizeof(cset->ac->devname)];
    502 			snprintf(cname, sizeof(cname), "%s%c",
    503 			    device_xname(rsc->sc_dev), 'a');
    504 			booted_device = dkwedge_find_by_wname(cname);
    505 		} else
    506 			booted_device = rsc->sc_dev;
    507 	} else if (num_root > 1) {
    508 
    509 		/*
    510 		 * Maybe the MD code can help. If it cannot, then
    511 		 * setroot() will discover that we have no
    512 		 * booted_device and will ask the user if nothing was
    513 		 * hardwired in the kernel config file
    514 		 */
    515 
    516 		if (booted_device == NULL)
    517 			cpu_rootconf();
    518 		if (booted_device == NULL)
    519 			return;
    520 
    521 		num_root = 0;
    522 		mutex_enter(&raid_lock);
    523 		LIST_FOREACH(sc, &raids, sc_link) {
    524 			RF_Raid_t *r = &sc->sc_r;
    525 			if (r->valid == 0)
    526 				continue;
    527 
    528 			if (r->root_partition == 0)
    529 				continue;
    530 
    531 			for (col = 0; col < r->numCol; col++) {
    532 				devname = r->Disks[col].devname;
    533 				devname += sizeof("/dev/") - 1;
    534 				if (strncmp(devname, device_xname(booted_device),
    535 					    strlen(device_xname(booted_device))) != 0)
    536 					continue;
    537 				aprint_debug("raid%d includes boot device %s\n",
    538 				       sc->sc_unit, devname);
    539 				num_root++;
    540 				rsc = sc;
    541 			}
    542 		}
    543 		mutex_exit(&raid_lock);
    544 
    545 		if (num_root == 1) {
    546 			booted_device = rsc->sc_dev;
    547 		} else {
    548 			/* we can't guess.. require the user to answer... */
    549 			boothowto |= RB_ASKNAME;
    550 		}
    551 	}
    552 }
    553 
    554 
    555 int
    556 raidsize(dev_t dev)
    557 {
    558 	struct raid_softc *rs;
    559 	struct disklabel *lp;
    560 	int     part, unit, omask, size;
    561 
    562 	unit = raidunit(dev);
    563 	if ((rs = raidget(unit)) == NULL)
    564 		return -1;
    565 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    566 		return (-1);
    567 
    568 	part = DISKPART(dev);
    569 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    570 	lp = rs->sc_dkdev.dk_label;
    571 
    572 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    573 		return (-1);
    574 
    575 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    576 		size = -1;
    577 	else
    578 		size = lp->d_partitions[part].p_size *
    579 		    (lp->d_secsize / DEV_BSIZE);
    580 
    581 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    582 		return (-1);
    583 
    584 	return (size);
    585 
    586 }
    587 
    588 int
    589 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    590 {
    591 	int     unit = raidunit(dev);
    592 	struct raid_softc *rs;
    593 	const struct bdevsw *bdev;
    594 	struct disklabel *lp;
    595 	RF_Raid_t *raidPtr;
    596 	daddr_t offset;
    597 	int     part, c, sparecol, j, scol, dumpto;
    598 	int     error = 0;
    599 
    600 	if ((rs = raidget(unit)) == NULL)
    601 		return ENXIO;
    602 
    603 	raidPtr = &rs->sc_r;
    604 
    605 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    606 		return ENXIO;
    607 
    608 	/* we only support dumping to RAID 1 sets */
    609 	if (raidPtr->Layout.numDataCol != 1 ||
    610 	    raidPtr->Layout.numParityCol != 1)
    611 		return EINVAL;
    612 
    613 
    614 	if ((error = raidlock(rs)) != 0)
    615 		return error;
    616 
    617 	if (size % DEV_BSIZE != 0) {
    618 		error = EINVAL;
    619 		goto out;
    620 	}
    621 
    622 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    623 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    624 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    625 		    size / DEV_BSIZE, rs->sc_size);
    626 		error = EINVAL;
    627 		goto out;
    628 	}
    629 
    630 	part = DISKPART(dev);
    631 	lp = rs->sc_dkdev.dk_label;
    632 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    633 
    634 	/* figure out what device is alive.. */
    635 
    636 	/*
    637 	   Look for a component to dump to.  The preference for the
    638 	   component to dump to is as follows:
    639 	   1) the master
    640 	   2) a used_spare of the master
    641 	   3) the slave
    642 	   4) a used_spare of the slave
    643 	*/
    644 
    645 	dumpto = -1;
    646 	for (c = 0; c < raidPtr->numCol; c++) {
    647 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    648 			/* this might be the one */
    649 			dumpto = c;
    650 			break;
    651 		}
    652 	}
    653 
    654 	/*
    655 	   At this point we have possibly selected a live master or a
    656 	   live slave.  We now check to see if there is a spared
    657 	   master (or a spared slave), if we didn't find a live master
    658 	   or a live slave.
    659 	*/
    660 
    661 	for (c = 0; c < raidPtr->numSpare; c++) {
    662 		sparecol = raidPtr->numCol + c;
    663 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    664 			/* How about this one? */
    665 			scol = -1;
    666 			for(j=0;j<raidPtr->numCol;j++) {
    667 				if (raidPtr->Disks[j].spareCol == sparecol) {
    668 					scol = j;
    669 					break;
    670 				}
    671 			}
    672 			if (scol == 0) {
    673 				/*
    674 				   We must have found a spared master!
    675 				   We'll take that over anything else
    676 				   found so far.  (We couldn't have
    677 				   found a real master before, since
    678 				   this is a used spare, and it's
    679 				   saying that it's replacing the
    680 				   master.)  On reboot (with
    681 				   autoconfiguration turned on)
    682 				   sparecol will become the 1st
    683 				   component (component0) of this set.
    684 				*/
    685 				dumpto = sparecol;
    686 				break;
    687 			} else if (scol != -1) {
    688 				/*
    689 				   Must be a spared slave.  We'll dump
    690 				   to that if we havn't found anything
    691 				   else so far.
    692 				*/
    693 				if (dumpto == -1)
    694 					dumpto = sparecol;
    695 			}
    696 		}
    697 	}
    698 
    699 	if (dumpto == -1) {
    700 		/* we couldn't find any live components to dump to!?!?
    701 		 */
    702 		error = EINVAL;
    703 		goto out;
    704 	}
    705 
    706 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    707 
    708 	/*
    709 	   Note that blkno is relative to this particular partition.
    710 	   By adding the offset of this partition in the RAID
    711 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    712 	   value that is relative to the partition used for the
    713 	   underlying component.
    714 	*/
    715 
    716 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    717 				blkno + offset, va, size);
    718 
    719 out:
    720 	raidunlock(rs);
    721 
    722 	return error;
    723 }
    724 /* ARGSUSED */
    725 int
    726 raidopen(dev_t dev, int flags, int fmt,
    727     struct lwp *l)
    728 {
    729 	int     unit = raidunit(dev);
    730 	struct raid_softc *rs;
    731 	struct disklabel *lp;
    732 	int     part, pmask;
    733 	int     error = 0;
    734 
    735 	if ((rs = raidget(unit)) == NULL)
    736 		return ENXIO;
    737 	if ((error = raidlock(rs)) != 0)
    738 		return (error);
    739 
    740 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    741 		error = EBUSY;
    742 		goto bad;
    743 	}
    744 
    745 	lp = rs->sc_dkdev.dk_label;
    746 
    747 	part = DISKPART(dev);
    748 
    749 	/*
    750 	 * If there are wedges, and this is not RAW_PART, then we
    751 	 * need to fail.
    752 	 */
    753 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    754 		error = EBUSY;
    755 		goto bad;
    756 	}
    757 	pmask = (1 << part);
    758 
    759 	if ((rs->sc_flags & RAIDF_INITED) &&
    760 	    (rs->sc_dkdev.dk_openmask == 0))
    761 		raidgetdisklabel(dev);
    762 
    763 	/* make sure that this partition exists */
    764 
    765 	if (part != RAW_PART) {
    766 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    767 		    ((part >= lp->d_npartitions) ||
    768 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    769 			error = ENXIO;
    770 			goto bad;
    771 		}
    772 	}
    773 	/* Prevent this unit from being unconfigured while open. */
    774 	switch (fmt) {
    775 	case S_IFCHR:
    776 		rs->sc_dkdev.dk_copenmask |= pmask;
    777 		break;
    778 
    779 	case S_IFBLK:
    780 		rs->sc_dkdev.dk_bopenmask |= pmask;
    781 		break;
    782 	}
    783 
    784 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    785 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    786 		/* First one... mark things as dirty... Note that we *MUST*
    787 		 have done a configure before this.  I DO NOT WANT TO BE
    788 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    789 		 THAT THEY BELONG TOGETHER!!!!! */
    790 		/* XXX should check to see if we're only open for reading
    791 		   here... If so, we needn't do this, but then need some
    792 		   other way of keeping track of what's happened.. */
    793 
    794 		rf_markalldirty(&rs->sc_r);
    795 	}
    796 
    797 
    798 	rs->sc_dkdev.dk_openmask =
    799 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    800 
    801 bad:
    802 	raidunlock(rs);
    803 
    804 	return (error);
    805 
    806 
    807 }
    808 /* ARGSUSED */
    809 int
    810 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    811 {
    812 	int     unit = raidunit(dev);
    813 	struct raid_softc *rs;
    814 	int     error = 0;
    815 	int     part;
    816 
    817 	if ((rs = raidget(unit)) == NULL)
    818 		return ENXIO;
    819 
    820 	if ((error = raidlock(rs)) != 0)
    821 		return (error);
    822 
    823 	part = DISKPART(dev);
    824 
    825 	/* ...that much closer to allowing unconfiguration... */
    826 	switch (fmt) {
    827 	case S_IFCHR:
    828 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    829 		break;
    830 
    831 	case S_IFBLK:
    832 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    833 		break;
    834 	}
    835 	rs->sc_dkdev.dk_openmask =
    836 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    837 
    838 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    839 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    840 		/* Last one... device is not unconfigured yet.
    841 		   Device shutdown has taken care of setting the
    842 		   clean bits if RAIDF_INITED is not set
    843 		   mark things as clean... */
    844 
    845 		rf_update_component_labels(&rs->sc_r,
    846 						 RF_FINAL_COMPONENT_UPDATE);
    847 
    848 		/* If the kernel is shutting down, it will detach
    849 		 * this RAID set soon enough.
    850 		 */
    851 	}
    852 
    853 	raidunlock(rs);
    854 	return (0);
    855 
    856 }
    857 
    858 void
    859 raidstrategy(struct buf *bp)
    860 {
    861 	unsigned int unit = raidunit(bp->b_dev);
    862 	RF_Raid_t *raidPtr;
    863 	int     wlabel;
    864 	struct raid_softc *rs;
    865 
    866 	if ((rs = raidget(unit)) == NULL) {
    867 		bp->b_error = ENXIO;
    868 		goto done;
    869 	}
    870 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    871 		bp->b_error = ENXIO;
    872 		goto done;
    873 	}
    874 	raidPtr = &rs->sc_r;
    875 	if (!raidPtr->valid) {
    876 		bp->b_error = ENODEV;
    877 		goto done;
    878 	}
    879 	if (bp->b_bcount == 0) {
    880 		db1_printf(("b_bcount is zero..\n"));
    881 		goto done;
    882 	}
    883 
    884 	/*
    885 	 * Do bounds checking and adjust transfer.  If there's an
    886 	 * error, the bounds check will flag that for us.
    887 	 */
    888 
    889 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    890 	if (DISKPART(bp->b_dev) == RAW_PART) {
    891 		uint64_t size; /* device size in DEV_BSIZE unit */
    892 
    893 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    894 			size = raidPtr->totalSectors <<
    895 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    896 		} else {
    897 			size = raidPtr->totalSectors >>
    898 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    899 		}
    900 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    901 			goto done;
    902 		}
    903 	} else {
    904 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    905 			db1_printf(("Bounds check failed!!:%d %d\n",
    906 				(int) bp->b_blkno, (int) wlabel));
    907 			goto done;
    908 		}
    909 	}
    910 
    911 	rf_lock_mutex2(raidPtr->iodone_lock);
    912 
    913 	bp->b_resid = 0;
    914 
    915 	/* stuff it onto our queue */
    916 	bufq_put(rs->buf_queue, bp);
    917 
    918 	/* scheduled the IO to happen at the next convenient time */
    919 	rf_signal_cond2(raidPtr->iodone_cv);
    920 	rf_unlock_mutex2(raidPtr->iodone_lock);
    921 
    922 	return;
    923 
    924 done:
    925 	bp->b_resid = bp->b_bcount;
    926 	biodone(bp);
    927 }
    928 /* ARGSUSED */
    929 int
    930 raidread(dev_t dev, struct uio *uio, int flags)
    931 {
    932 	int     unit = raidunit(dev);
    933 	struct raid_softc *rs;
    934 
    935 	if ((rs = raidget(unit)) == NULL)
    936 		return ENXIO;
    937 
    938 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    939 		return (ENXIO);
    940 
    941 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    942 
    943 }
    944 /* ARGSUSED */
    945 int
    946 raidwrite(dev_t dev, struct uio *uio, int flags)
    947 {
    948 	int     unit = raidunit(dev);
    949 	struct raid_softc *rs;
    950 
    951 	if ((rs = raidget(unit)) == NULL)
    952 		return ENXIO;
    953 
    954 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    955 		return (ENXIO);
    956 
    957 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    958 
    959 }
    960 
    961 static int
    962 raid_detach_unlocked(struct raid_softc *rs)
    963 {
    964 	int error;
    965 	RF_Raid_t *raidPtr;
    966 
    967 	raidPtr = &rs->sc_r;
    968 
    969 	/*
    970 	 * If somebody has a partition mounted, we shouldn't
    971 	 * shutdown.
    972 	 */
    973 	if (rs->sc_dkdev.dk_openmask != 0)
    974 		return EBUSY;
    975 
    976 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    977 		;	/* not initialized: nothing to do */
    978 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    979 		return error;
    980 	else
    981 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    982 
    983 	/* Detach the disk. */
    984 	dkwedge_delall(&rs->sc_dkdev);
    985 	disk_detach(&rs->sc_dkdev);
    986 	disk_destroy(&rs->sc_dkdev);
    987 
    988 	aprint_normal_dev(rs->sc_dev, "detached\n");
    989 
    990 	return 0;
    991 }
    992 
    993 int
    994 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    995 {
    996 	int     unit = raidunit(dev);
    997 	int     error = 0;
    998 	int     part, pmask, s;
    999 	cfdata_t cf;
   1000 	struct raid_softc *rs;
   1001 	RF_Config_t *k_cfg, *u_cfg;
   1002 	RF_Raid_t *raidPtr;
   1003 	RF_RaidDisk_t *diskPtr;
   1004 	RF_AccTotals_t *totals;
   1005 	RF_DeviceConfig_t *d_cfg, **ucfgp;
   1006 	u_char *specific_buf;
   1007 	int retcode = 0;
   1008 	int column;
   1009 /*	int raidid; */
   1010 	struct rf_recon_req *rrcopy, *rr;
   1011 	RF_ComponentLabel_t *clabel;
   1012 	RF_ComponentLabel_t *ci_label;
   1013 	RF_ComponentLabel_t **clabel_ptr;
   1014 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1015 	RF_SingleComponent_t component;
   1016 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1017 	int i, j, d;
   1018 #ifdef __HAVE_OLD_DISKLABEL
   1019 	struct disklabel newlabel;
   1020 #endif
   1021 	struct dkwedge_info *dkw;
   1022 
   1023 	if ((rs = raidget(unit)) == NULL)
   1024 		return ENXIO;
   1025 	raidPtr = &rs->sc_r;
   1026 
   1027 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1028 		(int) DISKPART(dev), (int) unit, cmd));
   1029 
   1030 	/* Must be open for writes for these commands... */
   1031 	switch (cmd) {
   1032 #ifdef DIOCGSECTORSIZE
   1033 	case DIOCGSECTORSIZE:
   1034 		*(u_int *)data = raidPtr->bytesPerSector;
   1035 		return 0;
   1036 	case DIOCGMEDIASIZE:
   1037 		*(off_t *)data =
   1038 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1039 		return 0;
   1040 #endif
   1041 	case DIOCSDINFO:
   1042 	case DIOCWDINFO:
   1043 #ifdef __HAVE_OLD_DISKLABEL
   1044 	case ODIOCWDINFO:
   1045 	case ODIOCSDINFO:
   1046 #endif
   1047 	case DIOCWLABEL:
   1048 	case DIOCAWEDGE:
   1049 	case DIOCDWEDGE:
   1050 	case DIOCSSTRATEGY:
   1051 		if ((flag & FWRITE) == 0)
   1052 			return (EBADF);
   1053 	}
   1054 
   1055 	/* Must be initialized for these... */
   1056 	switch (cmd) {
   1057 	case DIOCGDINFO:
   1058 	case DIOCSDINFO:
   1059 	case DIOCWDINFO:
   1060 #ifdef __HAVE_OLD_DISKLABEL
   1061 	case ODIOCGDINFO:
   1062 	case ODIOCWDINFO:
   1063 	case ODIOCSDINFO:
   1064 	case ODIOCGDEFLABEL:
   1065 #endif
   1066 	case DIOCGPART:
   1067 	case DIOCWLABEL:
   1068 	case DIOCGDEFLABEL:
   1069 	case DIOCAWEDGE:
   1070 	case DIOCDWEDGE:
   1071 	case DIOCLWEDGES:
   1072 	case DIOCCACHESYNC:
   1073 	case RAIDFRAME_SHUTDOWN:
   1074 	case RAIDFRAME_REWRITEPARITY:
   1075 	case RAIDFRAME_GET_INFO:
   1076 	case RAIDFRAME_RESET_ACCTOTALS:
   1077 	case RAIDFRAME_GET_ACCTOTALS:
   1078 	case RAIDFRAME_KEEP_ACCTOTALS:
   1079 	case RAIDFRAME_GET_SIZE:
   1080 	case RAIDFRAME_FAIL_DISK:
   1081 	case RAIDFRAME_COPYBACK:
   1082 	case RAIDFRAME_CHECK_RECON_STATUS:
   1083 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1084 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1085 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1086 	case RAIDFRAME_ADD_HOT_SPARE:
   1087 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1088 	case RAIDFRAME_INIT_LABELS:
   1089 	case RAIDFRAME_REBUILD_IN_PLACE:
   1090 	case RAIDFRAME_CHECK_PARITY:
   1091 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1092 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1093 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1094 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1095 	case RAIDFRAME_SET_AUTOCONFIG:
   1096 	case RAIDFRAME_SET_ROOT:
   1097 	case RAIDFRAME_DELETE_COMPONENT:
   1098 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1099 	case RAIDFRAME_PARITYMAP_STATUS:
   1100 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1101 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1102 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1103 	case DIOCGSTRATEGY:
   1104 	case DIOCSSTRATEGY:
   1105 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1106 			return (ENXIO);
   1107 	}
   1108 
   1109 	switch (cmd) {
   1110 #ifdef COMPAT_50
   1111 	case RAIDFRAME_GET_INFO50:
   1112 		return rf_get_info50(raidPtr, data);
   1113 
   1114 	case RAIDFRAME_CONFIGURE50:
   1115 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1116 			return retcode;
   1117 		goto config;
   1118 #endif
   1119 		/* configure the system */
   1120 	case RAIDFRAME_CONFIGURE:
   1121 
   1122 		if (raidPtr->valid) {
   1123 			/* There is a valid RAID set running on this unit! */
   1124 			printf("raid%d: Device already configured!\n",unit);
   1125 			return(EINVAL);
   1126 		}
   1127 
   1128 		/* copy-in the configuration information */
   1129 		/* data points to a pointer to the configuration structure */
   1130 
   1131 		u_cfg = *((RF_Config_t **) data);
   1132 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1133 		if (k_cfg == NULL) {
   1134 			return (ENOMEM);
   1135 		}
   1136 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1137 		if (retcode) {
   1138 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1139 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1140 				retcode));
   1141 			return (retcode);
   1142 		}
   1143 		goto config;
   1144 	config:
   1145 		/* allocate a buffer for the layout-specific data, and copy it
   1146 		 * in */
   1147 		if (k_cfg->layoutSpecificSize) {
   1148 			if (k_cfg->layoutSpecificSize > 10000) {
   1149 				/* sanity check */
   1150 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1151 				return (EINVAL);
   1152 			}
   1153 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1154 			    (u_char *));
   1155 			if (specific_buf == NULL) {
   1156 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1157 				return (ENOMEM);
   1158 			}
   1159 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1160 			    k_cfg->layoutSpecificSize);
   1161 			if (retcode) {
   1162 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1163 				RF_Free(specific_buf,
   1164 					k_cfg->layoutSpecificSize);
   1165 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1166 					retcode));
   1167 				return (retcode);
   1168 			}
   1169 		} else
   1170 			specific_buf = NULL;
   1171 		k_cfg->layoutSpecific = specific_buf;
   1172 
   1173 		/* should do some kind of sanity check on the configuration.
   1174 		 * Store the sum of all the bytes in the last byte? */
   1175 
   1176 		/* configure the system */
   1177 
   1178 		/*
   1179 		 * Clear the entire RAID descriptor, just to make sure
   1180 		 *  there is no stale data left in the case of a
   1181 		 *  reconfiguration
   1182 		 */
   1183 		memset(raidPtr, 0, sizeof(*raidPtr));
   1184 		raidPtr->softc = rs;
   1185 		raidPtr->raidid = unit;
   1186 
   1187 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1188 
   1189 		if (retcode == 0) {
   1190 
   1191 			/* allow this many simultaneous IO's to
   1192 			   this RAID device */
   1193 			raidPtr->openings = RAIDOUTSTANDING;
   1194 
   1195 			raidinit(rs);
   1196 			rf_markalldirty(raidPtr);
   1197 		}
   1198 		/* free the buffers.  No return code here. */
   1199 		if (k_cfg->layoutSpecificSize) {
   1200 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1201 		}
   1202 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1203 
   1204 		return (retcode);
   1205 
   1206 		/* shutdown the system */
   1207 	case RAIDFRAME_SHUTDOWN:
   1208 
   1209 		part = DISKPART(dev);
   1210 		pmask = (1 << part);
   1211 
   1212 		if ((error = raidlock(rs)) != 0)
   1213 			return (error);
   1214 
   1215 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1216 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1217 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1218 			retcode = EBUSY;
   1219 		else {
   1220 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1221 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1222 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1223 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1224 			retcode = 0;
   1225 		}
   1226 
   1227 		raidunlock(rs);
   1228 
   1229 		if (retcode != 0)
   1230 			return retcode;
   1231 
   1232 		/* free the pseudo device attach bits */
   1233 
   1234 		cf = device_cfdata(rs->sc_dev);
   1235 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1236 			free(cf, M_RAIDFRAME);
   1237 
   1238 		return (retcode);
   1239 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1240 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1241 		/* need to read the component label for the disk indicated
   1242 		   by row,column in clabel */
   1243 
   1244 		/*
   1245 		 * Perhaps there should be an option to skip the in-core
   1246 		 * copy and hit the disk, as with disklabel(8).
   1247 		 */
   1248 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1249 
   1250 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1251 
   1252 		if (retcode) {
   1253 			RF_Free(clabel, sizeof(*clabel));
   1254 			return retcode;
   1255 		}
   1256 
   1257 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1258 
   1259 		column = clabel->column;
   1260 
   1261 		if ((column < 0) || (column >= raidPtr->numCol +
   1262 		    raidPtr->numSpare)) {
   1263 			RF_Free(clabel, sizeof(*clabel));
   1264 			return EINVAL;
   1265 		}
   1266 
   1267 		RF_Free(clabel, sizeof(*clabel));
   1268 
   1269 		clabel = raidget_component_label(raidPtr, column);
   1270 
   1271 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1272 
   1273 #if 0
   1274 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1275 		clabel = (RF_ComponentLabel_t *) data;
   1276 
   1277 		/* XXX check the label for valid stuff... */
   1278 		/* Note that some things *should not* get modified --
   1279 		   the user should be re-initing the labels instead of
   1280 		   trying to patch things.
   1281 		   */
   1282 
   1283 		raidid = raidPtr->raidid;
   1284 #ifdef DEBUG
   1285 		printf("raid%d: Got component label:\n", raidid);
   1286 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1287 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1288 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1289 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1290 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1291 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1292 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1293 #endif
   1294 		clabel->row = 0;
   1295 		column = clabel->column;
   1296 
   1297 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1298 			return(EINVAL);
   1299 		}
   1300 
   1301 		/* XXX this isn't allowed to do anything for now :-) */
   1302 
   1303 		/* XXX and before it is, we need to fill in the rest
   1304 		   of the fields!?!?!?! */
   1305 		memcpy(raidget_component_label(raidPtr, column),
   1306 		    clabel, sizeof(*clabel));
   1307 		raidflush_component_label(raidPtr, column);
   1308 		return (0);
   1309 #endif
   1310 
   1311 	case RAIDFRAME_INIT_LABELS:
   1312 		clabel = (RF_ComponentLabel_t *) data;
   1313 		/*
   1314 		   we only want the serial number from
   1315 		   the above.  We get all the rest of the information
   1316 		   from the config that was used to create this RAID
   1317 		   set.
   1318 		   */
   1319 
   1320 		raidPtr->serial_number = clabel->serial_number;
   1321 
   1322 		for(column=0;column<raidPtr->numCol;column++) {
   1323 			diskPtr = &raidPtr->Disks[column];
   1324 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1325 				ci_label = raidget_component_label(raidPtr,
   1326 				    column);
   1327 				/* Zeroing this is important. */
   1328 				memset(ci_label, 0, sizeof(*ci_label));
   1329 				raid_init_component_label(raidPtr, ci_label);
   1330 				ci_label->serial_number =
   1331 				    raidPtr->serial_number;
   1332 				ci_label->row = 0; /* we dont' pretend to support more */
   1333 				rf_component_label_set_partitionsize(ci_label,
   1334 				    diskPtr->partitionSize);
   1335 				ci_label->column = column;
   1336 				raidflush_component_label(raidPtr, column);
   1337 			}
   1338 			/* XXXjld what about the spares? */
   1339 		}
   1340 
   1341 		return (retcode);
   1342 	case RAIDFRAME_SET_AUTOCONFIG:
   1343 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1344 		printf("raid%d: New autoconfig value is: %d\n",
   1345 		       raidPtr->raidid, d);
   1346 		*(int *) data = d;
   1347 		return (retcode);
   1348 
   1349 	case RAIDFRAME_SET_ROOT:
   1350 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1351 		printf("raid%d: New rootpartition value is: %d\n",
   1352 		       raidPtr->raidid, d);
   1353 		*(int *) data = d;
   1354 		return (retcode);
   1355 
   1356 		/* initialize all parity */
   1357 	case RAIDFRAME_REWRITEPARITY:
   1358 
   1359 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1360 			/* Parity for RAID 0 is trivially correct */
   1361 			raidPtr->parity_good = RF_RAID_CLEAN;
   1362 			return(0);
   1363 		}
   1364 
   1365 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1366 			/* Re-write is already in progress! */
   1367 			return(EINVAL);
   1368 		}
   1369 
   1370 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1371 					   rf_RewriteParityThread,
   1372 					   raidPtr,"raid_parity");
   1373 		return (retcode);
   1374 
   1375 
   1376 	case RAIDFRAME_ADD_HOT_SPARE:
   1377 		sparePtr = (RF_SingleComponent_t *) data;
   1378 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1379 		retcode = rf_add_hot_spare(raidPtr, &component);
   1380 		return(retcode);
   1381 
   1382 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1383 		return(retcode);
   1384 
   1385 	case RAIDFRAME_DELETE_COMPONENT:
   1386 		componentPtr = (RF_SingleComponent_t *)data;
   1387 		memcpy( &component, componentPtr,
   1388 			sizeof(RF_SingleComponent_t));
   1389 		retcode = rf_delete_component(raidPtr, &component);
   1390 		return(retcode);
   1391 
   1392 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1393 		componentPtr = (RF_SingleComponent_t *)data;
   1394 		memcpy( &component, componentPtr,
   1395 			sizeof(RF_SingleComponent_t));
   1396 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1397 		return(retcode);
   1398 
   1399 	case RAIDFRAME_REBUILD_IN_PLACE:
   1400 
   1401 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1402 			/* Can't do this on a RAID 0!! */
   1403 			return(EINVAL);
   1404 		}
   1405 
   1406 		if (raidPtr->recon_in_progress == 1) {
   1407 			/* a reconstruct is already in progress! */
   1408 			return(EINVAL);
   1409 		}
   1410 
   1411 		componentPtr = (RF_SingleComponent_t *) data;
   1412 		memcpy( &component, componentPtr,
   1413 			sizeof(RF_SingleComponent_t));
   1414 		component.row = 0; /* we don't support any more */
   1415 		column = component.column;
   1416 
   1417 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1418 			return(EINVAL);
   1419 		}
   1420 
   1421 		rf_lock_mutex2(raidPtr->mutex);
   1422 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1423 		    (raidPtr->numFailures > 0)) {
   1424 			/* XXX 0 above shouldn't be constant!!! */
   1425 			/* some component other than this has failed.
   1426 			   Let's not make things worse than they already
   1427 			   are... */
   1428 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1429 			       raidPtr->raidid);
   1430 			printf("raid%d:     Col: %d   Too many failures.\n",
   1431 			       raidPtr->raidid, column);
   1432 			rf_unlock_mutex2(raidPtr->mutex);
   1433 			return (EINVAL);
   1434 		}
   1435 		if (raidPtr->Disks[column].status ==
   1436 		    rf_ds_reconstructing) {
   1437 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1438 			       raidPtr->raidid);
   1439 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1440 
   1441 			rf_unlock_mutex2(raidPtr->mutex);
   1442 			return (EINVAL);
   1443 		}
   1444 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1445 			rf_unlock_mutex2(raidPtr->mutex);
   1446 			return (EINVAL);
   1447 		}
   1448 		rf_unlock_mutex2(raidPtr->mutex);
   1449 
   1450 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1451 		if (rrcopy == NULL)
   1452 			return(ENOMEM);
   1453 
   1454 		rrcopy->raidPtr = (void *) raidPtr;
   1455 		rrcopy->col = column;
   1456 
   1457 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1458 					   rf_ReconstructInPlaceThread,
   1459 					   rrcopy,"raid_reconip");
   1460 		return(retcode);
   1461 
   1462 	case RAIDFRAME_GET_INFO:
   1463 		if (!raidPtr->valid)
   1464 			return (ENODEV);
   1465 		ucfgp = (RF_DeviceConfig_t **) data;
   1466 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1467 			  (RF_DeviceConfig_t *));
   1468 		if (d_cfg == NULL)
   1469 			return (ENOMEM);
   1470 		d_cfg->rows = 1; /* there is only 1 row now */
   1471 		d_cfg->cols = raidPtr->numCol;
   1472 		d_cfg->ndevs = raidPtr->numCol;
   1473 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1474 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1475 			return (ENOMEM);
   1476 		}
   1477 		d_cfg->nspares = raidPtr->numSpare;
   1478 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1479 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1480 			return (ENOMEM);
   1481 		}
   1482 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1483 		d = 0;
   1484 		for (j = 0; j < d_cfg->cols; j++) {
   1485 			d_cfg->devs[d] = raidPtr->Disks[j];
   1486 			d++;
   1487 		}
   1488 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1489 			d_cfg->spares[i] = raidPtr->Disks[j];
   1490 		}
   1491 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1492 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1493 
   1494 		return (retcode);
   1495 
   1496 	case RAIDFRAME_CHECK_PARITY:
   1497 		*(int *) data = raidPtr->parity_good;
   1498 		return (0);
   1499 
   1500 	case RAIDFRAME_PARITYMAP_STATUS:
   1501 		if (rf_paritymap_ineligible(raidPtr))
   1502 			return EINVAL;
   1503 		rf_paritymap_status(raidPtr->parity_map,
   1504 		    (struct rf_pmstat *)data);
   1505 		return 0;
   1506 
   1507 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1508 		if (rf_paritymap_ineligible(raidPtr))
   1509 			return EINVAL;
   1510 		if (raidPtr->parity_map == NULL)
   1511 			return ENOENT; /* ??? */
   1512 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1513 			(struct rf_pmparams *)data, 1))
   1514 			return EINVAL;
   1515 		return 0;
   1516 
   1517 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1518 		if (rf_paritymap_ineligible(raidPtr))
   1519 			return EINVAL;
   1520 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1521 		return 0;
   1522 
   1523 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1524 		if (rf_paritymap_ineligible(raidPtr))
   1525 			return EINVAL;
   1526 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1527 		/* XXX should errors be passed up? */
   1528 		return 0;
   1529 
   1530 	case RAIDFRAME_RESET_ACCTOTALS:
   1531 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1532 		return (0);
   1533 
   1534 	case RAIDFRAME_GET_ACCTOTALS:
   1535 		totals = (RF_AccTotals_t *) data;
   1536 		*totals = raidPtr->acc_totals;
   1537 		return (0);
   1538 
   1539 	case RAIDFRAME_KEEP_ACCTOTALS:
   1540 		raidPtr->keep_acc_totals = *(int *)data;
   1541 		return (0);
   1542 
   1543 	case RAIDFRAME_GET_SIZE:
   1544 		*(int *) data = raidPtr->totalSectors;
   1545 		return (0);
   1546 
   1547 		/* fail a disk & optionally start reconstruction */
   1548 	case RAIDFRAME_FAIL_DISK:
   1549 
   1550 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1551 			/* Can't do this on a RAID 0!! */
   1552 			return(EINVAL);
   1553 		}
   1554 
   1555 		rr = (struct rf_recon_req *) data;
   1556 		rr->row = 0;
   1557 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1558 			return (EINVAL);
   1559 
   1560 
   1561 		rf_lock_mutex2(raidPtr->mutex);
   1562 		if (raidPtr->status == rf_rs_reconstructing) {
   1563 			/* you can't fail a disk while we're reconstructing! */
   1564 			/* XXX wrong for RAID6 */
   1565 			rf_unlock_mutex2(raidPtr->mutex);
   1566 			return (EINVAL);
   1567 		}
   1568 		if ((raidPtr->Disks[rr->col].status ==
   1569 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1570 			/* some other component has failed.  Let's not make
   1571 			   things worse. XXX wrong for RAID6 */
   1572 			rf_unlock_mutex2(raidPtr->mutex);
   1573 			return (EINVAL);
   1574 		}
   1575 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1576 			/* Can't fail a spared disk! */
   1577 			rf_unlock_mutex2(raidPtr->mutex);
   1578 			return (EINVAL);
   1579 		}
   1580 		rf_unlock_mutex2(raidPtr->mutex);
   1581 
   1582 		/* make a copy of the recon request so that we don't rely on
   1583 		 * the user's buffer */
   1584 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1585 		if (rrcopy == NULL)
   1586 			return(ENOMEM);
   1587 		memcpy(rrcopy, rr, sizeof(*rr));
   1588 		rrcopy->raidPtr = (void *) raidPtr;
   1589 
   1590 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1591 					   rf_ReconThread,
   1592 					   rrcopy,"raid_recon");
   1593 		return (0);
   1594 
   1595 		/* invoke a copyback operation after recon on whatever disk
   1596 		 * needs it, if any */
   1597 	case RAIDFRAME_COPYBACK:
   1598 
   1599 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1600 			/* This makes no sense on a RAID 0!! */
   1601 			return(EINVAL);
   1602 		}
   1603 
   1604 		if (raidPtr->copyback_in_progress == 1) {
   1605 			/* Copyback is already in progress! */
   1606 			return(EINVAL);
   1607 		}
   1608 
   1609 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1610 					   rf_CopybackThread,
   1611 					   raidPtr,"raid_copyback");
   1612 		return (retcode);
   1613 
   1614 		/* return the percentage completion of reconstruction */
   1615 	case RAIDFRAME_CHECK_RECON_STATUS:
   1616 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1617 			/* This makes no sense on a RAID 0, so tell the
   1618 			   user it's done. */
   1619 			*(int *) data = 100;
   1620 			return(0);
   1621 		}
   1622 		if (raidPtr->status != rf_rs_reconstructing)
   1623 			*(int *) data = 100;
   1624 		else {
   1625 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1626 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1627 			} else {
   1628 				*(int *) data = 0;
   1629 			}
   1630 		}
   1631 		return (0);
   1632 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1633 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1634 		if (raidPtr->status != rf_rs_reconstructing) {
   1635 			progressInfo.remaining = 0;
   1636 			progressInfo.completed = 100;
   1637 			progressInfo.total = 100;
   1638 		} else {
   1639 			progressInfo.total =
   1640 				raidPtr->reconControl->numRUsTotal;
   1641 			progressInfo.completed =
   1642 				raidPtr->reconControl->numRUsComplete;
   1643 			progressInfo.remaining = progressInfo.total -
   1644 				progressInfo.completed;
   1645 		}
   1646 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1647 				  sizeof(RF_ProgressInfo_t));
   1648 		return (retcode);
   1649 
   1650 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1651 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1652 			/* This makes no sense on a RAID 0, so tell the
   1653 			   user it's done. */
   1654 			*(int *) data = 100;
   1655 			return(0);
   1656 		}
   1657 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1658 			*(int *) data = 100 *
   1659 				raidPtr->parity_rewrite_stripes_done /
   1660 				raidPtr->Layout.numStripe;
   1661 		} else {
   1662 			*(int *) data = 100;
   1663 		}
   1664 		return (0);
   1665 
   1666 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1667 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1668 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1669 			progressInfo.total = raidPtr->Layout.numStripe;
   1670 			progressInfo.completed =
   1671 				raidPtr->parity_rewrite_stripes_done;
   1672 			progressInfo.remaining = progressInfo.total -
   1673 				progressInfo.completed;
   1674 		} else {
   1675 			progressInfo.remaining = 0;
   1676 			progressInfo.completed = 100;
   1677 			progressInfo.total = 100;
   1678 		}
   1679 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1680 				  sizeof(RF_ProgressInfo_t));
   1681 		return (retcode);
   1682 
   1683 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1684 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1685 			/* This makes no sense on a RAID 0 */
   1686 			*(int *) data = 100;
   1687 			return(0);
   1688 		}
   1689 		if (raidPtr->copyback_in_progress == 1) {
   1690 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1691 				raidPtr->Layout.numStripe;
   1692 		} else {
   1693 			*(int *) data = 100;
   1694 		}
   1695 		return (0);
   1696 
   1697 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1698 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1699 		if (raidPtr->copyback_in_progress == 1) {
   1700 			progressInfo.total = raidPtr->Layout.numStripe;
   1701 			progressInfo.completed =
   1702 				raidPtr->copyback_stripes_done;
   1703 			progressInfo.remaining = progressInfo.total -
   1704 				progressInfo.completed;
   1705 		} else {
   1706 			progressInfo.remaining = 0;
   1707 			progressInfo.completed = 100;
   1708 			progressInfo.total = 100;
   1709 		}
   1710 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1711 				  sizeof(RF_ProgressInfo_t));
   1712 		return (retcode);
   1713 
   1714 		/* the sparetable daemon calls this to wait for the kernel to
   1715 		 * need a spare table. this ioctl does not return until a
   1716 		 * spare table is needed. XXX -- calling mpsleep here in the
   1717 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1718 		 * -- I should either compute the spare table in the kernel,
   1719 		 * or have a different -- XXX XXX -- interface (a different
   1720 		 * character device) for delivering the table     -- XXX */
   1721 #if 0
   1722 	case RAIDFRAME_SPARET_WAIT:
   1723 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1724 		while (!rf_sparet_wait_queue)
   1725 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1726 		waitreq = rf_sparet_wait_queue;
   1727 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1728 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1729 
   1730 		/* structure assignment */
   1731 		*((RF_SparetWait_t *) data) = *waitreq;
   1732 
   1733 		RF_Free(waitreq, sizeof(*waitreq));
   1734 		return (0);
   1735 
   1736 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1737 		 * code in it that will cause the dameon to exit */
   1738 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1739 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1740 		waitreq->fcol = -1;
   1741 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1742 		waitreq->next = rf_sparet_wait_queue;
   1743 		rf_sparet_wait_queue = waitreq;
   1744 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1745 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1746 		return (0);
   1747 
   1748 		/* used by the spare table daemon to deliver a spare table
   1749 		 * into the kernel */
   1750 	case RAIDFRAME_SEND_SPARET:
   1751 
   1752 		/* install the spare table */
   1753 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1754 
   1755 		/* respond to the requestor.  the return status of the spare
   1756 		 * table installation is passed in the "fcol" field */
   1757 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1758 		waitreq->fcol = retcode;
   1759 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1760 		waitreq->next = rf_sparet_resp_queue;
   1761 		rf_sparet_resp_queue = waitreq;
   1762 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1763 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1764 
   1765 		return (retcode);
   1766 #endif
   1767 
   1768 	default:
   1769 		break; /* fall through to the os-specific code below */
   1770 
   1771 	}
   1772 
   1773 	if (!raidPtr->valid)
   1774 		return (EINVAL);
   1775 
   1776 	/*
   1777 	 * Add support for "regular" device ioctls here.
   1778 	 */
   1779 
   1780 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1781 	if (error != EPASSTHROUGH)
   1782 		return (error);
   1783 
   1784 	switch (cmd) {
   1785 	case DIOCGDINFO:
   1786 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1787 		break;
   1788 #ifdef __HAVE_OLD_DISKLABEL
   1789 	case ODIOCGDINFO:
   1790 		newlabel = *(rs->sc_dkdev.dk_label);
   1791 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1792 			return ENOTTY;
   1793 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1794 		break;
   1795 #endif
   1796 
   1797 	case DIOCGPART:
   1798 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1799 		((struct partinfo *) data)->part =
   1800 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1801 		break;
   1802 
   1803 	case DIOCWDINFO:
   1804 	case DIOCSDINFO:
   1805 #ifdef __HAVE_OLD_DISKLABEL
   1806 	case ODIOCWDINFO:
   1807 	case ODIOCSDINFO:
   1808 #endif
   1809 	{
   1810 		struct disklabel *lp;
   1811 #ifdef __HAVE_OLD_DISKLABEL
   1812 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1813 			memset(&newlabel, 0, sizeof newlabel);
   1814 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1815 			lp = &newlabel;
   1816 		} else
   1817 #endif
   1818 		lp = (struct disklabel *)data;
   1819 
   1820 		if ((error = raidlock(rs)) != 0)
   1821 			return (error);
   1822 
   1823 		rs->sc_flags |= RAIDF_LABELLING;
   1824 
   1825 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1826 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1827 		if (error == 0) {
   1828 			if (cmd == DIOCWDINFO
   1829 #ifdef __HAVE_OLD_DISKLABEL
   1830 			    || cmd == ODIOCWDINFO
   1831 #endif
   1832 			   )
   1833 				error = writedisklabel(RAIDLABELDEV(dev),
   1834 				    raidstrategy, rs->sc_dkdev.dk_label,
   1835 				    rs->sc_dkdev.dk_cpulabel);
   1836 		}
   1837 		rs->sc_flags &= ~RAIDF_LABELLING;
   1838 
   1839 		raidunlock(rs);
   1840 
   1841 		if (error)
   1842 			return (error);
   1843 		break;
   1844 	}
   1845 
   1846 	case DIOCWLABEL:
   1847 		if (*(int *) data != 0)
   1848 			rs->sc_flags |= RAIDF_WLABEL;
   1849 		else
   1850 			rs->sc_flags &= ~RAIDF_WLABEL;
   1851 		break;
   1852 
   1853 	case DIOCGDEFLABEL:
   1854 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1855 		break;
   1856 
   1857 #ifdef __HAVE_OLD_DISKLABEL
   1858 	case ODIOCGDEFLABEL:
   1859 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1860 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1861 			return ENOTTY;
   1862 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1863 		break;
   1864 #endif
   1865 
   1866 	case DIOCAWEDGE:
   1867 	case DIOCDWEDGE:
   1868 	    	dkw = (void *)data;
   1869 
   1870 		/* If the ioctl happens here, the parent is us. */
   1871 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1872 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1873 
   1874 	case DIOCLWEDGES:
   1875 		return dkwedge_list(&rs->sc_dkdev,
   1876 		    (struct dkwedge_list *)data, l);
   1877 	case DIOCCACHESYNC:
   1878 		return rf_sync_component_caches(raidPtr);
   1879 
   1880 	case DIOCGSTRATEGY:
   1881 	    {
   1882 		struct disk_strategy *dks = (void *)data;
   1883 
   1884 		s = splbio();
   1885 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1886 		    sizeof(dks->dks_name));
   1887 		splx(s);
   1888 		dks->dks_paramlen = 0;
   1889 
   1890 		return 0;
   1891 	    }
   1892 
   1893 	case DIOCSSTRATEGY:
   1894 	    {
   1895 		struct disk_strategy *dks = (void *)data;
   1896 		struct bufq_state *new;
   1897 		struct bufq_state *old;
   1898 
   1899 		if (dks->dks_param != NULL) {
   1900 			return EINVAL;
   1901 		}
   1902 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1903 		error = bufq_alloc(&new, dks->dks_name,
   1904 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1905 		if (error) {
   1906 			return error;
   1907 		}
   1908 		s = splbio();
   1909 		old = rs->buf_queue;
   1910 		bufq_move(new, old);
   1911 		rs->buf_queue = new;
   1912 		splx(s);
   1913 		bufq_free(old);
   1914 
   1915 		return 0;
   1916 	    }
   1917 
   1918 	default:
   1919 		retcode = ENOTTY;
   1920 	}
   1921 	return (retcode);
   1922 
   1923 }
   1924 
   1925 
   1926 /* raidinit -- complete the rest of the initialization for the
   1927    RAIDframe device.  */
   1928 
   1929 
   1930 static void
   1931 raidinit(struct raid_softc *rs)
   1932 {
   1933 	cfdata_t cf;
   1934 	int     unit;
   1935 	RF_Raid_t *raidPtr = &rs->sc_r;
   1936 
   1937 	unit = raidPtr->raidid;
   1938 
   1939 
   1940 	/* XXX should check return code first... */
   1941 	rs->sc_flags |= RAIDF_INITED;
   1942 
   1943 	/* XXX doesn't check bounds. */
   1944 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1945 
   1946 	/* attach the pseudo device */
   1947 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1948 	cf->cf_name = raid_cd.cd_name;
   1949 	cf->cf_atname = raid_cd.cd_name;
   1950 	cf->cf_unit = unit;
   1951 	cf->cf_fstate = FSTATE_STAR;
   1952 
   1953 	rs->sc_dev = config_attach_pseudo(cf);
   1954 
   1955 	if (rs->sc_dev == NULL) {
   1956 		printf("raid%d: config_attach_pseudo failed\n",
   1957 		    raidPtr->raidid);
   1958 		rs->sc_flags &= ~RAIDF_INITED;
   1959 		free(cf, M_RAIDFRAME);
   1960 		return;
   1961 	}
   1962 
   1963 	/* disk_attach actually creates space for the CPU disklabel, among
   1964 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1965 	 * with disklabels. */
   1966 
   1967 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1968 	disk_attach(&rs->sc_dkdev);
   1969 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1970 
   1971 	/* XXX There may be a weird interaction here between this, and
   1972 	 * protectedSectors, as used in RAIDframe.  */
   1973 
   1974 	rs->sc_size = raidPtr->totalSectors;
   1975 
   1976 	dkwedge_discover(&rs->sc_dkdev);
   1977 
   1978 	rf_set_geometry(rs, raidPtr);
   1979 
   1980 }
   1981 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1982 /* wake up the daemon & tell it to get us a spare table
   1983  * XXX
   1984  * the entries in the queues should be tagged with the raidPtr
   1985  * so that in the extremely rare case that two recons happen at once,
   1986  * we know for which device were requesting a spare table
   1987  * XXX
   1988  *
   1989  * XXX This code is not currently used. GO
   1990  */
   1991 int
   1992 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1993 {
   1994 	int     retcode;
   1995 
   1996 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1997 	req->next = rf_sparet_wait_queue;
   1998 	rf_sparet_wait_queue = req;
   1999 	rf_broadcast_cond2(rf_sparet_wait_cv);
   2000 
   2001 	/* mpsleep unlocks the mutex */
   2002 	while (!rf_sparet_resp_queue) {
   2003 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   2004 	}
   2005 	req = rf_sparet_resp_queue;
   2006 	rf_sparet_resp_queue = req->next;
   2007 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   2008 
   2009 	retcode = req->fcol;
   2010 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   2011 					 * alloc'd */
   2012 	return (retcode);
   2013 }
   2014 #endif
   2015 
   2016 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2017  * bp & passes it down.
   2018  * any calls originating in the kernel must use non-blocking I/O
   2019  * do some extra sanity checking to return "appropriate" error values for
   2020  * certain conditions (to make some standard utilities work)
   2021  *
   2022  * Formerly known as: rf_DoAccessKernel
   2023  */
   2024 void
   2025 raidstart(RF_Raid_t *raidPtr)
   2026 {
   2027 	RF_SectorCount_t num_blocks, pb, sum;
   2028 	RF_RaidAddr_t raid_addr;
   2029 	struct partition *pp;
   2030 	daddr_t blocknum;
   2031 	struct raid_softc *rs;
   2032 	int     do_async;
   2033 	struct buf *bp;
   2034 	int rc;
   2035 
   2036 	rs = raidPtr->softc;
   2037 	/* quick check to see if anything has died recently */
   2038 	rf_lock_mutex2(raidPtr->mutex);
   2039 	if (raidPtr->numNewFailures > 0) {
   2040 		rf_unlock_mutex2(raidPtr->mutex);
   2041 		rf_update_component_labels(raidPtr,
   2042 					   RF_NORMAL_COMPONENT_UPDATE);
   2043 		rf_lock_mutex2(raidPtr->mutex);
   2044 		raidPtr->numNewFailures--;
   2045 	}
   2046 
   2047 	/* Check to see if we're at the limit... */
   2048 	while (raidPtr->openings > 0) {
   2049 		rf_unlock_mutex2(raidPtr->mutex);
   2050 
   2051 		/* get the next item, if any, from the queue */
   2052 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2053 			/* nothing more to do */
   2054 			return;
   2055 		}
   2056 
   2057 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2058 		 * partition.. Need to make it absolute to the underlying
   2059 		 * device.. */
   2060 
   2061 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2062 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2063 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2064 			blocknum += pp->p_offset;
   2065 		}
   2066 
   2067 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2068 			    (int) blocknum));
   2069 
   2070 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2071 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2072 
   2073 		/* *THIS* is where we adjust what block we're going to...
   2074 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2075 		raid_addr = blocknum;
   2076 
   2077 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2078 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2079 		sum = raid_addr + num_blocks + pb;
   2080 		if (1 || rf_debugKernelAccess) {
   2081 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2082 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2083 				    (int) pb, (int) bp->b_resid));
   2084 		}
   2085 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2086 		    || (sum < num_blocks) || (sum < pb)) {
   2087 			bp->b_error = ENOSPC;
   2088 			bp->b_resid = bp->b_bcount;
   2089 			biodone(bp);
   2090 			rf_lock_mutex2(raidPtr->mutex);
   2091 			continue;
   2092 		}
   2093 		/*
   2094 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2095 		 */
   2096 
   2097 		if (bp->b_bcount & raidPtr->sectorMask) {
   2098 			bp->b_error = EINVAL;
   2099 			bp->b_resid = bp->b_bcount;
   2100 			biodone(bp);
   2101 			rf_lock_mutex2(raidPtr->mutex);
   2102 			continue;
   2103 
   2104 		}
   2105 		db1_printf(("Calling DoAccess..\n"));
   2106 
   2107 
   2108 		rf_lock_mutex2(raidPtr->mutex);
   2109 		raidPtr->openings--;
   2110 		rf_unlock_mutex2(raidPtr->mutex);
   2111 
   2112 		/*
   2113 		 * Everything is async.
   2114 		 */
   2115 		do_async = 1;
   2116 
   2117 		disk_busy(&rs->sc_dkdev);
   2118 
   2119 		/* XXX we're still at splbio() here... do we *really*
   2120 		   need to be? */
   2121 
   2122 		/* don't ever condition on bp->b_flags & B_WRITE.
   2123 		 * always condition on B_READ instead */
   2124 
   2125 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2126 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2127 				 do_async, raid_addr, num_blocks,
   2128 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2129 
   2130 		if (rc) {
   2131 			bp->b_error = rc;
   2132 			bp->b_resid = bp->b_bcount;
   2133 			biodone(bp);
   2134 			/* continue loop */
   2135 		}
   2136 
   2137 		rf_lock_mutex2(raidPtr->mutex);
   2138 	}
   2139 	rf_unlock_mutex2(raidPtr->mutex);
   2140 }
   2141 
   2142 
   2143 
   2144 
   2145 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2146 
   2147 int
   2148 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2149 {
   2150 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2151 	struct buf *bp;
   2152 
   2153 	req->queue = queue;
   2154 	bp = req->bp;
   2155 
   2156 	switch (req->type) {
   2157 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2158 		/* XXX need to do something extra here.. */
   2159 		/* I'm leaving this in, as I've never actually seen it used,
   2160 		 * and I'd like folks to report it... GO */
   2161 		printf(("WAKEUP CALLED\n"));
   2162 		queue->numOutstanding++;
   2163 
   2164 		bp->b_flags = 0;
   2165 		bp->b_private = req;
   2166 
   2167 		KernelWakeupFunc(bp);
   2168 		break;
   2169 
   2170 	case RF_IO_TYPE_READ:
   2171 	case RF_IO_TYPE_WRITE:
   2172 #if RF_ACC_TRACE > 0
   2173 		if (req->tracerec) {
   2174 			RF_ETIMER_START(req->tracerec->timer);
   2175 		}
   2176 #endif
   2177 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2178 		    op, queue->rf_cinfo->ci_dev,
   2179 		    req->sectorOffset, req->numSector,
   2180 		    req->buf, KernelWakeupFunc, (void *) req,
   2181 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2182 
   2183 		if (rf_debugKernelAccess) {
   2184 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2185 				(long) bp->b_blkno));
   2186 		}
   2187 		queue->numOutstanding++;
   2188 		queue->last_deq_sector = req->sectorOffset;
   2189 		/* acc wouldn't have been let in if there were any pending
   2190 		 * reqs at any other priority */
   2191 		queue->curPriority = req->priority;
   2192 
   2193 		db1_printf(("Going for %c to unit %d col %d\n",
   2194 			    req->type, queue->raidPtr->raidid,
   2195 			    queue->col));
   2196 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2197 			(int) req->sectorOffset, (int) req->numSector,
   2198 			(int) (req->numSector <<
   2199 			    queue->raidPtr->logBytesPerSector),
   2200 			(int) queue->raidPtr->logBytesPerSector));
   2201 
   2202 		/*
   2203 		 * XXX: drop lock here since this can block at
   2204 		 * least with backing SCSI devices.  Retake it
   2205 		 * to minimize fuss with calling interfaces.
   2206 		 */
   2207 
   2208 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2209 		bdev_strategy(bp);
   2210 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2211 		break;
   2212 
   2213 	default:
   2214 		panic("bad req->type in rf_DispatchKernelIO");
   2215 	}
   2216 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2217 
   2218 	return (0);
   2219 }
   2220 /* this is the callback function associated with a I/O invoked from
   2221    kernel code.
   2222  */
   2223 static void
   2224 KernelWakeupFunc(struct buf *bp)
   2225 {
   2226 	RF_DiskQueueData_t *req = NULL;
   2227 	RF_DiskQueue_t *queue;
   2228 
   2229 	db1_printf(("recovering the request queue:\n"));
   2230 
   2231 	req = bp->b_private;
   2232 
   2233 	queue = (RF_DiskQueue_t *) req->queue;
   2234 
   2235 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2236 
   2237 #if RF_ACC_TRACE > 0
   2238 	if (req->tracerec) {
   2239 		RF_ETIMER_STOP(req->tracerec->timer);
   2240 		RF_ETIMER_EVAL(req->tracerec->timer);
   2241 		rf_lock_mutex2(rf_tracing_mutex);
   2242 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2243 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2244 		req->tracerec->num_phys_ios++;
   2245 		rf_unlock_mutex2(rf_tracing_mutex);
   2246 	}
   2247 #endif
   2248 
   2249 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2250 	 * ballistic, and mark the component as hosed... */
   2251 
   2252 	if (bp->b_error != 0) {
   2253 		/* Mark the disk as dead */
   2254 		/* but only mark it once... */
   2255 		/* and only if it wouldn't leave this RAID set
   2256 		   completely broken */
   2257 		if (((queue->raidPtr->Disks[queue->col].status ==
   2258 		      rf_ds_optimal) ||
   2259 		     (queue->raidPtr->Disks[queue->col].status ==
   2260 		      rf_ds_used_spare)) &&
   2261 		     (queue->raidPtr->numFailures <
   2262 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2263 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2264 			       queue->raidPtr->raidid,
   2265 			       queue->raidPtr->Disks[queue->col].devname);
   2266 			queue->raidPtr->Disks[queue->col].status =
   2267 			    rf_ds_failed;
   2268 			queue->raidPtr->status = rf_rs_degraded;
   2269 			queue->raidPtr->numFailures++;
   2270 			queue->raidPtr->numNewFailures++;
   2271 		} else {	/* Disk is already dead... */
   2272 			/* printf("Disk already marked as dead!\n"); */
   2273 		}
   2274 
   2275 	}
   2276 
   2277 	/* Fill in the error value */
   2278 	req->error = bp->b_error;
   2279 
   2280 	/* Drop this one on the "finished" queue... */
   2281 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2282 
   2283 	/* Let the raidio thread know there is work to be done. */
   2284 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2285 
   2286 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2287 }
   2288 
   2289 
   2290 /*
   2291  * initialize a buf structure for doing an I/O in the kernel.
   2292  */
   2293 static void
   2294 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2295        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2296        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2297        struct proc *b_proc)
   2298 {
   2299 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2300 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2301 	bp->b_oflags = 0;
   2302 	bp->b_cflags = 0;
   2303 	bp->b_bcount = numSect << logBytesPerSector;
   2304 	bp->b_bufsize = bp->b_bcount;
   2305 	bp->b_error = 0;
   2306 	bp->b_dev = dev;
   2307 	bp->b_data = bf;
   2308 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2309 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2310 	if (bp->b_bcount == 0) {
   2311 		panic("bp->b_bcount is zero in InitBP!!");
   2312 	}
   2313 	bp->b_proc = b_proc;
   2314 	bp->b_iodone = cbFunc;
   2315 	bp->b_private = cbArg;
   2316 }
   2317 
   2318 static void
   2319 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2320 		    struct disklabel *lp)
   2321 {
   2322 	memset(lp, 0, sizeof(*lp));
   2323 
   2324 	/* fabricate a label... */
   2325 	lp->d_secperunit = raidPtr->totalSectors;
   2326 	lp->d_secsize = raidPtr->bytesPerSector;
   2327 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2328 	lp->d_ntracks = 4 * raidPtr->numCol;
   2329 	lp->d_ncylinders = raidPtr->totalSectors /
   2330 		(lp->d_nsectors * lp->d_ntracks);
   2331 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2332 
   2333 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2334 	lp->d_type = DTYPE_RAID;
   2335 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2336 	lp->d_rpm = 3600;
   2337 	lp->d_interleave = 1;
   2338 	lp->d_flags = 0;
   2339 
   2340 	lp->d_partitions[RAW_PART].p_offset = 0;
   2341 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2342 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2343 	lp->d_npartitions = RAW_PART + 1;
   2344 
   2345 	lp->d_magic = DISKMAGIC;
   2346 	lp->d_magic2 = DISKMAGIC;
   2347 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2348 
   2349 }
   2350 /*
   2351  * Read the disklabel from the raid device.  If one is not present, fake one
   2352  * up.
   2353  */
   2354 static void
   2355 raidgetdisklabel(dev_t dev)
   2356 {
   2357 	int     unit = raidunit(dev);
   2358 	struct raid_softc *rs;
   2359 	const char   *errstring;
   2360 	struct disklabel *lp;
   2361 	struct cpu_disklabel *clp;
   2362 	RF_Raid_t *raidPtr;
   2363 
   2364 	if ((rs = raidget(unit)) == NULL)
   2365 		return;
   2366 
   2367 	lp = rs->sc_dkdev.dk_label;
   2368 	clp = rs->sc_dkdev.dk_cpulabel;
   2369 
   2370 	db1_printf(("Getting the disklabel...\n"));
   2371 
   2372 	memset(clp, 0, sizeof(*clp));
   2373 
   2374 	raidPtr = &rs->sc_r;
   2375 
   2376 	raidgetdefaultlabel(raidPtr, rs, lp);
   2377 
   2378 	/*
   2379 	 * Call the generic disklabel extraction routine.
   2380 	 */
   2381 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2382 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2383 	if (errstring)
   2384 		raidmakedisklabel(rs);
   2385 	else {
   2386 		int     i;
   2387 		struct partition *pp;
   2388 
   2389 		/*
   2390 		 * Sanity check whether the found disklabel is valid.
   2391 		 *
   2392 		 * This is necessary since total size of the raid device
   2393 		 * may vary when an interleave is changed even though exactly
   2394 		 * same components are used, and old disklabel may used
   2395 		 * if that is found.
   2396 		 */
   2397 		if (lp->d_secperunit != rs->sc_size)
   2398 			printf("raid%d: WARNING: %s: "
   2399 			    "total sector size in disklabel (%" PRIu32 ") != "
   2400 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2401 			    lp->d_secperunit, rs->sc_size);
   2402 		for (i = 0; i < lp->d_npartitions; i++) {
   2403 			pp = &lp->d_partitions[i];
   2404 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2405 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2406 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2407 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2408 		}
   2409 	}
   2410 
   2411 }
   2412 /*
   2413  * Take care of things one might want to take care of in the event
   2414  * that a disklabel isn't present.
   2415  */
   2416 static void
   2417 raidmakedisklabel(struct raid_softc *rs)
   2418 {
   2419 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2420 	db1_printf(("Making a label..\n"));
   2421 
   2422 	/*
   2423 	 * For historical reasons, if there's no disklabel present
   2424 	 * the raw partition must be marked FS_BSDFFS.
   2425 	 */
   2426 
   2427 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2428 
   2429 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2430 
   2431 	lp->d_checksum = dkcksum(lp);
   2432 }
   2433 /*
   2434  * Wait interruptibly for an exclusive lock.
   2435  *
   2436  * XXX
   2437  * Several drivers do this; it should be abstracted and made MP-safe.
   2438  * (Hmm... where have we seen this warning before :->  GO )
   2439  */
   2440 static int
   2441 raidlock(struct raid_softc *rs)
   2442 {
   2443 	int     error;
   2444 
   2445 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2446 		rs->sc_flags |= RAIDF_WANTED;
   2447 		if ((error =
   2448 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2449 			return (error);
   2450 	}
   2451 	rs->sc_flags |= RAIDF_LOCKED;
   2452 	return (0);
   2453 }
   2454 /*
   2455  * Unlock and wake up any waiters.
   2456  */
   2457 static void
   2458 raidunlock(struct raid_softc *rs)
   2459 {
   2460 
   2461 	rs->sc_flags &= ~RAIDF_LOCKED;
   2462 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2463 		rs->sc_flags &= ~RAIDF_WANTED;
   2464 		wakeup(rs);
   2465 	}
   2466 }
   2467 
   2468 
   2469 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2470 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2471 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2472 
   2473 static daddr_t
   2474 rf_component_info_offset(void)
   2475 {
   2476 
   2477 	return RF_COMPONENT_INFO_OFFSET;
   2478 }
   2479 
   2480 static daddr_t
   2481 rf_component_info_size(unsigned secsize)
   2482 {
   2483 	daddr_t info_size;
   2484 
   2485 	KASSERT(secsize);
   2486 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2487 		info_size = secsize;
   2488 	else
   2489 		info_size = RF_COMPONENT_INFO_SIZE;
   2490 
   2491 	return info_size;
   2492 }
   2493 
   2494 static daddr_t
   2495 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2496 {
   2497 	daddr_t map_offset;
   2498 
   2499 	KASSERT(raidPtr->bytesPerSector);
   2500 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2501 		map_offset = raidPtr->bytesPerSector;
   2502 	else
   2503 		map_offset = RF_COMPONENT_INFO_SIZE;
   2504 	map_offset += rf_component_info_offset();
   2505 
   2506 	return map_offset;
   2507 }
   2508 
   2509 static daddr_t
   2510 rf_parity_map_size(RF_Raid_t *raidPtr)
   2511 {
   2512 	daddr_t map_size;
   2513 
   2514 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2515 		map_size = raidPtr->bytesPerSector;
   2516 	else
   2517 		map_size = RF_PARITY_MAP_SIZE;
   2518 
   2519 	return map_size;
   2520 }
   2521 
   2522 int
   2523 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2524 {
   2525 	RF_ComponentLabel_t *clabel;
   2526 
   2527 	clabel = raidget_component_label(raidPtr, col);
   2528 	clabel->clean = RF_RAID_CLEAN;
   2529 	raidflush_component_label(raidPtr, col);
   2530 	return(0);
   2531 }
   2532 
   2533 
   2534 int
   2535 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2536 {
   2537 	RF_ComponentLabel_t *clabel;
   2538 
   2539 	clabel = raidget_component_label(raidPtr, col);
   2540 	clabel->clean = RF_RAID_DIRTY;
   2541 	raidflush_component_label(raidPtr, col);
   2542 	return(0);
   2543 }
   2544 
   2545 int
   2546 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2547 {
   2548 	KASSERT(raidPtr->bytesPerSector);
   2549 	return raidread_component_label(raidPtr->bytesPerSector,
   2550 	    raidPtr->Disks[col].dev,
   2551 	    raidPtr->raid_cinfo[col].ci_vp,
   2552 	    &raidPtr->raid_cinfo[col].ci_label);
   2553 }
   2554 
   2555 RF_ComponentLabel_t *
   2556 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2557 {
   2558 	return &raidPtr->raid_cinfo[col].ci_label;
   2559 }
   2560 
   2561 int
   2562 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2563 {
   2564 	RF_ComponentLabel_t *label;
   2565 
   2566 	label = &raidPtr->raid_cinfo[col].ci_label;
   2567 	label->mod_counter = raidPtr->mod_counter;
   2568 #ifndef RF_NO_PARITY_MAP
   2569 	label->parity_map_modcount = label->mod_counter;
   2570 #endif
   2571 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2572 	    raidPtr->Disks[col].dev,
   2573 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2574 }
   2575 
   2576 
   2577 static int
   2578 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2579     RF_ComponentLabel_t *clabel)
   2580 {
   2581 	return raidread_component_area(dev, b_vp, clabel,
   2582 	    sizeof(RF_ComponentLabel_t),
   2583 	    rf_component_info_offset(),
   2584 	    rf_component_info_size(secsize));
   2585 }
   2586 
   2587 /* ARGSUSED */
   2588 static int
   2589 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2590     size_t msize, daddr_t offset, daddr_t dsize)
   2591 {
   2592 	struct buf *bp;
   2593 	const struct bdevsw *bdev;
   2594 	int error;
   2595 
   2596 	/* XXX should probably ensure that we don't try to do this if
   2597 	   someone has changed rf_protected_sectors. */
   2598 
   2599 	if (b_vp == NULL) {
   2600 		/* For whatever reason, this component is not valid.
   2601 		   Don't try to read a component label from it. */
   2602 		return(EINVAL);
   2603 	}
   2604 
   2605 	/* get a block of the appropriate size... */
   2606 	bp = geteblk((int)dsize);
   2607 	bp->b_dev = dev;
   2608 
   2609 	/* get our ducks in a row for the read */
   2610 	bp->b_blkno = offset / DEV_BSIZE;
   2611 	bp->b_bcount = dsize;
   2612 	bp->b_flags |= B_READ;
   2613  	bp->b_resid = dsize;
   2614 
   2615 	bdev = bdevsw_lookup(bp->b_dev);
   2616 	if (bdev == NULL)
   2617 		return (ENXIO);
   2618 	(*bdev->d_strategy)(bp);
   2619 
   2620 	error = biowait(bp);
   2621 
   2622 	if (!error) {
   2623 		memcpy(data, bp->b_data, msize);
   2624 	}
   2625 
   2626 	brelse(bp, 0);
   2627 	return(error);
   2628 }
   2629 
   2630 
   2631 static int
   2632 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2633     RF_ComponentLabel_t *clabel)
   2634 {
   2635 	return raidwrite_component_area(dev, b_vp, clabel,
   2636 	    sizeof(RF_ComponentLabel_t),
   2637 	    rf_component_info_offset(),
   2638 	    rf_component_info_size(secsize), 0);
   2639 }
   2640 
   2641 /* ARGSUSED */
   2642 static int
   2643 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2644     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2645 {
   2646 	struct buf *bp;
   2647 	const struct bdevsw *bdev;
   2648 	int error;
   2649 
   2650 	/* get a block of the appropriate size... */
   2651 	bp = geteblk((int)dsize);
   2652 	bp->b_dev = dev;
   2653 
   2654 	/* get our ducks in a row for the write */
   2655 	bp->b_blkno = offset / DEV_BSIZE;
   2656 	bp->b_bcount = dsize;
   2657 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2658  	bp->b_resid = dsize;
   2659 
   2660 	memset(bp->b_data, 0, dsize);
   2661 	memcpy(bp->b_data, data, msize);
   2662 
   2663 	bdev = bdevsw_lookup(bp->b_dev);
   2664 	if (bdev == NULL)
   2665 		return (ENXIO);
   2666 	(*bdev->d_strategy)(bp);
   2667 	if (asyncp)
   2668 		return 0;
   2669 	error = biowait(bp);
   2670 	brelse(bp, 0);
   2671 	if (error) {
   2672 #if 1
   2673 		printf("Failed to write RAID component info!\n");
   2674 #endif
   2675 	}
   2676 
   2677 	return(error);
   2678 }
   2679 
   2680 void
   2681 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2682 {
   2683 	int c;
   2684 
   2685 	for (c = 0; c < raidPtr->numCol; c++) {
   2686 		/* Skip dead disks. */
   2687 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2688 			continue;
   2689 		/* XXXjld: what if an error occurs here? */
   2690 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2691 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2692 		    RF_PARITYMAP_NBYTE,
   2693 		    rf_parity_map_offset(raidPtr),
   2694 		    rf_parity_map_size(raidPtr), 0);
   2695 	}
   2696 }
   2697 
   2698 void
   2699 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2700 {
   2701 	struct rf_paritymap_ondisk tmp;
   2702 	int c,first;
   2703 
   2704 	first=1;
   2705 	for (c = 0; c < raidPtr->numCol; c++) {
   2706 		/* Skip dead disks. */
   2707 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2708 			continue;
   2709 		raidread_component_area(raidPtr->Disks[c].dev,
   2710 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2711 		    RF_PARITYMAP_NBYTE,
   2712 		    rf_parity_map_offset(raidPtr),
   2713 		    rf_parity_map_size(raidPtr));
   2714 		if (first) {
   2715 			memcpy(map, &tmp, sizeof(*map));
   2716 			first = 0;
   2717 		} else {
   2718 			rf_paritymap_merge(map, &tmp);
   2719 		}
   2720 	}
   2721 }
   2722 
   2723 void
   2724 rf_markalldirty(RF_Raid_t *raidPtr)
   2725 {
   2726 	RF_ComponentLabel_t *clabel;
   2727 	int sparecol;
   2728 	int c;
   2729 	int j;
   2730 	int scol = -1;
   2731 
   2732 	raidPtr->mod_counter++;
   2733 	for (c = 0; c < raidPtr->numCol; c++) {
   2734 		/* we don't want to touch (at all) a disk that has
   2735 		   failed */
   2736 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2737 			clabel = raidget_component_label(raidPtr, c);
   2738 			if (clabel->status == rf_ds_spared) {
   2739 				/* XXX do something special...
   2740 				   but whatever you do, don't
   2741 				   try to access it!! */
   2742 			} else {
   2743 				raidmarkdirty(raidPtr, c);
   2744 			}
   2745 		}
   2746 	}
   2747 
   2748 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2749 		sparecol = raidPtr->numCol + c;
   2750 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2751 			/*
   2752 
   2753 			   we claim this disk is "optimal" if it's
   2754 			   rf_ds_used_spare, as that means it should be
   2755 			   directly substitutable for the disk it replaced.
   2756 			   We note that too...
   2757 
   2758 			 */
   2759 
   2760 			for(j=0;j<raidPtr->numCol;j++) {
   2761 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2762 					scol = j;
   2763 					break;
   2764 				}
   2765 			}
   2766 
   2767 			clabel = raidget_component_label(raidPtr, sparecol);
   2768 			/* make sure status is noted */
   2769 
   2770 			raid_init_component_label(raidPtr, clabel);
   2771 
   2772 			clabel->row = 0;
   2773 			clabel->column = scol;
   2774 			/* Note: we *don't* change status from rf_ds_used_spare
   2775 			   to rf_ds_optimal */
   2776 			/* clabel.status = rf_ds_optimal; */
   2777 
   2778 			raidmarkdirty(raidPtr, sparecol);
   2779 		}
   2780 	}
   2781 }
   2782 
   2783 
   2784 void
   2785 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2786 {
   2787 	RF_ComponentLabel_t *clabel;
   2788 	int sparecol;
   2789 	int c;
   2790 	int j;
   2791 	int scol;
   2792 
   2793 	scol = -1;
   2794 
   2795 	/* XXX should do extra checks to make sure things really are clean,
   2796 	   rather than blindly setting the clean bit... */
   2797 
   2798 	raidPtr->mod_counter++;
   2799 
   2800 	for (c = 0; c < raidPtr->numCol; c++) {
   2801 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2802 			clabel = raidget_component_label(raidPtr, c);
   2803 			/* make sure status is noted */
   2804 			clabel->status = rf_ds_optimal;
   2805 
   2806 			/* note what unit we are configured as */
   2807 			clabel->last_unit = raidPtr->raidid;
   2808 
   2809 			raidflush_component_label(raidPtr, c);
   2810 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2811 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2812 					raidmarkclean(raidPtr, c);
   2813 				}
   2814 			}
   2815 		}
   2816 		/* else we don't touch it.. */
   2817 	}
   2818 
   2819 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2820 		sparecol = raidPtr->numCol + c;
   2821 		/* Need to ensure that the reconstruct actually completed! */
   2822 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2823 			/*
   2824 
   2825 			   we claim this disk is "optimal" if it's
   2826 			   rf_ds_used_spare, as that means it should be
   2827 			   directly substitutable for the disk it replaced.
   2828 			   We note that too...
   2829 
   2830 			 */
   2831 
   2832 			for(j=0;j<raidPtr->numCol;j++) {
   2833 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2834 					scol = j;
   2835 					break;
   2836 				}
   2837 			}
   2838 
   2839 			/* XXX shouldn't *really* need this... */
   2840 			clabel = raidget_component_label(raidPtr, sparecol);
   2841 			/* make sure status is noted */
   2842 
   2843 			raid_init_component_label(raidPtr, clabel);
   2844 
   2845 			clabel->column = scol;
   2846 			clabel->status = rf_ds_optimal;
   2847 			clabel->last_unit = raidPtr->raidid;
   2848 
   2849 			raidflush_component_label(raidPtr, sparecol);
   2850 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2851 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2852 					raidmarkclean(raidPtr, sparecol);
   2853 				}
   2854 			}
   2855 		}
   2856 	}
   2857 }
   2858 
   2859 void
   2860 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2861 {
   2862 
   2863 	if (vp != NULL) {
   2864 		if (auto_configured == 1) {
   2865 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2866 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2867 			vput(vp);
   2868 
   2869 		} else {
   2870 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2871 		}
   2872 	}
   2873 }
   2874 
   2875 
   2876 void
   2877 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2878 {
   2879 	int r,c;
   2880 	struct vnode *vp;
   2881 	int acd;
   2882 
   2883 
   2884 	/* We take this opportunity to close the vnodes like we should.. */
   2885 
   2886 	for (c = 0; c < raidPtr->numCol; c++) {
   2887 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2888 		acd = raidPtr->Disks[c].auto_configured;
   2889 		rf_close_component(raidPtr, vp, acd);
   2890 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2891 		raidPtr->Disks[c].auto_configured = 0;
   2892 	}
   2893 
   2894 	for (r = 0; r < raidPtr->numSpare; r++) {
   2895 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2896 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2897 		rf_close_component(raidPtr, vp, acd);
   2898 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2899 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2900 	}
   2901 }
   2902 
   2903 
   2904 void
   2905 rf_ReconThread(struct rf_recon_req *req)
   2906 {
   2907 	int     s;
   2908 	RF_Raid_t *raidPtr;
   2909 
   2910 	s = splbio();
   2911 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2912 	raidPtr->recon_in_progress = 1;
   2913 
   2914 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2915 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2916 
   2917 	RF_Free(req, sizeof(*req));
   2918 
   2919 	raidPtr->recon_in_progress = 0;
   2920 	splx(s);
   2921 
   2922 	/* That's all... */
   2923 	kthread_exit(0);	/* does not return */
   2924 }
   2925 
   2926 void
   2927 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2928 {
   2929 	int retcode;
   2930 	int s;
   2931 
   2932 	raidPtr->parity_rewrite_stripes_done = 0;
   2933 	raidPtr->parity_rewrite_in_progress = 1;
   2934 	s = splbio();
   2935 	retcode = rf_RewriteParity(raidPtr);
   2936 	splx(s);
   2937 	if (retcode) {
   2938 		printf("raid%d: Error re-writing parity (%d)!\n",
   2939 		    raidPtr->raidid, retcode);
   2940 	} else {
   2941 		/* set the clean bit!  If we shutdown correctly,
   2942 		   the clean bit on each component label will get
   2943 		   set */
   2944 		raidPtr->parity_good = RF_RAID_CLEAN;
   2945 	}
   2946 	raidPtr->parity_rewrite_in_progress = 0;
   2947 
   2948 	/* Anyone waiting for us to stop?  If so, inform them... */
   2949 	if (raidPtr->waitShutdown) {
   2950 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2951 	}
   2952 
   2953 	/* That's all... */
   2954 	kthread_exit(0);	/* does not return */
   2955 }
   2956 
   2957 
   2958 void
   2959 rf_CopybackThread(RF_Raid_t *raidPtr)
   2960 {
   2961 	int s;
   2962 
   2963 	raidPtr->copyback_in_progress = 1;
   2964 	s = splbio();
   2965 	rf_CopybackReconstructedData(raidPtr);
   2966 	splx(s);
   2967 	raidPtr->copyback_in_progress = 0;
   2968 
   2969 	/* That's all... */
   2970 	kthread_exit(0);	/* does not return */
   2971 }
   2972 
   2973 
   2974 void
   2975 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2976 {
   2977 	int s;
   2978 	RF_Raid_t *raidPtr;
   2979 
   2980 	s = splbio();
   2981 	raidPtr = req->raidPtr;
   2982 	raidPtr->recon_in_progress = 1;
   2983 	rf_ReconstructInPlace(raidPtr, req->col);
   2984 	RF_Free(req, sizeof(*req));
   2985 	raidPtr->recon_in_progress = 0;
   2986 	splx(s);
   2987 
   2988 	/* That's all... */
   2989 	kthread_exit(0);	/* does not return */
   2990 }
   2991 
   2992 static RF_AutoConfig_t *
   2993 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2994     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2995     unsigned secsize)
   2996 {
   2997 	int good_one = 0;
   2998 	RF_ComponentLabel_t *clabel;
   2999 	RF_AutoConfig_t *ac;
   3000 
   3001 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   3002 	if (clabel == NULL) {
   3003 oomem:
   3004 		    while(ac_list) {
   3005 			    ac = ac_list;
   3006 			    if (ac->clabel)
   3007 				    free(ac->clabel, M_RAIDFRAME);
   3008 			    ac_list = ac_list->next;
   3009 			    free(ac, M_RAIDFRAME);
   3010 		    }
   3011 		    printf("RAID auto config: out of memory!\n");
   3012 		    return NULL; /* XXX probably should panic? */
   3013 	}
   3014 
   3015 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3016 		/* Got the label.  Does it look reasonable? */
   3017 		if (rf_reasonable_label(clabel, numsecs) &&
   3018 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3019 #ifdef DEBUG
   3020 			printf("Component on: %s: %llu\n",
   3021 				cname, (unsigned long long)size);
   3022 			rf_print_component_label(clabel);
   3023 #endif
   3024 			/* if it's reasonable, add it, else ignore it. */
   3025 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3026 				M_NOWAIT);
   3027 			if (ac == NULL) {
   3028 				free(clabel, M_RAIDFRAME);
   3029 				goto oomem;
   3030 			}
   3031 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3032 			ac->dev = dev;
   3033 			ac->vp = vp;
   3034 			ac->clabel = clabel;
   3035 			ac->next = ac_list;
   3036 			ac_list = ac;
   3037 			good_one = 1;
   3038 		}
   3039 	}
   3040 	if (!good_one) {
   3041 		/* cleanup */
   3042 		free(clabel, M_RAIDFRAME);
   3043 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3044 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3045 		vput(vp);
   3046 	}
   3047 	return ac_list;
   3048 }
   3049 
   3050 RF_AutoConfig_t *
   3051 rf_find_raid_components(void)
   3052 {
   3053 	struct vnode *vp;
   3054 	struct disklabel label;
   3055 	device_t dv;
   3056 	deviter_t di;
   3057 	dev_t dev;
   3058 	int bmajor, bminor, wedge, rf_part_found;
   3059 	int error;
   3060 	int i;
   3061 	RF_AutoConfig_t *ac_list;
   3062 	uint64_t numsecs;
   3063 	unsigned secsize;
   3064 
   3065 	/* initialize the AutoConfig list */
   3066 	ac_list = NULL;
   3067 
   3068 	/* we begin by trolling through *all* the devices on the system */
   3069 
   3070 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3071 	     dv = deviter_next(&di)) {
   3072 
   3073 		/* we are only interested in disks... */
   3074 		if (device_class(dv) != DV_DISK)
   3075 			continue;
   3076 
   3077 		/* we don't care about floppies... */
   3078 		if (device_is_a(dv, "fd")) {
   3079 			continue;
   3080 		}
   3081 
   3082 		/* we don't care about CD's... */
   3083 		if (device_is_a(dv, "cd")) {
   3084 			continue;
   3085 		}
   3086 
   3087 		/* we don't care about md's... */
   3088 		if (device_is_a(dv, "md")) {
   3089 			continue;
   3090 		}
   3091 
   3092 		/* hdfd is the Atari/Hades floppy driver */
   3093 		if (device_is_a(dv, "hdfd")) {
   3094 			continue;
   3095 		}
   3096 
   3097 		/* fdisa is the Atari/Milan floppy driver */
   3098 		if (device_is_a(dv, "fdisa")) {
   3099 			continue;
   3100 		}
   3101 
   3102 		/* need to find the device_name_to_block_device_major stuff */
   3103 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3104 
   3105 		rf_part_found = 0; /*No raid partition as yet*/
   3106 
   3107 		/* get a vnode for the raw partition of this disk */
   3108 
   3109 		wedge = device_is_a(dv, "dk");
   3110 		bminor = minor(device_unit(dv));
   3111 		dev = wedge ? makedev(bmajor, bminor) :
   3112 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3113 		if (bdevvp(dev, &vp))
   3114 			panic("RAID can't alloc vnode");
   3115 
   3116 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3117 
   3118 		if (error) {
   3119 			/* "Who cares."  Continue looking
   3120 			   for something that exists*/
   3121 			vput(vp);
   3122 			continue;
   3123 		}
   3124 
   3125 		error = getdisksize(vp, &numsecs, &secsize);
   3126 		if (error) {
   3127 			vput(vp);
   3128 			continue;
   3129 		}
   3130 		if (wedge) {
   3131 			struct dkwedge_info dkw;
   3132 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3133 			    NOCRED);
   3134 			if (error) {
   3135 				printf("RAIDframe: can't get wedge info for "
   3136 				    "dev %s (%d)\n", device_xname(dv), error);
   3137 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3138 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3139 				vput(vp);
   3140 				continue;
   3141 			}
   3142 
   3143 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3144 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3145 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3146 				vput(vp);
   3147 				continue;
   3148 			}
   3149 
   3150 			ac_list = rf_get_component(ac_list, dev, vp,
   3151 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3152 			rf_part_found = 1; /*There is a raid component on this disk*/
   3153 			continue;
   3154 		}
   3155 
   3156 		/* Ok, the disk exists.  Go get the disklabel. */
   3157 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3158 		if (error) {
   3159 			/*
   3160 			 * XXX can't happen - open() would
   3161 			 * have errored out (or faked up one)
   3162 			 */
   3163 			if (error != ENOTTY)
   3164 				printf("RAIDframe: can't get label for dev "
   3165 				    "%s (%d)\n", device_xname(dv), error);
   3166 		}
   3167 
   3168 		/* don't need this any more.  We'll allocate it again
   3169 		   a little later if we really do... */
   3170 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3171 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3172 		vput(vp);
   3173 
   3174 		if (error)
   3175 			continue;
   3176 
   3177 		rf_part_found = 0; /*No raid partitions yet*/
   3178 		for (i = 0; i < label.d_npartitions; i++) {
   3179 			char cname[sizeof(ac_list->devname)];
   3180 
   3181 			/* We only support partitions marked as RAID */
   3182 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3183 				continue;
   3184 
   3185 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3186 			if (bdevvp(dev, &vp))
   3187 				panic("RAID can't alloc vnode");
   3188 
   3189 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3190 			if (error) {
   3191 				/* Whatever... */
   3192 				vput(vp);
   3193 				continue;
   3194 			}
   3195 			snprintf(cname, sizeof(cname), "%s%c",
   3196 			    device_xname(dv), 'a' + i);
   3197 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3198 				label.d_partitions[i].p_size, numsecs, secsize);
   3199 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3200 		}
   3201 
   3202 		/*
   3203 		 *If there is no raid component on this disk, either in a
   3204 		 *disklabel or inside a wedge, check the raw partition as well,
   3205 		 *as it is possible to configure raid components on raw disk
   3206 		 *devices.
   3207 		 */
   3208 
   3209 		if (!rf_part_found) {
   3210 			char cname[sizeof(ac_list->devname)];
   3211 
   3212 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3213 			if (bdevvp(dev, &vp))
   3214 				panic("RAID can't alloc vnode");
   3215 
   3216 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3217 			if (error) {
   3218 				/* Whatever... */
   3219 				vput(vp);
   3220 				continue;
   3221 			}
   3222 			snprintf(cname, sizeof(cname), "%s%c",
   3223 			    device_xname(dv), 'a' + RAW_PART);
   3224 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3225 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3226 		}
   3227 	}
   3228 	deviter_release(&di);
   3229 	return ac_list;
   3230 }
   3231 
   3232 
   3233 int
   3234 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3235 {
   3236 
   3237 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3238 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3239 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3240 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3241 	    clabel->row >=0 &&
   3242 	    clabel->column >= 0 &&
   3243 	    clabel->num_rows > 0 &&
   3244 	    clabel->num_columns > 0 &&
   3245 	    clabel->row < clabel->num_rows &&
   3246 	    clabel->column < clabel->num_columns &&
   3247 	    clabel->blockSize > 0 &&
   3248 	    /*
   3249 	     * numBlocksHi may contain garbage, but it is ok since
   3250 	     * the type is unsigned.  If it is really garbage,
   3251 	     * rf_fix_old_label_size() will fix it.
   3252 	     */
   3253 	    rf_component_label_numblocks(clabel) > 0) {
   3254 		/*
   3255 		 * label looks reasonable enough...
   3256 		 * let's make sure it has no old garbage.
   3257 		 */
   3258 		if (numsecs)
   3259 			rf_fix_old_label_size(clabel, numsecs);
   3260 		return(1);
   3261 	}
   3262 	return(0);
   3263 }
   3264 
   3265 
   3266 /*
   3267  * For reasons yet unknown, some old component labels have garbage in
   3268  * the newer numBlocksHi region, and this causes lossage.  Since those
   3269  * disks will also have numsecs set to less than 32 bits of sectors,
   3270  * we can determine when this corruption has occurred, and fix it.
   3271  *
   3272  * The exact same problem, with the same unknown reason, happens to
   3273  * the partitionSizeHi member as well.
   3274  */
   3275 static void
   3276 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3277 {
   3278 
   3279 	if (numsecs < ((uint64_t)1 << 32)) {
   3280 		if (clabel->numBlocksHi) {
   3281 			printf("WARNING: total sectors < 32 bits, yet "
   3282 			       "numBlocksHi set\n"
   3283 			       "WARNING: resetting numBlocksHi to zero.\n");
   3284 			clabel->numBlocksHi = 0;
   3285 		}
   3286 
   3287 		if (clabel->partitionSizeHi) {
   3288 			printf("WARNING: total sectors < 32 bits, yet "
   3289 			       "partitionSizeHi set\n"
   3290 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3291 			clabel->partitionSizeHi = 0;
   3292 		}
   3293 	}
   3294 }
   3295 
   3296 
   3297 #ifdef DEBUG
   3298 void
   3299 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3300 {
   3301 	uint64_t numBlocks;
   3302 
   3303 	numBlocks = rf_component_label_numblocks(clabel);
   3304 
   3305 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3306 	       clabel->row, clabel->column,
   3307 	       clabel->num_rows, clabel->num_columns);
   3308 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3309 	       clabel->version, clabel->serial_number,
   3310 	       clabel->mod_counter);
   3311 	printf("   Clean: %s Status: %d\n",
   3312 	       clabel->clean ? "Yes" : "No", clabel->status);
   3313 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3314 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3315 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3316 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3317 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3318 	printf("   Contains root partition: %s\n",
   3319 	       clabel->root_partition ? "Yes" : "No");
   3320 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3321 #if 0
   3322 	   printf("   Config order: %d\n", clabel->config_order);
   3323 #endif
   3324 
   3325 }
   3326 #endif
   3327 
   3328 RF_ConfigSet_t *
   3329 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3330 {
   3331 	RF_AutoConfig_t *ac;
   3332 	RF_ConfigSet_t *config_sets;
   3333 	RF_ConfigSet_t *cset;
   3334 	RF_AutoConfig_t *ac_next;
   3335 
   3336 
   3337 	config_sets = NULL;
   3338 
   3339 	/* Go through the AutoConfig list, and figure out which components
   3340 	   belong to what sets.  */
   3341 	ac = ac_list;
   3342 	while(ac!=NULL) {
   3343 		/* we're going to putz with ac->next, so save it here
   3344 		   for use at the end of the loop */
   3345 		ac_next = ac->next;
   3346 
   3347 		if (config_sets == NULL) {
   3348 			/* will need at least this one... */
   3349 			config_sets = (RF_ConfigSet_t *)
   3350 				malloc(sizeof(RF_ConfigSet_t),
   3351 				       M_RAIDFRAME, M_NOWAIT);
   3352 			if (config_sets == NULL) {
   3353 				panic("rf_create_auto_sets: No memory!");
   3354 			}
   3355 			/* this one is easy :) */
   3356 			config_sets->ac = ac;
   3357 			config_sets->next = NULL;
   3358 			config_sets->rootable = 0;
   3359 			ac->next = NULL;
   3360 		} else {
   3361 			/* which set does this component fit into? */
   3362 			cset = config_sets;
   3363 			while(cset!=NULL) {
   3364 				if (rf_does_it_fit(cset, ac)) {
   3365 					/* looks like it matches... */
   3366 					ac->next = cset->ac;
   3367 					cset->ac = ac;
   3368 					break;
   3369 				}
   3370 				cset = cset->next;
   3371 			}
   3372 			if (cset==NULL) {
   3373 				/* didn't find a match above... new set..*/
   3374 				cset = (RF_ConfigSet_t *)
   3375 					malloc(sizeof(RF_ConfigSet_t),
   3376 					       M_RAIDFRAME, M_NOWAIT);
   3377 				if (cset == NULL) {
   3378 					panic("rf_create_auto_sets: No memory!");
   3379 				}
   3380 				cset->ac = ac;
   3381 				ac->next = NULL;
   3382 				cset->next = config_sets;
   3383 				cset->rootable = 0;
   3384 				config_sets = cset;
   3385 			}
   3386 		}
   3387 		ac = ac_next;
   3388 	}
   3389 
   3390 
   3391 	return(config_sets);
   3392 }
   3393 
   3394 static int
   3395 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3396 {
   3397 	RF_ComponentLabel_t *clabel1, *clabel2;
   3398 
   3399 	/* If this one matches the *first* one in the set, that's good
   3400 	   enough, since the other members of the set would have been
   3401 	   through here too... */
   3402 	/* note that we are not checking partitionSize here..
   3403 
   3404 	   Note that we are also not checking the mod_counters here.
   3405 	   If everything else matches except the mod_counter, that's
   3406 	   good enough for this test.  We will deal with the mod_counters
   3407 	   a little later in the autoconfiguration process.
   3408 
   3409 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3410 
   3411 	   The reason we don't check for this is that failed disks
   3412 	   will have lower modification counts.  If those disks are
   3413 	   not added to the set they used to belong to, then they will
   3414 	   form their own set, which may result in 2 different sets,
   3415 	   for example, competing to be configured at raid0, and
   3416 	   perhaps competing to be the root filesystem set.  If the
   3417 	   wrong ones get configured, or both attempt to become /,
   3418 	   weird behaviour and or serious lossage will occur.  Thus we
   3419 	   need to bring them into the fold here, and kick them out at
   3420 	   a later point.
   3421 
   3422 	*/
   3423 
   3424 	clabel1 = cset->ac->clabel;
   3425 	clabel2 = ac->clabel;
   3426 	if ((clabel1->version == clabel2->version) &&
   3427 	    (clabel1->serial_number == clabel2->serial_number) &&
   3428 	    (clabel1->num_rows == clabel2->num_rows) &&
   3429 	    (clabel1->num_columns == clabel2->num_columns) &&
   3430 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3431 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3432 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3433 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3434 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3435 	    (clabel1->blockSize == clabel2->blockSize) &&
   3436 	    rf_component_label_numblocks(clabel1) ==
   3437 	    rf_component_label_numblocks(clabel2) &&
   3438 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3439 	    (clabel1->root_partition == clabel2->root_partition) &&
   3440 	    (clabel1->last_unit == clabel2->last_unit) &&
   3441 	    (clabel1->config_order == clabel2->config_order)) {
   3442 		/* if it get's here, it almost *has* to be a match */
   3443 	} else {
   3444 		/* it's not consistent with somebody in the set..
   3445 		   punt */
   3446 		return(0);
   3447 	}
   3448 	/* all was fine.. it must fit... */
   3449 	return(1);
   3450 }
   3451 
   3452 int
   3453 rf_have_enough_components(RF_ConfigSet_t *cset)
   3454 {
   3455 	RF_AutoConfig_t *ac;
   3456 	RF_AutoConfig_t *auto_config;
   3457 	RF_ComponentLabel_t *clabel;
   3458 	int c;
   3459 	int num_cols;
   3460 	int num_missing;
   3461 	int mod_counter;
   3462 	int mod_counter_found;
   3463 	int even_pair_failed;
   3464 	char parity_type;
   3465 
   3466 
   3467 	/* check to see that we have enough 'live' components
   3468 	   of this set.  If so, we can configure it if necessary */
   3469 
   3470 	num_cols = cset->ac->clabel->num_columns;
   3471 	parity_type = cset->ac->clabel->parityConfig;
   3472 
   3473 	/* XXX Check for duplicate components!?!?!? */
   3474 
   3475 	/* Determine what the mod_counter is supposed to be for this set. */
   3476 
   3477 	mod_counter_found = 0;
   3478 	mod_counter = 0;
   3479 	ac = cset->ac;
   3480 	while(ac!=NULL) {
   3481 		if (mod_counter_found==0) {
   3482 			mod_counter = ac->clabel->mod_counter;
   3483 			mod_counter_found = 1;
   3484 		} else {
   3485 			if (ac->clabel->mod_counter > mod_counter) {
   3486 				mod_counter = ac->clabel->mod_counter;
   3487 			}
   3488 		}
   3489 		ac = ac->next;
   3490 	}
   3491 
   3492 	num_missing = 0;
   3493 	auto_config = cset->ac;
   3494 
   3495 	even_pair_failed = 0;
   3496 	for(c=0; c<num_cols; c++) {
   3497 		ac = auto_config;
   3498 		while(ac!=NULL) {
   3499 			if ((ac->clabel->column == c) &&
   3500 			    (ac->clabel->mod_counter == mod_counter)) {
   3501 				/* it's this one... */
   3502 #ifdef DEBUG
   3503 				printf("Found: %s at %d\n",
   3504 				       ac->devname,c);
   3505 #endif
   3506 				break;
   3507 			}
   3508 			ac=ac->next;
   3509 		}
   3510 		if (ac==NULL) {
   3511 				/* Didn't find one here! */
   3512 				/* special case for RAID 1, especially
   3513 				   where there are more than 2
   3514 				   components (where RAIDframe treats
   3515 				   things a little differently :( ) */
   3516 			if (parity_type == '1') {
   3517 				if (c%2 == 0) { /* even component */
   3518 					even_pair_failed = 1;
   3519 				} else { /* odd component.  If
   3520 					    we're failed, and
   3521 					    so is the even
   3522 					    component, it's
   3523 					    "Good Night, Charlie" */
   3524 					if (even_pair_failed == 1) {
   3525 						return(0);
   3526 					}
   3527 				}
   3528 			} else {
   3529 				/* normal accounting */
   3530 				num_missing++;
   3531 			}
   3532 		}
   3533 		if ((parity_type == '1') && (c%2 == 1)) {
   3534 				/* Just did an even component, and we didn't
   3535 				   bail.. reset the even_pair_failed flag,
   3536 				   and go on to the next component.... */
   3537 			even_pair_failed = 0;
   3538 		}
   3539 	}
   3540 
   3541 	clabel = cset->ac->clabel;
   3542 
   3543 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3544 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3545 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3546 		/* XXX this needs to be made *much* more general */
   3547 		/* Too many failures */
   3548 		return(0);
   3549 	}
   3550 	/* otherwise, all is well, and we've got enough to take a kick
   3551 	   at autoconfiguring this set */
   3552 	return(1);
   3553 }
   3554 
   3555 void
   3556 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3557 			RF_Raid_t *raidPtr)
   3558 {
   3559 	RF_ComponentLabel_t *clabel;
   3560 	int i;
   3561 
   3562 	clabel = ac->clabel;
   3563 
   3564 	/* 1. Fill in the common stuff */
   3565 	config->numRow = clabel->num_rows = 1;
   3566 	config->numCol = clabel->num_columns;
   3567 	config->numSpare = 0; /* XXX should this be set here? */
   3568 	config->sectPerSU = clabel->sectPerSU;
   3569 	config->SUsPerPU = clabel->SUsPerPU;
   3570 	config->SUsPerRU = clabel->SUsPerRU;
   3571 	config->parityConfig = clabel->parityConfig;
   3572 	/* XXX... */
   3573 	strcpy(config->diskQueueType,"fifo");
   3574 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3575 	config->layoutSpecificSize = 0; /* XXX ?? */
   3576 
   3577 	while(ac!=NULL) {
   3578 		/* row/col values will be in range due to the checks
   3579 		   in reasonable_label() */
   3580 		strcpy(config->devnames[0][ac->clabel->column],
   3581 		       ac->devname);
   3582 		ac = ac->next;
   3583 	}
   3584 
   3585 	for(i=0;i<RF_MAXDBGV;i++) {
   3586 		config->debugVars[i][0] = 0;
   3587 	}
   3588 }
   3589 
   3590 int
   3591 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3592 {
   3593 	RF_ComponentLabel_t *clabel;
   3594 	int column;
   3595 	int sparecol;
   3596 
   3597 	raidPtr->autoconfigure = new_value;
   3598 
   3599 	for(column=0; column<raidPtr->numCol; column++) {
   3600 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3601 			clabel = raidget_component_label(raidPtr, column);
   3602 			clabel->autoconfigure = new_value;
   3603 			raidflush_component_label(raidPtr, column);
   3604 		}
   3605 	}
   3606 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3607 		sparecol = raidPtr->numCol + column;
   3608 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3609 			clabel = raidget_component_label(raidPtr, sparecol);
   3610 			clabel->autoconfigure = new_value;
   3611 			raidflush_component_label(raidPtr, sparecol);
   3612 		}
   3613 	}
   3614 	return(new_value);
   3615 }
   3616 
   3617 int
   3618 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3619 {
   3620 	RF_ComponentLabel_t *clabel;
   3621 	int column;
   3622 	int sparecol;
   3623 
   3624 	raidPtr->root_partition = new_value;
   3625 	for(column=0; column<raidPtr->numCol; column++) {
   3626 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3627 			clabel = raidget_component_label(raidPtr, column);
   3628 			clabel->root_partition = new_value;
   3629 			raidflush_component_label(raidPtr, column);
   3630 		}
   3631 	}
   3632 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3633 		sparecol = raidPtr->numCol + column;
   3634 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3635 			clabel = raidget_component_label(raidPtr, sparecol);
   3636 			clabel->root_partition = new_value;
   3637 			raidflush_component_label(raidPtr, sparecol);
   3638 		}
   3639 	}
   3640 	return(new_value);
   3641 }
   3642 
   3643 void
   3644 rf_release_all_vps(RF_ConfigSet_t *cset)
   3645 {
   3646 	RF_AutoConfig_t *ac;
   3647 
   3648 	ac = cset->ac;
   3649 	while(ac!=NULL) {
   3650 		/* Close the vp, and give it back */
   3651 		if (ac->vp) {
   3652 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3653 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3654 			vput(ac->vp);
   3655 			ac->vp = NULL;
   3656 		}
   3657 		ac = ac->next;
   3658 	}
   3659 }
   3660 
   3661 
   3662 void
   3663 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3664 {
   3665 	RF_AutoConfig_t *ac;
   3666 	RF_AutoConfig_t *next_ac;
   3667 
   3668 	ac = cset->ac;
   3669 	while(ac!=NULL) {
   3670 		next_ac = ac->next;
   3671 		/* nuke the label */
   3672 		free(ac->clabel, M_RAIDFRAME);
   3673 		/* cleanup the config structure */
   3674 		free(ac, M_RAIDFRAME);
   3675 		/* "next.." */
   3676 		ac = next_ac;
   3677 	}
   3678 	/* and, finally, nuke the config set */
   3679 	free(cset, M_RAIDFRAME);
   3680 }
   3681 
   3682 
   3683 void
   3684 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3685 {
   3686 	/* current version number */
   3687 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3688 	clabel->serial_number = raidPtr->serial_number;
   3689 	clabel->mod_counter = raidPtr->mod_counter;
   3690 
   3691 	clabel->num_rows = 1;
   3692 	clabel->num_columns = raidPtr->numCol;
   3693 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3694 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3695 
   3696 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3697 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3698 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3699 
   3700 	clabel->blockSize = raidPtr->bytesPerSector;
   3701 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3702 
   3703 	/* XXX not portable */
   3704 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3705 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3706 	clabel->autoconfigure = raidPtr->autoconfigure;
   3707 	clabel->root_partition = raidPtr->root_partition;
   3708 	clabel->last_unit = raidPtr->raidid;
   3709 	clabel->config_order = raidPtr->config_order;
   3710 
   3711 #ifndef RF_NO_PARITY_MAP
   3712 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3713 #endif
   3714 }
   3715 
   3716 struct raid_softc *
   3717 rf_auto_config_set(RF_ConfigSet_t *cset)
   3718 {
   3719 	RF_Raid_t *raidPtr;
   3720 	RF_Config_t *config;
   3721 	int raidID;
   3722 	struct raid_softc *sc;
   3723 
   3724 #ifdef DEBUG
   3725 	printf("RAID autoconfigure\n");
   3726 #endif
   3727 
   3728 	/* 1. Create a config structure */
   3729 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3730 	if (config == NULL) {
   3731 		printf("Out of mem!?!?\n");
   3732 				/* XXX do something more intelligent here. */
   3733 		return NULL;
   3734 	}
   3735 
   3736 	/*
   3737 	   2. Figure out what RAID ID this one is supposed to live at
   3738 	   See if we can get the same RAID dev that it was configured
   3739 	   on last time..
   3740 	*/
   3741 
   3742 	raidID = cset->ac->clabel->last_unit;
   3743 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3744 		continue;
   3745 #ifdef DEBUG
   3746 	printf("Configuring raid%d:\n",raidID);
   3747 #endif
   3748 
   3749 	raidPtr = &sc->sc_r;
   3750 
   3751 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3752 	raidPtr->softc = sc;
   3753 	raidPtr->raidid = raidID;
   3754 	raidPtr->openings = RAIDOUTSTANDING;
   3755 
   3756 	/* 3. Build the configuration structure */
   3757 	rf_create_configuration(cset->ac, config, raidPtr);
   3758 
   3759 	/* 4. Do the configuration */
   3760 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3761 		raidinit(sc);
   3762 
   3763 		rf_markalldirty(raidPtr);
   3764 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3765 		if (cset->ac->clabel->root_partition==1) {
   3766 			/* everything configured just fine.  Make a note
   3767 			   that this set is eligible to be root. */
   3768 			cset->rootable = 1;
   3769 			/* XXX do this here? */
   3770 			raidPtr->root_partition = 1;
   3771 		}
   3772 	} else {
   3773 		raidput(sc);
   3774 		sc = NULL;
   3775 	}
   3776 
   3777 	/* 5. Cleanup */
   3778 	free(config, M_RAIDFRAME);
   3779 	return sc;
   3780 }
   3781 
   3782 void
   3783 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3784 {
   3785 	struct buf *bp;
   3786 	struct raid_softc *rs;
   3787 
   3788 	bp = (struct buf *)desc->bp;
   3789 	rs = desc->raidPtr->softc;
   3790 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3791 	    (bp->b_flags & B_READ));
   3792 }
   3793 
   3794 void
   3795 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3796 	     size_t xmin, size_t xmax)
   3797 {
   3798 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3799 	pool_sethiwat(p, xmax);
   3800 	pool_prime(p, xmin);
   3801 	pool_setlowat(p, xmin);
   3802 }
   3803 
   3804 /*
   3805  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3806  * if there is IO pending and if that IO could possibly be done for a
   3807  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3808  * otherwise.
   3809  *
   3810  */
   3811 
   3812 int
   3813 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3814 {
   3815 	struct raid_softc *rs = raidPtr->softc;
   3816 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3817 		/* there is work to do */
   3818 		return 0;
   3819 	}
   3820 	/* default is nothing to do */
   3821 	return 1;
   3822 }
   3823 
   3824 int
   3825 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3826 {
   3827 	uint64_t numsecs;
   3828 	unsigned secsize;
   3829 	int error;
   3830 
   3831 	error = getdisksize(vp, &numsecs, &secsize);
   3832 	if (error == 0) {
   3833 		diskPtr->blockSize = secsize;
   3834 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3835 		diskPtr->partitionSize = numsecs;
   3836 		return 0;
   3837 	}
   3838 	return error;
   3839 }
   3840 
   3841 static int
   3842 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3843 {
   3844 	return 1;
   3845 }
   3846 
   3847 static void
   3848 raid_attach(device_t parent, device_t self, void *aux)
   3849 {
   3850 
   3851 }
   3852 
   3853 
   3854 static int
   3855 raid_detach(device_t self, int flags)
   3856 {
   3857 	int error;
   3858 	struct raid_softc *rs = raidget(device_unit(self));
   3859 
   3860 	if (rs == NULL)
   3861 		return ENXIO;
   3862 
   3863 	if ((error = raidlock(rs)) != 0)
   3864 		return (error);
   3865 
   3866 	error = raid_detach_unlocked(rs);
   3867 
   3868 	raidunlock(rs);
   3869 
   3870 	/* XXXkd: raidput(rs) ??? */
   3871 
   3872 	return error;
   3873 }
   3874 
   3875 static void
   3876 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3877 {
   3878 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3879 
   3880 	memset(dg, 0, sizeof(*dg));
   3881 
   3882 	dg->dg_secperunit = raidPtr->totalSectors;
   3883 	dg->dg_secsize = raidPtr->bytesPerSector;
   3884 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3885 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3886 
   3887 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3888 }
   3889 
   3890 /*
   3891  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3892  * We end up returning whatever error was returned by the first cache flush
   3893  * that fails.
   3894  */
   3895 
   3896 int
   3897 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3898 {
   3899 	int c, sparecol;
   3900 	int e,error;
   3901 	int force = 1;
   3902 
   3903 	error = 0;
   3904 	for (c = 0; c < raidPtr->numCol; c++) {
   3905 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3906 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3907 					  &force, FWRITE, NOCRED);
   3908 			if (e) {
   3909 				if (e != ENODEV)
   3910 					printf("raid%d: cache flush to component %s failed.\n",
   3911 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3912 				if (error == 0) {
   3913 					error = e;
   3914 				}
   3915 			}
   3916 		}
   3917 	}
   3918 
   3919 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3920 		sparecol = raidPtr->numCol + c;
   3921 		/* Need to ensure that the reconstruct actually completed! */
   3922 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3923 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3924 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3925 			if (e) {
   3926 				if (e != ENODEV)
   3927 					printf("raid%d: cache flush to component %s failed.\n",
   3928 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3929 				if (error == 0) {
   3930 					error = e;
   3931 				}
   3932 			}
   3933 		}
   3934 	}
   3935 	return error;
   3936 }
   3937