Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.366
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.366 2019/02/05 17:13:37 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.366 2019/02/05 17:13:37 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "rf_compat50.h"
    153 #include "rf_compat80.h"
    154 
    155 #ifdef COMPAT_NETBSD32
    156 #ifdef _LP64
    157 #include "rf_compat32.h"
    158 #define RAID_COMPAT32
    159 #endif
    160 #endif
    161 
    162 #include "ioconf.h"
    163 
    164 #ifdef DEBUG
    165 int     rf_kdebug_level = 0;
    166 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    167 #else				/* DEBUG */
    168 #define db1_printf(a) { }
    169 #endif				/* DEBUG */
    170 
    171 #ifdef DEBUG_ROOT
    172 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    173 #else
    174 #define DPRINTF(a, ...)
    175 #endif
    176 
    177 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    178 static rf_declare_mutex2(rf_sparet_wait_mutex);
    179 static rf_declare_cond2(rf_sparet_wait_cv);
    180 static rf_declare_cond2(rf_sparet_resp_cv);
    181 
    182 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    183 						 * spare table */
    184 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    185 						 * installation process */
    186 #endif
    187 
    188 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    189 
    190 /* prototypes */
    191 static void KernelWakeupFunc(struct buf *);
    192 static void InitBP(struct buf *, struct vnode *, unsigned,
    193     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    194     void *, int, struct proc *);
    195 struct raid_softc;
    196 static void raidinit(struct raid_softc *);
    197 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    198 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    199 
    200 static int raid_match(device_t, cfdata_t, void *);
    201 static void raid_attach(device_t, device_t, void *);
    202 static int raid_detach(device_t, int);
    203 
    204 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    205     daddr_t, daddr_t);
    206 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    207     daddr_t, daddr_t, int);
    208 
    209 static int raidwrite_component_label(unsigned,
    210     dev_t, struct vnode *, RF_ComponentLabel_t *);
    211 static int raidread_component_label(unsigned,
    212     dev_t, struct vnode *, RF_ComponentLabel_t *);
    213 
    214 static int raid_diskstart(device_t, struct buf *bp);
    215 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    216 static int raid_lastclose(device_t);
    217 
    218 static dev_type_open(raidopen);
    219 static dev_type_close(raidclose);
    220 static dev_type_read(raidread);
    221 static dev_type_write(raidwrite);
    222 static dev_type_ioctl(raidioctl);
    223 static dev_type_strategy(raidstrategy);
    224 static dev_type_dump(raiddump);
    225 static dev_type_size(raidsize);
    226 
    227 const struct bdevsw raid_bdevsw = {
    228 	.d_open = raidopen,
    229 	.d_close = raidclose,
    230 	.d_strategy = raidstrategy,
    231 	.d_ioctl = raidioctl,
    232 	.d_dump = raiddump,
    233 	.d_psize = raidsize,
    234 	.d_discard = nodiscard,
    235 	.d_flag = D_DISK
    236 };
    237 
    238 const struct cdevsw raid_cdevsw = {
    239 	.d_open = raidopen,
    240 	.d_close = raidclose,
    241 	.d_read = raidread,
    242 	.d_write = raidwrite,
    243 	.d_ioctl = raidioctl,
    244 	.d_stop = nostop,
    245 	.d_tty = notty,
    246 	.d_poll = nopoll,
    247 	.d_mmap = nommap,
    248 	.d_kqfilter = nokqfilter,
    249 	.d_discard = nodiscard,
    250 	.d_flag = D_DISK
    251 };
    252 
    253 static struct dkdriver rf_dkdriver = {
    254 	.d_open = raidopen,
    255 	.d_close = raidclose,
    256 	.d_strategy = raidstrategy,
    257 	.d_diskstart = raid_diskstart,
    258 	.d_dumpblocks = raid_dumpblocks,
    259 	.d_lastclose = raid_lastclose,
    260 	.d_minphys = minphys
    261 };
    262 
    263 struct raid_softc {
    264 	struct dk_softc sc_dksc;
    265 	int	sc_unit;
    266 	int     sc_flags;	/* flags */
    267 	int     sc_cflags;	/* configuration flags */
    268 	kmutex_t sc_mutex;	/* interlock mutex */
    269 	kcondvar_t sc_cv;	/* and the condvar */
    270 	uint64_t sc_size;	/* size of the raid device */
    271 	char    sc_xname[20];	/* XXX external name */
    272 	RF_Raid_t sc_r;
    273 	LIST_ENTRY(raid_softc) sc_link;
    274 };
    275 /* sc_flags */
    276 #define RAIDF_INITED		0x01	/* unit has been initialized */
    277 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    278 #define RAIDF_DETACH  		0x04	/* detach after final close */
    279 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    280 #define RAIDF_LOCKED		0x10	/* unit is locked */
    281 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    282 
    283 #define	raidunit(x)	DISKUNIT(x)
    284 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    285 
    286 extern struct cfdriver raid_cd;
    287 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    288     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    289     DVF_DETACH_SHUTDOWN);
    290 
    291 /* Internal representation of a rf_recon_req */
    292 struct rf_recon_req_internal {
    293 	RF_RowCol_t col;
    294 	RF_ReconReqFlags_t flags;
    295 	void   *raidPtr;
    296 };
    297 
    298 /*
    299  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    300  * Be aware that large numbers can allow the driver to consume a lot of
    301  * kernel memory, especially on writes, and in degraded mode reads.
    302  *
    303  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    304  * a single 64K write will typically require 64K for the old data,
    305  * 64K for the old parity, and 64K for the new parity, for a total
    306  * of 192K (if the parity buffer is not re-used immediately).
    307  * Even it if is used immediately, that's still 128K, which when multiplied
    308  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    309  *
    310  * Now in degraded mode, for example, a 64K read on the above setup may
    311  * require data reconstruction, which will require *all* of the 4 remaining
    312  * disks to participate -- 4 * 32K/disk == 128K again.
    313  */
    314 
    315 #ifndef RAIDOUTSTANDING
    316 #define RAIDOUTSTANDING   6
    317 #endif
    318 
    319 #define RAIDLABELDEV(dev)	\
    320 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    321 
    322 /* declared here, and made public, for the benefit of KVM stuff.. */
    323 
    324 static int raidlock(struct raid_softc *);
    325 static void raidunlock(struct raid_softc *);
    326 
    327 static int raid_detach_unlocked(struct raid_softc *);
    328 
    329 static void rf_markalldirty(RF_Raid_t *);
    330 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    331 
    332 void rf_ReconThread(struct rf_recon_req_internal *);
    333 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    334 void rf_CopybackThread(RF_Raid_t *raidPtr);
    335 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    336 int rf_autoconfig(device_t);
    337 void rf_buildroothack(RF_ConfigSet_t *);
    338 
    339 RF_AutoConfig_t *rf_find_raid_components(void);
    340 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    341 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    342 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    343 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    344 int rf_set_autoconfig(RF_Raid_t *, int);
    345 int rf_set_rootpartition(RF_Raid_t *, int);
    346 void rf_release_all_vps(RF_ConfigSet_t *);
    347 void rf_cleanup_config_set(RF_ConfigSet_t *);
    348 int rf_have_enough_components(RF_ConfigSet_t *);
    349 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    350 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    351 
    352 /*
    353  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    354  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    355  * in the kernel config file.
    356  */
    357 #ifdef RAID_AUTOCONFIG
    358 int raidautoconfig = 1;
    359 #else
    360 int raidautoconfig = 0;
    361 #endif
    362 static bool raidautoconfigdone = false;
    363 
    364 struct RF_Pools_s rf_pools;
    365 
    366 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    367 static kmutex_t raid_lock;
    368 
    369 static struct raid_softc *
    370 raidcreate(int unit) {
    371 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    372 	sc->sc_unit = unit;
    373 	cv_init(&sc->sc_cv, "raidunit");
    374 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    375 	return sc;
    376 }
    377 
    378 static void
    379 raiddestroy(struct raid_softc *sc) {
    380 	cv_destroy(&sc->sc_cv);
    381 	mutex_destroy(&sc->sc_mutex);
    382 	kmem_free(sc, sizeof(*sc));
    383 }
    384 
    385 static struct raid_softc *
    386 raidget(int unit, bool create) {
    387 	struct raid_softc *sc;
    388 	if (unit < 0) {
    389 #ifdef DIAGNOSTIC
    390 		panic("%s: unit %d!", __func__, unit);
    391 #endif
    392 		return NULL;
    393 	}
    394 	mutex_enter(&raid_lock);
    395 	LIST_FOREACH(sc, &raids, sc_link) {
    396 		if (sc->sc_unit == unit) {
    397 			mutex_exit(&raid_lock);
    398 			return sc;
    399 		}
    400 	}
    401 	mutex_exit(&raid_lock);
    402 	if (!create)
    403 		return NULL;
    404 	if ((sc = raidcreate(unit)) == NULL)
    405 		return NULL;
    406 	mutex_enter(&raid_lock);
    407 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    408 	mutex_exit(&raid_lock);
    409 	return sc;
    410 }
    411 
    412 static void
    413 raidput(struct raid_softc *sc) {
    414 	mutex_enter(&raid_lock);
    415 	LIST_REMOVE(sc, sc_link);
    416 	mutex_exit(&raid_lock);
    417 	raiddestroy(sc);
    418 }
    419 
    420 void
    421 raidattach(int num)
    422 {
    423 
    424 	/*
    425 	 * Device attachment and associated initialization now occurs
    426 	 * as part of the module initialization.
    427 	 */
    428 }
    429 
    430 int
    431 rf_autoconfig(device_t self)
    432 {
    433 	RF_AutoConfig_t *ac_list;
    434 	RF_ConfigSet_t *config_sets;
    435 
    436 	if (!raidautoconfig || raidautoconfigdone == true)
    437 		return (0);
    438 
    439 	/* XXX This code can only be run once. */
    440 	raidautoconfigdone = true;
    441 
    442 #ifdef __HAVE_CPU_BOOTCONF
    443 	/*
    444 	 * 0. find the boot device if needed first so we can use it later
    445 	 * this needs to be done before we autoconfigure any raid sets,
    446 	 * because if we use wedges we are not going to be able to open
    447 	 * the boot device later
    448 	 */
    449 	if (booted_device == NULL)
    450 		cpu_bootconf();
    451 #endif
    452 	/* 1. locate all RAID components on the system */
    453 	aprint_debug("Searching for RAID components...\n");
    454 	ac_list = rf_find_raid_components();
    455 
    456 	/* 2. Sort them into their respective sets. */
    457 	config_sets = rf_create_auto_sets(ac_list);
    458 
    459 	/*
    460 	 * 3. Evaluate each set and configure the valid ones.
    461 	 * This gets done in rf_buildroothack().
    462 	 */
    463 	rf_buildroothack(config_sets);
    464 
    465 	return 1;
    466 }
    467 
    468 static int
    469 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    470 	const char *bootname;
    471 	size_t len;
    472 
    473 	/* if bdv is NULL, the set can't contain it. exit early. */
    474 	if (bdv == NULL)
    475 		return 0;
    476 
    477 	bootname = device_xname(bdv);
    478 	len = strlen(bootname);
    479 
    480 	for (int col = 0; col < r->numCol; col++) {
    481 		const char *devname = r->Disks[col].devname;
    482 		devname += sizeof("/dev/") - 1;
    483 		if (strncmp(devname, "dk", 2) == 0) {
    484 			const char *parent =
    485 			    dkwedge_get_parent_name(r->Disks[col].dev);
    486 			if (parent != NULL)
    487 				devname = parent;
    488 		}
    489 		if (strncmp(devname, bootname, len) == 0) {
    490 			struct raid_softc *sc = r->softc;
    491 			aprint_debug("raid%d includes boot device %s\n",
    492 			    sc->sc_unit, devname);
    493 			return 1;
    494 		}
    495 	}
    496 	return 0;
    497 }
    498 
    499 void
    500 rf_buildroothack(RF_ConfigSet_t *config_sets)
    501 {
    502 	RF_ConfigSet_t *cset;
    503 	RF_ConfigSet_t *next_cset;
    504 	int num_root;
    505 	struct raid_softc *sc, *rsc;
    506 	struct dk_softc *dksc;
    507 
    508 	sc = rsc = NULL;
    509 	num_root = 0;
    510 	cset = config_sets;
    511 	while (cset != NULL) {
    512 		next_cset = cset->next;
    513 		if (rf_have_enough_components(cset) &&
    514 		    cset->ac->clabel->autoconfigure == 1) {
    515 			sc = rf_auto_config_set(cset);
    516 			if (sc != NULL) {
    517 				aprint_debug("raid%d: configured ok, rootable %d\n",
    518 				    sc->sc_unit, cset->rootable);
    519 				if (cset->rootable) {
    520 					rsc = sc;
    521 					num_root++;
    522 				}
    523 			} else {
    524 				/* The autoconfig didn't work :( */
    525 				aprint_debug("Autoconfig failed\n");
    526 				rf_release_all_vps(cset);
    527 			}
    528 		} else {
    529 			/* we're not autoconfiguring this set...
    530 			   release the associated resources */
    531 			rf_release_all_vps(cset);
    532 		}
    533 		/* cleanup */
    534 		rf_cleanup_config_set(cset);
    535 		cset = next_cset;
    536 	}
    537 	dksc = &rsc->sc_dksc;
    538 
    539 	/* if the user has specified what the root device should be
    540 	   then we don't touch booted_device or boothowto... */
    541 
    542 	if (rootspec != NULL) {
    543 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    544 		return;
    545 	}
    546 
    547 	/* we found something bootable... */
    548 
    549 	/*
    550 	 * XXX: The following code assumes that the root raid
    551 	 * is the first ('a') partition. This is about the best
    552 	 * we can do with a BSD disklabel, but we might be able
    553 	 * to do better with a GPT label, by setting a specified
    554 	 * attribute to indicate the root partition. We can then
    555 	 * stash the partition number in the r->root_partition
    556 	 * high bits (the bottom 2 bits are already used). For
    557 	 * now we just set booted_partition to 0 when we override
    558 	 * root.
    559 	 */
    560 	if (num_root == 1) {
    561 		device_t candidate_root;
    562 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    563 			char cname[sizeof(cset->ac->devname)];
    564 			/* XXX: assume partition 'a' first */
    565 			snprintf(cname, sizeof(cname), "%s%c",
    566 			    device_xname(dksc->sc_dev), 'a');
    567 			candidate_root = dkwedge_find_by_wname(cname);
    568 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    569 			    cname);
    570 			if (candidate_root == NULL) {
    571 				/*
    572 				 * If that is not found, because we don't use
    573 				 * disklabel, return the first dk child
    574 				 * XXX: we can skip the 'a' check above
    575 				 * and always do this...
    576 				 */
    577 				size_t i = 0;
    578 				candidate_root = dkwedge_find_by_parent(
    579 				    device_xname(dksc->sc_dev), &i);
    580 			}
    581 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    582 			    candidate_root);
    583 		} else
    584 			candidate_root = dksc->sc_dev;
    585 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    586 		DPRINTF("%s: booted_device=%p root_partition=%d "
    587 			"contains_boot=%d",
    588 		    __func__, booted_device, rsc->sc_r.root_partition,
    589 			   rf_containsboot(&rsc->sc_r, booted_device));
    590 		/* XXX the check for booted_device == NULL can probably be
    591 		 * dropped, now that rf_containsboot handles that case.
    592 		 */
    593 		if (booted_device == NULL ||
    594 		    rsc->sc_r.root_partition == 1 ||
    595 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    596 			booted_device = candidate_root;
    597 			booted_method = "raidframe/single";
    598 			booted_partition = 0;	/* XXX assume 'a' */
    599 		}
    600 	} else if (num_root > 1) {
    601 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    602 		    booted_device);
    603 
    604 		/*
    605 		 * Maybe the MD code can help. If it cannot, then
    606 		 * setroot() will discover that we have no
    607 		 * booted_device and will ask the user if nothing was
    608 		 * hardwired in the kernel config file
    609 		 */
    610 		if (booted_device == NULL)
    611 			return;
    612 
    613 		num_root = 0;
    614 		mutex_enter(&raid_lock);
    615 		LIST_FOREACH(sc, &raids, sc_link) {
    616 			RF_Raid_t *r = &sc->sc_r;
    617 			if (r->valid == 0)
    618 				continue;
    619 
    620 			if (r->root_partition == 0)
    621 				continue;
    622 
    623 			if (rf_containsboot(r, booted_device)) {
    624 				num_root++;
    625 				rsc = sc;
    626 				dksc = &rsc->sc_dksc;
    627 			}
    628 		}
    629 		mutex_exit(&raid_lock);
    630 
    631 		if (num_root == 1) {
    632 			booted_device = dksc->sc_dev;
    633 			booted_method = "raidframe/multi";
    634 			booted_partition = 0;	/* XXX assume 'a' */
    635 		} else {
    636 			/* we can't guess.. require the user to answer... */
    637 			boothowto |= RB_ASKNAME;
    638 		}
    639 	}
    640 }
    641 
    642 static int
    643 raidsize(dev_t dev)
    644 {
    645 	struct raid_softc *rs;
    646 	struct dk_softc *dksc;
    647 	unsigned int unit;
    648 
    649 	unit = raidunit(dev);
    650 	if ((rs = raidget(unit, false)) == NULL)
    651 		return -1;
    652 	dksc = &rs->sc_dksc;
    653 
    654 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    655 		return -1;
    656 
    657 	return dk_size(dksc, dev);
    658 }
    659 
    660 static int
    661 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    662 {
    663 	unsigned int unit;
    664 	struct raid_softc *rs;
    665 	struct dk_softc *dksc;
    666 
    667 	unit = raidunit(dev);
    668 	if ((rs = raidget(unit, false)) == NULL)
    669 		return ENXIO;
    670 	dksc = &rs->sc_dksc;
    671 
    672 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    673 		return ENODEV;
    674 
    675         /*
    676            Note that blkno is relative to this particular partition.
    677            By adding adding RF_PROTECTED_SECTORS, we get a value that
    678 	   is relative to the partition used for the underlying component.
    679         */
    680 	blkno += RF_PROTECTED_SECTORS;
    681 
    682 	return dk_dump(dksc, dev, blkno, va, size);
    683 }
    684 
    685 static int
    686 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    687 {
    688 	struct raid_softc *rs = raidsoftc(dev);
    689 	const struct bdevsw *bdev;
    690 	RF_Raid_t *raidPtr;
    691 	int     c, sparecol, j, scol, dumpto;
    692 	int     error = 0;
    693 
    694 	raidPtr = &rs->sc_r;
    695 
    696 	/* we only support dumping to RAID 1 sets */
    697 	if (raidPtr->Layout.numDataCol != 1 ||
    698 	    raidPtr->Layout.numParityCol != 1)
    699 		return EINVAL;
    700 
    701 	if ((error = raidlock(rs)) != 0)
    702 		return error;
    703 
    704 	/* figure out what device is alive.. */
    705 
    706 	/*
    707 	   Look for a component to dump to.  The preference for the
    708 	   component to dump to is as follows:
    709 	   1) the master
    710 	   2) a used_spare of the master
    711 	   3) the slave
    712 	   4) a used_spare of the slave
    713 	*/
    714 
    715 	dumpto = -1;
    716 	for (c = 0; c < raidPtr->numCol; c++) {
    717 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    718 			/* this might be the one */
    719 			dumpto = c;
    720 			break;
    721 		}
    722 	}
    723 
    724 	/*
    725 	   At this point we have possibly selected a live master or a
    726 	   live slave.  We now check to see if there is a spared
    727 	   master (or a spared slave), if we didn't find a live master
    728 	   or a live slave.
    729 	*/
    730 
    731 	for (c = 0; c < raidPtr->numSpare; c++) {
    732 		sparecol = raidPtr->numCol + c;
    733 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    734 			/* How about this one? */
    735 			scol = -1;
    736 			for(j=0;j<raidPtr->numCol;j++) {
    737 				if (raidPtr->Disks[j].spareCol == sparecol) {
    738 					scol = j;
    739 					break;
    740 				}
    741 			}
    742 			if (scol == 0) {
    743 				/*
    744 				   We must have found a spared master!
    745 				   We'll take that over anything else
    746 				   found so far.  (We couldn't have
    747 				   found a real master before, since
    748 				   this is a used spare, and it's
    749 				   saying that it's replacing the
    750 				   master.)  On reboot (with
    751 				   autoconfiguration turned on)
    752 				   sparecol will become the 1st
    753 				   component (component0) of this set.
    754 				*/
    755 				dumpto = sparecol;
    756 				break;
    757 			} else if (scol != -1) {
    758 				/*
    759 				   Must be a spared slave.  We'll dump
    760 				   to that if we havn't found anything
    761 				   else so far.
    762 				*/
    763 				if (dumpto == -1)
    764 					dumpto = sparecol;
    765 			}
    766 		}
    767 	}
    768 
    769 	if (dumpto == -1) {
    770 		/* we couldn't find any live components to dump to!?!?
    771 		 */
    772 		error = EINVAL;
    773 		goto out;
    774 	}
    775 
    776 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    777 	if (bdev == NULL) {
    778 		error = ENXIO;
    779 		goto out;
    780 	}
    781 
    782 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    783 				blkno, va, nblk * raidPtr->bytesPerSector);
    784 
    785 out:
    786 	raidunlock(rs);
    787 
    788 	return error;
    789 }
    790 
    791 /* ARGSUSED */
    792 static int
    793 raidopen(dev_t dev, int flags, int fmt,
    794     struct lwp *l)
    795 {
    796 	int     unit = raidunit(dev);
    797 	struct raid_softc *rs;
    798 	struct dk_softc *dksc;
    799 	int     error = 0;
    800 	int     part, pmask;
    801 
    802 	if ((rs = raidget(unit, true)) == NULL)
    803 		return ENXIO;
    804 	if ((error = raidlock(rs)) != 0)
    805 		return (error);
    806 
    807 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    808 		error = EBUSY;
    809 		goto bad;
    810 	}
    811 
    812 	dksc = &rs->sc_dksc;
    813 
    814 	part = DISKPART(dev);
    815 	pmask = (1 << part);
    816 
    817 	if (!DK_BUSY(dksc, pmask) &&
    818 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    819 		/* First one... mark things as dirty... Note that we *MUST*
    820 		 have done a configure before this.  I DO NOT WANT TO BE
    821 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    822 		 THAT THEY BELONG TOGETHER!!!!! */
    823 		/* XXX should check to see if we're only open for reading
    824 		   here... If so, we needn't do this, but then need some
    825 		   other way of keeping track of what's happened.. */
    826 
    827 		rf_markalldirty(&rs->sc_r);
    828 	}
    829 
    830 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    831 		error = dk_open(dksc, dev, flags, fmt, l);
    832 
    833 bad:
    834 	raidunlock(rs);
    835 
    836 	return (error);
    837 
    838 
    839 }
    840 
    841 static int
    842 raid_lastclose(device_t self)
    843 {
    844 	struct raid_softc *rs = raidsoftc(self);
    845 
    846 	/* Last one... device is not unconfigured yet.
    847 	   Device shutdown has taken care of setting the
    848 	   clean bits if RAIDF_INITED is not set
    849 	   mark things as clean... */
    850 
    851 	rf_update_component_labels(&rs->sc_r,
    852 	    RF_FINAL_COMPONENT_UPDATE);
    853 
    854 	/* pass to unlocked code */
    855 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    856 		rs->sc_flags |= RAIDF_DETACH;
    857 
    858 	return 0;
    859 }
    860 
    861 /* ARGSUSED */
    862 static int
    863 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    864 {
    865 	int     unit = raidunit(dev);
    866 	struct raid_softc *rs;
    867 	struct dk_softc *dksc;
    868 	cfdata_t cf;
    869 	int     error = 0, do_detach = 0, do_put = 0;
    870 
    871 	if ((rs = raidget(unit, false)) == NULL)
    872 		return ENXIO;
    873 	dksc = &rs->sc_dksc;
    874 
    875 	if ((error = raidlock(rs)) != 0)
    876 		return (error);
    877 
    878 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    879 		error = dk_close(dksc, dev, flags, fmt, l);
    880 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    881 			do_detach = 1;
    882 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    883 		do_put = 1;
    884 
    885 	raidunlock(rs);
    886 
    887 	if (do_detach) {
    888 		/* free the pseudo device attach bits */
    889 		cf = device_cfdata(dksc->sc_dev);
    890 		error = config_detach(dksc->sc_dev, 0);
    891 		if (error == 0)
    892 			free(cf, M_RAIDFRAME);
    893 	} else if (do_put) {
    894 		raidput(rs);
    895 	}
    896 
    897 	return (error);
    898 
    899 }
    900 
    901 static void
    902 raid_wakeup(RF_Raid_t *raidPtr)
    903 {
    904 	rf_lock_mutex2(raidPtr->iodone_lock);
    905 	rf_signal_cond2(raidPtr->iodone_cv);
    906 	rf_unlock_mutex2(raidPtr->iodone_lock);
    907 }
    908 
    909 static void
    910 raidstrategy(struct buf *bp)
    911 {
    912 	unsigned int unit;
    913 	struct raid_softc *rs;
    914 	struct dk_softc *dksc;
    915 	RF_Raid_t *raidPtr;
    916 
    917 	unit = raidunit(bp->b_dev);
    918 	if ((rs = raidget(unit, false)) == NULL) {
    919 		bp->b_error = ENXIO;
    920 		goto fail;
    921 	}
    922 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    923 		bp->b_error = ENXIO;
    924 		goto fail;
    925 	}
    926 	dksc = &rs->sc_dksc;
    927 	raidPtr = &rs->sc_r;
    928 
    929 	/* Queue IO only */
    930 	if (dk_strategy_defer(dksc, bp))
    931 		goto done;
    932 
    933 	/* schedule the IO to happen at the next convenient time */
    934 	raid_wakeup(raidPtr);
    935 
    936 done:
    937 	return;
    938 
    939 fail:
    940 	bp->b_resid = bp->b_bcount;
    941 	biodone(bp);
    942 }
    943 
    944 static int
    945 raid_diskstart(device_t dev, struct buf *bp)
    946 {
    947 	struct raid_softc *rs = raidsoftc(dev);
    948 	RF_Raid_t *raidPtr;
    949 
    950 	raidPtr = &rs->sc_r;
    951 	if (!raidPtr->valid) {
    952 		db1_printf(("raid is not valid..\n"));
    953 		return ENODEV;
    954 	}
    955 
    956 	/* XXX */
    957 	bp->b_resid = 0;
    958 
    959 	return raiddoaccess(raidPtr, bp);
    960 }
    961 
    962 void
    963 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    964 {
    965 	struct raid_softc *rs;
    966 	struct dk_softc *dksc;
    967 
    968 	rs = raidPtr->softc;
    969 	dksc = &rs->sc_dksc;
    970 
    971 	dk_done(dksc, bp);
    972 
    973 	rf_lock_mutex2(raidPtr->mutex);
    974 	raidPtr->openings++;
    975 	rf_unlock_mutex2(raidPtr->mutex);
    976 
    977 	/* schedule more IO */
    978 	raid_wakeup(raidPtr);
    979 }
    980 
    981 /* ARGSUSED */
    982 static int
    983 raidread(dev_t dev, struct uio *uio, int flags)
    984 {
    985 	int     unit = raidunit(dev);
    986 	struct raid_softc *rs;
    987 
    988 	if ((rs = raidget(unit, false)) == NULL)
    989 		return ENXIO;
    990 
    991 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    992 		return (ENXIO);
    993 
    994 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    995 
    996 }
    997 
    998 /* ARGSUSED */
    999 static int
   1000 raidwrite(dev_t dev, struct uio *uio, int flags)
   1001 {
   1002 	int     unit = raidunit(dev);
   1003 	struct raid_softc *rs;
   1004 
   1005 	if ((rs = raidget(unit, false)) == NULL)
   1006 		return ENXIO;
   1007 
   1008 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1009 		return (ENXIO);
   1010 
   1011 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1012 
   1013 }
   1014 
   1015 static int
   1016 raid_detach_unlocked(struct raid_softc *rs)
   1017 {
   1018 	struct dk_softc *dksc = &rs->sc_dksc;
   1019 	RF_Raid_t *raidPtr;
   1020 	int error;
   1021 
   1022 	raidPtr = &rs->sc_r;
   1023 
   1024 	if (DK_BUSY(dksc, 0) ||
   1025 	    raidPtr->recon_in_progress != 0 ||
   1026 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1027 	    raidPtr->copyback_in_progress != 0)
   1028 		return EBUSY;
   1029 
   1030 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1031 		return 0;
   1032 
   1033 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1034 
   1035 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1036 		return error;
   1037 
   1038 	rs->sc_flags &= ~RAIDF_INITED;
   1039 
   1040 	/* Kill off any queued buffers */
   1041 	dk_drain(dksc);
   1042 	bufq_free(dksc->sc_bufq);
   1043 
   1044 	/* Detach the disk. */
   1045 	dkwedge_delall(&dksc->sc_dkdev);
   1046 	disk_detach(&dksc->sc_dkdev);
   1047 	disk_destroy(&dksc->sc_dkdev);
   1048 	dk_detach(dksc);
   1049 
   1050 	return 0;
   1051 }
   1052 
   1053 static bool
   1054 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1055 {
   1056 	switch (cmd) {
   1057 	case RAIDFRAME_ADD_HOT_SPARE:
   1058 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1059 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1060 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1061 	case RAIDFRAME_CHECK_PARITY:
   1062 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1063 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1064 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1065 	case RAIDFRAME_CHECK_RECON_STATUS:
   1066 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1067 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1068 	case RAIDFRAME_COPYBACK:
   1069 	case RAIDFRAME_DELETE_COMPONENT:
   1070 	case RAIDFRAME_FAIL_DISK:
   1071 	case RAIDFRAME_FAIL_DISK80:
   1072 	case RAIDFRAME_GET_ACCTOTALS:
   1073 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1074 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1075 	case RAIDFRAME_GET_INFO:
   1076 #ifdef RAID_COMPAT32
   1077 	case RAIDFRAME_GET_INFO32:
   1078 #endif
   1079 	case RAIDFRAME_GET_INFO50:
   1080 	case RAIDFRAME_GET_INFO80:
   1081 	case RAIDFRAME_GET_SIZE:
   1082 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1083 	case RAIDFRAME_INIT_LABELS:
   1084 	case RAIDFRAME_KEEP_ACCTOTALS:
   1085 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1086 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1087 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1088 	case RAIDFRAME_PARITYMAP_STATUS:
   1089 	case RAIDFRAME_REBUILD_IN_PLACE:
   1090 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1091 	case RAIDFRAME_RESET_ACCTOTALS:
   1092 	case RAIDFRAME_REWRITEPARITY:
   1093 	case RAIDFRAME_SET_AUTOCONFIG:
   1094 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1095 	case RAIDFRAME_SET_ROOT:
   1096 		return (rs->sc_flags & RAIDF_INITED) != 0;
   1097 	}
   1098 	return false;
   1099 }
   1100 
   1101 /*
   1102  * Really this should be done as part of the default in the ioctl
   1103  * switch like other compat code, but it is too messy to do that
   1104  * right now, so we list all the compat ioctls we know about,
   1105  * and load appropriately.
   1106  *
   1107  * XXX[1] what about combinations of compat32 and compat80 ioctls?
   1108  * XXX[2] what about autoloading the compat32 code? Is there a compat32
   1109  * ioctl module? Should there be one?
   1110  */
   1111 static int
   1112 rf_handle_compat(struct raid_softc *rs, int unit, u_long cmd, void *data,
   1113     RF_Config_t **k_cfg)
   1114 {
   1115 	RF_Raid_t *raidPtr = &rs->sc_r;
   1116 	int retcode = EPASSTHROUGH;
   1117 	switch (cmd) {
   1118 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1119 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1120 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1121 	case RAIDFRAME_CONFIGURE80:
   1122 	case RAIDFRAME_FAIL_DISK80:
   1123 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1124 	case RAIDFRAME_GET_INFO80:
   1125 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1126 		MODULE_CALL_HOOK(raidframe_ioctl_80_hook, (cmd,
   1127 		    (rs->sc_flags & RAIDF_INITED), raidPtr, unit, data, k_cfg),
   1128 		    enosys(), retcode);
   1129 		break;
   1130 	case RAIDFRAME_CONFIGURE50:
   1131 	case RAIDFRAME_GET_INFO50:
   1132 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1133 		MODULE_CALL_HOOK(raidframe_ioctl_50_hook, (cmd,
   1134 		    (rs->sc_flags & RAIDF_INITED), raidPtr, unit, data, k_cfg),
   1135 		    enosys(), retcode);
   1136 		break;
   1137 	default:
   1138 		break;
   1139 	}
   1140 	return retcode;
   1141 }
   1142 
   1143 int
   1144 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1145 {
   1146 	struct rf_recon_req_internal *rrint;
   1147 
   1148 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1149 		/* Can't do this on a RAID 0!! */
   1150 		return EINVAL;
   1151 	}
   1152 
   1153 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1154 		/* bad column */
   1155 		return EINVAL;
   1156 	}
   1157 
   1158 	rf_lock_mutex2(raidPtr->mutex);
   1159 	if (raidPtr->status == rf_rs_reconstructing) {
   1160 		/* you can't fail a disk while we're reconstructing! */
   1161 		/* XXX wrong for RAID6 */
   1162 		goto out;
   1163 	}
   1164 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1165 	    (raidPtr->numFailures > 0)) {
   1166 		/* some other component has failed.  Let's not make
   1167 		   things worse. XXX wrong for RAID6 */
   1168 		goto out;
   1169 	}
   1170 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1171 		/* Can't fail a spared disk! */
   1172 		goto out;
   1173 	}
   1174 	rf_unlock_mutex2(raidPtr->mutex);
   1175 
   1176 	/* make a copy of the recon request so that we don't rely on
   1177 	 * the user's buffer */
   1178 	RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1179 	if (rrint == NULL)
   1180 		return(ENOMEM);
   1181 	rrint->col = rr->col;
   1182 	rrint->flags = rr->flags;
   1183 	rrint->raidPtr = raidPtr;
   1184 
   1185 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1186 	    rrint, "raid_recon");
   1187 out:
   1188 	rf_unlock_mutex2(raidPtr->mutex);
   1189 	return EINVAL;
   1190 }
   1191 
   1192 static int
   1193 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1194 {
   1195 	int     unit = raidunit(dev);
   1196 	int     error = 0;
   1197 	int     part, pmask;
   1198 	struct raid_softc *rs;
   1199 	struct dk_softc *dksc;
   1200 	RF_Config_t *k_cfg, *u_cfg;
   1201 	RF_Raid_t *raidPtr;
   1202 	RF_RaidDisk_t *diskPtr;
   1203 	RF_AccTotals_t *totals;
   1204 	RF_DeviceConfig_t *d_cfg, *ucfgp = data;
   1205 	u_char *specific_buf;
   1206 	int retcode = 0;
   1207 	int column;
   1208 /*	int raidid; */
   1209 	struct rf_recon_req_internal *rrint;
   1210 	RF_ComponentLabel_t *clabel;
   1211 	RF_ComponentLabel_t *ci_label;
   1212 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1213 	RF_SingleComponent_t component;
   1214 	int d;
   1215 
   1216 	if ((rs = raidget(unit, false)) == NULL)
   1217 		return ENXIO;
   1218 
   1219 	dksc = &rs->sc_dksc;
   1220 	raidPtr = &rs->sc_r;
   1221 
   1222 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1223 	    (int) DISKPART(dev), (int) unit, cmd));
   1224 
   1225 	/* Must be initialized for these... */
   1226 	if (rf_must_be_initialized(rs, cmd))
   1227 		return ENXIO;
   1228 
   1229 	switch (retcode = rf_handle_compat(rs, unit, cmd, data, &k_cfg)) {
   1230 	case EPASSTHROUGH:
   1231 		/* Not compat, keep going */
   1232 		retcode = 0;
   1233 		break;
   1234 	case EAGAIN:
   1235 		goto config;
   1236 	default:
   1237 		/* compat but could not handle it or load the module */
   1238 		return retcode;
   1239 	}
   1240 
   1241 	switch (cmd) {
   1242 		/* configure the system */
   1243 	case RAIDFRAME_CONFIGURE:
   1244 #ifdef RAID_COMPAT32
   1245 	case RAIDFRAME_CONFIGURE32:
   1246 #endif
   1247 		if (raidPtr->valid) {
   1248 			/* There is a valid RAID set running on this unit! */
   1249 			printf("raid%d: Device already configured!\n", unit);
   1250 			return(EINVAL);
   1251 		}
   1252 
   1253 		/* copy-in the configuration information */
   1254 		/* data points to a pointer to the configuration structure */
   1255 
   1256 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1257 		if (k_cfg == NULL) {
   1258 			return (ENOMEM);
   1259 		}
   1260 #ifdef RAID_COMPAT32
   1261 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1262 		    (l->l_proc->p_flag & PK_32) != 0)
   1263 			MODULE_CALL_HOOK(raidframe_netbsd32_config_hook,
   1264 			    (data, k_cfg), enosys(), retcode);
   1265 		else
   1266 #endif
   1267 		{
   1268 			u_cfg = *((RF_Config_t **) data);
   1269 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1270 		}
   1271 		if (retcode) {
   1272 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1273 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1274 				retcode));
   1275 			goto no_config;
   1276 		}
   1277 		goto config;
   1278 	config:
   1279 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1280 
   1281 		/* allocate a buffer for the layout-specific data, and copy it
   1282 		 * in */
   1283 		if (k_cfg->layoutSpecificSize) {
   1284 			if (k_cfg->layoutSpecificSize > 10000) {
   1285 				/* sanity check */
   1286 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1287 				retcode = EINVAL;
   1288 				goto no_config;
   1289 			}
   1290 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1291 			    (u_char *));
   1292 			if (specific_buf == NULL) {
   1293 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1294 				retcode = ENOMEM;
   1295 				goto no_config;
   1296 			}
   1297 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1298 			    k_cfg->layoutSpecificSize);
   1299 			if (retcode) {
   1300 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1301 				RF_Free(specific_buf,
   1302 					k_cfg->layoutSpecificSize);
   1303 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1304 					retcode));
   1305 				goto no_config;
   1306 			}
   1307 		} else
   1308 			specific_buf = NULL;
   1309 		k_cfg->layoutSpecific = specific_buf;
   1310 
   1311 		/* should do some kind of sanity check on the configuration.
   1312 		 * Store the sum of all the bytes in the last byte? */
   1313 
   1314 		/* configure the system */
   1315 
   1316 		/*
   1317 		 * Clear the entire RAID descriptor, just to make sure
   1318 		 *  there is no stale data left in the case of a
   1319 		 *  reconfiguration
   1320 		 */
   1321 		memset(raidPtr, 0, sizeof(*raidPtr));
   1322 		raidPtr->softc = rs;
   1323 		raidPtr->raidid = unit;
   1324 
   1325 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1326 
   1327 		if (retcode == 0) {
   1328 
   1329 			/* allow this many simultaneous IO's to
   1330 			   this RAID device */
   1331 			raidPtr->openings = RAIDOUTSTANDING;
   1332 
   1333 			raidinit(rs);
   1334 			raid_wakeup(raidPtr);
   1335 			rf_markalldirty(raidPtr);
   1336 		}
   1337 		/* free the buffers.  No return code here. */
   1338 		if (k_cfg->layoutSpecificSize) {
   1339 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1340 		}
   1341 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1342 
   1343 	no_config:
   1344 		/*
   1345 		 * If configuration failed, set sc_flags so that we
   1346 		 * will detach the device when we close it.
   1347 		 */
   1348 		if (retcode != 0)
   1349 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1350 		return (retcode);
   1351 
   1352 		/* shutdown the system */
   1353 	case RAIDFRAME_SHUTDOWN:
   1354 
   1355 		part = DISKPART(dev);
   1356 		pmask = (1 << part);
   1357 
   1358 		if ((error = raidlock(rs)) != 0)
   1359 			return (error);
   1360 
   1361 		if (DK_BUSY(dksc, pmask) ||
   1362 		    raidPtr->recon_in_progress != 0 ||
   1363 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1364 		    raidPtr->copyback_in_progress != 0)
   1365 			retcode = EBUSY;
   1366 		else {
   1367 			/* detach and free on close */
   1368 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1369 			retcode = 0;
   1370 		}
   1371 
   1372 		raidunlock(rs);
   1373 
   1374 		return (retcode);
   1375 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1376 		return rf_get_component_label(raidPtr, data);
   1377 
   1378 #if 0
   1379 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1380 		clabel = (RF_ComponentLabel_t *) data;
   1381 
   1382 		/* XXX check the label for valid stuff... */
   1383 		/* Note that some things *should not* get modified --
   1384 		   the user should be re-initing the labels instead of
   1385 		   trying to patch things.
   1386 		   */
   1387 
   1388 		raidid = raidPtr->raidid;
   1389 #ifdef DEBUG
   1390 		printf("raid%d: Got component label:\n", raidid);
   1391 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1392 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1393 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1394 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1395 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1396 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1397 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1398 #endif	/* DEBUG */
   1399 		clabel->row = 0;
   1400 		column = clabel->column;
   1401 
   1402 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1403 			return(EINVAL);
   1404 		}
   1405 
   1406 		/* XXX this isn't allowed to do anything for now :-) */
   1407 
   1408 		/* XXX and before it is, we need to fill in the rest
   1409 		   of the fields!?!?!?! */
   1410 		memcpy(raidget_component_label(raidPtr, column),
   1411 		    clabel, sizeof(*clabel));
   1412 		raidflush_component_label(raidPtr, column);
   1413 		return (0);
   1414 #endif	/* 0 */
   1415 
   1416 	case RAIDFRAME_INIT_LABELS:
   1417 		clabel = (RF_ComponentLabel_t *) data;
   1418 		/*
   1419 		   we only want the serial number from
   1420 		   the above.  We get all the rest of the information
   1421 		   from the config that was used to create this RAID
   1422 		   set.
   1423 		   */
   1424 
   1425 		raidPtr->serial_number = clabel->serial_number;
   1426 
   1427 		for(column=0;column<raidPtr->numCol;column++) {
   1428 			diskPtr = &raidPtr->Disks[column];
   1429 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1430 				ci_label = raidget_component_label(raidPtr,
   1431 				    column);
   1432 				/* Zeroing this is important. */
   1433 				memset(ci_label, 0, sizeof(*ci_label));
   1434 				raid_init_component_label(raidPtr, ci_label);
   1435 				ci_label->serial_number =
   1436 				    raidPtr->serial_number;
   1437 				ci_label->row = 0; /* we dont' pretend to support more */
   1438 				rf_component_label_set_partitionsize(ci_label,
   1439 				    diskPtr->partitionSize);
   1440 				ci_label->column = column;
   1441 				raidflush_component_label(raidPtr, column);
   1442 			}
   1443 			/* XXXjld what about the spares? */
   1444 		}
   1445 
   1446 		return (retcode);
   1447 	case RAIDFRAME_SET_AUTOCONFIG:
   1448 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1449 		printf("raid%d: New autoconfig value is: %d\n",
   1450 		       raidPtr->raidid, d);
   1451 		*(int *) data = d;
   1452 		return (retcode);
   1453 
   1454 	case RAIDFRAME_SET_ROOT:
   1455 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1456 		printf("raid%d: New rootpartition value is: %d\n",
   1457 		       raidPtr->raidid, d);
   1458 		*(int *) data = d;
   1459 		return (retcode);
   1460 
   1461 		/* initialize all parity */
   1462 	case RAIDFRAME_REWRITEPARITY:
   1463 
   1464 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1465 			/* Parity for RAID 0 is trivially correct */
   1466 			raidPtr->parity_good = RF_RAID_CLEAN;
   1467 			return(0);
   1468 		}
   1469 
   1470 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1471 			/* Re-write is already in progress! */
   1472 			return(EINVAL);
   1473 		}
   1474 
   1475 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1476 					   rf_RewriteParityThread,
   1477 					   raidPtr,"raid_parity");
   1478 		return (retcode);
   1479 
   1480 
   1481 	case RAIDFRAME_ADD_HOT_SPARE:
   1482 		sparePtr = (RF_SingleComponent_t *) data;
   1483 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1484 		retcode = rf_add_hot_spare(raidPtr, &component);
   1485 		return(retcode);
   1486 
   1487 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1488 		return(retcode);
   1489 
   1490 	case RAIDFRAME_DELETE_COMPONENT:
   1491 		componentPtr = (RF_SingleComponent_t *)data;
   1492 		memcpy( &component, componentPtr,
   1493 			sizeof(RF_SingleComponent_t));
   1494 		retcode = rf_delete_component(raidPtr, &component);
   1495 		return(retcode);
   1496 
   1497 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1498 		componentPtr = (RF_SingleComponent_t *)data;
   1499 		memcpy( &component, componentPtr,
   1500 			sizeof(RF_SingleComponent_t));
   1501 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1502 		return(retcode);
   1503 
   1504 	case RAIDFRAME_REBUILD_IN_PLACE:
   1505 
   1506 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1507 			/* Can't do this on a RAID 0!! */
   1508 			return(EINVAL);
   1509 		}
   1510 
   1511 		if (raidPtr->recon_in_progress == 1) {
   1512 			/* a reconstruct is already in progress! */
   1513 			return(EINVAL);
   1514 		}
   1515 
   1516 		componentPtr = (RF_SingleComponent_t *) data;
   1517 		memcpy( &component, componentPtr,
   1518 			sizeof(RF_SingleComponent_t));
   1519 		component.row = 0; /* we don't support any more */
   1520 		column = component.column;
   1521 
   1522 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1523 			return(EINVAL);
   1524 		}
   1525 
   1526 		rf_lock_mutex2(raidPtr->mutex);
   1527 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1528 		    (raidPtr->numFailures > 0)) {
   1529 			/* XXX 0 above shouldn't be constant!!! */
   1530 			/* some component other than this has failed.
   1531 			   Let's not make things worse than they already
   1532 			   are... */
   1533 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1534 			       raidPtr->raidid);
   1535 			printf("raid%d:     Col: %d   Too many failures.\n",
   1536 			       raidPtr->raidid, column);
   1537 			rf_unlock_mutex2(raidPtr->mutex);
   1538 			return (EINVAL);
   1539 		}
   1540 		if (raidPtr->Disks[column].status ==
   1541 		    rf_ds_reconstructing) {
   1542 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1543 			       raidPtr->raidid);
   1544 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1545 
   1546 			rf_unlock_mutex2(raidPtr->mutex);
   1547 			return (EINVAL);
   1548 		}
   1549 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1550 			rf_unlock_mutex2(raidPtr->mutex);
   1551 			return (EINVAL);
   1552 		}
   1553 		rf_unlock_mutex2(raidPtr->mutex);
   1554 
   1555 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1556 		if (rrint == NULL)
   1557 			return(ENOMEM);
   1558 
   1559 		rrint->col = column;
   1560 		rrint->raidPtr = raidPtr;
   1561 
   1562 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1563 					   rf_ReconstructInPlaceThread,
   1564 					   rrint, "raid_reconip");
   1565 		return(retcode);
   1566 
   1567 #ifdef RAID_COMPAT32
   1568 	case RAIDFRAME_GET_INFO32:
   1569 		if (!raidframe_netbsd32_config_hook.hooked)
   1570 			return ENOSYS;
   1571 		ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1572 		/*FALLTHROUGH*/
   1573 #endif
   1574 	case RAIDFRAME_GET_INFO:
   1575 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1576 			  (RF_DeviceConfig_t *));
   1577 		if (d_cfg == NULL)
   1578 			return ENOMEM;
   1579 		retcode = rf_get_info(raidPtr, d_cfg);
   1580 		if (retcode == 0) {
   1581 		    retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1582 		}
   1583 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1584 
   1585 		return retcode;
   1586 
   1587 	case RAIDFRAME_CHECK_PARITY:
   1588 		*(int *) data = raidPtr->parity_good;
   1589 		return (0);
   1590 
   1591 	case RAIDFRAME_PARITYMAP_STATUS:
   1592 		if (rf_paritymap_ineligible(raidPtr))
   1593 			return EINVAL;
   1594 		rf_paritymap_status(raidPtr->parity_map,
   1595 		    (struct rf_pmstat *)data);
   1596 		return 0;
   1597 
   1598 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1599 		if (rf_paritymap_ineligible(raidPtr))
   1600 			return EINVAL;
   1601 		if (raidPtr->parity_map == NULL)
   1602 			return ENOENT; /* ??? */
   1603 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1604 			(struct rf_pmparams *)data, 1))
   1605 			return EINVAL;
   1606 		return 0;
   1607 
   1608 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1609 		if (rf_paritymap_ineligible(raidPtr))
   1610 			return EINVAL;
   1611 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1612 		return 0;
   1613 
   1614 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1615 		if (rf_paritymap_ineligible(raidPtr))
   1616 			return EINVAL;
   1617 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1618 		/* XXX should errors be passed up? */
   1619 		return 0;
   1620 
   1621 	case RAIDFRAME_RESET_ACCTOTALS:
   1622 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1623 		return (0);
   1624 
   1625 	case RAIDFRAME_GET_ACCTOTALS:
   1626 		totals = (RF_AccTotals_t *) data;
   1627 		*totals = raidPtr->acc_totals;
   1628 		return 0;
   1629 
   1630 	case RAIDFRAME_KEEP_ACCTOTALS:
   1631 		raidPtr->keep_acc_totals = *(int *)data;
   1632 		return 0;
   1633 
   1634 	case RAIDFRAME_GET_SIZE:
   1635 		*(int *) data = raidPtr->totalSectors;
   1636 		return 0;
   1637 
   1638 	case RAIDFRAME_FAIL_DISK:
   1639 		return rf_fail_disk(raidPtr, data);
   1640 
   1641 		/* invoke a copyback operation after recon on whatever disk
   1642 		 * needs it, if any */
   1643 	case RAIDFRAME_COPYBACK:
   1644 
   1645 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1646 			/* This makes no sense on a RAID 0!! */
   1647 			return(EINVAL);
   1648 		}
   1649 
   1650 		if (raidPtr->copyback_in_progress == 1) {
   1651 			/* Copyback is already in progress! */
   1652 			return(EINVAL);
   1653 		}
   1654 
   1655 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1656 					   rf_CopybackThread,
   1657 					   raidPtr,"raid_copyback");
   1658 		return (retcode);
   1659 
   1660 		/* return the percentage completion of reconstruction */
   1661 	case RAIDFRAME_CHECK_RECON_STATUS:
   1662 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1663 			/* This makes no sense on a RAID 0, so tell the
   1664 			   user it's done. */
   1665 			*(int *) data = 100;
   1666 			return(0);
   1667 		}
   1668 		if (raidPtr->status != rf_rs_reconstructing)
   1669 			*(int *) data = 100;
   1670 		else {
   1671 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1672 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1673 			} else {
   1674 				*(int *) data = 0;
   1675 			}
   1676 		}
   1677 		return (0);
   1678 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1679 		rf_check_recon_status_ext(raidPtr, data);
   1680 		return (0);
   1681 
   1682 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1683 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1684 			/* This makes no sense on a RAID 0, so tell the
   1685 			   user it's done. */
   1686 			*(int *) data = 100;
   1687 			return(0);
   1688 		}
   1689 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1690 			*(int *) data = 100 *
   1691 				raidPtr->parity_rewrite_stripes_done /
   1692 				raidPtr->Layout.numStripe;
   1693 		} else {
   1694 			*(int *) data = 100;
   1695 		}
   1696 		return (0);
   1697 
   1698 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1699 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1700 		return (0);
   1701 
   1702 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1703 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1704 			/* This makes no sense on a RAID 0 */
   1705 			*(int *) data = 100;
   1706 			return(0);
   1707 		}
   1708 		if (raidPtr->copyback_in_progress == 1) {
   1709 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1710 				raidPtr->Layout.numStripe;
   1711 		} else {
   1712 			*(int *) data = 100;
   1713 		}
   1714 		return (0);
   1715 
   1716 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1717 		rf_check_copyback_status_ext(raidPtr, data);
   1718 		return 0;
   1719 
   1720 	case RAIDFRAME_SET_LAST_UNIT:
   1721 		for (column = 0; column < raidPtr->numCol; column++)
   1722 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1723 				return EBUSY;
   1724 
   1725 		for (column = 0; column < raidPtr->numCol; column++) {
   1726 			clabel = raidget_component_label(raidPtr, column);
   1727 			clabel->last_unit = *(int *)data;
   1728 			raidflush_component_label(raidPtr, column);
   1729 		}
   1730 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1731 		return 0;
   1732 
   1733 		/* the sparetable daemon calls this to wait for the kernel to
   1734 		 * need a spare table. this ioctl does not return until a
   1735 		 * spare table is needed. XXX -- calling mpsleep here in the
   1736 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1737 		 * -- I should either compute the spare table in the kernel,
   1738 		 * or have a different -- XXX XXX -- interface (a different
   1739 		 * character device) for delivering the table     -- XXX */
   1740 #if 0
   1741 	case RAIDFRAME_SPARET_WAIT:
   1742 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1743 		while (!rf_sparet_wait_queue)
   1744 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1745 		waitreq = rf_sparet_wait_queue;
   1746 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1747 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1748 
   1749 		/* structure assignment */
   1750 		*((RF_SparetWait_t *) data) = *waitreq;
   1751 
   1752 		RF_Free(waitreq, sizeof(*waitreq));
   1753 		return (0);
   1754 
   1755 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1756 		 * code in it that will cause the dameon to exit */
   1757 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1758 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1759 		waitreq->fcol = -1;
   1760 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1761 		waitreq->next = rf_sparet_wait_queue;
   1762 		rf_sparet_wait_queue = waitreq;
   1763 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1764 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1765 		return (0);
   1766 
   1767 		/* used by the spare table daemon to deliver a spare table
   1768 		 * into the kernel */
   1769 	case RAIDFRAME_SEND_SPARET:
   1770 
   1771 		/* install the spare table */
   1772 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1773 
   1774 		/* respond to the requestor.  the return status of the spare
   1775 		 * table installation is passed in the "fcol" field */
   1776 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1777 		waitreq->fcol = retcode;
   1778 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1779 		waitreq->next = rf_sparet_resp_queue;
   1780 		rf_sparet_resp_queue = waitreq;
   1781 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1782 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1783 
   1784 		return (retcode);
   1785 #endif
   1786 
   1787 	default:
   1788 		break; /* fall through to the os-specific code below */
   1789 
   1790 	}
   1791 
   1792 	if (!raidPtr->valid)
   1793 		return (EINVAL);
   1794 
   1795 	/*
   1796 	 * Add support for "regular" device ioctls here.
   1797 	 */
   1798 
   1799 	switch (cmd) {
   1800 	case DIOCGCACHE:
   1801 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1802 		break;
   1803 
   1804 	case DIOCCACHESYNC:
   1805 		retcode = rf_sync_component_caches(raidPtr);
   1806 		break;
   1807 
   1808 	default:
   1809 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1810 		break;
   1811 	}
   1812 
   1813 	return (retcode);
   1814 
   1815 }
   1816 
   1817 
   1818 /* raidinit -- complete the rest of the initialization for the
   1819    RAIDframe device.  */
   1820 
   1821 
   1822 static void
   1823 raidinit(struct raid_softc *rs)
   1824 {
   1825 	cfdata_t cf;
   1826 	unsigned int unit;
   1827 	struct dk_softc *dksc = &rs->sc_dksc;
   1828 	RF_Raid_t *raidPtr = &rs->sc_r;
   1829 	device_t dev;
   1830 
   1831 	unit = raidPtr->raidid;
   1832 
   1833 	/* XXX doesn't check bounds. */
   1834 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1835 
   1836 	/* attach the pseudo device */
   1837 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1838 	cf->cf_name = raid_cd.cd_name;
   1839 	cf->cf_atname = raid_cd.cd_name;
   1840 	cf->cf_unit = unit;
   1841 	cf->cf_fstate = FSTATE_STAR;
   1842 
   1843 	dev = config_attach_pseudo(cf);
   1844 	if (dev == NULL) {
   1845 		printf("raid%d: config_attach_pseudo failed\n",
   1846 		    raidPtr->raidid);
   1847 		free(cf, M_RAIDFRAME);
   1848 		return;
   1849 	}
   1850 
   1851 	/* provide a backpointer to the real softc */
   1852 	raidsoftc(dev) = rs;
   1853 
   1854 	/* disk_attach actually creates space for the CPU disklabel, among
   1855 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1856 	 * with disklabels. */
   1857 	dk_init(dksc, dev, DKTYPE_RAID);
   1858 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1859 
   1860 	/* XXX There may be a weird interaction here between this, and
   1861 	 * protectedSectors, as used in RAIDframe.  */
   1862 
   1863 	rs->sc_size = raidPtr->totalSectors;
   1864 
   1865 	/* Attach dk and disk subsystems */
   1866 	dk_attach(dksc);
   1867 	disk_attach(&dksc->sc_dkdev);
   1868 	rf_set_geometry(rs, raidPtr);
   1869 
   1870 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1871 
   1872 	/* mark unit as usuable */
   1873 	rs->sc_flags |= RAIDF_INITED;
   1874 
   1875 	dkwedge_discover(&dksc->sc_dkdev);
   1876 }
   1877 
   1878 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1879 /* wake up the daemon & tell it to get us a spare table
   1880  * XXX
   1881  * the entries in the queues should be tagged with the raidPtr
   1882  * so that in the extremely rare case that two recons happen at once,
   1883  * we know for which device were requesting a spare table
   1884  * XXX
   1885  *
   1886  * XXX This code is not currently used. GO
   1887  */
   1888 int
   1889 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1890 {
   1891 	int     retcode;
   1892 
   1893 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1894 	req->next = rf_sparet_wait_queue;
   1895 	rf_sparet_wait_queue = req;
   1896 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1897 
   1898 	/* mpsleep unlocks the mutex */
   1899 	while (!rf_sparet_resp_queue) {
   1900 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1901 	}
   1902 	req = rf_sparet_resp_queue;
   1903 	rf_sparet_resp_queue = req->next;
   1904 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1905 
   1906 	retcode = req->fcol;
   1907 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1908 					 * alloc'd */
   1909 	return (retcode);
   1910 }
   1911 #endif
   1912 
   1913 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1914  * bp & passes it down.
   1915  * any calls originating in the kernel must use non-blocking I/O
   1916  * do some extra sanity checking to return "appropriate" error values for
   1917  * certain conditions (to make some standard utilities work)
   1918  *
   1919  * Formerly known as: rf_DoAccessKernel
   1920  */
   1921 void
   1922 raidstart(RF_Raid_t *raidPtr)
   1923 {
   1924 	struct raid_softc *rs;
   1925 	struct dk_softc *dksc;
   1926 
   1927 	rs = raidPtr->softc;
   1928 	dksc = &rs->sc_dksc;
   1929 	/* quick check to see if anything has died recently */
   1930 	rf_lock_mutex2(raidPtr->mutex);
   1931 	if (raidPtr->numNewFailures > 0) {
   1932 		rf_unlock_mutex2(raidPtr->mutex);
   1933 		rf_update_component_labels(raidPtr,
   1934 					   RF_NORMAL_COMPONENT_UPDATE);
   1935 		rf_lock_mutex2(raidPtr->mutex);
   1936 		raidPtr->numNewFailures--;
   1937 	}
   1938 	rf_unlock_mutex2(raidPtr->mutex);
   1939 
   1940 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1941 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1942 		return;
   1943 	}
   1944 
   1945 	dk_start(dksc, NULL);
   1946 }
   1947 
   1948 static int
   1949 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1950 {
   1951 	RF_SectorCount_t num_blocks, pb, sum;
   1952 	RF_RaidAddr_t raid_addr;
   1953 	daddr_t blocknum;
   1954 	int     do_async;
   1955 	int rc;
   1956 
   1957 	rf_lock_mutex2(raidPtr->mutex);
   1958 	if (raidPtr->openings == 0) {
   1959 		rf_unlock_mutex2(raidPtr->mutex);
   1960 		return EAGAIN;
   1961 	}
   1962 	rf_unlock_mutex2(raidPtr->mutex);
   1963 
   1964 	blocknum = bp->b_rawblkno;
   1965 
   1966 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1967 		    (int) blocknum));
   1968 
   1969 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1970 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1971 
   1972 	/* *THIS* is where we adjust what block we're going to...
   1973 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1974 	raid_addr = blocknum;
   1975 
   1976 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1977 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1978 	sum = raid_addr + num_blocks + pb;
   1979 	if (1 || rf_debugKernelAccess) {
   1980 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1981 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1982 			    (int) pb, (int) bp->b_resid));
   1983 	}
   1984 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1985 	    || (sum < num_blocks) || (sum < pb)) {
   1986 		rc = ENOSPC;
   1987 		goto done;
   1988 	}
   1989 	/*
   1990 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1991 	 */
   1992 
   1993 	if (bp->b_bcount & raidPtr->sectorMask) {
   1994 		rc = ENOSPC;
   1995 		goto done;
   1996 	}
   1997 	db1_printf(("Calling DoAccess..\n"));
   1998 
   1999 
   2000 	rf_lock_mutex2(raidPtr->mutex);
   2001 	raidPtr->openings--;
   2002 	rf_unlock_mutex2(raidPtr->mutex);
   2003 
   2004 	/*
   2005 	 * Everything is async.
   2006 	 */
   2007 	do_async = 1;
   2008 
   2009 	/* don't ever condition on bp->b_flags & B_WRITE.
   2010 	 * always condition on B_READ instead */
   2011 
   2012 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2013 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2014 			 do_async, raid_addr, num_blocks,
   2015 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2016 
   2017 done:
   2018 	return rc;
   2019 }
   2020 
   2021 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2022 
   2023 int
   2024 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2025 {
   2026 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2027 	struct buf *bp;
   2028 
   2029 	req->queue = queue;
   2030 	bp = req->bp;
   2031 
   2032 	switch (req->type) {
   2033 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2034 		/* XXX need to do something extra here.. */
   2035 		/* I'm leaving this in, as I've never actually seen it used,
   2036 		 * and I'd like folks to report it... GO */
   2037 		printf(("WAKEUP CALLED\n"));
   2038 		queue->numOutstanding++;
   2039 
   2040 		bp->b_flags = 0;
   2041 		bp->b_private = req;
   2042 
   2043 		KernelWakeupFunc(bp);
   2044 		break;
   2045 
   2046 	case RF_IO_TYPE_READ:
   2047 	case RF_IO_TYPE_WRITE:
   2048 #if RF_ACC_TRACE > 0
   2049 		if (req->tracerec) {
   2050 			RF_ETIMER_START(req->tracerec->timer);
   2051 		}
   2052 #endif
   2053 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2054 		    op, queue->rf_cinfo->ci_dev,
   2055 		    req->sectorOffset, req->numSector,
   2056 		    req->buf, KernelWakeupFunc, (void *) req,
   2057 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2058 
   2059 		if (rf_debugKernelAccess) {
   2060 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2061 				(long) bp->b_blkno));
   2062 		}
   2063 		queue->numOutstanding++;
   2064 		queue->last_deq_sector = req->sectorOffset;
   2065 		/* acc wouldn't have been let in if there were any pending
   2066 		 * reqs at any other priority */
   2067 		queue->curPriority = req->priority;
   2068 
   2069 		db1_printf(("Going for %c to unit %d col %d\n",
   2070 			    req->type, queue->raidPtr->raidid,
   2071 			    queue->col));
   2072 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2073 			(int) req->sectorOffset, (int) req->numSector,
   2074 			(int) (req->numSector <<
   2075 			    queue->raidPtr->logBytesPerSector),
   2076 			(int) queue->raidPtr->logBytesPerSector));
   2077 
   2078 		/*
   2079 		 * XXX: drop lock here since this can block at
   2080 		 * least with backing SCSI devices.  Retake it
   2081 		 * to minimize fuss with calling interfaces.
   2082 		 */
   2083 
   2084 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2085 		bdev_strategy(bp);
   2086 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2087 		break;
   2088 
   2089 	default:
   2090 		panic("bad req->type in rf_DispatchKernelIO");
   2091 	}
   2092 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2093 
   2094 	return (0);
   2095 }
   2096 /* this is the callback function associated with a I/O invoked from
   2097    kernel code.
   2098  */
   2099 static void
   2100 KernelWakeupFunc(struct buf *bp)
   2101 {
   2102 	RF_DiskQueueData_t *req = NULL;
   2103 	RF_DiskQueue_t *queue;
   2104 
   2105 	db1_printf(("recovering the request queue:\n"));
   2106 
   2107 	req = bp->b_private;
   2108 
   2109 	queue = (RF_DiskQueue_t *) req->queue;
   2110 
   2111 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2112 
   2113 #if RF_ACC_TRACE > 0
   2114 	if (req->tracerec) {
   2115 		RF_ETIMER_STOP(req->tracerec->timer);
   2116 		RF_ETIMER_EVAL(req->tracerec->timer);
   2117 		rf_lock_mutex2(rf_tracing_mutex);
   2118 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2119 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2120 		req->tracerec->num_phys_ios++;
   2121 		rf_unlock_mutex2(rf_tracing_mutex);
   2122 	}
   2123 #endif
   2124 
   2125 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2126 	 * ballistic, and mark the component as hosed... */
   2127 
   2128 	if (bp->b_error != 0) {
   2129 		/* Mark the disk as dead */
   2130 		/* but only mark it once... */
   2131 		/* and only if it wouldn't leave this RAID set
   2132 		   completely broken */
   2133 		if (((queue->raidPtr->Disks[queue->col].status ==
   2134 		      rf_ds_optimal) ||
   2135 		     (queue->raidPtr->Disks[queue->col].status ==
   2136 		      rf_ds_used_spare)) &&
   2137 		     (queue->raidPtr->numFailures <
   2138 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2139 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2140 			       queue->raidPtr->raidid,
   2141 			       bp->b_error,
   2142 			       queue->raidPtr->Disks[queue->col].devname);
   2143 			queue->raidPtr->Disks[queue->col].status =
   2144 			    rf_ds_failed;
   2145 			queue->raidPtr->status = rf_rs_degraded;
   2146 			queue->raidPtr->numFailures++;
   2147 			queue->raidPtr->numNewFailures++;
   2148 		} else {	/* Disk is already dead... */
   2149 			/* printf("Disk already marked as dead!\n"); */
   2150 		}
   2151 
   2152 	}
   2153 
   2154 	/* Fill in the error value */
   2155 	req->error = bp->b_error;
   2156 
   2157 	/* Drop this one on the "finished" queue... */
   2158 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2159 
   2160 	/* Let the raidio thread know there is work to be done. */
   2161 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2162 
   2163 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2164 }
   2165 
   2166 
   2167 /*
   2168  * initialize a buf structure for doing an I/O in the kernel.
   2169  */
   2170 static void
   2171 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2172        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2173        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2174        struct proc *b_proc)
   2175 {
   2176 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2177 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2178 	bp->b_oflags = 0;
   2179 	bp->b_cflags = 0;
   2180 	bp->b_bcount = numSect << logBytesPerSector;
   2181 	bp->b_bufsize = bp->b_bcount;
   2182 	bp->b_error = 0;
   2183 	bp->b_dev = dev;
   2184 	bp->b_data = bf;
   2185 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2186 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2187 	if (bp->b_bcount == 0) {
   2188 		panic("bp->b_bcount is zero in InitBP!!");
   2189 	}
   2190 	bp->b_proc = b_proc;
   2191 	bp->b_iodone = cbFunc;
   2192 	bp->b_private = cbArg;
   2193 }
   2194 
   2195 /*
   2196  * Wait interruptibly for an exclusive lock.
   2197  *
   2198  * XXX
   2199  * Several drivers do this; it should be abstracted and made MP-safe.
   2200  * (Hmm... where have we seen this warning before :->  GO )
   2201  */
   2202 static int
   2203 raidlock(struct raid_softc *rs)
   2204 {
   2205 	int     error;
   2206 
   2207 	error = 0;
   2208 	mutex_enter(&rs->sc_mutex);
   2209 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2210 		rs->sc_flags |= RAIDF_WANTED;
   2211 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2212 		if (error != 0)
   2213 			goto done;
   2214 	}
   2215 	rs->sc_flags |= RAIDF_LOCKED;
   2216 done:
   2217 	mutex_exit(&rs->sc_mutex);
   2218 	return (error);
   2219 }
   2220 /*
   2221  * Unlock and wake up any waiters.
   2222  */
   2223 static void
   2224 raidunlock(struct raid_softc *rs)
   2225 {
   2226 
   2227 	mutex_enter(&rs->sc_mutex);
   2228 	rs->sc_flags &= ~RAIDF_LOCKED;
   2229 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2230 		rs->sc_flags &= ~RAIDF_WANTED;
   2231 		cv_broadcast(&rs->sc_cv);
   2232 	}
   2233 	mutex_exit(&rs->sc_mutex);
   2234 }
   2235 
   2236 
   2237 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2238 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2239 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2240 
   2241 static daddr_t
   2242 rf_component_info_offset(void)
   2243 {
   2244 
   2245 	return RF_COMPONENT_INFO_OFFSET;
   2246 }
   2247 
   2248 static daddr_t
   2249 rf_component_info_size(unsigned secsize)
   2250 {
   2251 	daddr_t info_size;
   2252 
   2253 	KASSERT(secsize);
   2254 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2255 		info_size = secsize;
   2256 	else
   2257 		info_size = RF_COMPONENT_INFO_SIZE;
   2258 
   2259 	return info_size;
   2260 }
   2261 
   2262 static daddr_t
   2263 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2264 {
   2265 	daddr_t map_offset;
   2266 
   2267 	KASSERT(raidPtr->bytesPerSector);
   2268 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2269 		map_offset = raidPtr->bytesPerSector;
   2270 	else
   2271 		map_offset = RF_COMPONENT_INFO_SIZE;
   2272 	map_offset += rf_component_info_offset();
   2273 
   2274 	return map_offset;
   2275 }
   2276 
   2277 static daddr_t
   2278 rf_parity_map_size(RF_Raid_t *raidPtr)
   2279 {
   2280 	daddr_t map_size;
   2281 
   2282 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2283 		map_size = raidPtr->bytesPerSector;
   2284 	else
   2285 		map_size = RF_PARITY_MAP_SIZE;
   2286 
   2287 	return map_size;
   2288 }
   2289 
   2290 int
   2291 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2292 {
   2293 	RF_ComponentLabel_t *clabel;
   2294 
   2295 	clabel = raidget_component_label(raidPtr, col);
   2296 	clabel->clean = RF_RAID_CLEAN;
   2297 	raidflush_component_label(raidPtr, col);
   2298 	return(0);
   2299 }
   2300 
   2301 
   2302 int
   2303 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2304 {
   2305 	RF_ComponentLabel_t *clabel;
   2306 
   2307 	clabel = raidget_component_label(raidPtr, col);
   2308 	clabel->clean = RF_RAID_DIRTY;
   2309 	raidflush_component_label(raidPtr, col);
   2310 	return(0);
   2311 }
   2312 
   2313 int
   2314 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2315 {
   2316 	KASSERT(raidPtr->bytesPerSector);
   2317 	return raidread_component_label(raidPtr->bytesPerSector,
   2318 	    raidPtr->Disks[col].dev,
   2319 	    raidPtr->raid_cinfo[col].ci_vp,
   2320 	    &raidPtr->raid_cinfo[col].ci_label);
   2321 }
   2322 
   2323 RF_ComponentLabel_t *
   2324 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2325 {
   2326 	return &raidPtr->raid_cinfo[col].ci_label;
   2327 }
   2328 
   2329 int
   2330 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2331 {
   2332 	RF_ComponentLabel_t *label;
   2333 
   2334 	label = &raidPtr->raid_cinfo[col].ci_label;
   2335 	label->mod_counter = raidPtr->mod_counter;
   2336 #ifndef RF_NO_PARITY_MAP
   2337 	label->parity_map_modcount = label->mod_counter;
   2338 #endif
   2339 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2340 	    raidPtr->Disks[col].dev,
   2341 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2342 }
   2343 
   2344 
   2345 static int
   2346 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2347     RF_ComponentLabel_t *clabel)
   2348 {
   2349 	return raidread_component_area(dev, b_vp, clabel,
   2350 	    sizeof(RF_ComponentLabel_t),
   2351 	    rf_component_info_offset(),
   2352 	    rf_component_info_size(secsize));
   2353 }
   2354 
   2355 /* ARGSUSED */
   2356 static int
   2357 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2358     size_t msize, daddr_t offset, daddr_t dsize)
   2359 {
   2360 	struct buf *bp;
   2361 	int error;
   2362 
   2363 	/* XXX should probably ensure that we don't try to do this if
   2364 	   someone has changed rf_protected_sectors. */
   2365 
   2366 	if (b_vp == NULL) {
   2367 		/* For whatever reason, this component is not valid.
   2368 		   Don't try to read a component label from it. */
   2369 		return(EINVAL);
   2370 	}
   2371 
   2372 	/* get a block of the appropriate size... */
   2373 	bp = geteblk((int)dsize);
   2374 	bp->b_dev = dev;
   2375 
   2376 	/* get our ducks in a row for the read */
   2377 	bp->b_blkno = offset / DEV_BSIZE;
   2378 	bp->b_bcount = dsize;
   2379 	bp->b_flags |= B_READ;
   2380  	bp->b_resid = dsize;
   2381 
   2382 	bdev_strategy(bp);
   2383 	error = biowait(bp);
   2384 
   2385 	if (!error) {
   2386 		memcpy(data, bp->b_data, msize);
   2387 	}
   2388 
   2389 	brelse(bp, 0);
   2390 	return(error);
   2391 }
   2392 
   2393 
   2394 static int
   2395 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2396     RF_ComponentLabel_t *clabel)
   2397 {
   2398 	return raidwrite_component_area(dev, b_vp, clabel,
   2399 	    sizeof(RF_ComponentLabel_t),
   2400 	    rf_component_info_offset(),
   2401 	    rf_component_info_size(secsize), 0);
   2402 }
   2403 
   2404 /* ARGSUSED */
   2405 static int
   2406 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2407     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2408 {
   2409 	struct buf *bp;
   2410 	int error;
   2411 
   2412 	/* get a block of the appropriate size... */
   2413 	bp = geteblk((int)dsize);
   2414 	bp->b_dev = dev;
   2415 
   2416 	/* get our ducks in a row for the write */
   2417 	bp->b_blkno = offset / DEV_BSIZE;
   2418 	bp->b_bcount = dsize;
   2419 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2420  	bp->b_resid = dsize;
   2421 
   2422 	memset(bp->b_data, 0, dsize);
   2423 	memcpy(bp->b_data, data, msize);
   2424 
   2425 	bdev_strategy(bp);
   2426 	if (asyncp)
   2427 		return 0;
   2428 	error = biowait(bp);
   2429 	brelse(bp, 0);
   2430 	if (error) {
   2431 #if 1
   2432 		printf("Failed to write RAID component info!\n");
   2433 #endif
   2434 	}
   2435 
   2436 	return(error);
   2437 }
   2438 
   2439 void
   2440 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2441 {
   2442 	int c;
   2443 
   2444 	for (c = 0; c < raidPtr->numCol; c++) {
   2445 		/* Skip dead disks. */
   2446 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2447 			continue;
   2448 		/* XXXjld: what if an error occurs here? */
   2449 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2450 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2451 		    RF_PARITYMAP_NBYTE,
   2452 		    rf_parity_map_offset(raidPtr),
   2453 		    rf_parity_map_size(raidPtr), 0);
   2454 	}
   2455 }
   2456 
   2457 void
   2458 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2459 {
   2460 	struct rf_paritymap_ondisk tmp;
   2461 	int c,first;
   2462 
   2463 	first=1;
   2464 	for (c = 0; c < raidPtr->numCol; c++) {
   2465 		/* Skip dead disks. */
   2466 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2467 			continue;
   2468 		raidread_component_area(raidPtr->Disks[c].dev,
   2469 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2470 		    RF_PARITYMAP_NBYTE,
   2471 		    rf_parity_map_offset(raidPtr),
   2472 		    rf_parity_map_size(raidPtr));
   2473 		if (first) {
   2474 			memcpy(map, &tmp, sizeof(*map));
   2475 			first = 0;
   2476 		} else {
   2477 			rf_paritymap_merge(map, &tmp);
   2478 		}
   2479 	}
   2480 }
   2481 
   2482 void
   2483 rf_markalldirty(RF_Raid_t *raidPtr)
   2484 {
   2485 	RF_ComponentLabel_t *clabel;
   2486 	int sparecol;
   2487 	int c;
   2488 	int j;
   2489 	int scol = -1;
   2490 
   2491 	raidPtr->mod_counter++;
   2492 	for (c = 0; c < raidPtr->numCol; c++) {
   2493 		/* we don't want to touch (at all) a disk that has
   2494 		   failed */
   2495 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2496 			clabel = raidget_component_label(raidPtr, c);
   2497 			if (clabel->status == rf_ds_spared) {
   2498 				/* XXX do something special...
   2499 				   but whatever you do, don't
   2500 				   try to access it!! */
   2501 			} else {
   2502 				raidmarkdirty(raidPtr, c);
   2503 			}
   2504 		}
   2505 	}
   2506 
   2507 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2508 		sparecol = raidPtr->numCol + c;
   2509 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2510 			/*
   2511 
   2512 			   we claim this disk is "optimal" if it's
   2513 			   rf_ds_used_spare, as that means it should be
   2514 			   directly substitutable for the disk it replaced.
   2515 			   We note that too...
   2516 
   2517 			 */
   2518 
   2519 			for(j=0;j<raidPtr->numCol;j++) {
   2520 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2521 					scol = j;
   2522 					break;
   2523 				}
   2524 			}
   2525 
   2526 			clabel = raidget_component_label(raidPtr, sparecol);
   2527 			/* make sure status is noted */
   2528 
   2529 			raid_init_component_label(raidPtr, clabel);
   2530 
   2531 			clabel->row = 0;
   2532 			clabel->column = scol;
   2533 			/* Note: we *don't* change status from rf_ds_used_spare
   2534 			   to rf_ds_optimal */
   2535 			/* clabel.status = rf_ds_optimal; */
   2536 
   2537 			raidmarkdirty(raidPtr, sparecol);
   2538 		}
   2539 	}
   2540 }
   2541 
   2542 
   2543 void
   2544 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2545 {
   2546 	RF_ComponentLabel_t *clabel;
   2547 	int sparecol;
   2548 	int c;
   2549 	int j;
   2550 	int scol;
   2551 	struct raid_softc *rs = raidPtr->softc;
   2552 
   2553 	scol = -1;
   2554 
   2555 	/* XXX should do extra checks to make sure things really are clean,
   2556 	   rather than blindly setting the clean bit... */
   2557 
   2558 	raidPtr->mod_counter++;
   2559 
   2560 	for (c = 0; c < raidPtr->numCol; c++) {
   2561 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2562 			clabel = raidget_component_label(raidPtr, c);
   2563 			/* make sure status is noted */
   2564 			clabel->status = rf_ds_optimal;
   2565 
   2566 			/* note what unit we are configured as */
   2567 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2568 				clabel->last_unit = raidPtr->raidid;
   2569 
   2570 			raidflush_component_label(raidPtr, c);
   2571 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2572 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2573 					raidmarkclean(raidPtr, c);
   2574 				}
   2575 			}
   2576 		}
   2577 		/* else we don't touch it.. */
   2578 	}
   2579 
   2580 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2581 		sparecol = raidPtr->numCol + c;
   2582 		/* Need to ensure that the reconstruct actually completed! */
   2583 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2584 			/*
   2585 
   2586 			   we claim this disk is "optimal" if it's
   2587 			   rf_ds_used_spare, as that means it should be
   2588 			   directly substitutable for the disk it replaced.
   2589 			   We note that too...
   2590 
   2591 			 */
   2592 
   2593 			for(j=0;j<raidPtr->numCol;j++) {
   2594 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2595 					scol = j;
   2596 					break;
   2597 				}
   2598 			}
   2599 
   2600 			/* XXX shouldn't *really* need this... */
   2601 			clabel = raidget_component_label(raidPtr, sparecol);
   2602 			/* make sure status is noted */
   2603 
   2604 			raid_init_component_label(raidPtr, clabel);
   2605 
   2606 			clabel->column = scol;
   2607 			clabel->status = rf_ds_optimal;
   2608 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2609 				clabel->last_unit = raidPtr->raidid;
   2610 
   2611 			raidflush_component_label(raidPtr, sparecol);
   2612 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2613 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2614 					raidmarkclean(raidPtr, sparecol);
   2615 				}
   2616 			}
   2617 		}
   2618 	}
   2619 }
   2620 
   2621 void
   2622 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2623 {
   2624 
   2625 	if (vp != NULL) {
   2626 		if (auto_configured == 1) {
   2627 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2628 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2629 			vput(vp);
   2630 
   2631 		} else {
   2632 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2633 		}
   2634 	}
   2635 }
   2636 
   2637 
   2638 void
   2639 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2640 {
   2641 	int r,c;
   2642 	struct vnode *vp;
   2643 	int acd;
   2644 
   2645 
   2646 	/* We take this opportunity to close the vnodes like we should.. */
   2647 
   2648 	for (c = 0; c < raidPtr->numCol; c++) {
   2649 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2650 		acd = raidPtr->Disks[c].auto_configured;
   2651 		rf_close_component(raidPtr, vp, acd);
   2652 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2653 		raidPtr->Disks[c].auto_configured = 0;
   2654 	}
   2655 
   2656 	for (r = 0; r < raidPtr->numSpare; r++) {
   2657 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2658 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2659 		rf_close_component(raidPtr, vp, acd);
   2660 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2661 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2662 	}
   2663 }
   2664 
   2665 
   2666 void
   2667 rf_ReconThread(struct rf_recon_req_internal *req)
   2668 {
   2669 	int     s;
   2670 	RF_Raid_t *raidPtr;
   2671 
   2672 	s = splbio();
   2673 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2674 	raidPtr->recon_in_progress = 1;
   2675 
   2676 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2677 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2678 
   2679 	RF_Free(req, sizeof(*req));
   2680 
   2681 	raidPtr->recon_in_progress = 0;
   2682 	splx(s);
   2683 
   2684 	/* That's all... */
   2685 	kthread_exit(0);	/* does not return */
   2686 }
   2687 
   2688 void
   2689 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2690 {
   2691 	int retcode;
   2692 	int s;
   2693 
   2694 	raidPtr->parity_rewrite_stripes_done = 0;
   2695 	raidPtr->parity_rewrite_in_progress = 1;
   2696 	s = splbio();
   2697 	retcode = rf_RewriteParity(raidPtr);
   2698 	splx(s);
   2699 	if (retcode) {
   2700 		printf("raid%d: Error re-writing parity (%d)!\n",
   2701 		    raidPtr->raidid, retcode);
   2702 	} else {
   2703 		/* set the clean bit!  If we shutdown correctly,
   2704 		   the clean bit on each component label will get
   2705 		   set */
   2706 		raidPtr->parity_good = RF_RAID_CLEAN;
   2707 	}
   2708 	raidPtr->parity_rewrite_in_progress = 0;
   2709 
   2710 	/* Anyone waiting for us to stop?  If so, inform them... */
   2711 	if (raidPtr->waitShutdown) {
   2712 		rf_lock_mutex2(raidPtr->rad_lock);
   2713 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2714 		rf_unlock_mutex2(raidPtr->rad_lock);
   2715 	}
   2716 
   2717 	/* That's all... */
   2718 	kthread_exit(0);	/* does not return */
   2719 }
   2720 
   2721 
   2722 void
   2723 rf_CopybackThread(RF_Raid_t *raidPtr)
   2724 {
   2725 	int s;
   2726 
   2727 	raidPtr->copyback_in_progress = 1;
   2728 	s = splbio();
   2729 	rf_CopybackReconstructedData(raidPtr);
   2730 	splx(s);
   2731 	raidPtr->copyback_in_progress = 0;
   2732 
   2733 	/* That's all... */
   2734 	kthread_exit(0);	/* does not return */
   2735 }
   2736 
   2737 
   2738 void
   2739 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2740 {
   2741 	int s;
   2742 	RF_Raid_t *raidPtr;
   2743 
   2744 	s = splbio();
   2745 	raidPtr = req->raidPtr;
   2746 	raidPtr->recon_in_progress = 1;
   2747 	rf_ReconstructInPlace(raidPtr, req->col);
   2748 	RF_Free(req, sizeof(*req));
   2749 	raidPtr->recon_in_progress = 0;
   2750 	splx(s);
   2751 
   2752 	/* That's all... */
   2753 	kthread_exit(0);	/* does not return */
   2754 }
   2755 
   2756 static RF_AutoConfig_t *
   2757 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2758     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2759     unsigned secsize)
   2760 {
   2761 	int good_one = 0;
   2762 	RF_ComponentLabel_t *clabel;
   2763 	RF_AutoConfig_t *ac;
   2764 
   2765 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2766 	if (clabel == NULL) {
   2767 oomem:
   2768 		    while(ac_list) {
   2769 			    ac = ac_list;
   2770 			    if (ac->clabel)
   2771 				    free(ac->clabel, M_RAIDFRAME);
   2772 			    ac_list = ac_list->next;
   2773 			    free(ac, M_RAIDFRAME);
   2774 		    }
   2775 		    printf("RAID auto config: out of memory!\n");
   2776 		    return NULL; /* XXX probably should panic? */
   2777 	}
   2778 
   2779 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2780 		/* Got the label.  Does it look reasonable? */
   2781 		if (rf_reasonable_label(clabel, numsecs) &&
   2782 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2783 #ifdef DEBUG
   2784 			printf("Component on: %s: %llu\n",
   2785 				cname, (unsigned long long)size);
   2786 			rf_print_component_label(clabel);
   2787 #endif
   2788 			/* if it's reasonable, add it, else ignore it. */
   2789 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2790 				M_NOWAIT);
   2791 			if (ac == NULL) {
   2792 				free(clabel, M_RAIDFRAME);
   2793 				goto oomem;
   2794 			}
   2795 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2796 			ac->dev = dev;
   2797 			ac->vp = vp;
   2798 			ac->clabel = clabel;
   2799 			ac->next = ac_list;
   2800 			ac_list = ac;
   2801 			good_one = 1;
   2802 		}
   2803 	}
   2804 	if (!good_one) {
   2805 		/* cleanup */
   2806 		free(clabel, M_RAIDFRAME);
   2807 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2808 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2809 		vput(vp);
   2810 	}
   2811 	return ac_list;
   2812 }
   2813 
   2814 RF_AutoConfig_t *
   2815 rf_find_raid_components(void)
   2816 {
   2817 	struct vnode *vp;
   2818 	struct disklabel label;
   2819 	device_t dv;
   2820 	deviter_t di;
   2821 	dev_t dev;
   2822 	int bmajor, bminor, wedge, rf_part_found;
   2823 	int error;
   2824 	int i;
   2825 	RF_AutoConfig_t *ac_list;
   2826 	uint64_t numsecs;
   2827 	unsigned secsize;
   2828 	int dowedges;
   2829 
   2830 	/* initialize the AutoConfig list */
   2831 	ac_list = NULL;
   2832 
   2833 	/*
   2834 	 * we begin by trolling through *all* the devices on the system *twice*
   2835 	 * first we scan for wedges, second for other devices. This avoids
   2836 	 * using a raw partition instead of a wedge that covers the whole disk
   2837 	 */
   2838 
   2839 	for (dowedges=1; dowedges>=0; --dowedges) {
   2840 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2841 		     dv = deviter_next(&di)) {
   2842 
   2843 			/* we are only interested in disks... */
   2844 			if (device_class(dv) != DV_DISK)
   2845 				continue;
   2846 
   2847 			/* we don't care about floppies... */
   2848 			if (device_is_a(dv, "fd")) {
   2849 				continue;
   2850 			}
   2851 
   2852 			/* we don't care about CD's... */
   2853 			if (device_is_a(dv, "cd")) {
   2854 				continue;
   2855 			}
   2856 
   2857 			/* we don't care about md's... */
   2858 			if (device_is_a(dv, "md")) {
   2859 				continue;
   2860 			}
   2861 
   2862 			/* hdfd is the Atari/Hades floppy driver */
   2863 			if (device_is_a(dv, "hdfd")) {
   2864 				continue;
   2865 			}
   2866 
   2867 			/* fdisa is the Atari/Milan floppy driver */
   2868 			if (device_is_a(dv, "fdisa")) {
   2869 				continue;
   2870 			}
   2871 
   2872 			/* are we in the wedges pass ? */
   2873 			wedge = device_is_a(dv, "dk");
   2874 			if (wedge != dowedges) {
   2875 				continue;
   2876 			}
   2877 
   2878 			/* need to find the device_name_to_block_device_major stuff */
   2879 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2880 
   2881 			rf_part_found = 0; /*No raid partition as yet*/
   2882 
   2883 			/* get a vnode for the raw partition of this disk */
   2884 			bminor = minor(device_unit(dv));
   2885 			dev = wedge ? makedev(bmajor, bminor) :
   2886 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2887 			if (bdevvp(dev, &vp))
   2888 				panic("RAID can't alloc vnode");
   2889 
   2890 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2891 
   2892 			if (error) {
   2893 				/* "Who cares."  Continue looking
   2894 				   for something that exists*/
   2895 				vput(vp);
   2896 				continue;
   2897 			}
   2898 
   2899 			error = getdisksize(vp, &numsecs, &secsize);
   2900 			if (error) {
   2901 				/*
   2902 				 * Pseudo devices like vnd and cgd can be
   2903 				 * opened but may still need some configuration.
   2904 				 * Ignore these quietly.
   2905 				 */
   2906 				if (error != ENXIO)
   2907 					printf("RAIDframe: can't get disk size"
   2908 					    " for dev %s (%d)\n",
   2909 					    device_xname(dv), error);
   2910 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2911 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2912 				vput(vp);
   2913 				continue;
   2914 			}
   2915 			if (wedge) {
   2916 				struct dkwedge_info dkw;
   2917 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2918 				    NOCRED);
   2919 				if (error) {
   2920 					printf("RAIDframe: can't get wedge info for "
   2921 					    "dev %s (%d)\n", device_xname(dv), error);
   2922 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2923 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2924 					vput(vp);
   2925 					continue;
   2926 				}
   2927 
   2928 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2929 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2930 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2931 					vput(vp);
   2932 					continue;
   2933 				}
   2934 
   2935 				ac_list = rf_get_component(ac_list, dev, vp,
   2936 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2937 				rf_part_found = 1; /*There is a raid component on this disk*/
   2938 				continue;
   2939 			}
   2940 
   2941 			/* Ok, the disk exists.  Go get the disklabel. */
   2942 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2943 			if (error) {
   2944 				/*
   2945 				 * XXX can't happen - open() would
   2946 				 * have errored out (or faked up one)
   2947 				 */
   2948 				if (error != ENOTTY)
   2949 					printf("RAIDframe: can't get label for dev "
   2950 					    "%s (%d)\n", device_xname(dv), error);
   2951 			}
   2952 
   2953 			/* don't need this any more.  We'll allocate it again
   2954 			   a little later if we really do... */
   2955 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2956 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2957 			vput(vp);
   2958 
   2959 			if (error)
   2960 				continue;
   2961 
   2962 			rf_part_found = 0; /*No raid partitions yet*/
   2963 			for (i = 0; i < label.d_npartitions; i++) {
   2964 				char cname[sizeof(ac_list->devname)];
   2965 
   2966 				/* We only support partitions marked as RAID */
   2967 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2968 					continue;
   2969 
   2970 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2971 				if (bdevvp(dev, &vp))
   2972 					panic("RAID can't alloc vnode");
   2973 
   2974 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2975 				if (error) {
   2976 					/* Whatever... */
   2977 					vput(vp);
   2978 					continue;
   2979 				}
   2980 				snprintf(cname, sizeof(cname), "%s%c",
   2981 				    device_xname(dv), 'a' + i);
   2982 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2983 					label.d_partitions[i].p_size, numsecs, secsize);
   2984 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2985 			}
   2986 
   2987 			/*
   2988 			 *If there is no raid component on this disk, either in a
   2989 			 *disklabel or inside a wedge, check the raw partition as well,
   2990 			 *as it is possible to configure raid components on raw disk
   2991 			 *devices.
   2992 			 */
   2993 
   2994 			if (!rf_part_found) {
   2995 				char cname[sizeof(ac_list->devname)];
   2996 
   2997 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2998 				if (bdevvp(dev, &vp))
   2999 					panic("RAID can't alloc vnode");
   3000 
   3001 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3002 				if (error) {
   3003 					/* Whatever... */
   3004 					vput(vp);
   3005 					continue;
   3006 				}
   3007 				snprintf(cname, sizeof(cname), "%s%c",
   3008 				    device_xname(dv), 'a' + RAW_PART);
   3009 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3010 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3011 			}
   3012 		}
   3013 		deviter_release(&di);
   3014 	}
   3015 	return ac_list;
   3016 }
   3017 
   3018 
   3019 int
   3020 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3021 {
   3022 
   3023 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3024 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3025 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3026 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3027 	    clabel->row >=0 &&
   3028 	    clabel->column >= 0 &&
   3029 	    clabel->num_rows > 0 &&
   3030 	    clabel->num_columns > 0 &&
   3031 	    clabel->row < clabel->num_rows &&
   3032 	    clabel->column < clabel->num_columns &&
   3033 	    clabel->blockSize > 0 &&
   3034 	    /*
   3035 	     * numBlocksHi may contain garbage, but it is ok since
   3036 	     * the type is unsigned.  If it is really garbage,
   3037 	     * rf_fix_old_label_size() will fix it.
   3038 	     */
   3039 	    rf_component_label_numblocks(clabel) > 0) {
   3040 		/*
   3041 		 * label looks reasonable enough...
   3042 		 * let's make sure it has no old garbage.
   3043 		 */
   3044 		if (numsecs)
   3045 			rf_fix_old_label_size(clabel, numsecs);
   3046 		return(1);
   3047 	}
   3048 	return(0);
   3049 }
   3050 
   3051 
   3052 /*
   3053  * For reasons yet unknown, some old component labels have garbage in
   3054  * the newer numBlocksHi region, and this causes lossage.  Since those
   3055  * disks will also have numsecs set to less than 32 bits of sectors,
   3056  * we can determine when this corruption has occurred, and fix it.
   3057  *
   3058  * The exact same problem, with the same unknown reason, happens to
   3059  * the partitionSizeHi member as well.
   3060  */
   3061 static void
   3062 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3063 {
   3064 
   3065 	if (numsecs < ((uint64_t)1 << 32)) {
   3066 		if (clabel->numBlocksHi) {
   3067 			printf("WARNING: total sectors < 32 bits, yet "
   3068 			       "numBlocksHi set\n"
   3069 			       "WARNING: resetting numBlocksHi to zero.\n");
   3070 			clabel->numBlocksHi = 0;
   3071 		}
   3072 
   3073 		if (clabel->partitionSizeHi) {
   3074 			printf("WARNING: total sectors < 32 bits, yet "
   3075 			       "partitionSizeHi set\n"
   3076 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3077 			clabel->partitionSizeHi = 0;
   3078 		}
   3079 	}
   3080 }
   3081 
   3082 
   3083 #ifdef DEBUG
   3084 void
   3085 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3086 {
   3087 	uint64_t numBlocks;
   3088 	static const char *rp[] = {
   3089 	    "No", "Force", "Soft", "*invalid*"
   3090 	};
   3091 
   3092 
   3093 	numBlocks = rf_component_label_numblocks(clabel);
   3094 
   3095 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3096 	       clabel->row, clabel->column,
   3097 	       clabel->num_rows, clabel->num_columns);
   3098 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3099 	       clabel->version, clabel->serial_number,
   3100 	       clabel->mod_counter);
   3101 	printf("   Clean: %s Status: %d\n",
   3102 	       clabel->clean ? "Yes" : "No", clabel->status);
   3103 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3104 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3105 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3106 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3107 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3108 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3109 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3110 #if 0
   3111 	   printf("   Config order: %d\n", clabel->config_order);
   3112 #endif
   3113 
   3114 }
   3115 #endif
   3116 
   3117 RF_ConfigSet_t *
   3118 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3119 {
   3120 	RF_AutoConfig_t *ac;
   3121 	RF_ConfigSet_t *config_sets;
   3122 	RF_ConfigSet_t *cset;
   3123 	RF_AutoConfig_t *ac_next;
   3124 
   3125 
   3126 	config_sets = NULL;
   3127 
   3128 	/* Go through the AutoConfig list, and figure out which components
   3129 	   belong to what sets.  */
   3130 	ac = ac_list;
   3131 	while(ac!=NULL) {
   3132 		/* we're going to putz with ac->next, so save it here
   3133 		   for use at the end of the loop */
   3134 		ac_next = ac->next;
   3135 
   3136 		if (config_sets == NULL) {
   3137 			/* will need at least this one... */
   3138 			config_sets = (RF_ConfigSet_t *)
   3139 				malloc(sizeof(RF_ConfigSet_t),
   3140 				       M_RAIDFRAME, M_NOWAIT);
   3141 			if (config_sets == NULL) {
   3142 				panic("rf_create_auto_sets: No memory!");
   3143 			}
   3144 			/* this one is easy :) */
   3145 			config_sets->ac = ac;
   3146 			config_sets->next = NULL;
   3147 			config_sets->rootable = 0;
   3148 			ac->next = NULL;
   3149 		} else {
   3150 			/* which set does this component fit into? */
   3151 			cset = config_sets;
   3152 			while(cset!=NULL) {
   3153 				if (rf_does_it_fit(cset, ac)) {
   3154 					/* looks like it matches... */
   3155 					ac->next = cset->ac;
   3156 					cset->ac = ac;
   3157 					break;
   3158 				}
   3159 				cset = cset->next;
   3160 			}
   3161 			if (cset==NULL) {
   3162 				/* didn't find a match above... new set..*/
   3163 				cset = (RF_ConfigSet_t *)
   3164 					malloc(sizeof(RF_ConfigSet_t),
   3165 					       M_RAIDFRAME, M_NOWAIT);
   3166 				if (cset == NULL) {
   3167 					panic("rf_create_auto_sets: No memory!");
   3168 				}
   3169 				cset->ac = ac;
   3170 				ac->next = NULL;
   3171 				cset->next = config_sets;
   3172 				cset->rootable = 0;
   3173 				config_sets = cset;
   3174 			}
   3175 		}
   3176 		ac = ac_next;
   3177 	}
   3178 
   3179 
   3180 	return(config_sets);
   3181 }
   3182 
   3183 static int
   3184 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3185 {
   3186 	RF_ComponentLabel_t *clabel1, *clabel2;
   3187 
   3188 	/* If this one matches the *first* one in the set, that's good
   3189 	   enough, since the other members of the set would have been
   3190 	   through here too... */
   3191 	/* note that we are not checking partitionSize here..
   3192 
   3193 	   Note that we are also not checking the mod_counters here.
   3194 	   If everything else matches except the mod_counter, that's
   3195 	   good enough for this test.  We will deal with the mod_counters
   3196 	   a little later in the autoconfiguration process.
   3197 
   3198 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3199 
   3200 	   The reason we don't check for this is that failed disks
   3201 	   will have lower modification counts.  If those disks are
   3202 	   not added to the set they used to belong to, then they will
   3203 	   form their own set, which may result in 2 different sets,
   3204 	   for example, competing to be configured at raid0, and
   3205 	   perhaps competing to be the root filesystem set.  If the
   3206 	   wrong ones get configured, or both attempt to become /,
   3207 	   weird behaviour and or serious lossage will occur.  Thus we
   3208 	   need to bring them into the fold here, and kick them out at
   3209 	   a later point.
   3210 
   3211 	*/
   3212 
   3213 	clabel1 = cset->ac->clabel;
   3214 	clabel2 = ac->clabel;
   3215 	if ((clabel1->version == clabel2->version) &&
   3216 	    (clabel1->serial_number == clabel2->serial_number) &&
   3217 	    (clabel1->num_rows == clabel2->num_rows) &&
   3218 	    (clabel1->num_columns == clabel2->num_columns) &&
   3219 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3220 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3221 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3222 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3223 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3224 	    (clabel1->blockSize == clabel2->blockSize) &&
   3225 	    rf_component_label_numblocks(clabel1) ==
   3226 	    rf_component_label_numblocks(clabel2) &&
   3227 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3228 	    (clabel1->root_partition == clabel2->root_partition) &&
   3229 	    (clabel1->last_unit == clabel2->last_unit) &&
   3230 	    (clabel1->config_order == clabel2->config_order)) {
   3231 		/* if it get's here, it almost *has* to be a match */
   3232 	} else {
   3233 		/* it's not consistent with somebody in the set..
   3234 		   punt */
   3235 		return(0);
   3236 	}
   3237 	/* all was fine.. it must fit... */
   3238 	return(1);
   3239 }
   3240 
   3241 int
   3242 rf_have_enough_components(RF_ConfigSet_t *cset)
   3243 {
   3244 	RF_AutoConfig_t *ac;
   3245 	RF_AutoConfig_t *auto_config;
   3246 	RF_ComponentLabel_t *clabel;
   3247 	int c;
   3248 	int num_cols;
   3249 	int num_missing;
   3250 	int mod_counter;
   3251 	int mod_counter_found;
   3252 	int even_pair_failed;
   3253 	char parity_type;
   3254 
   3255 
   3256 	/* check to see that we have enough 'live' components
   3257 	   of this set.  If so, we can configure it if necessary */
   3258 
   3259 	num_cols = cset->ac->clabel->num_columns;
   3260 	parity_type = cset->ac->clabel->parityConfig;
   3261 
   3262 	/* XXX Check for duplicate components!?!?!? */
   3263 
   3264 	/* Determine what the mod_counter is supposed to be for this set. */
   3265 
   3266 	mod_counter_found = 0;
   3267 	mod_counter = 0;
   3268 	ac = cset->ac;
   3269 	while(ac!=NULL) {
   3270 		if (mod_counter_found==0) {
   3271 			mod_counter = ac->clabel->mod_counter;
   3272 			mod_counter_found = 1;
   3273 		} else {
   3274 			if (ac->clabel->mod_counter > mod_counter) {
   3275 				mod_counter = ac->clabel->mod_counter;
   3276 			}
   3277 		}
   3278 		ac = ac->next;
   3279 	}
   3280 
   3281 	num_missing = 0;
   3282 	auto_config = cset->ac;
   3283 
   3284 	even_pair_failed = 0;
   3285 	for(c=0; c<num_cols; c++) {
   3286 		ac = auto_config;
   3287 		while(ac!=NULL) {
   3288 			if ((ac->clabel->column == c) &&
   3289 			    (ac->clabel->mod_counter == mod_counter)) {
   3290 				/* it's this one... */
   3291 #ifdef DEBUG
   3292 				printf("Found: %s at %d\n",
   3293 				       ac->devname,c);
   3294 #endif
   3295 				break;
   3296 			}
   3297 			ac=ac->next;
   3298 		}
   3299 		if (ac==NULL) {
   3300 				/* Didn't find one here! */
   3301 				/* special case for RAID 1, especially
   3302 				   where there are more than 2
   3303 				   components (where RAIDframe treats
   3304 				   things a little differently :( ) */
   3305 			if (parity_type == '1') {
   3306 				if (c%2 == 0) { /* even component */
   3307 					even_pair_failed = 1;
   3308 				} else { /* odd component.  If
   3309 					    we're failed, and
   3310 					    so is the even
   3311 					    component, it's
   3312 					    "Good Night, Charlie" */
   3313 					if (even_pair_failed == 1) {
   3314 						return(0);
   3315 					}
   3316 				}
   3317 			} else {
   3318 				/* normal accounting */
   3319 				num_missing++;
   3320 			}
   3321 		}
   3322 		if ((parity_type == '1') && (c%2 == 1)) {
   3323 				/* Just did an even component, and we didn't
   3324 				   bail.. reset the even_pair_failed flag,
   3325 				   and go on to the next component.... */
   3326 			even_pair_failed = 0;
   3327 		}
   3328 	}
   3329 
   3330 	clabel = cset->ac->clabel;
   3331 
   3332 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3333 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3334 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3335 		/* XXX this needs to be made *much* more general */
   3336 		/* Too many failures */
   3337 		return(0);
   3338 	}
   3339 	/* otherwise, all is well, and we've got enough to take a kick
   3340 	   at autoconfiguring this set */
   3341 	return(1);
   3342 }
   3343 
   3344 void
   3345 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3346 			RF_Raid_t *raidPtr)
   3347 {
   3348 	RF_ComponentLabel_t *clabel;
   3349 	int i;
   3350 
   3351 	clabel = ac->clabel;
   3352 
   3353 	/* 1. Fill in the common stuff */
   3354 	config->numCol = clabel->num_columns;
   3355 	config->numSpare = 0; /* XXX should this be set here? */
   3356 	config->sectPerSU = clabel->sectPerSU;
   3357 	config->SUsPerPU = clabel->SUsPerPU;
   3358 	config->SUsPerRU = clabel->SUsPerRU;
   3359 	config->parityConfig = clabel->parityConfig;
   3360 	/* XXX... */
   3361 	strcpy(config->diskQueueType,"fifo");
   3362 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3363 	config->layoutSpecificSize = 0; /* XXX ?? */
   3364 
   3365 	while(ac!=NULL) {
   3366 		/* row/col values will be in range due to the checks
   3367 		   in reasonable_label() */
   3368 		strcpy(config->devnames[0][ac->clabel->column],
   3369 		       ac->devname);
   3370 		ac = ac->next;
   3371 	}
   3372 
   3373 	for(i=0;i<RF_MAXDBGV;i++) {
   3374 		config->debugVars[i][0] = 0;
   3375 	}
   3376 }
   3377 
   3378 int
   3379 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3380 {
   3381 	RF_ComponentLabel_t *clabel;
   3382 	int column;
   3383 	int sparecol;
   3384 
   3385 	raidPtr->autoconfigure = new_value;
   3386 
   3387 	for(column=0; column<raidPtr->numCol; column++) {
   3388 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3389 			clabel = raidget_component_label(raidPtr, column);
   3390 			clabel->autoconfigure = new_value;
   3391 			raidflush_component_label(raidPtr, column);
   3392 		}
   3393 	}
   3394 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3395 		sparecol = raidPtr->numCol + column;
   3396 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3397 			clabel = raidget_component_label(raidPtr, sparecol);
   3398 			clabel->autoconfigure = new_value;
   3399 			raidflush_component_label(raidPtr, sparecol);
   3400 		}
   3401 	}
   3402 	return(new_value);
   3403 }
   3404 
   3405 int
   3406 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3407 {
   3408 	RF_ComponentLabel_t *clabel;
   3409 	int column;
   3410 	int sparecol;
   3411 
   3412 	raidPtr->root_partition = new_value;
   3413 	for(column=0; column<raidPtr->numCol; column++) {
   3414 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3415 			clabel = raidget_component_label(raidPtr, column);
   3416 			clabel->root_partition = new_value;
   3417 			raidflush_component_label(raidPtr, column);
   3418 		}
   3419 	}
   3420 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3421 		sparecol = raidPtr->numCol + column;
   3422 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3423 			clabel = raidget_component_label(raidPtr, sparecol);
   3424 			clabel->root_partition = new_value;
   3425 			raidflush_component_label(raidPtr, sparecol);
   3426 		}
   3427 	}
   3428 	return(new_value);
   3429 }
   3430 
   3431 void
   3432 rf_release_all_vps(RF_ConfigSet_t *cset)
   3433 {
   3434 	RF_AutoConfig_t *ac;
   3435 
   3436 	ac = cset->ac;
   3437 	while(ac!=NULL) {
   3438 		/* Close the vp, and give it back */
   3439 		if (ac->vp) {
   3440 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3441 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3442 			vput(ac->vp);
   3443 			ac->vp = NULL;
   3444 		}
   3445 		ac = ac->next;
   3446 	}
   3447 }
   3448 
   3449 
   3450 void
   3451 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3452 {
   3453 	RF_AutoConfig_t *ac;
   3454 	RF_AutoConfig_t *next_ac;
   3455 
   3456 	ac = cset->ac;
   3457 	while(ac!=NULL) {
   3458 		next_ac = ac->next;
   3459 		/* nuke the label */
   3460 		free(ac->clabel, M_RAIDFRAME);
   3461 		/* cleanup the config structure */
   3462 		free(ac, M_RAIDFRAME);
   3463 		/* "next.." */
   3464 		ac = next_ac;
   3465 	}
   3466 	/* and, finally, nuke the config set */
   3467 	free(cset, M_RAIDFRAME);
   3468 }
   3469 
   3470 
   3471 void
   3472 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3473 {
   3474 	/* current version number */
   3475 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3476 	clabel->serial_number = raidPtr->serial_number;
   3477 	clabel->mod_counter = raidPtr->mod_counter;
   3478 
   3479 	clabel->num_rows = 1;
   3480 	clabel->num_columns = raidPtr->numCol;
   3481 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3482 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3483 
   3484 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3485 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3486 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3487 
   3488 	clabel->blockSize = raidPtr->bytesPerSector;
   3489 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3490 
   3491 	/* XXX not portable */
   3492 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3493 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3494 	clabel->autoconfigure = raidPtr->autoconfigure;
   3495 	clabel->root_partition = raidPtr->root_partition;
   3496 	clabel->last_unit = raidPtr->raidid;
   3497 	clabel->config_order = raidPtr->config_order;
   3498 
   3499 #ifndef RF_NO_PARITY_MAP
   3500 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3501 #endif
   3502 }
   3503 
   3504 struct raid_softc *
   3505 rf_auto_config_set(RF_ConfigSet_t *cset)
   3506 {
   3507 	RF_Raid_t *raidPtr;
   3508 	RF_Config_t *config;
   3509 	int raidID;
   3510 	struct raid_softc *sc;
   3511 
   3512 #ifdef DEBUG
   3513 	printf("RAID autoconfigure\n");
   3514 #endif
   3515 
   3516 	/* 1. Create a config structure */
   3517 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3518 	if (config == NULL) {
   3519 		printf("%s: Out of mem - config!?!?\n", __func__);
   3520 				/* XXX do something more intelligent here. */
   3521 		return NULL;
   3522 	}
   3523 
   3524 	/*
   3525 	   2. Figure out what RAID ID this one is supposed to live at
   3526 	   See if we can get the same RAID dev that it was configured
   3527 	   on last time..
   3528 	*/
   3529 
   3530 	raidID = cset->ac->clabel->last_unit;
   3531 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3532 	     sc = raidget(++raidID, false))
   3533 		continue;
   3534 #ifdef DEBUG
   3535 	printf("Configuring raid%d:\n",raidID);
   3536 #endif
   3537 
   3538 	if (sc == NULL)
   3539 		sc = raidget(raidID, true);
   3540 	if (sc == NULL) {
   3541 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3542 				/* XXX do something more intelligent here. */
   3543 		free(config, M_RAIDFRAME);
   3544 		return NULL;
   3545 	}
   3546 
   3547 	raidPtr = &sc->sc_r;
   3548 
   3549 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3550 	raidPtr->softc = sc;
   3551 	raidPtr->raidid = raidID;
   3552 	raidPtr->openings = RAIDOUTSTANDING;
   3553 
   3554 	/* 3. Build the configuration structure */
   3555 	rf_create_configuration(cset->ac, config, raidPtr);
   3556 
   3557 	/* 4. Do the configuration */
   3558 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3559 		raidinit(sc);
   3560 
   3561 		rf_markalldirty(raidPtr);
   3562 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3563 		switch (cset->ac->clabel->root_partition) {
   3564 		case 1:	/* Force Root */
   3565 		case 2:	/* Soft Root: root when boot partition part of raid */
   3566 			/*
   3567 			 * everything configured just fine.  Make a note
   3568 			 * that this set is eligible to be root,
   3569 			 * or forced to be root
   3570 			 */
   3571 			cset->rootable = cset->ac->clabel->root_partition;
   3572 			/* XXX do this here? */
   3573 			raidPtr->root_partition = cset->rootable;
   3574 			break;
   3575 		default:
   3576 			break;
   3577 		}
   3578 	} else {
   3579 		raidput(sc);
   3580 		sc = NULL;
   3581 	}
   3582 
   3583 	/* 5. Cleanup */
   3584 	free(config, M_RAIDFRAME);
   3585 	return sc;
   3586 }
   3587 
   3588 void
   3589 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3590 	     size_t xmin, size_t xmax)
   3591 {
   3592 	int error;
   3593 
   3594 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3595 	pool_sethiwat(p, xmax);
   3596 	if ((error = pool_prime(p, xmin)) != 0)
   3597 		panic("%s: failed to prime pool: %d", __func__, error);
   3598 	pool_setlowat(p, xmin);
   3599 }
   3600 
   3601 /*
   3602  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3603  * to see if there is IO pending and if that IO could possibly be done
   3604  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3605  * otherwise.
   3606  *
   3607  */
   3608 int
   3609 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3610 {
   3611 	struct raid_softc *rs;
   3612 	struct dk_softc *dksc;
   3613 
   3614 	rs = raidPtr->softc;
   3615 	dksc = &rs->sc_dksc;
   3616 
   3617 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3618 		return 1;
   3619 
   3620 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3621 		/* there is work to do */
   3622 		return 0;
   3623 	}
   3624 	/* default is nothing to do */
   3625 	return 1;
   3626 }
   3627 
   3628 int
   3629 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3630 {
   3631 	uint64_t numsecs;
   3632 	unsigned secsize;
   3633 	int error;
   3634 
   3635 	error = getdisksize(vp, &numsecs, &secsize);
   3636 	if (error == 0) {
   3637 		diskPtr->blockSize = secsize;
   3638 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3639 		diskPtr->partitionSize = numsecs;
   3640 		return 0;
   3641 	}
   3642 	return error;
   3643 }
   3644 
   3645 static int
   3646 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3647 {
   3648 	return 1;
   3649 }
   3650 
   3651 static void
   3652 raid_attach(device_t parent, device_t self, void *aux)
   3653 {
   3654 }
   3655 
   3656 
   3657 static int
   3658 raid_detach(device_t self, int flags)
   3659 {
   3660 	int error;
   3661 	struct raid_softc *rs = raidsoftc(self);
   3662 
   3663 	if (rs == NULL)
   3664 		return ENXIO;
   3665 
   3666 	if ((error = raidlock(rs)) != 0)
   3667 		return (error);
   3668 
   3669 	error = raid_detach_unlocked(rs);
   3670 
   3671 	raidunlock(rs);
   3672 
   3673 	/* XXX raid can be referenced here */
   3674 
   3675 	if (error)
   3676 		return error;
   3677 
   3678 	/* Free the softc */
   3679 	raidput(rs);
   3680 
   3681 	return 0;
   3682 }
   3683 
   3684 static void
   3685 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3686 {
   3687 	struct dk_softc *dksc = &rs->sc_dksc;
   3688 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3689 
   3690 	memset(dg, 0, sizeof(*dg));
   3691 
   3692 	dg->dg_secperunit = raidPtr->totalSectors;
   3693 	dg->dg_secsize = raidPtr->bytesPerSector;
   3694 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3695 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3696 
   3697 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3698 }
   3699 
   3700 /*
   3701  * Get cache info for all the components (including spares).
   3702  * Returns intersection of all the cache flags of all disks, or first
   3703  * error if any encountered.
   3704  * XXXfua feature flags can change as spares are added - lock down somehow
   3705  */
   3706 static int
   3707 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3708 {
   3709 	int c;
   3710 	int error;
   3711 	int dkwhole = 0, dkpart;
   3712 
   3713 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3714 		/*
   3715 		 * Check any non-dead disk, even when currently being
   3716 		 * reconstructed.
   3717 		 */
   3718 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3719 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3720 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3721 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3722 			if (error) {
   3723 				if (error != ENODEV) {
   3724 					printf("raid%d: get cache for component %s failed\n",
   3725 					    raidPtr->raidid,
   3726 					    raidPtr->Disks[c].devname);
   3727 				}
   3728 
   3729 				return error;
   3730 			}
   3731 
   3732 			if (c == 0)
   3733 				dkwhole = dkpart;
   3734 			else
   3735 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3736 		}
   3737 	}
   3738 
   3739 	*data = dkwhole;
   3740 
   3741 	return 0;
   3742 }
   3743 
   3744 /*
   3745  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3746  * We end up returning whatever error was returned by the first cache flush
   3747  * that fails.
   3748  */
   3749 
   3750 int
   3751 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3752 {
   3753 	int c, sparecol;
   3754 	int e,error;
   3755 	int force = 1;
   3756 
   3757 	error = 0;
   3758 	for (c = 0; c < raidPtr->numCol; c++) {
   3759 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3760 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3761 					  &force, FWRITE, NOCRED);
   3762 			if (e) {
   3763 				if (e != ENODEV)
   3764 					printf("raid%d: cache flush to component %s failed.\n",
   3765 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3766 				if (error == 0) {
   3767 					error = e;
   3768 				}
   3769 			}
   3770 		}
   3771 	}
   3772 
   3773 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3774 		sparecol = raidPtr->numCol + c;
   3775 		/* Need to ensure that the reconstruct actually completed! */
   3776 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3777 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3778 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3779 			if (e) {
   3780 				if (e != ENODEV)
   3781 					printf("raid%d: cache flush to component %s failed.\n",
   3782 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3783 				if (error == 0) {
   3784 					error = e;
   3785 				}
   3786 			}
   3787 		}
   3788 	}
   3789 	return error;
   3790 }
   3791 
   3792 /* Fill in info with the current status */
   3793 void
   3794 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3795 {
   3796 
   3797 	if (raidPtr->status != rf_rs_reconstructing) {
   3798 		info->total = 100;
   3799 		info->completed = 100;
   3800 	} else {
   3801 		info->total = raidPtr->reconControl->numRUsTotal;
   3802 		info->completed = raidPtr->reconControl->numRUsComplete;
   3803 	}
   3804 	info->remaining = info->total - info->completed;
   3805 }
   3806 
   3807 /* Fill in info with the current status */
   3808 void
   3809 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3810 {
   3811 
   3812 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3813 		info->total = raidPtr->Layout.numStripe;
   3814 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3815 	} else {
   3816 		info->completed = 100;
   3817 		info->total = 100;
   3818 	}
   3819 	info->remaining = info->total - info->completed;
   3820 }
   3821 
   3822 /* Fill in info with the current status */
   3823 void
   3824 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3825 {
   3826 
   3827 	if (raidPtr->copyback_in_progress == 1) {
   3828 		info->total = raidPtr->Layout.numStripe;
   3829 		info->completed = raidPtr->copyback_stripes_done;
   3830 		info->remaining = info->total - info->completed;
   3831 	} else {
   3832 		info->remaining = 0;
   3833 		info->completed = 100;
   3834 		info->total = 100;
   3835 	}
   3836 }
   3837 
   3838 /* Fill in config with the current info */
   3839 int
   3840 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3841 {
   3842 	int	d, i, j;
   3843 
   3844 	if (!raidPtr->valid)
   3845 		return (ENODEV);
   3846 	config->cols = raidPtr->numCol;
   3847 	config->ndevs = raidPtr->numCol;
   3848 	if (config->ndevs >= RF_MAX_DISKS)
   3849 		return (ENOMEM);
   3850 	config->nspares = raidPtr->numSpare;
   3851 	if (config->nspares >= RF_MAX_DISKS)
   3852 		return (ENOMEM);
   3853 	config->maxqdepth = raidPtr->maxQueueDepth;
   3854 	d = 0;
   3855 	for (j = 0; j < config->cols; j++) {
   3856 		config->devs[d] = raidPtr->Disks[j];
   3857 		d++;
   3858 	}
   3859 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3860 		config->spares[i] = raidPtr->Disks[j];
   3861 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3862 			/* XXX: raidctl(8) expects to see this as a used spare */
   3863 			config->spares[i].status = rf_ds_used_spare;
   3864 		}
   3865 	}
   3866 	return 0;
   3867 }
   3868 
   3869 int
   3870 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3871 {
   3872 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3873 	RF_ComponentLabel_t *raid_clabel;
   3874 	int column = clabel->column;
   3875 
   3876 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3877 		return EINVAL;
   3878 	raid_clabel = raidget_component_label(raidPtr, column);
   3879 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3880 
   3881 	return 0;
   3882 }
   3883 
   3884 /*
   3885  * Module interface
   3886  */
   3887 
   3888 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3889 
   3890 #ifdef _MODULE
   3891 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3892 #endif
   3893 
   3894 static int raid_modcmd(modcmd_t, void *);
   3895 static int raid_modcmd_init(void);
   3896 static int raid_modcmd_fini(void);
   3897 
   3898 static int
   3899 raid_modcmd(modcmd_t cmd, void *data)
   3900 {
   3901 	int error;
   3902 
   3903 	error = 0;
   3904 	switch (cmd) {
   3905 	case MODULE_CMD_INIT:
   3906 		error = raid_modcmd_init();
   3907 		break;
   3908 	case MODULE_CMD_FINI:
   3909 		error = raid_modcmd_fini();
   3910 		break;
   3911 	default:
   3912 		error = ENOTTY;
   3913 		break;
   3914 	}
   3915 	return error;
   3916 }
   3917 
   3918 static int
   3919 raid_modcmd_init(void)
   3920 {
   3921 	int error;
   3922 	int bmajor, cmajor;
   3923 
   3924 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3925 	mutex_enter(&raid_lock);
   3926 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3927 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3928 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3929 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3930 
   3931 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3932 #endif
   3933 
   3934 	bmajor = cmajor = -1;
   3935 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3936 	    &raid_cdevsw, &cmajor);
   3937 	if (error != 0 && error != EEXIST) {
   3938 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3939 		mutex_exit(&raid_lock);
   3940 		return error;
   3941 	}
   3942 #ifdef _MODULE
   3943 	error = config_cfdriver_attach(&raid_cd);
   3944 	if (error != 0) {
   3945 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3946 		    __func__, error);
   3947 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3948 		mutex_exit(&raid_lock);
   3949 		return error;
   3950 	}
   3951 #endif
   3952 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3953 	if (error != 0) {
   3954 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3955 		    __func__, error);
   3956 #ifdef _MODULE
   3957 		config_cfdriver_detach(&raid_cd);
   3958 #endif
   3959 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3960 		mutex_exit(&raid_lock);
   3961 		return error;
   3962 	}
   3963 
   3964 	raidautoconfigdone = false;
   3965 
   3966 	mutex_exit(&raid_lock);
   3967 
   3968 	if (error == 0) {
   3969 		if (rf_BootRaidframe(true) == 0)
   3970 			aprint_verbose("Kernelized RAIDframe activated\n");
   3971 		else
   3972 			panic("Serious error activating RAID!!");
   3973 	}
   3974 
   3975 	/*
   3976 	 * Register a finalizer which will be used to auto-config RAID
   3977 	 * sets once all real hardware devices have been found.
   3978 	 */
   3979 	error = config_finalize_register(NULL, rf_autoconfig);
   3980 	if (error != 0) {
   3981 		aprint_error("WARNING: unable to register RAIDframe "
   3982 		    "finalizer\n");
   3983 		error = 0;
   3984 	}
   3985 
   3986 	return error;
   3987 }
   3988 
   3989 static int
   3990 raid_modcmd_fini(void)
   3991 {
   3992 	int error;
   3993 
   3994 	mutex_enter(&raid_lock);
   3995 
   3996 	/* Don't allow unload if raid device(s) exist.  */
   3997 	if (!LIST_EMPTY(&raids)) {
   3998 		mutex_exit(&raid_lock);
   3999 		return EBUSY;
   4000 	}
   4001 
   4002 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4003 	if (error != 0) {
   4004 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4005 		mutex_exit(&raid_lock);
   4006 		return error;
   4007 	}
   4008 #ifdef _MODULE
   4009 	error = config_cfdriver_detach(&raid_cd);
   4010 	if (error != 0) {
   4011 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4012 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4013 		mutex_exit(&raid_lock);
   4014 		return error;
   4015 	}
   4016 #endif
   4017 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4018 	if (error != 0) {
   4019 		aprint_error("%s: cannot detach devsw\n",__func__);
   4020 #ifdef _MODULE
   4021 		config_cfdriver_attach(&raid_cd);
   4022 #endif
   4023 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4024 		mutex_exit(&raid_lock);
   4025 		return error;
   4026 	}
   4027 	rf_BootRaidframe(false);
   4028 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4029 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4030 	rf_destroy_cond2(rf_sparet_wait_cv);
   4031 	rf_destroy_cond2(rf_sparet_resp_cv);
   4032 #endif
   4033 	mutex_exit(&raid_lock);
   4034 	mutex_destroy(&raid_lock);
   4035 
   4036 	return error;
   4037 }
   4038