Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.356
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.356 2018/01/23 22:42:29 pgoyette Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.356 2018/01/23 22:42:29 pgoyette Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_compat_netbsd32.h"
    109 #include "opt_raid_autoconfig.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 #include <sys/module.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #ifdef COMPAT_50
    153 #include "rf_compat50.h"
    154 #endif
    155 
    156 #ifdef COMPAT_80
    157 #include "rf_compat80.h"
    158 #endif
    159 
    160 #ifdef COMPAT_NETBSD32
    161 #include "rf_compat32.h"
    162 #endif
    163 
    164 #include "ioconf.h"
    165 
    166 #ifdef DEBUG
    167 int     rf_kdebug_level = 0;
    168 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    169 #else				/* DEBUG */
    170 #define db1_printf(a) { }
    171 #endif				/* DEBUG */
    172 
    173 #ifdef DEBUG_ROOT
    174 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    175 #else
    176 #define DPRINTF(a, ...)
    177 #endif
    178 
    179 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    180 static rf_declare_mutex2(rf_sparet_wait_mutex);
    181 static rf_declare_cond2(rf_sparet_wait_cv);
    182 static rf_declare_cond2(rf_sparet_resp_cv);
    183 
    184 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    185 						 * spare table */
    186 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    187 						 * installation process */
    188 #endif
    189 
    190 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    191 
    192 /* prototypes */
    193 static void KernelWakeupFunc(struct buf *);
    194 static void InitBP(struct buf *, struct vnode *, unsigned,
    195     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    196     void *, int, struct proc *);
    197 struct raid_softc;
    198 static void raidinit(struct raid_softc *);
    199 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    200 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    201 
    202 static int raid_match(device_t, cfdata_t, void *);
    203 static void raid_attach(device_t, device_t, void *);
    204 static int raid_detach(device_t, int);
    205 
    206 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    207     daddr_t, daddr_t);
    208 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    209     daddr_t, daddr_t, int);
    210 
    211 static int raidwrite_component_label(unsigned,
    212     dev_t, struct vnode *, RF_ComponentLabel_t *);
    213 static int raidread_component_label(unsigned,
    214     dev_t, struct vnode *, RF_ComponentLabel_t *);
    215 
    216 static int raid_diskstart(device_t, struct buf *bp);
    217 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    218 static int raid_lastclose(device_t);
    219 
    220 static dev_type_open(raidopen);
    221 static dev_type_close(raidclose);
    222 static dev_type_read(raidread);
    223 static dev_type_write(raidwrite);
    224 static dev_type_ioctl(raidioctl);
    225 static dev_type_strategy(raidstrategy);
    226 static dev_type_dump(raiddump);
    227 static dev_type_size(raidsize);
    228 
    229 const struct bdevsw raid_bdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_strategy = raidstrategy,
    233 	.d_ioctl = raidioctl,
    234 	.d_dump = raiddump,
    235 	.d_psize = raidsize,
    236 	.d_discard = nodiscard,
    237 	.d_flag = D_DISK
    238 };
    239 
    240 const struct cdevsw raid_cdevsw = {
    241 	.d_open = raidopen,
    242 	.d_close = raidclose,
    243 	.d_read = raidread,
    244 	.d_write = raidwrite,
    245 	.d_ioctl = raidioctl,
    246 	.d_stop = nostop,
    247 	.d_tty = notty,
    248 	.d_poll = nopoll,
    249 	.d_mmap = nommap,
    250 	.d_kqfilter = nokqfilter,
    251 	.d_discard = nodiscard,
    252 	.d_flag = D_DISK
    253 };
    254 
    255 static struct dkdriver rf_dkdriver = {
    256 	.d_open = raidopen,
    257 	.d_close = raidclose,
    258 	.d_strategy = raidstrategy,
    259 	.d_diskstart = raid_diskstart,
    260 	.d_dumpblocks = raid_dumpblocks,
    261 	.d_lastclose = raid_lastclose,
    262 	.d_minphys = minphys
    263 };
    264 
    265 struct raid_softc {
    266 	struct dk_softc sc_dksc;
    267 	int	sc_unit;
    268 	int     sc_flags;	/* flags */
    269 	int     sc_cflags;	/* configuration flags */
    270 	kmutex_t sc_mutex;	/* interlock mutex */
    271 	kcondvar_t sc_cv;	/* and the condvar */
    272 	uint64_t sc_size;	/* size of the raid device */
    273 	char    sc_xname[20];	/* XXX external name */
    274 	RF_Raid_t sc_r;
    275 	LIST_ENTRY(raid_softc) sc_link;
    276 };
    277 /* sc_flags */
    278 #define RAIDF_INITED		0x01	/* unit has been initialized */
    279 #define RAIDF_SHUTDOWN		0x02	/* unit is being shutdown */
    280 #define RAIDF_DETACH  		0x04	/* detach after final close */
    281 #define RAIDF_WANTED		0x08	/* someone waiting to obtain a lock */
    282 #define RAIDF_LOCKED		0x10	/* unit is locked */
    283 #define RAIDF_UNIT_CHANGED	0x20	/* unit is being changed */
    284 
    285 #define	raidunit(x)	DISKUNIT(x)
    286 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    287 
    288 extern struct cfdriver raid_cd;
    289 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    290     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    291     DVF_DETACH_SHUTDOWN);
    292 
    293 /* Internal representation of a rf_recon_req */
    294 struct rf_recon_req_internal {
    295 	RF_RowCol_t col;
    296 	RF_ReconReqFlags_t flags;
    297 	void   *raidPtr;
    298 };
    299 
    300 /*
    301  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    302  * Be aware that large numbers can allow the driver to consume a lot of
    303  * kernel memory, especially on writes, and in degraded mode reads.
    304  *
    305  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    306  * a single 64K write will typically require 64K for the old data,
    307  * 64K for the old parity, and 64K for the new parity, for a total
    308  * of 192K (if the parity buffer is not re-used immediately).
    309  * Even it if is used immediately, that's still 128K, which when multiplied
    310  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    311  *
    312  * Now in degraded mode, for example, a 64K read on the above setup may
    313  * require data reconstruction, which will require *all* of the 4 remaining
    314  * disks to participate -- 4 * 32K/disk == 128K again.
    315  */
    316 
    317 #ifndef RAIDOUTSTANDING
    318 #define RAIDOUTSTANDING   6
    319 #endif
    320 
    321 #define RAIDLABELDEV(dev)	\
    322 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    323 
    324 /* declared here, and made public, for the benefit of KVM stuff.. */
    325 
    326 static int raidlock(struct raid_softc *);
    327 static void raidunlock(struct raid_softc *);
    328 
    329 static int raid_detach_unlocked(struct raid_softc *);
    330 
    331 static void rf_markalldirty(RF_Raid_t *);
    332 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    333 
    334 void rf_ReconThread(struct rf_recon_req_internal *);
    335 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    336 void rf_CopybackThread(RF_Raid_t *raidPtr);
    337 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    338 int rf_autoconfig(device_t);
    339 void rf_buildroothack(RF_ConfigSet_t *);
    340 
    341 RF_AutoConfig_t *rf_find_raid_components(void);
    342 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    343 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    344 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    345 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    346 int rf_set_autoconfig(RF_Raid_t *, int);
    347 int rf_set_rootpartition(RF_Raid_t *, int);
    348 void rf_release_all_vps(RF_ConfigSet_t *);
    349 void rf_cleanup_config_set(RF_ConfigSet_t *);
    350 int rf_have_enough_components(RF_ConfigSet_t *);
    351 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    352 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    353 
    354 /*
    355  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    356  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    357  * in the kernel config file.
    358  */
    359 #ifdef RAID_AUTOCONFIG
    360 int raidautoconfig = 1;
    361 #else
    362 int raidautoconfig = 0;
    363 #endif
    364 static bool raidautoconfigdone = false;
    365 
    366 struct RF_Pools_s rf_pools;
    367 
    368 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    369 static kmutex_t raid_lock;
    370 
    371 static struct raid_softc *
    372 raidcreate(int unit) {
    373 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    374 	sc->sc_unit = unit;
    375 	cv_init(&sc->sc_cv, "raidunit");
    376 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    377 	return sc;
    378 }
    379 
    380 static void
    381 raiddestroy(struct raid_softc *sc) {
    382 	cv_destroy(&sc->sc_cv);
    383 	mutex_destroy(&sc->sc_mutex);
    384 	kmem_free(sc, sizeof(*sc));
    385 }
    386 
    387 static struct raid_softc *
    388 raidget(int unit, bool create) {
    389 	struct raid_softc *sc;
    390 	if (unit < 0) {
    391 #ifdef DIAGNOSTIC
    392 		panic("%s: unit %d!", __func__, unit);
    393 #endif
    394 		return NULL;
    395 	}
    396 	mutex_enter(&raid_lock);
    397 	LIST_FOREACH(sc, &raids, sc_link) {
    398 		if (sc->sc_unit == unit) {
    399 			mutex_exit(&raid_lock);
    400 			return sc;
    401 		}
    402 	}
    403 	mutex_exit(&raid_lock);
    404 	if (!create)
    405 		return NULL;
    406 	if ((sc = raidcreate(unit)) == NULL)
    407 		return NULL;
    408 	mutex_enter(&raid_lock);
    409 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    410 	mutex_exit(&raid_lock);
    411 	return sc;
    412 }
    413 
    414 static void
    415 raidput(struct raid_softc *sc) {
    416 	mutex_enter(&raid_lock);
    417 	LIST_REMOVE(sc, sc_link);
    418 	mutex_exit(&raid_lock);
    419 	raiddestroy(sc);
    420 }
    421 
    422 void
    423 raidattach(int num)
    424 {
    425 
    426 	/*
    427 	 * Device attachment and associated initialization now occurs
    428 	 * as part of the module initialization.
    429 	 */
    430 }
    431 
    432 int
    433 rf_autoconfig(device_t self)
    434 {
    435 	RF_AutoConfig_t *ac_list;
    436 	RF_ConfigSet_t *config_sets;
    437 
    438 	if (!raidautoconfig || raidautoconfigdone == true)
    439 		return (0);
    440 
    441 	/* XXX This code can only be run once. */
    442 	raidautoconfigdone = true;
    443 
    444 #ifdef __HAVE_CPU_BOOTCONF
    445 	/*
    446 	 * 0. find the boot device if needed first so we can use it later
    447 	 * this needs to be done before we autoconfigure any raid sets,
    448 	 * because if we use wedges we are not going to be able to open
    449 	 * the boot device later
    450 	 */
    451 	if (booted_device == NULL)
    452 		cpu_bootconf();
    453 #endif
    454 	/* 1. locate all RAID components on the system */
    455 	aprint_debug("Searching for RAID components...\n");
    456 	ac_list = rf_find_raid_components();
    457 
    458 	/* 2. Sort them into their respective sets. */
    459 	config_sets = rf_create_auto_sets(ac_list);
    460 
    461 	/*
    462 	 * 3. Evaluate each set and configure the valid ones.
    463 	 * This gets done in rf_buildroothack().
    464 	 */
    465 	rf_buildroothack(config_sets);
    466 
    467 	return 1;
    468 }
    469 
    470 static int
    471 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    472 	const char *bootname = device_xname(bdv);
    473 	size_t len = strlen(bootname);
    474 
    475 	for (int col = 0; col < r->numCol; col++) {
    476 		const char *devname = r->Disks[col].devname;
    477 		devname += sizeof("/dev/") - 1;
    478 		if (strncmp(devname, "dk", 2) == 0) {
    479 			const char *parent =
    480 			    dkwedge_get_parent_name(r->Disks[col].dev);
    481 			if (parent != NULL)
    482 				devname = parent;
    483 		}
    484 		if (strncmp(devname, bootname, len) == 0) {
    485 			struct raid_softc *sc = r->softc;
    486 			aprint_debug("raid%d includes boot device %s\n",
    487 			    sc->sc_unit, devname);
    488 			return 1;
    489 		}
    490 	}
    491 	return 0;
    492 }
    493 
    494 void
    495 rf_buildroothack(RF_ConfigSet_t *config_sets)
    496 {
    497 	RF_ConfigSet_t *cset;
    498 	RF_ConfigSet_t *next_cset;
    499 	int num_root;
    500 	struct raid_softc *sc, *rsc;
    501 	struct dk_softc *dksc;
    502 
    503 	sc = rsc = NULL;
    504 	num_root = 0;
    505 	cset = config_sets;
    506 	while (cset != NULL) {
    507 		next_cset = cset->next;
    508 		if (rf_have_enough_components(cset) &&
    509 		    cset->ac->clabel->autoconfigure == 1) {
    510 			sc = rf_auto_config_set(cset);
    511 			if (sc != NULL) {
    512 				aprint_debug("raid%d: configured ok\n",
    513 				    sc->sc_unit);
    514 				if (cset->rootable) {
    515 					rsc = sc;
    516 					num_root++;
    517 				}
    518 			} else {
    519 				/* The autoconfig didn't work :( */
    520 				aprint_debug("Autoconfig failed\n");
    521 				rf_release_all_vps(cset);
    522 			}
    523 		} else {
    524 			/* we're not autoconfiguring this set...
    525 			   release the associated resources */
    526 			rf_release_all_vps(cset);
    527 		}
    528 		/* cleanup */
    529 		rf_cleanup_config_set(cset);
    530 		cset = next_cset;
    531 	}
    532 	dksc = &rsc->sc_dksc;
    533 
    534 	/* if the user has specified what the root device should be
    535 	   then we don't touch booted_device or boothowto... */
    536 
    537 	if (rootspec != NULL)
    538 		return;
    539 
    540 	/* we found something bootable... */
    541 
    542 	/*
    543 	 * XXX: The following code assumes that the root raid
    544 	 * is the first ('a') partition. This is about the best
    545 	 * we can do with a BSD disklabel, but we might be able
    546 	 * to do better with a GPT label, by setting a specified
    547 	 * attribute to indicate the root partition. We can then
    548 	 * stash the partition number in the r->root_partition
    549 	 * high bits (the bottom 2 bits are already used). For
    550 	 * now we just set booted_partition to 0 when we override
    551 	 * root.
    552 	 */
    553 	if (num_root == 1) {
    554 		device_t candidate_root;
    555 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    556 			char cname[sizeof(cset->ac->devname)];
    557 			/* XXX: assume partition 'a' first */
    558 			snprintf(cname, sizeof(cname), "%s%c",
    559 			    device_xname(dksc->sc_dev), 'a');
    560 			candidate_root = dkwedge_find_by_wname(cname);
    561 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    562 			    cname);
    563 			if (candidate_root == NULL) {
    564 				/*
    565 				 * If that is not found, because we don't use
    566 				 * disklabel, return the first dk child
    567 				 * XXX: we can skip the 'a' check above
    568 				 * and always do this...
    569 				 */
    570 				size_t i = 0;
    571 				candidate_root = dkwedge_find_by_parent(
    572 				    device_xname(dksc->sc_dev), &i);
    573 			}
    574 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    575 			    candidate_root);
    576 		} else
    577 			candidate_root = dksc->sc_dev;
    578 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    579 		DPRINTF("%s: booted_device=%p root_partition=%d "
    580 		   "contains_boot=%d\n", __func__, booted_device,
    581 		   rsc->sc_r.root_partition,
    582 		   rf_containsboot(&rsc->sc_r, booted_device));
    583 		if (booted_device == NULL ||
    584 		    rsc->sc_r.root_partition == 1 ||
    585 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    586 			booted_device = candidate_root;
    587 			booted_method = "raidframe/single";
    588 			booted_partition = 0;	/* XXX assume 'a' */
    589 		}
    590 	} else if (num_root > 1) {
    591 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    592 		    booted_device);
    593 
    594 		/*
    595 		 * Maybe the MD code can help. If it cannot, then
    596 		 * setroot() will discover that we have no
    597 		 * booted_device and will ask the user if nothing was
    598 		 * hardwired in the kernel config file
    599 		 */
    600 		if (booted_device == NULL)
    601 			return;
    602 
    603 		num_root = 0;
    604 		mutex_enter(&raid_lock);
    605 		LIST_FOREACH(sc, &raids, sc_link) {
    606 			RF_Raid_t *r = &sc->sc_r;
    607 			if (r->valid == 0)
    608 				continue;
    609 
    610 			if (r->root_partition == 0)
    611 				continue;
    612 
    613 			if (rf_containsboot(r, booted_device)) {
    614 				num_root++;
    615 				rsc = sc;
    616 				dksc = &rsc->sc_dksc;
    617 			}
    618 		}
    619 		mutex_exit(&raid_lock);
    620 
    621 		if (num_root == 1) {
    622 			booted_device = dksc->sc_dev;
    623 			booted_method = "raidframe/multi";
    624 			booted_partition = 0;	/* XXX assume 'a' */
    625 		} else {
    626 			/* we can't guess.. require the user to answer... */
    627 			boothowto |= RB_ASKNAME;
    628 		}
    629 	}
    630 }
    631 
    632 static int
    633 raidsize(dev_t dev)
    634 {
    635 	struct raid_softc *rs;
    636 	struct dk_softc *dksc;
    637 	unsigned int unit;
    638 
    639 	unit = raidunit(dev);
    640 	if ((rs = raidget(unit, false)) == NULL)
    641 		return -1;
    642 	dksc = &rs->sc_dksc;
    643 
    644 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    645 		return -1;
    646 
    647 	return dk_size(dksc, dev);
    648 }
    649 
    650 static int
    651 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    652 {
    653 	unsigned int unit;
    654 	struct raid_softc *rs;
    655 	struct dk_softc *dksc;
    656 
    657 	unit = raidunit(dev);
    658 	if ((rs = raidget(unit, false)) == NULL)
    659 		return ENXIO;
    660 	dksc = &rs->sc_dksc;
    661 
    662 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    663 		return ENODEV;
    664 
    665         /*
    666            Note that blkno is relative to this particular partition.
    667            By adding adding RF_PROTECTED_SECTORS, we get a value that
    668 	   is relative to the partition used for the underlying component.
    669         */
    670 	blkno += RF_PROTECTED_SECTORS;
    671 
    672 	return dk_dump(dksc, dev, blkno, va, size);
    673 }
    674 
    675 static int
    676 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    677 {
    678 	struct raid_softc *rs = raidsoftc(dev);
    679 	const struct bdevsw *bdev;
    680 	RF_Raid_t *raidPtr;
    681 	int     c, sparecol, j, scol, dumpto;
    682 	int     error = 0;
    683 
    684 	raidPtr = &rs->sc_r;
    685 
    686 	/* we only support dumping to RAID 1 sets */
    687 	if (raidPtr->Layout.numDataCol != 1 ||
    688 	    raidPtr->Layout.numParityCol != 1)
    689 		return EINVAL;
    690 
    691 	if ((error = raidlock(rs)) != 0)
    692 		return error;
    693 
    694 	/* figure out what device is alive.. */
    695 
    696 	/*
    697 	   Look for a component to dump to.  The preference for the
    698 	   component to dump to is as follows:
    699 	   1) the master
    700 	   2) a used_spare of the master
    701 	   3) the slave
    702 	   4) a used_spare of the slave
    703 	*/
    704 
    705 	dumpto = -1;
    706 	for (c = 0; c < raidPtr->numCol; c++) {
    707 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    708 			/* this might be the one */
    709 			dumpto = c;
    710 			break;
    711 		}
    712 	}
    713 
    714 	/*
    715 	   At this point we have possibly selected a live master or a
    716 	   live slave.  We now check to see if there is a spared
    717 	   master (or a spared slave), if we didn't find a live master
    718 	   or a live slave.
    719 	*/
    720 
    721 	for (c = 0; c < raidPtr->numSpare; c++) {
    722 		sparecol = raidPtr->numCol + c;
    723 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    724 			/* How about this one? */
    725 			scol = -1;
    726 			for(j=0;j<raidPtr->numCol;j++) {
    727 				if (raidPtr->Disks[j].spareCol == sparecol) {
    728 					scol = j;
    729 					break;
    730 				}
    731 			}
    732 			if (scol == 0) {
    733 				/*
    734 				   We must have found a spared master!
    735 				   We'll take that over anything else
    736 				   found so far.  (We couldn't have
    737 				   found a real master before, since
    738 				   this is a used spare, and it's
    739 				   saying that it's replacing the
    740 				   master.)  On reboot (with
    741 				   autoconfiguration turned on)
    742 				   sparecol will become the 1st
    743 				   component (component0) of this set.
    744 				*/
    745 				dumpto = sparecol;
    746 				break;
    747 			} else if (scol != -1) {
    748 				/*
    749 				   Must be a spared slave.  We'll dump
    750 				   to that if we havn't found anything
    751 				   else so far.
    752 				*/
    753 				if (dumpto == -1)
    754 					dumpto = sparecol;
    755 			}
    756 		}
    757 	}
    758 
    759 	if (dumpto == -1) {
    760 		/* we couldn't find any live components to dump to!?!?
    761 		 */
    762 		error = EINVAL;
    763 		goto out;
    764 	}
    765 
    766 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    767 	if (bdev == NULL) {
    768 		error = ENXIO;
    769 		goto out;
    770 	}
    771 
    772 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    773 				blkno, va, nblk * raidPtr->bytesPerSector);
    774 
    775 out:
    776 	raidunlock(rs);
    777 
    778 	return error;
    779 }
    780 
    781 /* ARGSUSED */
    782 static int
    783 raidopen(dev_t dev, int flags, int fmt,
    784     struct lwp *l)
    785 {
    786 	int     unit = raidunit(dev);
    787 	struct raid_softc *rs;
    788 	struct dk_softc *dksc;
    789 	int     error = 0;
    790 	int     part, pmask;
    791 
    792 	if ((rs = raidget(unit, true)) == NULL)
    793 		return ENXIO;
    794 	if ((error = raidlock(rs)) != 0)
    795 		return (error);
    796 
    797 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    798 		error = EBUSY;
    799 		goto bad;
    800 	}
    801 
    802 	dksc = &rs->sc_dksc;
    803 
    804 	part = DISKPART(dev);
    805 	pmask = (1 << part);
    806 
    807 	if (!DK_BUSY(dksc, pmask) &&
    808 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    809 		/* First one... mark things as dirty... Note that we *MUST*
    810 		 have done a configure before this.  I DO NOT WANT TO BE
    811 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    812 		 THAT THEY BELONG TOGETHER!!!!! */
    813 		/* XXX should check to see if we're only open for reading
    814 		   here... If so, we needn't do this, but then need some
    815 		   other way of keeping track of what's happened.. */
    816 
    817 		rf_markalldirty(&rs->sc_r);
    818 	}
    819 
    820 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    821 		error = dk_open(dksc, dev, flags, fmt, l);
    822 
    823 bad:
    824 	raidunlock(rs);
    825 
    826 	return (error);
    827 
    828 
    829 }
    830 
    831 static int
    832 raid_lastclose(device_t self)
    833 {
    834 	struct raid_softc *rs = raidsoftc(self);
    835 
    836 	/* Last one... device is not unconfigured yet.
    837 	   Device shutdown has taken care of setting the
    838 	   clean bits if RAIDF_INITED is not set
    839 	   mark things as clean... */
    840 
    841 	rf_update_component_labels(&rs->sc_r,
    842 	    RF_FINAL_COMPONENT_UPDATE);
    843 
    844 	/* pass to unlocked code */
    845 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    846 		rs->sc_flags |= RAIDF_DETACH;
    847 
    848 	return 0;
    849 }
    850 
    851 /* ARGSUSED */
    852 static int
    853 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    854 {
    855 	int     unit = raidunit(dev);
    856 	struct raid_softc *rs;
    857 	struct dk_softc *dksc;
    858 	cfdata_t cf;
    859 	int     error = 0, do_detach = 0, do_put = 0;
    860 
    861 	if ((rs = raidget(unit, false)) == NULL)
    862 		return ENXIO;
    863 	dksc = &rs->sc_dksc;
    864 
    865 	if ((error = raidlock(rs)) != 0)
    866 		return (error);
    867 
    868 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    869 		error = dk_close(dksc, dev, flags, fmt, l);
    870 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    871 			do_detach = 1;
    872 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    873 		do_put = 1;
    874 
    875 	raidunlock(rs);
    876 
    877 	if (do_detach) {
    878 		/* free the pseudo device attach bits */
    879 		cf = device_cfdata(dksc->sc_dev);
    880 		error = config_detach(dksc->sc_dev, 0);
    881 		if (error == 0)
    882 			free(cf, M_RAIDFRAME);
    883 	} else if (do_put) {
    884 		raidput(rs);
    885 	}
    886 
    887 	return (error);
    888 
    889 }
    890 
    891 static void
    892 raid_wakeup(RF_Raid_t *raidPtr)
    893 {
    894 	rf_lock_mutex2(raidPtr->iodone_lock);
    895 	rf_signal_cond2(raidPtr->iodone_cv);
    896 	rf_unlock_mutex2(raidPtr->iodone_lock);
    897 }
    898 
    899 static void
    900 raidstrategy(struct buf *bp)
    901 {
    902 	unsigned int unit;
    903 	struct raid_softc *rs;
    904 	struct dk_softc *dksc;
    905 	RF_Raid_t *raidPtr;
    906 
    907 	unit = raidunit(bp->b_dev);
    908 	if ((rs = raidget(unit, false)) == NULL) {
    909 		bp->b_error = ENXIO;
    910 		goto fail;
    911 	}
    912 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    913 		bp->b_error = ENXIO;
    914 		goto fail;
    915 	}
    916 	dksc = &rs->sc_dksc;
    917 	raidPtr = &rs->sc_r;
    918 
    919 	/* Queue IO only */
    920 	if (dk_strategy_defer(dksc, bp))
    921 		goto done;
    922 
    923 	/* schedule the IO to happen at the next convenient time */
    924 	raid_wakeup(raidPtr);
    925 
    926 done:
    927 	return;
    928 
    929 fail:
    930 	bp->b_resid = bp->b_bcount;
    931 	biodone(bp);
    932 }
    933 
    934 static int
    935 raid_diskstart(device_t dev, struct buf *bp)
    936 {
    937 	struct raid_softc *rs = raidsoftc(dev);
    938 	RF_Raid_t *raidPtr;
    939 
    940 	raidPtr = &rs->sc_r;
    941 	if (!raidPtr->valid) {
    942 		db1_printf(("raid is not valid..\n"));
    943 		return ENODEV;
    944 	}
    945 
    946 	/* XXX */
    947 	bp->b_resid = 0;
    948 
    949 	return raiddoaccess(raidPtr, bp);
    950 }
    951 
    952 void
    953 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
    954 {
    955 	struct raid_softc *rs;
    956 	struct dk_softc *dksc;
    957 
    958 	rs = raidPtr->softc;
    959 	dksc = &rs->sc_dksc;
    960 
    961 	dk_done(dksc, bp);
    962 
    963 	rf_lock_mutex2(raidPtr->mutex);
    964 	raidPtr->openings++;
    965 	rf_unlock_mutex2(raidPtr->mutex);
    966 
    967 	/* schedule more IO */
    968 	raid_wakeup(raidPtr);
    969 }
    970 
    971 /* ARGSUSED */
    972 static int
    973 raidread(dev_t dev, struct uio *uio, int flags)
    974 {
    975 	int     unit = raidunit(dev);
    976 	struct raid_softc *rs;
    977 
    978 	if ((rs = raidget(unit, false)) == NULL)
    979 		return ENXIO;
    980 
    981 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    982 		return (ENXIO);
    983 
    984 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    985 
    986 }
    987 
    988 /* ARGSUSED */
    989 static int
    990 raidwrite(dev_t dev, struct uio *uio, int flags)
    991 {
    992 	int     unit = raidunit(dev);
    993 	struct raid_softc *rs;
    994 
    995 	if ((rs = raidget(unit, false)) == NULL)
    996 		return ENXIO;
    997 
    998 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    999 		return (ENXIO);
   1000 
   1001 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
   1002 
   1003 }
   1004 
   1005 static int
   1006 raid_detach_unlocked(struct raid_softc *rs)
   1007 {
   1008 	struct dk_softc *dksc = &rs->sc_dksc;
   1009 	RF_Raid_t *raidPtr;
   1010 	int error;
   1011 
   1012 	raidPtr = &rs->sc_r;
   1013 
   1014 	if (DK_BUSY(dksc, 0) ||
   1015 	    raidPtr->recon_in_progress != 0 ||
   1016 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1017 	    raidPtr->copyback_in_progress != 0)
   1018 		return EBUSY;
   1019 
   1020 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1021 		return 0;
   1022 
   1023 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1024 
   1025 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1026 		return error;
   1027 
   1028 	rs->sc_flags &= ~RAIDF_INITED;
   1029 
   1030 	/* Kill off any queued buffers */
   1031 	dk_drain(dksc);
   1032 	bufq_free(dksc->sc_bufq);
   1033 
   1034 	/* Detach the disk. */
   1035 	dkwedge_delall(&dksc->sc_dkdev);
   1036 	disk_detach(&dksc->sc_dkdev);
   1037 	disk_destroy(&dksc->sc_dkdev);
   1038 	dk_detach(dksc);
   1039 
   1040 	return 0;
   1041 }
   1042 
   1043 static int
   1044 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1045 {
   1046 	int     unit = raidunit(dev);
   1047 	int     error = 0;
   1048 	int     part, pmask;
   1049 	struct raid_softc *rs;
   1050 	struct dk_softc *dksc;
   1051 	RF_Config_t *k_cfg, *u_cfg;
   1052 	RF_Raid_t *raidPtr;
   1053 	RF_RaidDisk_t *diskPtr;
   1054 	RF_AccTotals_t *totals;
   1055 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1056 	u_char *specific_buf;
   1057 	int retcode = 0;
   1058 	int column;
   1059 /*	int raidid; */
   1060 	struct rf_recon_req *rr;
   1061 	struct rf_recon_req_internal *rrint;
   1062 	RF_ComponentLabel_t *clabel;
   1063 	RF_ComponentLabel_t *ci_label;
   1064 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1065 	RF_SingleComponent_t component;
   1066 	int d;
   1067 
   1068 	if ((rs = raidget(unit, false)) == NULL)
   1069 		return ENXIO;
   1070 	dksc = &rs->sc_dksc;
   1071 	raidPtr = &rs->sc_r;
   1072 
   1073 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1074 		(int) DISKPART(dev), (int) unit, cmd));
   1075 
   1076 	/* Must be initialized for these... */
   1077 	switch (cmd) {
   1078 	case RAIDFRAME_REWRITEPARITY:
   1079 	case RAIDFRAME_GET_INFO:
   1080 	case RAIDFRAME_RESET_ACCTOTALS:
   1081 	case RAIDFRAME_GET_ACCTOTALS:
   1082 	case RAIDFRAME_KEEP_ACCTOTALS:
   1083 	case RAIDFRAME_GET_SIZE:
   1084 	case RAIDFRAME_FAIL_DISK:
   1085 	case RAIDFRAME_COPYBACK:
   1086 	case RAIDFRAME_CHECK_RECON_STATUS:
   1087 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1088 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1089 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1090 	case RAIDFRAME_ADD_HOT_SPARE:
   1091 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1092 	case RAIDFRAME_INIT_LABELS:
   1093 	case RAIDFRAME_REBUILD_IN_PLACE:
   1094 	case RAIDFRAME_CHECK_PARITY:
   1095 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1096 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1097 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1098 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1099 	case RAIDFRAME_SET_AUTOCONFIG:
   1100 	case RAIDFRAME_SET_ROOT:
   1101 	case RAIDFRAME_DELETE_COMPONENT:
   1102 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1103 	case RAIDFRAME_PARITYMAP_STATUS:
   1104 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1105 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1106 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1107 #ifdef COMPAT_50
   1108 	case RAIDFRAME_GET_INFO50:
   1109 #endif
   1110 #ifdef COMPAT_80
   1111 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1113 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1114 	case RAIDFRAME_GET_INFO80:
   1115 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1116 #endif
   1117 #ifdef COMPAT_NETBSD32
   1118 #ifdef _LP64
   1119 	case RAIDFRAME_GET_INFO32:
   1120 #endif
   1121 #endif
   1122 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1123 			return (ENXIO);
   1124 	}
   1125 
   1126 	switch (cmd) {
   1127 #ifdef COMPAT_50
   1128 	case RAIDFRAME_GET_INFO50:
   1129 		return rf_get_info50(raidPtr, data);
   1130 
   1131 	case RAIDFRAME_CONFIGURE50:
   1132 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1133 			return retcode;
   1134 		goto config;
   1135 #endif
   1136 
   1137 #ifdef COMPAT_80
   1138 	case RAIDFRAME_CHECK_RECON_STATUS_EXT80:
   1139 		return rf_check_recon_status_ext80(raidPtr, data);
   1140 
   1141 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT80:
   1142 		return rf_check_parityrewrite_status_ext80(raidPtr, data);
   1143 
   1144 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT80:
   1145 		return rf_check_copyback_status_ext80(raidPtr, data);
   1146 
   1147 	case RAIDFRAME_GET_INFO80:
   1148 		return rf_get_info80(raidPtr, data);
   1149 
   1150 	case RAIDFRAME_GET_COMPONENT_LABEL80:
   1151 		return rf_get_component_label80(raidPtr, data);
   1152 
   1153 	case RAIDFRAME_CONFIGURE80:
   1154 		if ((retcode = rf_config80(raidPtr, unit, data, &k_cfg)) != 0)
   1155 			return retcode;
   1156 		goto config;
   1157 #endif
   1158 
   1159 		/* configure the system */
   1160 	case RAIDFRAME_CONFIGURE:
   1161 #ifdef COMPAT_NETBSD32
   1162 #ifdef _LP64
   1163 	case RAIDFRAME_CONFIGURE32:
   1164 #endif
   1165 #endif
   1166 
   1167 		if (raidPtr->valid) {
   1168 			/* There is a valid RAID set running on this unit! */
   1169 			printf("raid%d: Device already configured!\n",unit);
   1170 			return(EINVAL);
   1171 		}
   1172 
   1173 		/* copy-in the configuration information */
   1174 		/* data points to a pointer to the configuration structure */
   1175 
   1176 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1177 		if (k_cfg == NULL) {
   1178 			return (ENOMEM);
   1179 		}
   1180 #ifdef COMPAT_NETBSD32
   1181 #ifdef _LP64
   1182 		if (cmd == RAIDFRAME_CONFIGURE32 &&
   1183 		    (l->l_proc->p_flag & PK_32) != 0)
   1184 			retcode = rf_config_netbsd32(data, k_cfg);
   1185 		else
   1186 #endif
   1187 #endif
   1188 		{
   1189 			u_cfg = *((RF_Config_t **) data);
   1190 			retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1191 		}
   1192 		if (retcode) {
   1193 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1194 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1195 				retcode));
   1196 			goto no_config;
   1197 		}
   1198 		goto config;
   1199 	config:
   1200 		rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1201 
   1202 		/* allocate a buffer for the layout-specific data, and copy it
   1203 		 * in */
   1204 		if (k_cfg->layoutSpecificSize) {
   1205 			if (k_cfg->layoutSpecificSize > 10000) {
   1206 				/* sanity check */
   1207 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1208 				retcode = EINVAL;
   1209 				goto no_config;
   1210 			}
   1211 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1212 			    (u_char *));
   1213 			if (specific_buf == NULL) {
   1214 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1215 				retcode = ENOMEM;
   1216 				goto no_config;
   1217 			}
   1218 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1219 			    k_cfg->layoutSpecificSize);
   1220 			if (retcode) {
   1221 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1222 				RF_Free(specific_buf,
   1223 					k_cfg->layoutSpecificSize);
   1224 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1225 					retcode));
   1226 				goto no_config;
   1227 			}
   1228 		} else
   1229 			specific_buf = NULL;
   1230 		k_cfg->layoutSpecific = specific_buf;
   1231 
   1232 		/* should do some kind of sanity check on the configuration.
   1233 		 * Store the sum of all the bytes in the last byte? */
   1234 
   1235 		/* configure the system */
   1236 
   1237 		/*
   1238 		 * Clear the entire RAID descriptor, just to make sure
   1239 		 *  there is no stale data left in the case of a
   1240 		 *  reconfiguration
   1241 		 */
   1242 		memset(raidPtr, 0, sizeof(*raidPtr));
   1243 		raidPtr->softc = rs;
   1244 		raidPtr->raidid = unit;
   1245 
   1246 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1247 
   1248 		if (retcode == 0) {
   1249 
   1250 			/* allow this many simultaneous IO's to
   1251 			   this RAID device */
   1252 			raidPtr->openings = RAIDOUTSTANDING;
   1253 
   1254 			raidinit(rs);
   1255 			raid_wakeup(raidPtr);
   1256 			rf_markalldirty(raidPtr);
   1257 		}
   1258 		/* free the buffers.  No return code here. */
   1259 		if (k_cfg->layoutSpecificSize) {
   1260 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1261 		}
   1262 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1263 
   1264 	no_config:
   1265 		/*
   1266 		 * If configuration failed, set sc_flags so that we
   1267 		 * will detach the device when we close it.
   1268 		 */
   1269 		if (retcode != 0)
   1270 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1271 		return (retcode);
   1272 
   1273 		/* shutdown the system */
   1274 	case RAIDFRAME_SHUTDOWN:
   1275 
   1276 		part = DISKPART(dev);
   1277 		pmask = (1 << part);
   1278 
   1279 		if ((error = raidlock(rs)) != 0)
   1280 			return (error);
   1281 
   1282 		if (DK_BUSY(dksc, pmask) ||
   1283 		    raidPtr->recon_in_progress != 0 ||
   1284 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1285 		    raidPtr->copyback_in_progress != 0)
   1286 			retcode = EBUSY;
   1287 		else {
   1288 			/* detach and free on close */
   1289 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1290 			retcode = 0;
   1291 		}
   1292 
   1293 		raidunlock(rs);
   1294 
   1295 		return (retcode);
   1296 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1297 		return rf_get_component_label(raidPtr, data);
   1298 
   1299 #if 0
   1300 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1301 		clabel = (RF_ComponentLabel_t *) data;
   1302 
   1303 		/* XXX check the label for valid stuff... */
   1304 		/* Note that some things *should not* get modified --
   1305 		   the user should be re-initing the labels instead of
   1306 		   trying to patch things.
   1307 		   */
   1308 
   1309 		raidid = raidPtr->raidid;
   1310 #ifdef DEBUG
   1311 		printf("raid%d: Got component label:\n", raidid);
   1312 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1313 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1314 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1315 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1316 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1317 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1318 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1319 #endif
   1320 		clabel->row = 0;
   1321 		column = clabel->column;
   1322 
   1323 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1324 			return(EINVAL);
   1325 		}
   1326 
   1327 		/* XXX this isn't allowed to do anything for now :-) */
   1328 
   1329 		/* XXX and before it is, we need to fill in the rest
   1330 		   of the fields!?!?!?! */
   1331 		memcpy(raidget_component_label(raidPtr, column),
   1332 		    clabel, sizeof(*clabel));
   1333 		raidflush_component_label(raidPtr, column);
   1334 		return (0);
   1335 #endif
   1336 
   1337 	case RAIDFRAME_INIT_LABELS:
   1338 		clabel = (RF_ComponentLabel_t *) data;
   1339 		/*
   1340 		   we only want the serial number from
   1341 		   the above.  We get all the rest of the information
   1342 		   from the config that was used to create this RAID
   1343 		   set.
   1344 		   */
   1345 
   1346 		raidPtr->serial_number = clabel->serial_number;
   1347 
   1348 		for(column=0;column<raidPtr->numCol;column++) {
   1349 			diskPtr = &raidPtr->Disks[column];
   1350 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1351 				ci_label = raidget_component_label(raidPtr,
   1352 				    column);
   1353 				/* Zeroing this is important. */
   1354 				memset(ci_label, 0, sizeof(*ci_label));
   1355 				raid_init_component_label(raidPtr, ci_label);
   1356 				ci_label->serial_number =
   1357 				    raidPtr->serial_number;
   1358 				ci_label->row = 0; /* we dont' pretend to support more */
   1359 				rf_component_label_set_partitionsize(ci_label,
   1360 				    diskPtr->partitionSize);
   1361 				ci_label->column = column;
   1362 				raidflush_component_label(raidPtr, column);
   1363 			}
   1364 			/* XXXjld what about the spares? */
   1365 		}
   1366 
   1367 		return (retcode);
   1368 	case RAIDFRAME_SET_AUTOCONFIG:
   1369 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1370 		printf("raid%d: New autoconfig value is: %d\n",
   1371 		       raidPtr->raidid, d);
   1372 		*(int *) data = d;
   1373 		return (retcode);
   1374 
   1375 	case RAIDFRAME_SET_ROOT:
   1376 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1377 		printf("raid%d: New rootpartition value is: %d\n",
   1378 		       raidPtr->raidid, d);
   1379 		*(int *) data = d;
   1380 		return (retcode);
   1381 
   1382 		/* initialize all parity */
   1383 	case RAIDFRAME_REWRITEPARITY:
   1384 
   1385 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1386 			/* Parity for RAID 0 is trivially correct */
   1387 			raidPtr->parity_good = RF_RAID_CLEAN;
   1388 			return(0);
   1389 		}
   1390 
   1391 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1392 			/* Re-write is already in progress! */
   1393 			return(EINVAL);
   1394 		}
   1395 
   1396 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1397 					   rf_RewriteParityThread,
   1398 					   raidPtr,"raid_parity");
   1399 		return (retcode);
   1400 
   1401 
   1402 	case RAIDFRAME_ADD_HOT_SPARE:
   1403 		sparePtr = (RF_SingleComponent_t *) data;
   1404 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1405 		retcode = rf_add_hot_spare(raidPtr, &component);
   1406 		return(retcode);
   1407 
   1408 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1409 		return(retcode);
   1410 
   1411 	case RAIDFRAME_DELETE_COMPONENT:
   1412 		componentPtr = (RF_SingleComponent_t *)data;
   1413 		memcpy( &component, componentPtr,
   1414 			sizeof(RF_SingleComponent_t));
   1415 		retcode = rf_delete_component(raidPtr, &component);
   1416 		return(retcode);
   1417 
   1418 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1419 		componentPtr = (RF_SingleComponent_t *)data;
   1420 		memcpy( &component, componentPtr,
   1421 			sizeof(RF_SingleComponent_t));
   1422 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1423 		return(retcode);
   1424 
   1425 	case RAIDFRAME_REBUILD_IN_PLACE:
   1426 
   1427 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1428 			/* Can't do this on a RAID 0!! */
   1429 			return(EINVAL);
   1430 		}
   1431 
   1432 		if (raidPtr->recon_in_progress == 1) {
   1433 			/* a reconstruct is already in progress! */
   1434 			return(EINVAL);
   1435 		}
   1436 
   1437 		componentPtr = (RF_SingleComponent_t *) data;
   1438 		memcpy( &component, componentPtr,
   1439 			sizeof(RF_SingleComponent_t));
   1440 		component.row = 0; /* we don't support any more */
   1441 		column = component.column;
   1442 
   1443 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1444 			return(EINVAL);
   1445 		}
   1446 
   1447 		rf_lock_mutex2(raidPtr->mutex);
   1448 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1449 		    (raidPtr->numFailures > 0)) {
   1450 			/* XXX 0 above shouldn't be constant!!! */
   1451 			/* some component other than this has failed.
   1452 			   Let's not make things worse than they already
   1453 			   are... */
   1454 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1455 			       raidPtr->raidid);
   1456 			printf("raid%d:     Col: %d   Too many failures.\n",
   1457 			       raidPtr->raidid, column);
   1458 			rf_unlock_mutex2(raidPtr->mutex);
   1459 			return (EINVAL);
   1460 		}
   1461 		if (raidPtr->Disks[column].status ==
   1462 		    rf_ds_reconstructing) {
   1463 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1464 			       raidPtr->raidid);
   1465 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1466 
   1467 			rf_unlock_mutex2(raidPtr->mutex);
   1468 			return (EINVAL);
   1469 		}
   1470 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1471 			rf_unlock_mutex2(raidPtr->mutex);
   1472 			return (EINVAL);
   1473 		}
   1474 		rf_unlock_mutex2(raidPtr->mutex);
   1475 
   1476 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1477 		if (rrint == NULL)
   1478 			return(ENOMEM);
   1479 
   1480 		rrint->col = column;
   1481 		rrint->raidPtr = raidPtr;
   1482 
   1483 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1484 					   rf_ReconstructInPlaceThread,
   1485 					   rrint, "raid_reconip");
   1486 		return(retcode);
   1487 
   1488 	case RAIDFRAME_GET_INFO:
   1489 #ifdef COMPAT_NETBSD32
   1490 #ifdef _LP64
   1491 	case RAIDFRAME_GET_INFO32:
   1492 #endif
   1493 #endif
   1494 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1495 			  (RF_DeviceConfig_t *));
   1496 		if (d_cfg == NULL)
   1497 			return (ENOMEM);
   1498 		retcode = rf_get_info(raidPtr, d_cfg);
   1499 		if (retcode == 0) {
   1500 #ifdef COMPAT_NETBSD32
   1501 #ifdef _LP64
   1502 			if (cmd == RAIDFRAME_GET_INFO32)
   1503 				ucfgp = NETBSD32PTR64(*(netbsd32_pointer_t *)data);
   1504 			else
   1505 #endif
   1506 #endif
   1507 				ucfgp = *(RF_DeviceConfig_t **)data;
   1508 			retcode = copyout(d_cfg, ucfgp, sizeof(RF_DeviceConfig_t));
   1509 		}
   1510 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1511 
   1512 		return (retcode);
   1513 
   1514 	case RAIDFRAME_CHECK_PARITY:
   1515 		*(int *) data = raidPtr->parity_good;
   1516 		return (0);
   1517 
   1518 	case RAIDFRAME_PARITYMAP_STATUS:
   1519 		if (rf_paritymap_ineligible(raidPtr))
   1520 			return EINVAL;
   1521 		rf_paritymap_status(raidPtr->parity_map,
   1522 		    (struct rf_pmstat *)data);
   1523 		return 0;
   1524 
   1525 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1526 		if (rf_paritymap_ineligible(raidPtr))
   1527 			return EINVAL;
   1528 		if (raidPtr->parity_map == NULL)
   1529 			return ENOENT; /* ??? */
   1530 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1531 			(struct rf_pmparams *)data, 1))
   1532 			return EINVAL;
   1533 		return 0;
   1534 
   1535 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1536 		if (rf_paritymap_ineligible(raidPtr))
   1537 			return EINVAL;
   1538 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1539 		return 0;
   1540 
   1541 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1542 		if (rf_paritymap_ineligible(raidPtr))
   1543 			return EINVAL;
   1544 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1545 		/* XXX should errors be passed up? */
   1546 		return 0;
   1547 
   1548 	case RAIDFRAME_RESET_ACCTOTALS:
   1549 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1550 		return (0);
   1551 
   1552 	case RAIDFRAME_GET_ACCTOTALS:
   1553 		totals = (RF_AccTotals_t *) data;
   1554 		*totals = raidPtr->acc_totals;
   1555 		return (0);
   1556 
   1557 	case RAIDFRAME_KEEP_ACCTOTALS:
   1558 		raidPtr->keep_acc_totals = *(int *)data;
   1559 		return (0);
   1560 
   1561 	case RAIDFRAME_GET_SIZE:
   1562 		*(int *) data = raidPtr->totalSectors;
   1563 		return (0);
   1564 
   1565 		/* fail a disk & optionally start reconstruction */
   1566 	case RAIDFRAME_FAIL_DISK:
   1567 #ifdef COMPAT_80
   1568 	case RAIDFRAME_FAIL_DISK80:
   1569 #endif
   1570 
   1571 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1572 			/* Can't do this on a RAID 0!! */
   1573 			return(EINVAL);
   1574 		}
   1575 
   1576 		rr = (struct rf_recon_req *) data;
   1577 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1578 			return (EINVAL);
   1579 
   1580 		rf_lock_mutex2(raidPtr->mutex);
   1581 		if (raidPtr->status == rf_rs_reconstructing) {
   1582 			/* you can't fail a disk while we're reconstructing! */
   1583 			/* XXX wrong for RAID6 */
   1584 			rf_unlock_mutex2(raidPtr->mutex);
   1585 			return (EINVAL);
   1586 		}
   1587 		if ((raidPtr->Disks[rr->col].status ==
   1588 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1589 			/* some other component has failed.  Let's not make
   1590 			   things worse. XXX wrong for RAID6 */
   1591 			rf_unlock_mutex2(raidPtr->mutex);
   1592 			return (EINVAL);
   1593 		}
   1594 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1595 			/* Can't fail a spared disk! */
   1596 			rf_unlock_mutex2(raidPtr->mutex);
   1597 			return (EINVAL);
   1598 		}
   1599 		rf_unlock_mutex2(raidPtr->mutex);
   1600 
   1601 		/* make a copy of the recon request so that we don't rely on
   1602 		 * the user's buffer */
   1603 		RF_Malloc(rrint, sizeof(*rrint), (struct rf_recon_req_internal *));
   1604 		if (rrint == NULL)
   1605 			return(ENOMEM);
   1606 		rrint->col = rr->col;
   1607 		rrint->flags = rr->flags;
   1608 		rrint->raidPtr = raidPtr;
   1609 
   1610 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1611 					   rf_ReconThread,
   1612 					   rrint, "raid_recon");
   1613 		return (0);
   1614 
   1615 		/* invoke a copyback operation after recon on whatever disk
   1616 		 * needs it, if any */
   1617 	case RAIDFRAME_COPYBACK:
   1618 
   1619 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1620 			/* This makes no sense on a RAID 0!! */
   1621 			return(EINVAL);
   1622 		}
   1623 
   1624 		if (raidPtr->copyback_in_progress == 1) {
   1625 			/* Copyback is already in progress! */
   1626 			return(EINVAL);
   1627 		}
   1628 
   1629 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1630 					   rf_CopybackThread,
   1631 					   raidPtr,"raid_copyback");
   1632 		return (retcode);
   1633 
   1634 		/* return the percentage completion of reconstruction */
   1635 	case RAIDFRAME_CHECK_RECON_STATUS:
   1636 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1637 			/* This makes no sense on a RAID 0, so tell the
   1638 			   user it's done. */
   1639 			*(int *) data = 100;
   1640 			return(0);
   1641 		}
   1642 		if (raidPtr->status != rf_rs_reconstructing)
   1643 			*(int *) data = 100;
   1644 		else {
   1645 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1646 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1647 			} else {
   1648 				*(int *) data = 0;
   1649 			}
   1650 		}
   1651 		return (0);
   1652 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1653 		rf_check_recon_status_ext(raidPtr, data);
   1654 		return (0);
   1655 
   1656 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1657 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1658 			/* This makes no sense on a RAID 0, so tell the
   1659 			   user it's done. */
   1660 			*(int *) data = 100;
   1661 			return(0);
   1662 		}
   1663 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1664 			*(int *) data = 100 *
   1665 				raidPtr->parity_rewrite_stripes_done /
   1666 				raidPtr->Layout.numStripe;
   1667 		} else {
   1668 			*(int *) data = 100;
   1669 		}
   1670 		return (0);
   1671 
   1672 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1673 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1674 		return (0);
   1675 
   1676 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1677 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1678 			/* This makes no sense on a RAID 0 */
   1679 			*(int *) data = 100;
   1680 			return(0);
   1681 		}
   1682 		if (raidPtr->copyback_in_progress == 1) {
   1683 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1684 				raidPtr->Layout.numStripe;
   1685 		} else {
   1686 			*(int *) data = 100;
   1687 		}
   1688 		return (0);
   1689 
   1690 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1691 		rf_check_copyback_status_ext(raidPtr, data);
   1692 		return 0;
   1693 
   1694 	case RAIDFRAME_SET_LAST_UNIT:
   1695 		for (column = 0; column < raidPtr->numCol; column++)
   1696 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1697 				return EBUSY;
   1698 
   1699 		for (column = 0; column < raidPtr->numCol; column++) {
   1700 			clabel = raidget_component_label(raidPtr, column);
   1701 			clabel->last_unit = *(int *)data;
   1702 			raidflush_component_label(raidPtr, column);
   1703 		}
   1704 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1705 		return 0;
   1706 
   1707 		/* the sparetable daemon calls this to wait for the kernel to
   1708 		 * need a spare table. this ioctl does not return until a
   1709 		 * spare table is needed. XXX -- calling mpsleep here in the
   1710 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1711 		 * -- I should either compute the spare table in the kernel,
   1712 		 * or have a different -- XXX XXX -- interface (a different
   1713 		 * character device) for delivering the table     -- XXX */
   1714 #if 0
   1715 	case RAIDFRAME_SPARET_WAIT:
   1716 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1717 		while (!rf_sparet_wait_queue)
   1718 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1719 		waitreq = rf_sparet_wait_queue;
   1720 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1721 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1722 
   1723 		/* structure assignment */
   1724 		*((RF_SparetWait_t *) data) = *waitreq;
   1725 
   1726 		RF_Free(waitreq, sizeof(*waitreq));
   1727 		return (0);
   1728 
   1729 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1730 		 * code in it that will cause the dameon to exit */
   1731 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1732 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1733 		waitreq->fcol = -1;
   1734 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1735 		waitreq->next = rf_sparet_wait_queue;
   1736 		rf_sparet_wait_queue = waitreq;
   1737 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1738 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1739 		return (0);
   1740 
   1741 		/* used by the spare table daemon to deliver a spare table
   1742 		 * into the kernel */
   1743 	case RAIDFRAME_SEND_SPARET:
   1744 
   1745 		/* install the spare table */
   1746 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1747 
   1748 		/* respond to the requestor.  the return status of the spare
   1749 		 * table installation is passed in the "fcol" field */
   1750 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1751 		waitreq->fcol = retcode;
   1752 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1753 		waitreq->next = rf_sparet_resp_queue;
   1754 		rf_sparet_resp_queue = waitreq;
   1755 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1756 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1757 
   1758 		return (retcode);
   1759 #endif
   1760 
   1761 	default:
   1762 		break; /* fall through to the os-specific code below */
   1763 
   1764 	}
   1765 
   1766 	if (!raidPtr->valid)
   1767 		return (EINVAL);
   1768 
   1769 	/*
   1770 	 * Add support for "regular" device ioctls here.
   1771 	 */
   1772 
   1773 	switch (cmd) {
   1774 	case DIOCGCACHE:
   1775 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1776 		break;
   1777 
   1778 	case DIOCCACHESYNC:
   1779 		retcode = rf_sync_component_caches(raidPtr);
   1780 		break;
   1781 
   1782 	default:
   1783 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1784 		break;
   1785 	}
   1786 
   1787 	return (retcode);
   1788 
   1789 }
   1790 
   1791 
   1792 /* raidinit -- complete the rest of the initialization for the
   1793    RAIDframe device.  */
   1794 
   1795 
   1796 static void
   1797 raidinit(struct raid_softc *rs)
   1798 {
   1799 	cfdata_t cf;
   1800 	unsigned int unit;
   1801 	struct dk_softc *dksc = &rs->sc_dksc;
   1802 	RF_Raid_t *raidPtr = &rs->sc_r;
   1803 	device_t dev;
   1804 
   1805 	unit = raidPtr->raidid;
   1806 
   1807 	/* XXX doesn't check bounds. */
   1808 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1809 
   1810 	/* attach the pseudo device */
   1811 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1812 	cf->cf_name = raid_cd.cd_name;
   1813 	cf->cf_atname = raid_cd.cd_name;
   1814 	cf->cf_unit = unit;
   1815 	cf->cf_fstate = FSTATE_STAR;
   1816 
   1817 	dev = config_attach_pseudo(cf);
   1818 	if (dev == NULL) {
   1819 		printf("raid%d: config_attach_pseudo failed\n",
   1820 		    raidPtr->raidid);
   1821 		free(cf, M_RAIDFRAME);
   1822 		return;
   1823 	}
   1824 
   1825 	/* provide a backpointer to the real softc */
   1826 	raidsoftc(dev) = rs;
   1827 
   1828 	/* disk_attach actually creates space for the CPU disklabel, among
   1829 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1830 	 * with disklabels. */
   1831 	dk_init(dksc, dev, DKTYPE_RAID);
   1832 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1833 
   1834 	/* XXX There may be a weird interaction here between this, and
   1835 	 * protectedSectors, as used in RAIDframe.  */
   1836 
   1837 	rs->sc_size = raidPtr->totalSectors;
   1838 
   1839 	/* Attach dk and disk subsystems */
   1840 	dk_attach(dksc);
   1841 	disk_attach(&dksc->sc_dkdev);
   1842 	rf_set_geometry(rs, raidPtr);
   1843 
   1844 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1845 
   1846 	/* mark unit as usuable */
   1847 	rs->sc_flags |= RAIDF_INITED;
   1848 
   1849 	dkwedge_discover(&dksc->sc_dkdev);
   1850 }
   1851 
   1852 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1853 /* wake up the daemon & tell it to get us a spare table
   1854  * XXX
   1855  * the entries in the queues should be tagged with the raidPtr
   1856  * so that in the extremely rare case that two recons happen at once,
   1857  * we know for which device were requesting a spare table
   1858  * XXX
   1859  *
   1860  * XXX This code is not currently used. GO
   1861  */
   1862 int
   1863 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1864 {
   1865 	int     retcode;
   1866 
   1867 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1868 	req->next = rf_sparet_wait_queue;
   1869 	rf_sparet_wait_queue = req;
   1870 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1871 
   1872 	/* mpsleep unlocks the mutex */
   1873 	while (!rf_sparet_resp_queue) {
   1874 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1875 	}
   1876 	req = rf_sparet_resp_queue;
   1877 	rf_sparet_resp_queue = req->next;
   1878 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1879 
   1880 	retcode = req->fcol;
   1881 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1882 					 * alloc'd */
   1883 	return (retcode);
   1884 }
   1885 #endif
   1886 
   1887 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1888  * bp & passes it down.
   1889  * any calls originating in the kernel must use non-blocking I/O
   1890  * do some extra sanity checking to return "appropriate" error values for
   1891  * certain conditions (to make some standard utilities work)
   1892  *
   1893  * Formerly known as: rf_DoAccessKernel
   1894  */
   1895 void
   1896 raidstart(RF_Raid_t *raidPtr)
   1897 {
   1898 	struct raid_softc *rs;
   1899 	struct dk_softc *dksc;
   1900 
   1901 	rs = raidPtr->softc;
   1902 	dksc = &rs->sc_dksc;
   1903 	/* quick check to see if anything has died recently */
   1904 	rf_lock_mutex2(raidPtr->mutex);
   1905 	if (raidPtr->numNewFailures > 0) {
   1906 		rf_unlock_mutex2(raidPtr->mutex);
   1907 		rf_update_component_labels(raidPtr,
   1908 					   RF_NORMAL_COMPONENT_UPDATE);
   1909 		rf_lock_mutex2(raidPtr->mutex);
   1910 		raidPtr->numNewFailures--;
   1911 	}
   1912 	rf_unlock_mutex2(raidPtr->mutex);
   1913 
   1914 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1915 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1916 		return;
   1917 	}
   1918 
   1919 	dk_start(dksc, NULL);
   1920 }
   1921 
   1922 static int
   1923 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1924 {
   1925 	RF_SectorCount_t num_blocks, pb, sum;
   1926 	RF_RaidAddr_t raid_addr;
   1927 	daddr_t blocknum;
   1928 	int     do_async;
   1929 	int rc;
   1930 
   1931 	rf_lock_mutex2(raidPtr->mutex);
   1932 	if (raidPtr->openings == 0) {
   1933 		rf_unlock_mutex2(raidPtr->mutex);
   1934 		return EAGAIN;
   1935 	}
   1936 	rf_unlock_mutex2(raidPtr->mutex);
   1937 
   1938 	blocknum = bp->b_rawblkno;
   1939 
   1940 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1941 		    (int) blocknum));
   1942 
   1943 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1944 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1945 
   1946 	/* *THIS* is where we adjust what block we're going to...
   1947 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1948 	raid_addr = blocknum;
   1949 
   1950 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1951 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1952 	sum = raid_addr + num_blocks + pb;
   1953 	if (1 || rf_debugKernelAccess) {
   1954 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   1955 			    (int) raid_addr, (int) sum, (int) num_blocks,
   1956 			    (int) pb, (int) bp->b_resid));
   1957 	}
   1958 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   1959 	    || (sum < num_blocks) || (sum < pb)) {
   1960 		rc = ENOSPC;
   1961 		goto done;
   1962 	}
   1963 	/*
   1964 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   1965 	 */
   1966 
   1967 	if (bp->b_bcount & raidPtr->sectorMask) {
   1968 		rc = ENOSPC;
   1969 		goto done;
   1970 	}
   1971 	db1_printf(("Calling DoAccess..\n"));
   1972 
   1973 
   1974 	rf_lock_mutex2(raidPtr->mutex);
   1975 	raidPtr->openings--;
   1976 	rf_unlock_mutex2(raidPtr->mutex);
   1977 
   1978 	/*
   1979 	 * Everything is async.
   1980 	 */
   1981 	do_async = 1;
   1982 
   1983 	/* don't ever condition on bp->b_flags & B_WRITE.
   1984 	 * always condition on B_READ instead */
   1985 
   1986 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   1987 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   1988 			 do_async, raid_addr, num_blocks,
   1989 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   1990 
   1991 done:
   1992 	return rc;
   1993 }
   1994 
   1995 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   1996 
   1997 int
   1998 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   1999 {
   2000 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2001 	struct buf *bp;
   2002 
   2003 	req->queue = queue;
   2004 	bp = req->bp;
   2005 
   2006 	switch (req->type) {
   2007 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2008 		/* XXX need to do something extra here.. */
   2009 		/* I'm leaving this in, as I've never actually seen it used,
   2010 		 * and I'd like folks to report it... GO */
   2011 		printf(("WAKEUP CALLED\n"));
   2012 		queue->numOutstanding++;
   2013 
   2014 		bp->b_flags = 0;
   2015 		bp->b_private = req;
   2016 
   2017 		KernelWakeupFunc(bp);
   2018 		break;
   2019 
   2020 	case RF_IO_TYPE_READ:
   2021 	case RF_IO_TYPE_WRITE:
   2022 #if RF_ACC_TRACE > 0
   2023 		if (req->tracerec) {
   2024 			RF_ETIMER_START(req->tracerec->timer);
   2025 		}
   2026 #endif
   2027 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2028 		    op, queue->rf_cinfo->ci_dev,
   2029 		    req->sectorOffset, req->numSector,
   2030 		    req->buf, KernelWakeupFunc, (void *) req,
   2031 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2032 
   2033 		if (rf_debugKernelAccess) {
   2034 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2035 				(long) bp->b_blkno));
   2036 		}
   2037 		queue->numOutstanding++;
   2038 		queue->last_deq_sector = req->sectorOffset;
   2039 		/* acc wouldn't have been let in if there were any pending
   2040 		 * reqs at any other priority */
   2041 		queue->curPriority = req->priority;
   2042 
   2043 		db1_printf(("Going for %c to unit %d col %d\n",
   2044 			    req->type, queue->raidPtr->raidid,
   2045 			    queue->col));
   2046 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2047 			(int) req->sectorOffset, (int) req->numSector,
   2048 			(int) (req->numSector <<
   2049 			    queue->raidPtr->logBytesPerSector),
   2050 			(int) queue->raidPtr->logBytesPerSector));
   2051 
   2052 		/*
   2053 		 * XXX: drop lock here since this can block at
   2054 		 * least with backing SCSI devices.  Retake it
   2055 		 * to minimize fuss with calling interfaces.
   2056 		 */
   2057 
   2058 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2059 		bdev_strategy(bp);
   2060 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2061 		break;
   2062 
   2063 	default:
   2064 		panic("bad req->type in rf_DispatchKernelIO");
   2065 	}
   2066 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2067 
   2068 	return (0);
   2069 }
   2070 /* this is the callback function associated with a I/O invoked from
   2071    kernel code.
   2072  */
   2073 static void
   2074 KernelWakeupFunc(struct buf *bp)
   2075 {
   2076 	RF_DiskQueueData_t *req = NULL;
   2077 	RF_DiskQueue_t *queue;
   2078 
   2079 	db1_printf(("recovering the request queue:\n"));
   2080 
   2081 	req = bp->b_private;
   2082 
   2083 	queue = (RF_DiskQueue_t *) req->queue;
   2084 
   2085 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2086 
   2087 #if RF_ACC_TRACE > 0
   2088 	if (req->tracerec) {
   2089 		RF_ETIMER_STOP(req->tracerec->timer);
   2090 		RF_ETIMER_EVAL(req->tracerec->timer);
   2091 		rf_lock_mutex2(rf_tracing_mutex);
   2092 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2093 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2094 		req->tracerec->num_phys_ios++;
   2095 		rf_unlock_mutex2(rf_tracing_mutex);
   2096 	}
   2097 #endif
   2098 
   2099 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2100 	 * ballistic, and mark the component as hosed... */
   2101 
   2102 	if (bp->b_error != 0) {
   2103 		/* Mark the disk as dead */
   2104 		/* but only mark it once... */
   2105 		/* and only if it wouldn't leave this RAID set
   2106 		   completely broken */
   2107 		if (((queue->raidPtr->Disks[queue->col].status ==
   2108 		      rf_ds_optimal) ||
   2109 		     (queue->raidPtr->Disks[queue->col].status ==
   2110 		      rf_ds_used_spare)) &&
   2111 		     (queue->raidPtr->numFailures <
   2112 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2113 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2114 			       queue->raidPtr->raidid,
   2115 			       bp->b_error,
   2116 			       queue->raidPtr->Disks[queue->col].devname);
   2117 			queue->raidPtr->Disks[queue->col].status =
   2118 			    rf_ds_failed;
   2119 			queue->raidPtr->status = rf_rs_degraded;
   2120 			queue->raidPtr->numFailures++;
   2121 			queue->raidPtr->numNewFailures++;
   2122 		} else {	/* Disk is already dead... */
   2123 			/* printf("Disk already marked as dead!\n"); */
   2124 		}
   2125 
   2126 	}
   2127 
   2128 	/* Fill in the error value */
   2129 	req->error = bp->b_error;
   2130 
   2131 	/* Drop this one on the "finished" queue... */
   2132 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2133 
   2134 	/* Let the raidio thread know there is work to be done. */
   2135 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2136 
   2137 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2138 }
   2139 
   2140 
   2141 /*
   2142  * initialize a buf structure for doing an I/O in the kernel.
   2143  */
   2144 static void
   2145 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2146        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2147        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2148        struct proc *b_proc)
   2149 {
   2150 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2151 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2152 	bp->b_oflags = 0;
   2153 	bp->b_cflags = 0;
   2154 	bp->b_bcount = numSect << logBytesPerSector;
   2155 	bp->b_bufsize = bp->b_bcount;
   2156 	bp->b_error = 0;
   2157 	bp->b_dev = dev;
   2158 	bp->b_data = bf;
   2159 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2160 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2161 	if (bp->b_bcount == 0) {
   2162 		panic("bp->b_bcount is zero in InitBP!!");
   2163 	}
   2164 	bp->b_proc = b_proc;
   2165 	bp->b_iodone = cbFunc;
   2166 	bp->b_private = cbArg;
   2167 }
   2168 
   2169 /*
   2170  * Wait interruptibly for an exclusive lock.
   2171  *
   2172  * XXX
   2173  * Several drivers do this; it should be abstracted and made MP-safe.
   2174  * (Hmm... where have we seen this warning before :->  GO )
   2175  */
   2176 static int
   2177 raidlock(struct raid_softc *rs)
   2178 {
   2179 	int     error;
   2180 
   2181 	error = 0;
   2182 	mutex_enter(&rs->sc_mutex);
   2183 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2184 		rs->sc_flags |= RAIDF_WANTED;
   2185 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2186 		if (error != 0)
   2187 			goto done;
   2188 	}
   2189 	rs->sc_flags |= RAIDF_LOCKED;
   2190 done:
   2191 	mutex_exit(&rs->sc_mutex);
   2192 	return (error);
   2193 }
   2194 /*
   2195  * Unlock and wake up any waiters.
   2196  */
   2197 static void
   2198 raidunlock(struct raid_softc *rs)
   2199 {
   2200 
   2201 	mutex_enter(&rs->sc_mutex);
   2202 	rs->sc_flags &= ~RAIDF_LOCKED;
   2203 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2204 		rs->sc_flags &= ~RAIDF_WANTED;
   2205 		cv_broadcast(&rs->sc_cv);
   2206 	}
   2207 	mutex_exit(&rs->sc_mutex);
   2208 }
   2209 
   2210 
   2211 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2212 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2213 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2214 
   2215 static daddr_t
   2216 rf_component_info_offset(void)
   2217 {
   2218 
   2219 	return RF_COMPONENT_INFO_OFFSET;
   2220 }
   2221 
   2222 static daddr_t
   2223 rf_component_info_size(unsigned secsize)
   2224 {
   2225 	daddr_t info_size;
   2226 
   2227 	KASSERT(secsize);
   2228 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2229 		info_size = secsize;
   2230 	else
   2231 		info_size = RF_COMPONENT_INFO_SIZE;
   2232 
   2233 	return info_size;
   2234 }
   2235 
   2236 static daddr_t
   2237 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2238 {
   2239 	daddr_t map_offset;
   2240 
   2241 	KASSERT(raidPtr->bytesPerSector);
   2242 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2243 		map_offset = raidPtr->bytesPerSector;
   2244 	else
   2245 		map_offset = RF_COMPONENT_INFO_SIZE;
   2246 	map_offset += rf_component_info_offset();
   2247 
   2248 	return map_offset;
   2249 }
   2250 
   2251 static daddr_t
   2252 rf_parity_map_size(RF_Raid_t *raidPtr)
   2253 {
   2254 	daddr_t map_size;
   2255 
   2256 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2257 		map_size = raidPtr->bytesPerSector;
   2258 	else
   2259 		map_size = RF_PARITY_MAP_SIZE;
   2260 
   2261 	return map_size;
   2262 }
   2263 
   2264 int
   2265 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2266 {
   2267 	RF_ComponentLabel_t *clabel;
   2268 
   2269 	clabel = raidget_component_label(raidPtr, col);
   2270 	clabel->clean = RF_RAID_CLEAN;
   2271 	raidflush_component_label(raidPtr, col);
   2272 	return(0);
   2273 }
   2274 
   2275 
   2276 int
   2277 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2278 {
   2279 	RF_ComponentLabel_t *clabel;
   2280 
   2281 	clabel = raidget_component_label(raidPtr, col);
   2282 	clabel->clean = RF_RAID_DIRTY;
   2283 	raidflush_component_label(raidPtr, col);
   2284 	return(0);
   2285 }
   2286 
   2287 int
   2288 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2289 {
   2290 	KASSERT(raidPtr->bytesPerSector);
   2291 	return raidread_component_label(raidPtr->bytesPerSector,
   2292 	    raidPtr->Disks[col].dev,
   2293 	    raidPtr->raid_cinfo[col].ci_vp,
   2294 	    &raidPtr->raid_cinfo[col].ci_label);
   2295 }
   2296 
   2297 RF_ComponentLabel_t *
   2298 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2299 {
   2300 	return &raidPtr->raid_cinfo[col].ci_label;
   2301 }
   2302 
   2303 int
   2304 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2305 {
   2306 	RF_ComponentLabel_t *label;
   2307 
   2308 	label = &raidPtr->raid_cinfo[col].ci_label;
   2309 	label->mod_counter = raidPtr->mod_counter;
   2310 #ifndef RF_NO_PARITY_MAP
   2311 	label->parity_map_modcount = label->mod_counter;
   2312 #endif
   2313 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2314 	    raidPtr->Disks[col].dev,
   2315 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2316 }
   2317 
   2318 
   2319 static int
   2320 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2321     RF_ComponentLabel_t *clabel)
   2322 {
   2323 	return raidread_component_area(dev, b_vp, clabel,
   2324 	    sizeof(RF_ComponentLabel_t),
   2325 	    rf_component_info_offset(),
   2326 	    rf_component_info_size(secsize));
   2327 }
   2328 
   2329 /* ARGSUSED */
   2330 static int
   2331 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2332     size_t msize, daddr_t offset, daddr_t dsize)
   2333 {
   2334 	struct buf *bp;
   2335 	int error;
   2336 
   2337 	/* XXX should probably ensure that we don't try to do this if
   2338 	   someone has changed rf_protected_sectors. */
   2339 
   2340 	if (b_vp == NULL) {
   2341 		/* For whatever reason, this component is not valid.
   2342 		   Don't try to read a component label from it. */
   2343 		return(EINVAL);
   2344 	}
   2345 
   2346 	/* get a block of the appropriate size... */
   2347 	bp = geteblk((int)dsize);
   2348 	bp->b_dev = dev;
   2349 
   2350 	/* get our ducks in a row for the read */
   2351 	bp->b_blkno = offset / DEV_BSIZE;
   2352 	bp->b_bcount = dsize;
   2353 	bp->b_flags |= B_READ;
   2354  	bp->b_resid = dsize;
   2355 
   2356 	bdev_strategy(bp);
   2357 	error = biowait(bp);
   2358 
   2359 	if (!error) {
   2360 		memcpy(data, bp->b_data, msize);
   2361 	}
   2362 
   2363 	brelse(bp, 0);
   2364 	return(error);
   2365 }
   2366 
   2367 
   2368 static int
   2369 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2370     RF_ComponentLabel_t *clabel)
   2371 {
   2372 	return raidwrite_component_area(dev, b_vp, clabel,
   2373 	    sizeof(RF_ComponentLabel_t),
   2374 	    rf_component_info_offset(),
   2375 	    rf_component_info_size(secsize), 0);
   2376 }
   2377 
   2378 /* ARGSUSED */
   2379 static int
   2380 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2381     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2382 {
   2383 	struct buf *bp;
   2384 	int error;
   2385 
   2386 	/* get a block of the appropriate size... */
   2387 	bp = geteblk((int)dsize);
   2388 	bp->b_dev = dev;
   2389 
   2390 	/* get our ducks in a row for the write */
   2391 	bp->b_blkno = offset / DEV_BSIZE;
   2392 	bp->b_bcount = dsize;
   2393 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2394  	bp->b_resid = dsize;
   2395 
   2396 	memset(bp->b_data, 0, dsize);
   2397 	memcpy(bp->b_data, data, msize);
   2398 
   2399 	bdev_strategy(bp);
   2400 	if (asyncp)
   2401 		return 0;
   2402 	error = biowait(bp);
   2403 	brelse(bp, 0);
   2404 	if (error) {
   2405 #if 1
   2406 		printf("Failed to write RAID component info!\n");
   2407 #endif
   2408 	}
   2409 
   2410 	return(error);
   2411 }
   2412 
   2413 void
   2414 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2415 {
   2416 	int c;
   2417 
   2418 	for (c = 0; c < raidPtr->numCol; c++) {
   2419 		/* Skip dead disks. */
   2420 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2421 			continue;
   2422 		/* XXXjld: what if an error occurs here? */
   2423 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2424 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2425 		    RF_PARITYMAP_NBYTE,
   2426 		    rf_parity_map_offset(raidPtr),
   2427 		    rf_parity_map_size(raidPtr), 0);
   2428 	}
   2429 }
   2430 
   2431 void
   2432 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2433 {
   2434 	struct rf_paritymap_ondisk tmp;
   2435 	int c,first;
   2436 
   2437 	first=1;
   2438 	for (c = 0; c < raidPtr->numCol; c++) {
   2439 		/* Skip dead disks. */
   2440 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2441 			continue;
   2442 		raidread_component_area(raidPtr->Disks[c].dev,
   2443 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2444 		    RF_PARITYMAP_NBYTE,
   2445 		    rf_parity_map_offset(raidPtr),
   2446 		    rf_parity_map_size(raidPtr));
   2447 		if (first) {
   2448 			memcpy(map, &tmp, sizeof(*map));
   2449 			first = 0;
   2450 		} else {
   2451 			rf_paritymap_merge(map, &tmp);
   2452 		}
   2453 	}
   2454 }
   2455 
   2456 void
   2457 rf_markalldirty(RF_Raid_t *raidPtr)
   2458 {
   2459 	RF_ComponentLabel_t *clabel;
   2460 	int sparecol;
   2461 	int c;
   2462 	int j;
   2463 	int scol = -1;
   2464 
   2465 	raidPtr->mod_counter++;
   2466 	for (c = 0; c < raidPtr->numCol; c++) {
   2467 		/* we don't want to touch (at all) a disk that has
   2468 		   failed */
   2469 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2470 			clabel = raidget_component_label(raidPtr, c);
   2471 			if (clabel->status == rf_ds_spared) {
   2472 				/* XXX do something special...
   2473 				   but whatever you do, don't
   2474 				   try to access it!! */
   2475 			} else {
   2476 				raidmarkdirty(raidPtr, c);
   2477 			}
   2478 		}
   2479 	}
   2480 
   2481 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2482 		sparecol = raidPtr->numCol + c;
   2483 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2484 			/*
   2485 
   2486 			   we claim this disk is "optimal" if it's
   2487 			   rf_ds_used_spare, as that means it should be
   2488 			   directly substitutable for the disk it replaced.
   2489 			   We note that too...
   2490 
   2491 			 */
   2492 
   2493 			for(j=0;j<raidPtr->numCol;j++) {
   2494 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2495 					scol = j;
   2496 					break;
   2497 				}
   2498 			}
   2499 
   2500 			clabel = raidget_component_label(raidPtr, sparecol);
   2501 			/* make sure status is noted */
   2502 
   2503 			raid_init_component_label(raidPtr, clabel);
   2504 
   2505 			clabel->row = 0;
   2506 			clabel->column = scol;
   2507 			/* Note: we *don't* change status from rf_ds_used_spare
   2508 			   to rf_ds_optimal */
   2509 			/* clabel.status = rf_ds_optimal; */
   2510 
   2511 			raidmarkdirty(raidPtr, sparecol);
   2512 		}
   2513 	}
   2514 }
   2515 
   2516 
   2517 void
   2518 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2519 {
   2520 	RF_ComponentLabel_t *clabel;
   2521 	int sparecol;
   2522 	int c;
   2523 	int j;
   2524 	int scol;
   2525 	struct raid_softc *rs = raidPtr->softc;
   2526 
   2527 	scol = -1;
   2528 
   2529 	/* XXX should do extra checks to make sure things really are clean,
   2530 	   rather than blindly setting the clean bit... */
   2531 
   2532 	raidPtr->mod_counter++;
   2533 
   2534 	for (c = 0; c < raidPtr->numCol; c++) {
   2535 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2536 			clabel = raidget_component_label(raidPtr, c);
   2537 			/* make sure status is noted */
   2538 			clabel->status = rf_ds_optimal;
   2539 
   2540 			/* note what unit we are configured as */
   2541 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2542 				clabel->last_unit = raidPtr->raidid;
   2543 
   2544 			raidflush_component_label(raidPtr, c);
   2545 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2546 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2547 					raidmarkclean(raidPtr, c);
   2548 				}
   2549 			}
   2550 		}
   2551 		/* else we don't touch it.. */
   2552 	}
   2553 
   2554 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2555 		sparecol = raidPtr->numCol + c;
   2556 		/* Need to ensure that the reconstruct actually completed! */
   2557 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2558 			/*
   2559 
   2560 			   we claim this disk is "optimal" if it's
   2561 			   rf_ds_used_spare, as that means it should be
   2562 			   directly substitutable for the disk it replaced.
   2563 			   We note that too...
   2564 
   2565 			 */
   2566 
   2567 			for(j=0;j<raidPtr->numCol;j++) {
   2568 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2569 					scol = j;
   2570 					break;
   2571 				}
   2572 			}
   2573 
   2574 			/* XXX shouldn't *really* need this... */
   2575 			clabel = raidget_component_label(raidPtr, sparecol);
   2576 			/* make sure status is noted */
   2577 
   2578 			raid_init_component_label(raidPtr, clabel);
   2579 
   2580 			clabel->column = scol;
   2581 			clabel->status = rf_ds_optimal;
   2582 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2583 				clabel->last_unit = raidPtr->raidid;
   2584 
   2585 			raidflush_component_label(raidPtr, sparecol);
   2586 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2587 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2588 					raidmarkclean(raidPtr, sparecol);
   2589 				}
   2590 			}
   2591 		}
   2592 	}
   2593 }
   2594 
   2595 void
   2596 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2597 {
   2598 
   2599 	if (vp != NULL) {
   2600 		if (auto_configured == 1) {
   2601 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2602 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2603 			vput(vp);
   2604 
   2605 		} else {
   2606 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2607 		}
   2608 	}
   2609 }
   2610 
   2611 
   2612 void
   2613 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2614 {
   2615 	int r,c;
   2616 	struct vnode *vp;
   2617 	int acd;
   2618 
   2619 
   2620 	/* We take this opportunity to close the vnodes like we should.. */
   2621 
   2622 	for (c = 0; c < raidPtr->numCol; c++) {
   2623 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2624 		acd = raidPtr->Disks[c].auto_configured;
   2625 		rf_close_component(raidPtr, vp, acd);
   2626 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2627 		raidPtr->Disks[c].auto_configured = 0;
   2628 	}
   2629 
   2630 	for (r = 0; r < raidPtr->numSpare; r++) {
   2631 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2632 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2633 		rf_close_component(raidPtr, vp, acd);
   2634 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2635 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2636 	}
   2637 }
   2638 
   2639 
   2640 void
   2641 rf_ReconThread(struct rf_recon_req_internal *req)
   2642 {
   2643 	int     s;
   2644 	RF_Raid_t *raidPtr;
   2645 
   2646 	s = splbio();
   2647 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2648 	raidPtr->recon_in_progress = 1;
   2649 
   2650 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2651 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2652 
   2653 	RF_Free(req, sizeof(*req));
   2654 
   2655 	raidPtr->recon_in_progress = 0;
   2656 	splx(s);
   2657 
   2658 	/* That's all... */
   2659 	kthread_exit(0);	/* does not return */
   2660 }
   2661 
   2662 void
   2663 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2664 {
   2665 	int retcode;
   2666 	int s;
   2667 
   2668 	raidPtr->parity_rewrite_stripes_done = 0;
   2669 	raidPtr->parity_rewrite_in_progress = 1;
   2670 	s = splbio();
   2671 	retcode = rf_RewriteParity(raidPtr);
   2672 	splx(s);
   2673 	if (retcode) {
   2674 		printf("raid%d: Error re-writing parity (%d)!\n",
   2675 		    raidPtr->raidid, retcode);
   2676 	} else {
   2677 		/* set the clean bit!  If we shutdown correctly,
   2678 		   the clean bit on each component label will get
   2679 		   set */
   2680 		raidPtr->parity_good = RF_RAID_CLEAN;
   2681 	}
   2682 	raidPtr->parity_rewrite_in_progress = 0;
   2683 
   2684 	/* Anyone waiting for us to stop?  If so, inform them... */
   2685 	if (raidPtr->waitShutdown) {
   2686 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2687 	}
   2688 
   2689 	/* That's all... */
   2690 	kthread_exit(0);	/* does not return */
   2691 }
   2692 
   2693 
   2694 void
   2695 rf_CopybackThread(RF_Raid_t *raidPtr)
   2696 {
   2697 	int s;
   2698 
   2699 	raidPtr->copyback_in_progress = 1;
   2700 	s = splbio();
   2701 	rf_CopybackReconstructedData(raidPtr);
   2702 	splx(s);
   2703 	raidPtr->copyback_in_progress = 0;
   2704 
   2705 	/* That's all... */
   2706 	kthread_exit(0);	/* does not return */
   2707 }
   2708 
   2709 
   2710 void
   2711 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2712 {
   2713 	int s;
   2714 	RF_Raid_t *raidPtr;
   2715 
   2716 	s = splbio();
   2717 	raidPtr = req->raidPtr;
   2718 	raidPtr->recon_in_progress = 1;
   2719 	rf_ReconstructInPlace(raidPtr, req->col);
   2720 	RF_Free(req, sizeof(*req));
   2721 	raidPtr->recon_in_progress = 0;
   2722 	splx(s);
   2723 
   2724 	/* That's all... */
   2725 	kthread_exit(0);	/* does not return */
   2726 }
   2727 
   2728 static RF_AutoConfig_t *
   2729 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2730     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2731     unsigned secsize)
   2732 {
   2733 	int good_one = 0;
   2734 	RF_ComponentLabel_t *clabel;
   2735 	RF_AutoConfig_t *ac;
   2736 
   2737 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2738 	if (clabel == NULL) {
   2739 oomem:
   2740 		    while(ac_list) {
   2741 			    ac = ac_list;
   2742 			    if (ac->clabel)
   2743 				    free(ac->clabel, M_RAIDFRAME);
   2744 			    ac_list = ac_list->next;
   2745 			    free(ac, M_RAIDFRAME);
   2746 		    }
   2747 		    printf("RAID auto config: out of memory!\n");
   2748 		    return NULL; /* XXX probably should panic? */
   2749 	}
   2750 
   2751 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2752 		/* Got the label.  Does it look reasonable? */
   2753 		if (rf_reasonable_label(clabel, numsecs) &&
   2754 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2755 #ifdef DEBUG
   2756 			printf("Component on: %s: %llu\n",
   2757 				cname, (unsigned long long)size);
   2758 			rf_print_component_label(clabel);
   2759 #endif
   2760 			/* if it's reasonable, add it, else ignore it. */
   2761 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2762 				M_NOWAIT);
   2763 			if (ac == NULL) {
   2764 				free(clabel, M_RAIDFRAME);
   2765 				goto oomem;
   2766 			}
   2767 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2768 			ac->dev = dev;
   2769 			ac->vp = vp;
   2770 			ac->clabel = clabel;
   2771 			ac->next = ac_list;
   2772 			ac_list = ac;
   2773 			good_one = 1;
   2774 		}
   2775 	}
   2776 	if (!good_one) {
   2777 		/* cleanup */
   2778 		free(clabel, M_RAIDFRAME);
   2779 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2780 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2781 		vput(vp);
   2782 	}
   2783 	return ac_list;
   2784 }
   2785 
   2786 RF_AutoConfig_t *
   2787 rf_find_raid_components(void)
   2788 {
   2789 	struct vnode *vp;
   2790 	struct disklabel label;
   2791 	device_t dv;
   2792 	deviter_t di;
   2793 	dev_t dev;
   2794 	int bmajor, bminor, wedge, rf_part_found;
   2795 	int error;
   2796 	int i;
   2797 	RF_AutoConfig_t *ac_list;
   2798 	uint64_t numsecs;
   2799 	unsigned secsize;
   2800 	int dowedges;
   2801 
   2802 	/* initialize the AutoConfig list */
   2803 	ac_list = NULL;
   2804 
   2805 	/*
   2806 	 * we begin by trolling through *all* the devices on the system *twice*
   2807 	 * first we scan for wedges, second for other devices. This avoids
   2808 	 * using a raw partition instead of a wedge that covers the whole disk
   2809 	 */
   2810 
   2811 	for (dowedges=1; dowedges>=0; --dowedges) {
   2812 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2813 		     dv = deviter_next(&di)) {
   2814 
   2815 			/* we are only interested in disks... */
   2816 			if (device_class(dv) != DV_DISK)
   2817 				continue;
   2818 
   2819 			/* we don't care about floppies... */
   2820 			if (device_is_a(dv, "fd")) {
   2821 				continue;
   2822 			}
   2823 
   2824 			/* we don't care about CD's... */
   2825 			if (device_is_a(dv, "cd")) {
   2826 				continue;
   2827 			}
   2828 
   2829 			/* we don't care about md's... */
   2830 			if (device_is_a(dv, "md")) {
   2831 				continue;
   2832 			}
   2833 
   2834 			/* hdfd is the Atari/Hades floppy driver */
   2835 			if (device_is_a(dv, "hdfd")) {
   2836 				continue;
   2837 			}
   2838 
   2839 			/* fdisa is the Atari/Milan floppy driver */
   2840 			if (device_is_a(dv, "fdisa")) {
   2841 				continue;
   2842 			}
   2843 
   2844 			/* are we in the wedges pass ? */
   2845 			wedge = device_is_a(dv, "dk");
   2846 			if (wedge != dowedges) {
   2847 				continue;
   2848 			}
   2849 
   2850 			/* need to find the device_name_to_block_device_major stuff */
   2851 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2852 
   2853 			rf_part_found = 0; /*No raid partition as yet*/
   2854 
   2855 			/* get a vnode for the raw partition of this disk */
   2856 			bminor = minor(device_unit(dv));
   2857 			dev = wedge ? makedev(bmajor, bminor) :
   2858 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2859 			if (bdevvp(dev, &vp))
   2860 				panic("RAID can't alloc vnode");
   2861 
   2862 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2863 
   2864 			if (error) {
   2865 				/* "Who cares."  Continue looking
   2866 				   for something that exists*/
   2867 				vput(vp);
   2868 				continue;
   2869 			}
   2870 
   2871 			error = getdisksize(vp, &numsecs, &secsize);
   2872 			if (error) {
   2873 				/*
   2874 				 * Pseudo devices like vnd and cgd can be
   2875 				 * opened but may still need some configuration.
   2876 				 * Ignore these quietly.
   2877 				 */
   2878 				if (error != ENXIO)
   2879 					printf("RAIDframe: can't get disk size"
   2880 					    " for dev %s (%d)\n",
   2881 					    device_xname(dv), error);
   2882 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2883 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2884 				vput(vp);
   2885 				continue;
   2886 			}
   2887 			if (wedge) {
   2888 				struct dkwedge_info dkw;
   2889 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2890 				    NOCRED);
   2891 				if (error) {
   2892 					printf("RAIDframe: can't get wedge info for "
   2893 					    "dev %s (%d)\n", device_xname(dv), error);
   2894 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2895 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2896 					vput(vp);
   2897 					continue;
   2898 				}
   2899 
   2900 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2901 					vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2902 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2903 					vput(vp);
   2904 					continue;
   2905 				}
   2906 
   2907 				ac_list = rf_get_component(ac_list, dev, vp,
   2908 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   2909 				rf_part_found = 1; /*There is a raid component on this disk*/
   2910 				continue;
   2911 			}
   2912 
   2913 			/* Ok, the disk exists.  Go get the disklabel. */
   2914 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   2915 			if (error) {
   2916 				/*
   2917 				 * XXX can't happen - open() would
   2918 				 * have errored out (or faked up one)
   2919 				 */
   2920 				if (error != ENOTTY)
   2921 					printf("RAIDframe: can't get label for dev "
   2922 					    "%s (%d)\n", device_xname(dv), error);
   2923 			}
   2924 
   2925 			/* don't need this any more.  We'll allocate it again
   2926 			   a little later if we really do... */
   2927 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2928 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2929 			vput(vp);
   2930 
   2931 			if (error)
   2932 				continue;
   2933 
   2934 			rf_part_found = 0; /*No raid partitions yet*/
   2935 			for (i = 0; i < label.d_npartitions; i++) {
   2936 				char cname[sizeof(ac_list->devname)];
   2937 
   2938 				/* We only support partitions marked as RAID */
   2939 				if (label.d_partitions[i].p_fstype != FS_RAID)
   2940 					continue;
   2941 
   2942 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   2943 				if (bdevvp(dev, &vp))
   2944 					panic("RAID can't alloc vnode");
   2945 
   2946 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2947 				if (error) {
   2948 					/* Whatever... */
   2949 					vput(vp);
   2950 					continue;
   2951 				}
   2952 				snprintf(cname, sizeof(cname), "%s%c",
   2953 				    device_xname(dv), 'a' + i);
   2954 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2955 					label.d_partitions[i].p_size, numsecs, secsize);
   2956 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   2957 			}
   2958 
   2959 			/*
   2960 			 *If there is no raid component on this disk, either in a
   2961 			 *disklabel or inside a wedge, check the raw partition as well,
   2962 			 *as it is possible to configure raid components on raw disk
   2963 			 *devices.
   2964 			 */
   2965 
   2966 			if (!rf_part_found) {
   2967 				char cname[sizeof(ac_list->devname)];
   2968 
   2969 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   2970 				if (bdevvp(dev, &vp))
   2971 					panic("RAID can't alloc vnode");
   2972 
   2973 				error = VOP_OPEN(vp, FREAD, NOCRED);
   2974 				if (error) {
   2975 					/* Whatever... */
   2976 					vput(vp);
   2977 					continue;
   2978 				}
   2979 				snprintf(cname, sizeof(cname), "%s%c",
   2980 				    device_xname(dv), 'a' + RAW_PART);
   2981 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   2982 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   2983 			}
   2984 		}
   2985 		deviter_release(&di);
   2986 	}
   2987 	return ac_list;
   2988 }
   2989 
   2990 
   2991 int
   2992 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   2993 {
   2994 
   2995 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   2996 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   2997 	    ((clabel->clean == RF_RAID_CLEAN) ||
   2998 	     (clabel->clean == RF_RAID_DIRTY)) &&
   2999 	    clabel->row >=0 &&
   3000 	    clabel->column >= 0 &&
   3001 	    clabel->num_rows > 0 &&
   3002 	    clabel->num_columns > 0 &&
   3003 	    clabel->row < clabel->num_rows &&
   3004 	    clabel->column < clabel->num_columns &&
   3005 	    clabel->blockSize > 0 &&
   3006 	    /*
   3007 	     * numBlocksHi may contain garbage, but it is ok since
   3008 	     * the type is unsigned.  If it is really garbage,
   3009 	     * rf_fix_old_label_size() will fix it.
   3010 	     */
   3011 	    rf_component_label_numblocks(clabel) > 0) {
   3012 		/*
   3013 		 * label looks reasonable enough...
   3014 		 * let's make sure it has no old garbage.
   3015 		 */
   3016 		if (numsecs)
   3017 			rf_fix_old_label_size(clabel, numsecs);
   3018 		return(1);
   3019 	}
   3020 	return(0);
   3021 }
   3022 
   3023 
   3024 /*
   3025  * For reasons yet unknown, some old component labels have garbage in
   3026  * the newer numBlocksHi region, and this causes lossage.  Since those
   3027  * disks will also have numsecs set to less than 32 bits of sectors,
   3028  * we can determine when this corruption has occurred, and fix it.
   3029  *
   3030  * The exact same problem, with the same unknown reason, happens to
   3031  * the partitionSizeHi member as well.
   3032  */
   3033 static void
   3034 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3035 {
   3036 
   3037 	if (numsecs < ((uint64_t)1 << 32)) {
   3038 		if (clabel->numBlocksHi) {
   3039 			printf("WARNING: total sectors < 32 bits, yet "
   3040 			       "numBlocksHi set\n"
   3041 			       "WARNING: resetting numBlocksHi to zero.\n");
   3042 			clabel->numBlocksHi = 0;
   3043 		}
   3044 
   3045 		if (clabel->partitionSizeHi) {
   3046 			printf("WARNING: total sectors < 32 bits, yet "
   3047 			       "partitionSizeHi set\n"
   3048 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3049 			clabel->partitionSizeHi = 0;
   3050 		}
   3051 	}
   3052 }
   3053 
   3054 
   3055 #ifdef DEBUG
   3056 void
   3057 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3058 {
   3059 	uint64_t numBlocks;
   3060 	static const char *rp[] = {
   3061 	    "No", "Force", "Soft", "*invalid*"
   3062 	};
   3063 
   3064 
   3065 	numBlocks = rf_component_label_numblocks(clabel);
   3066 
   3067 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3068 	       clabel->row, clabel->column,
   3069 	       clabel->num_rows, clabel->num_columns);
   3070 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3071 	       clabel->version, clabel->serial_number,
   3072 	       clabel->mod_counter);
   3073 	printf("   Clean: %s Status: %d\n",
   3074 	       clabel->clean ? "Yes" : "No", clabel->status);
   3075 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3076 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3077 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3078 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3079 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3080 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3081 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3082 #if 0
   3083 	   printf("   Config order: %d\n", clabel->config_order);
   3084 #endif
   3085 
   3086 }
   3087 #endif
   3088 
   3089 RF_ConfigSet_t *
   3090 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3091 {
   3092 	RF_AutoConfig_t *ac;
   3093 	RF_ConfigSet_t *config_sets;
   3094 	RF_ConfigSet_t *cset;
   3095 	RF_AutoConfig_t *ac_next;
   3096 
   3097 
   3098 	config_sets = NULL;
   3099 
   3100 	/* Go through the AutoConfig list, and figure out which components
   3101 	   belong to what sets.  */
   3102 	ac = ac_list;
   3103 	while(ac!=NULL) {
   3104 		/* we're going to putz with ac->next, so save it here
   3105 		   for use at the end of the loop */
   3106 		ac_next = ac->next;
   3107 
   3108 		if (config_sets == NULL) {
   3109 			/* will need at least this one... */
   3110 			config_sets = (RF_ConfigSet_t *)
   3111 				malloc(sizeof(RF_ConfigSet_t),
   3112 				       M_RAIDFRAME, M_NOWAIT);
   3113 			if (config_sets == NULL) {
   3114 				panic("rf_create_auto_sets: No memory!");
   3115 			}
   3116 			/* this one is easy :) */
   3117 			config_sets->ac = ac;
   3118 			config_sets->next = NULL;
   3119 			config_sets->rootable = 0;
   3120 			ac->next = NULL;
   3121 		} else {
   3122 			/* which set does this component fit into? */
   3123 			cset = config_sets;
   3124 			while(cset!=NULL) {
   3125 				if (rf_does_it_fit(cset, ac)) {
   3126 					/* looks like it matches... */
   3127 					ac->next = cset->ac;
   3128 					cset->ac = ac;
   3129 					break;
   3130 				}
   3131 				cset = cset->next;
   3132 			}
   3133 			if (cset==NULL) {
   3134 				/* didn't find a match above... new set..*/
   3135 				cset = (RF_ConfigSet_t *)
   3136 					malloc(sizeof(RF_ConfigSet_t),
   3137 					       M_RAIDFRAME, M_NOWAIT);
   3138 				if (cset == NULL) {
   3139 					panic("rf_create_auto_sets: No memory!");
   3140 				}
   3141 				cset->ac = ac;
   3142 				ac->next = NULL;
   3143 				cset->next = config_sets;
   3144 				cset->rootable = 0;
   3145 				config_sets = cset;
   3146 			}
   3147 		}
   3148 		ac = ac_next;
   3149 	}
   3150 
   3151 
   3152 	return(config_sets);
   3153 }
   3154 
   3155 static int
   3156 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3157 {
   3158 	RF_ComponentLabel_t *clabel1, *clabel2;
   3159 
   3160 	/* If this one matches the *first* one in the set, that's good
   3161 	   enough, since the other members of the set would have been
   3162 	   through here too... */
   3163 	/* note that we are not checking partitionSize here..
   3164 
   3165 	   Note that we are also not checking the mod_counters here.
   3166 	   If everything else matches except the mod_counter, that's
   3167 	   good enough for this test.  We will deal with the mod_counters
   3168 	   a little later in the autoconfiguration process.
   3169 
   3170 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3171 
   3172 	   The reason we don't check for this is that failed disks
   3173 	   will have lower modification counts.  If those disks are
   3174 	   not added to the set they used to belong to, then they will
   3175 	   form their own set, which may result in 2 different sets,
   3176 	   for example, competing to be configured at raid0, and
   3177 	   perhaps competing to be the root filesystem set.  If the
   3178 	   wrong ones get configured, or both attempt to become /,
   3179 	   weird behaviour and or serious lossage will occur.  Thus we
   3180 	   need to bring them into the fold here, and kick them out at
   3181 	   a later point.
   3182 
   3183 	*/
   3184 
   3185 	clabel1 = cset->ac->clabel;
   3186 	clabel2 = ac->clabel;
   3187 	if ((clabel1->version == clabel2->version) &&
   3188 	    (clabel1->serial_number == clabel2->serial_number) &&
   3189 	    (clabel1->num_rows == clabel2->num_rows) &&
   3190 	    (clabel1->num_columns == clabel2->num_columns) &&
   3191 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3192 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3193 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3194 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3195 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3196 	    (clabel1->blockSize == clabel2->blockSize) &&
   3197 	    rf_component_label_numblocks(clabel1) ==
   3198 	    rf_component_label_numblocks(clabel2) &&
   3199 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3200 	    (clabel1->root_partition == clabel2->root_partition) &&
   3201 	    (clabel1->last_unit == clabel2->last_unit) &&
   3202 	    (clabel1->config_order == clabel2->config_order)) {
   3203 		/* if it get's here, it almost *has* to be a match */
   3204 	} else {
   3205 		/* it's not consistent with somebody in the set..
   3206 		   punt */
   3207 		return(0);
   3208 	}
   3209 	/* all was fine.. it must fit... */
   3210 	return(1);
   3211 }
   3212 
   3213 int
   3214 rf_have_enough_components(RF_ConfigSet_t *cset)
   3215 {
   3216 	RF_AutoConfig_t *ac;
   3217 	RF_AutoConfig_t *auto_config;
   3218 	RF_ComponentLabel_t *clabel;
   3219 	int c;
   3220 	int num_cols;
   3221 	int num_missing;
   3222 	int mod_counter;
   3223 	int mod_counter_found;
   3224 	int even_pair_failed;
   3225 	char parity_type;
   3226 
   3227 
   3228 	/* check to see that we have enough 'live' components
   3229 	   of this set.  If so, we can configure it if necessary */
   3230 
   3231 	num_cols = cset->ac->clabel->num_columns;
   3232 	parity_type = cset->ac->clabel->parityConfig;
   3233 
   3234 	/* XXX Check for duplicate components!?!?!? */
   3235 
   3236 	/* Determine what the mod_counter is supposed to be for this set. */
   3237 
   3238 	mod_counter_found = 0;
   3239 	mod_counter = 0;
   3240 	ac = cset->ac;
   3241 	while(ac!=NULL) {
   3242 		if (mod_counter_found==0) {
   3243 			mod_counter = ac->clabel->mod_counter;
   3244 			mod_counter_found = 1;
   3245 		} else {
   3246 			if (ac->clabel->mod_counter > mod_counter) {
   3247 				mod_counter = ac->clabel->mod_counter;
   3248 			}
   3249 		}
   3250 		ac = ac->next;
   3251 	}
   3252 
   3253 	num_missing = 0;
   3254 	auto_config = cset->ac;
   3255 
   3256 	even_pair_failed = 0;
   3257 	for(c=0; c<num_cols; c++) {
   3258 		ac = auto_config;
   3259 		while(ac!=NULL) {
   3260 			if ((ac->clabel->column == c) &&
   3261 			    (ac->clabel->mod_counter == mod_counter)) {
   3262 				/* it's this one... */
   3263 #ifdef DEBUG
   3264 				printf("Found: %s at %d\n",
   3265 				       ac->devname,c);
   3266 #endif
   3267 				break;
   3268 			}
   3269 			ac=ac->next;
   3270 		}
   3271 		if (ac==NULL) {
   3272 				/* Didn't find one here! */
   3273 				/* special case for RAID 1, especially
   3274 				   where there are more than 2
   3275 				   components (where RAIDframe treats
   3276 				   things a little differently :( ) */
   3277 			if (parity_type == '1') {
   3278 				if (c%2 == 0) { /* even component */
   3279 					even_pair_failed = 1;
   3280 				} else { /* odd component.  If
   3281 					    we're failed, and
   3282 					    so is the even
   3283 					    component, it's
   3284 					    "Good Night, Charlie" */
   3285 					if (even_pair_failed == 1) {
   3286 						return(0);
   3287 					}
   3288 				}
   3289 			} else {
   3290 				/* normal accounting */
   3291 				num_missing++;
   3292 			}
   3293 		}
   3294 		if ((parity_type == '1') && (c%2 == 1)) {
   3295 				/* Just did an even component, and we didn't
   3296 				   bail.. reset the even_pair_failed flag,
   3297 				   and go on to the next component.... */
   3298 			even_pair_failed = 0;
   3299 		}
   3300 	}
   3301 
   3302 	clabel = cset->ac->clabel;
   3303 
   3304 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3305 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3306 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3307 		/* XXX this needs to be made *much* more general */
   3308 		/* Too many failures */
   3309 		return(0);
   3310 	}
   3311 	/* otherwise, all is well, and we've got enough to take a kick
   3312 	   at autoconfiguring this set */
   3313 	return(1);
   3314 }
   3315 
   3316 void
   3317 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3318 			RF_Raid_t *raidPtr)
   3319 {
   3320 	RF_ComponentLabel_t *clabel;
   3321 	int i;
   3322 
   3323 	clabel = ac->clabel;
   3324 
   3325 	/* 1. Fill in the common stuff */
   3326 	config->numCol = clabel->num_columns;
   3327 	config->numSpare = 0; /* XXX should this be set here? */
   3328 	config->sectPerSU = clabel->sectPerSU;
   3329 	config->SUsPerPU = clabel->SUsPerPU;
   3330 	config->SUsPerRU = clabel->SUsPerRU;
   3331 	config->parityConfig = clabel->parityConfig;
   3332 	/* XXX... */
   3333 	strcpy(config->diskQueueType,"fifo");
   3334 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3335 	config->layoutSpecificSize = 0; /* XXX ?? */
   3336 
   3337 	while(ac!=NULL) {
   3338 		/* row/col values will be in range due to the checks
   3339 		   in reasonable_label() */
   3340 		strcpy(config->devnames[0][ac->clabel->column],
   3341 		       ac->devname);
   3342 		ac = ac->next;
   3343 	}
   3344 
   3345 	for(i=0;i<RF_MAXDBGV;i++) {
   3346 		config->debugVars[i][0] = 0;
   3347 	}
   3348 }
   3349 
   3350 int
   3351 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3352 {
   3353 	RF_ComponentLabel_t *clabel;
   3354 	int column;
   3355 	int sparecol;
   3356 
   3357 	raidPtr->autoconfigure = new_value;
   3358 
   3359 	for(column=0; column<raidPtr->numCol; column++) {
   3360 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3361 			clabel = raidget_component_label(raidPtr, column);
   3362 			clabel->autoconfigure = new_value;
   3363 			raidflush_component_label(raidPtr, column);
   3364 		}
   3365 	}
   3366 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3367 		sparecol = raidPtr->numCol + column;
   3368 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3369 			clabel = raidget_component_label(raidPtr, sparecol);
   3370 			clabel->autoconfigure = new_value;
   3371 			raidflush_component_label(raidPtr, sparecol);
   3372 		}
   3373 	}
   3374 	return(new_value);
   3375 }
   3376 
   3377 int
   3378 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3379 {
   3380 	RF_ComponentLabel_t *clabel;
   3381 	int column;
   3382 	int sparecol;
   3383 
   3384 	raidPtr->root_partition = new_value;
   3385 	for(column=0; column<raidPtr->numCol; column++) {
   3386 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3387 			clabel = raidget_component_label(raidPtr, column);
   3388 			clabel->root_partition = new_value;
   3389 			raidflush_component_label(raidPtr, column);
   3390 		}
   3391 	}
   3392 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3393 		sparecol = raidPtr->numCol + column;
   3394 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3395 			clabel = raidget_component_label(raidPtr, sparecol);
   3396 			clabel->root_partition = new_value;
   3397 			raidflush_component_label(raidPtr, sparecol);
   3398 		}
   3399 	}
   3400 	return(new_value);
   3401 }
   3402 
   3403 void
   3404 rf_release_all_vps(RF_ConfigSet_t *cset)
   3405 {
   3406 	RF_AutoConfig_t *ac;
   3407 
   3408 	ac = cset->ac;
   3409 	while(ac!=NULL) {
   3410 		/* Close the vp, and give it back */
   3411 		if (ac->vp) {
   3412 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3413 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3414 			vput(ac->vp);
   3415 			ac->vp = NULL;
   3416 		}
   3417 		ac = ac->next;
   3418 	}
   3419 }
   3420 
   3421 
   3422 void
   3423 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3424 {
   3425 	RF_AutoConfig_t *ac;
   3426 	RF_AutoConfig_t *next_ac;
   3427 
   3428 	ac = cset->ac;
   3429 	while(ac!=NULL) {
   3430 		next_ac = ac->next;
   3431 		/* nuke the label */
   3432 		free(ac->clabel, M_RAIDFRAME);
   3433 		/* cleanup the config structure */
   3434 		free(ac, M_RAIDFRAME);
   3435 		/* "next.." */
   3436 		ac = next_ac;
   3437 	}
   3438 	/* and, finally, nuke the config set */
   3439 	free(cset, M_RAIDFRAME);
   3440 }
   3441 
   3442 
   3443 void
   3444 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3445 {
   3446 	/* current version number */
   3447 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3448 	clabel->serial_number = raidPtr->serial_number;
   3449 	clabel->mod_counter = raidPtr->mod_counter;
   3450 
   3451 	clabel->num_rows = 1;
   3452 	clabel->num_columns = raidPtr->numCol;
   3453 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3454 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3455 
   3456 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3457 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3458 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3459 
   3460 	clabel->blockSize = raidPtr->bytesPerSector;
   3461 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3462 
   3463 	/* XXX not portable */
   3464 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3465 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3466 	clabel->autoconfigure = raidPtr->autoconfigure;
   3467 	clabel->root_partition = raidPtr->root_partition;
   3468 	clabel->last_unit = raidPtr->raidid;
   3469 	clabel->config_order = raidPtr->config_order;
   3470 
   3471 #ifndef RF_NO_PARITY_MAP
   3472 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3473 #endif
   3474 }
   3475 
   3476 struct raid_softc *
   3477 rf_auto_config_set(RF_ConfigSet_t *cset)
   3478 {
   3479 	RF_Raid_t *raidPtr;
   3480 	RF_Config_t *config;
   3481 	int raidID;
   3482 	struct raid_softc *sc;
   3483 
   3484 #ifdef DEBUG
   3485 	printf("RAID autoconfigure\n");
   3486 #endif
   3487 
   3488 	/* 1. Create a config structure */
   3489 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3490 	if (config == NULL) {
   3491 		printf("%s: Out of mem - config!?!?\n", __func__);
   3492 				/* XXX do something more intelligent here. */
   3493 		return NULL;
   3494 	}
   3495 
   3496 	/*
   3497 	   2. Figure out what RAID ID this one is supposed to live at
   3498 	   See if we can get the same RAID dev that it was configured
   3499 	   on last time..
   3500 	*/
   3501 
   3502 	raidID = cset->ac->clabel->last_unit;
   3503 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3504 	     sc = raidget(++raidID, false))
   3505 		continue;
   3506 #ifdef DEBUG
   3507 	printf("Configuring raid%d:\n",raidID);
   3508 #endif
   3509 
   3510 	if (sc == NULL)
   3511 		sc = raidget(raidID, true);
   3512 	if (sc == NULL) {
   3513 		printf("%s: Out of mem - softc!?!?\n", __func__);
   3514 				/* XXX do something more intelligent here. */
   3515 		free(config, M_RAIDFRAME);
   3516 		return NULL;
   3517 	}
   3518 
   3519 	raidPtr = &sc->sc_r;
   3520 
   3521 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3522 	raidPtr->softc = sc;
   3523 	raidPtr->raidid = raidID;
   3524 	raidPtr->openings = RAIDOUTSTANDING;
   3525 
   3526 	/* 3. Build the configuration structure */
   3527 	rf_create_configuration(cset->ac, config, raidPtr);
   3528 
   3529 	/* 4. Do the configuration */
   3530 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3531 		raidinit(sc);
   3532 
   3533 		rf_markalldirty(raidPtr);
   3534 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3535 		switch (cset->ac->clabel->root_partition) {
   3536 		case 1:	/* Force Root */
   3537 		case 2:	/* Soft Root: root when boot partition part of raid */
   3538 			/*
   3539 			 * everything configured just fine.  Make a note
   3540 			 * that this set is eligible to be root,
   3541 			 * or forced to be root
   3542 			 */
   3543 			cset->rootable = cset->ac->clabel->root_partition;
   3544 			/* XXX do this here? */
   3545 			raidPtr->root_partition = cset->rootable;
   3546 			break;
   3547 		default:
   3548 			break;
   3549 		}
   3550 	} else {
   3551 		raidput(sc);
   3552 		sc = NULL;
   3553 	}
   3554 
   3555 	/* 5. Cleanup */
   3556 	free(config, M_RAIDFRAME);
   3557 	return sc;
   3558 }
   3559 
   3560 void
   3561 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3562 	     size_t xmin, size_t xmax)
   3563 {
   3564 	int error;
   3565 
   3566 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3567 	pool_sethiwat(p, xmax);
   3568 	if ((error = pool_prime(p, xmin)) != 0)
   3569 		panic("%s: failed to prime pool: %d", __func__, error);
   3570 	pool_setlowat(p, xmin);
   3571 }
   3572 
   3573 /*
   3574  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3575  * to see if there is IO pending and if that IO could possibly be done
   3576  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3577  * otherwise.
   3578  *
   3579  */
   3580 int
   3581 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3582 {
   3583 	struct raid_softc *rs;
   3584 	struct dk_softc *dksc;
   3585 
   3586 	rs = raidPtr->softc;
   3587 	dksc = &rs->sc_dksc;
   3588 
   3589 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3590 		return 1;
   3591 
   3592 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3593 		/* there is work to do */
   3594 		return 0;
   3595 	}
   3596 	/* default is nothing to do */
   3597 	return 1;
   3598 }
   3599 
   3600 int
   3601 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3602 {
   3603 	uint64_t numsecs;
   3604 	unsigned secsize;
   3605 	int error;
   3606 
   3607 	error = getdisksize(vp, &numsecs, &secsize);
   3608 	if (error == 0) {
   3609 		diskPtr->blockSize = secsize;
   3610 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3611 		diskPtr->partitionSize = numsecs;
   3612 		return 0;
   3613 	}
   3614 	return error;
   3615 }
   3616 
   3617 static int
   3618 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3619 {
   3620 	return 1;
   3621 }
   3622 
   3623 static void
   3624 raid_attach(device_t parent, device_t self, void *aux)
   3625 {
   3626 }
   3627 
   3628 
   3629 static int
   3630 raid_detach(device_t self, int flags)
   3631 {
   3632 	int error;
   3633 	struct raid_softc *rs = raidsoftc(self);
   3634 
   3635 	if (rs == NULL)
   3636 		return ENXIO;
   3637 
   3638 	if ((error = raidlock(rs)) != 0)
   3639 		return (error);
   3640 
   3641 	error = raid_detach_unlocked(rs);
   3642 
   3643 	raidunlock(rs);
   3644 
   3645 	/* XXX raid can be referenced here */
   3646 
   3647 	if (error)
   3648 		return error;
   3649 
   3650 	/* Free the softc */
   3651 	raidput(rs);
   3652 
   3653 	return 0;
   3654 }
   3655 
   3656 static void
   3657 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3658 {
   3659 	struct dk_softc *dksc = &rs->sc_dksc;
   3660 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3661 
   3662 	memset(dg, 0, sizeof(*dg));
   3663 
   3664 	dg->dg_secperunit = raidPtr->totalSectors;
   3665 	dg->dg_secsize = raidPtr->bytesPerSector;
   3666 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3667 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3668 
   3669 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3670 }
   3671 
   3672 /*
   3673  * Get cache info for all the components (including spares).
   3674  * Returns intersection of all the cache flags of all disks, or first
   3675  * error if any encountered.
   3676  * XXXfua feature flags can change as spares are added - lock down somehow
   3677  */
   3678 static int
   3679 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3680 {
   3681 	int c;
   3682 	int error;
   3683 	int dkwhole = 0, dkpart;
   3684 
   3685 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3686 		/*
   3687 		 * Check any non-dead disk, even when currently being
   3688 		 * reconstructed.
   3689 		 */
   3690 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3691 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3692 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3693 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3694 			if (error) {
   3695 				if (error != ENODEV) {
   3696 					printf("raid%d: get cache for component %s failed\n",
   3697 					    raidPtr->raidid,
   3698 					    raidPtr->Disks[c].devname);
   3699 				}
   3700 
   3701 				return error;
   3702 			}
   3703 
   3704 			if (c == 0)
   3705 				dkwhole = dkpart;
   3706 			else
   3707 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3708 		}
   3709 	}
   3710 
   3711 	*data = dkwhole;
   3712 
   3713 	return 0;
   3714 }
   3715 
   3716 /*
   3717  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3718  * We end up returning whatever error was returned by the first cache flush
   3719  * that fails.
   3720  */
   3721 
   3722 int
   3723 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3724 {
   3725 	int c, sparecol;
   3726 	int e,error;
   3727 	int force = 1;
   3728 
   3729 	error = 0;
   3730 	for (c = 0; c < raidPtr->numCol; c++) {
   3731 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3732 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3733 					  &force, FWRITE, NOCRED);
   3734 			if (e) {
   3735 				if (e != ENODEV)
   3736 					printf("raid%d: cache flush to component %s failed.\n",
   3737 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3738 				if (error == 0) {
   3739 					error = e;
   3740 				}
   3741 			}
   3742 		}
   3743 	}
   3744 
   3745 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3746 		sparecol = raidPtr->numCol + c;
   3747 		/* Need to ensure that the reconstruct actually completed! */
   3748 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3749 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3750 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3751 			if (e) {
   3752 				if (e != ENODEV)
   3753 					printf("raid%d: cache flush to component %s failed.\n",
   3754 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3755 				if (error == 0) {
   3756 					error = e;
   3757 				}
   3758 			}
   3759 		}
   3760 	}
   3761 	return error;
   3762 }
   3763 
   3764 /* Fill in info with the current status */
   3765 void
   3766 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3767 {
   3768 
   3769 	if (raidPtr->status != rf_rs_reconstructing) {
   3770 		info->total = 100;
   3771 		info->completed = 100;
   3772 	} else {
   3773 		info->total = raidPtr->reconControl->numRUsTotal;
   3774 		info->completed = raidPtr->reconControl->numRUsComplete;
   3775 	}
   3776 	info->remaining = info->total - info->completed;
   3777 }
   3778 
   3779 /* Fill in info with the current status */
   3780 void
   3781 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3782 {
   3783 
   3784 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3785 		info->total = raidPtr->Layout.numStripe;
   3786 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3787 	} else {
   3788 		info->completed = 100;
   3789 		info->total = 100;
   3790 	}
   3791 	info->remaining = info->total - info->completed;
   3792 }
   3793 
   3794 /* Fill in info with the current status */
   3795 void
   3796 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3797 {
   3798 
   3799 	if (raidPtr->copyback_in_progress == 1) {
   3800 		info->total = raidPtr->Layout.numStripe;
   3801 		info->completed = raidPtr->copyback_stripes_done;
   3802 		info->remaining = info->total - info->completed;
   3803 	} else {
   3804 		info->remaining = 0;
   3805 		info->completed = 100;
   3806 		info->total = 100;
   3807 	}
   3808 }
   3809 
   3810 /* Fill in config with the current info */
   3811 int
   3812 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3813 {
   3814 	int	d, i, j;
   3815 
   3816 	if (!raidPtr->valid)
   3817 		return (ENODEV);
   3818 	config->cols = raidPtr->numCol;
   3819 	config->ndevs = raidPtr->numCol;
   3820 	if (config->ndevs >= RF_MAX_DISKS)
   3821 		return (ENOMEM);
   3822 	config->nspares = raidPtr->numSpare;
   3823 	if (config->nspares >= RF_MAX_DISKS)
   3824 		return (ENOMEM);
   3825 	config->maxqdepth = raidPtr->maxQueueDepth;
   3826 	d = 0;
   3827 	for (j = 0; j < config->cols; j++) {
   3828 		config->devs[d] = raidPtr->Disks[j];
   3829 		d++;
   3830 	}
   3831 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3832 		config->spares[i] = raidPtr->Disks[j];
   3833 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3834 			/* XXX: raidctl(8) expects to see this as a used spare */
   3835 			config->spares[i].status = rf_ds_used_spare;
   3836 		}
   3837 	}
   3838 	return 0;
   3839 }
   3840 
   3841 int
   3842 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3843 {
   3844 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3845 	RF_ComponentLabel_t *raid_clabel;
   3846 	int column = clabel->column;
   3847 
   3848 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3849 		return EINVAL;
   3850 	raid_clabel = raidget_component_label(raidPtr, column);
   3851 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3852 
   3853 	return 0;
   3854 }
   3855 
   3856 /*
   3857  * Module interface
   3858  */
   3859 
   3860 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3861 
   3862 #ifdef _MODULE
   3863 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3864 #endif
   3865 
   3866 static int raid_modcmd(modcmd_t, void *);
   3867 static int raid_modcmd_init(void);
   3868 static int raid_modcmd_fini(void);
   3869 
   3870 static int
   3871 raid_modcmd(modcmd_t cmd, void *data)
   3872 {
   3873 	int error;
   3874 
   3875 	error = 0;
   3876 	switch (cmd) {
   3877 	case MODULE_CMD_INIT:
   3878 		error = raid_modcmd_init();
   3879 		break;
   3880 	case MODULE_CMD_FINI:
   3881 		error = raid_modcmd_fini();
   3882 		break;
   3883 	default:
   3884 		error = ENOTTY;
   3885 		break;
   3886 	}
   3887 	return error;
   3888 }
   3889 
   3890 static int
   3891 raid_modcmd_init(void)
   3892 {
   3893 	int error;
   3894 	int bmajor, cmajor;
   3895 
   3896 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3897 	mutex_enter(&raid_lock);
   3898 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3899 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3900 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3901 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3902 
   3903 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   3904 #endif
   3905 
   3906 	bmajor = cmajor = -1;
   3907 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   3908 	    &raid_cdevsw, &cmajor);
   3909 	if (error != 0 && error != EEXIST) {
   3910 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   3911 		mutex_exit(&raid_lock);
   3912 		return error;
   3913 	}
   3914 #ifdef _MODULE
   3915 	error = config_cfdriver_attach(&raid_cd);
   3916 	if (error != 0) {
   3917 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   3918 		    __func__, error);
   3919 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3920 		mutex_exit(&raid_lock);
   3921 		return error;
   3922 	}
   3923 #endif
   3924 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3925 	if (error != 0) {
   3926 		aprint_error("%s: config_cfattach_attach failed %d\n",
   3927 		    __func__, error);
   3928 #ifdef _MODULE
   3929 		config_cfdriver_detach(&raid_cd);
   3930 #endif
   3931 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3932 		mutex_exit(&raid_lock);
   3933 		return error;
   3934 	}
   3935 
   3936 	raidautoconfigdone = false;
   3937 
   3938 	mutex_exit(&raid_lock);
   3939 
   3940 	if (error == 0) {
   3941 		if (rf_BootRaidframe(true) == 0)
   3942 			aprint_verbose("Kernelized RAIDframe activated\n");
   3943 		else
   3944 			panic("Serious error activating RAID!!");
   3945 	}
   3946 
   3947 	/*
   3948 	 * Register a finalizer which will be used to auto-config RAID
   3949 	 * sets once all real hardware devices have been found.
   3950 	 */
   3951 	error = config_finalize_register(NULL, rf_autoconfig);
   3952 	if (error != 0) {
   3953 		aprint_error("WARNING: unable to register RAIDframe "
   3954 		    "finalizer\n");
   3955 		error = 0;
   3956 	}
   3957 
   3958 	return error;
   3959 }
   3960 
   3961 static int
   3962 raid_modcmd_fini(void)
   3963 {
   3964 	int error;
   3965 
   3966 	mutex_enter(&raid_lock);
   3967 
   3968 	/* Don't allow unload if raid device(s) exist.  */
   3969 	if (!LIST_EMPTY(&raids)) {
   3970 		mutex_exit(&raid_lock);
   3971 		return EBUSY;
   3972 	}
   3973 
   3974 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   3975 	if (error != 0) {
   3976 		aprint_error("%s: cannot detach cfattach\n",__func__);
   3977 		mutex_exit(&raid_lock);
   3978 		return error;
   3979 	}
   3980 #ifdef _MODULE
   3981 	error = config_cfdriver_detach(&raid_cd);
   3982 	if (error != 0) {
   3983 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   3984 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3985 		mutex_exit(&raid_lock);
   3986 		return error;
   3987 	}
   3988 #endif
   3989 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   3990 	if (error != 0) {
   3991 		aprint_error("%s: cannot detach devsw\n",__func__);
   3992 #ifdef _MODULE
   3993 		config_cfdriver_attach(&raid_cd);
   3994 #endif
   3995 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   3996 		mutex_exit(&raid_lock);
   3997 		return error;
   3998 	}
   3999 	rf_BootRaidframe(false);
   4000 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4001 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4002 	rf_destroy_cond2(rf_sparet_wait_cv);
   4003 	rf_destroy_cond2(rf_sparet_resp_cv);
   4004 #endif
   4005 	mutex_exit(&raid_lock);
   4006 	mutex_destroy(&raid_lock);
   4007 
   4008 	return error;
   4009 }
   4010