Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.400
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.400 2021/08/28 16:00:52 oster Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.400 2021/08/28 16:00:52 oster Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_raid_autoconfig.h"
    108 #include "opt_compat_netbsd32.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 #include <sys/module.h>
    130 #include <sys/compat_stub.h>
    131 
    132 #include <prop/proplib.h>
    133 
    134 #include <dev/raidframe/raidframevar.h>
    135 #include <dev/raidframe/raidframeio.h>
    136 #include <dev/raidframe/rf_paritymap.h>
    137 
    138 #include "rf_raid.h"
    139 #include "rf_copyback.h"
    140 #include "rf_dag.h"
    141 #include "rf_dagflags.h"
    142 #include "rf_desc.h"
    143 #include "rf_diskqueue.h"
    144 #include "rf_etimer.h"
    145 #include "rf_general.h"
    146 #include "rf_kintf.h"
    147 #include "rf_options.h"
    148 #include "rf_driver.h"
    149 #include "rf_parityscan.h"
    150 #include "rf_threadstuff.h"
    151 
    152 #include "ioconf.h"
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #ifdef DEBUG_ROOT
    162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
    163 #else
    164 #define DPRINTF(a, ...)
    165 #endif
    166 
    167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    168 static rf_declare_mutex2(rf_sparet_wait_mutex);
    169 static rf_declare_cond2(rf_sparet_wait_cv);
    170 static rf_declare_cond2(rf_sparet_resp_cv);
    171 
    172 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    173 						 * spare table */
    174 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    175 						 * installation process */
    176 #endif
    177 
    178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
    179 
    180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    181 
    182 /* prototypes */
    183 static void KernelWakeupFunc(struct buf *);
    184 static void InitBP(struct buf *, struct vnode *, unsigned,
    185     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    186     void *, int);
    187 static void raidinit(struct raid_softc *);
    188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
    189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
    190 
    191 static int raid_match(device_t, cfdata_t, void *);
    192 static void raid_attach(device_t, device_t, void *);
    193 static int raid_detach(device_t, int);
    194 
    195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    196     daddr_t, daddr_t);
    197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    198     daddr_t, daddr_t, int);
    199 
    200 static int raidwrite_component_label(unsigned,
    201     dev_t, struct vnode *, RF_ComponentLabel_t *);
    202 static int raidread_component_label(unsigned,
    203     dev_t, struct vnode *, RF_ComponentLabel_t *);
    204 
    205 static int raid_diskstart(device_t, struct buf *bp);
    206 static int raid_dumpblocks(device_t, void *, daddr_t, int);
    207 static int raid_lastclose(device_t);
    208 
    209 static dev_type_open(raidopen);
    210 static dev_type_close(raidclose);
    211 static dev_type_read(raidread);
    212 static dev_type_write(raidwrite);
    213 static dev_type_ioctl(raidioctl);
    214 static dev_type_strategy(raidstrategy);
    215 static dev_type_dump(raiddump);
    216 static dev_type_size(raidsize);
    217 
    218 const struct bdevsw raid_bdevsw = {
    219 	.d_open = raidopen,
    220 	.d_close = raidclose,
    221 	.d_strategy = raidstrategy,
    222 	.d_ioctl = raidioctl,
    223 	.d_dump = raiddump,
    224 	.d_psize = raidsize,
    225 	.d_discard = nodiscard,
    226 	.d_flag = D_DISK
    227 };
    228 
    229 const struct cdevsw raid_cdevsw = {
    230 	.d_open = raidopen,
    231 	.d_close = raidclose,
    232 	.d_read = raidread,
    233 	.d_write = raidwrite,
    234 	.d_ioctl = raidioctl,
    235 	.d_stop = nostop,
    236 	.d_tty = notty,
    237 	.d_poll = nopoll,
    238 	.d_mmap = nommap,
    239 	.d_kqfilter = nokqfilter,
    240 	.d_discard = nodiscard,
    241 	.d_flag = D_DISK
    242 };
    243 
    244 static struct dkdriver rf_dkdriver = {
    245 	.d_open = raidopen,
    246 	.d_close = raidclose,
    247 	.d_strategy = raidstrategy,
    248 	.d_diskstart = raid_diskstart,
    249 	.d_dumpblocks = raid_dumpblocks,
    250 	.d_lastclose = raid_lastclose,
    251 	.d_minphys = minphys
    252 };
    253 
    254 #define	raidunit(x)	DISKUNIT(x)
    255 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
    256 
    257 extern struct cfdriver raid_cd;
    258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    259     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    260     DVF_DETACH_SHUTDOWN);
    261 
    262 /* Internal representation of a rf_recon_req */
    263 struct rf_recon_req_internal {
    264 	RF_RowCol_t col;
    265 	RF_ReconReqFlags_t flags;
    266 	void   *raidPtr;
    267 };
    268 
    269 /*
    270  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    271  * Be aware that large numbers can allow the driver to consume a lot of
    272  * kernel memory, especially on writes, and in degraded mode reads.
    273  *
    274  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    275  * a single 64K write will typically require 64K for the old data,
    276  * 64K for the old parity, and 64K for the new parity, for a total
    277  * of 192K (if the parity buffer is not re-used immediately).
    278  * Even it if is used immediately, that's still 128K, which when multiplied
    279  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    280  *
    281  * Now in degraded mode, for example, a 64K read on the above setup may
    282  * require data reconstruction, which will require *all* of the 4 remaining
    283  * disks to participate -- 4 * 32K/disk == 128K again.
    284  */
    285 
    286 #ifndef RAIDOUTSTANDING
    287 #define RAIDOUTSTANDING   6
    288 #endif
    289 
    290 #define RAIDLABELDEV(dev)	\
    291 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    292 
    293 /* declared here, and made public, for the benefit of KVM stuff.. */
    294 
    295 static int raidlock(struct raid_softc *);
    296 static void raidunlock(struct raid_softc *);
    297 
    298 static int raid_detach_unlocked(struct raid_softc *);
    299 
    300 static void rf_markalldirty(RF_Raid_t *);
    301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    302 
    303 static void rf_ReconThread(struct rf_recon_req_internal *);
    304 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    305 static void rf_CopybackThread(RF_Raid_t *raidPtr);
    306 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
    307 static int rf_autoconfig(device_t);
    308 static int rf_rescan(void);
    309 static void rf_buildroothack(RF_ConfigSet_t *);
    310 
    311 static RF_AutoConfig_t *rf_find_raid_components(void);
    312 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    313 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    314 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    315 static int rf_set_autoconfig(RF_Raid_t *, int);
    316 static int rf_set_rootpartition(RF_Raid_t *, int);
    317 static void rf_release_all_vps(RF_ConfigSet_t *);
    318 static void rf_cleanup_config_set(RF_ConfigSet_t *);
    319 static int rf_have_enough_components(RF_ConfigSet_t *);
    320 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    322 
    323 /*
    324  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    325  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    326  * in the kernel config file.
    327  */
    328 #ifdef RAID_AUTOCONFIG
    329 int raidautoconfig = 1;
    330 #else
    331 int raidautoconfig = 0;
    332 #endif
    333 static bool raidautoconfigdone = false;
    334 
    335 struct pool rf_alloclist_pool;   /* AllocList */
    336 
    337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    338 static kmutex_t raid_lock;
    339 
    340 static struct raid_softc *
    341 raidcreate(int unit) {
    342 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    343 	sc->sc_unit = unit;
    344 	cv_init(&sc->sc_cv, "raidunit");
    345 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
    346 	return sc;
    347 }
    348 
    349 static void
    350 raiddestroy(struct raid_softc *sc) {
    351 	cv_destroy(&sc->sc_cv);
    352 	mutex_destroy(&sc->sc_mutex);
    353 	kmem_free(sc, sizeof(*sc));
    354 }
    355 
    356 static struct raid_softc *
    357 raidget(int unit, bool create) {
    358 	struct raid_softc *sc;
    359 	if (unit < 0) {
    360 #ifdef DIAGNOSTIC
    361 		panic("%s: unit %d!", __func__, unit);
    362 #endif
    363 		return NULL;
    364 	}
    365 	mutex_enter(&raid_lock);
    366 	LIST_FOREACH(sc, &raids, sc_link) {
    367 		if (sc->sc_unit == unit) {
    368 			mutex_exit(&raid_lock);
    369 			return sc;
    370 		}
    371 	}
    372 	mutex_exit(&raid_lock);
    373 	if (!create)
    374 		return NULL;
    375 	sc = raidcreate(unit);
    376 	mutex_enter(&raid_lock);
    377 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    378 	mutex_exit(&raid_lock);
    379 	return sc;
    380 }
    381 
    382 static void
    383 raidput(struct raid_softc *sc) {
    384 	mutex_enter(&raid_lock);
    385 	LIST_REMOVE(sc, sc_link);
    386 	mutex_exit(&raid_lock);
    387 	raiddestroy(sc);
    388 }
    389 
    390 void
    391 raidattach(int num)
    392 {
    393 
    394 	/*
    395 	 * Device attachment and associated initialization now occurs
    396 	 * as part of the module initialization.
    397 	 */
    398 }
    399 
    400 static int
    401 rf_autoconfig(device_t self)
    402 {
    403 	RF_AutoConfig_t *ac_list;
    404 	RF_ConfigSet_t *config_sets;
    405 
    406 	if (!raidautoconfig || raidautoconfigdone == true)
    407 		return 0;
    408 
    409 	/* XXX This code can only be run once. */
    410 	raidautoconfigdone = true;
    411 
    412 #ifdef __HAVE_CPU_BOOTCONF
    413 	/*
    414 	 * 0. find the boot device if needed first so we can use it later
    415 	 * this needs to be done before we autoconfigure any raid sets,
    416 	 * because if we use wedges we are not going to be able to open
    417 	 * the boot device later
    418 	 */
    419 	if (booted_device == NULL)
    420 		cpu_bootconf();
    421 #endif
    422 	/* 1. locate all RAID components on the system */
    423 	aprint_debug("Searching for RAID components...\n");
    424 	ac_list = rf_find_raid_components();
    425 
    426 	/* 2. Sort them into their respective sets. */
    427 	config_sets = rf_create_auto_sets(ac_list);
    428 
    429 	/*
    430 	 * 3. Evaluate each set and configure the valid ones.
    431 	 * This gets done in rf_buildroothack().
    432 	 */
    433 	rf_buildroothack(config_sets);
    434 
    435 	return 1;
    436 }
    437 
    438 int
    439 rf_inited(const struct raid_softc *rs) {
    440 	return (rs->sc_flags & RAIDF_INITED) != 0;
    441 }
    442 
    443 RF_Raid_t *
    444 rf_get_raid(struct raid_softc *rs) {
    445 	return &rs->sc_r;
    446 }
    447 
    448 int
    449 rf_get_unit(const struct raid_softc *rs) {
    450 	return rs->sc_unit;
    451 }
    452 
    453 static int
    454 rf_containsboot(RF_Raid_t *r, device_t bdv) {
    455 	const char *bootname;
    456 	size_t len;
    457 
    458 	/* if bdv is NULL, the set can't contain it. exit early. */
    459 	if (bdv == NULL)
    460 		return 0;
    461 
    462 	bootname = device_xname(bdv);
    463 	len = strlen(bootname);
    464 
    465 	for (int col = 0; col < r->numCol; col++) {
    466 		const char *devname = r->Disks[col].devname;
    467 		devname += sizeof("/dev/") - 1;
    468 		if (strncmp(devname, "dk", 2) == 0) {
    469 			const char *parent =
    470 			    dkwedge_get_parent_name(r->Disks[col].dev);
    471 			if (parent != NULL)
    472 				devname = parent;
    473 		}
    474 		if (strncmp(devname, bootname, len) == 0) {
    475 			struct raid_softc *sc = r->softc;
    476 			aprint_debug("raid%d includes boot device %s\n",
    477 			    sc->sc_unit, devname);
    478 			return 1;
    479 		}
    480 	}
    481 	return 0;
    482 }
    483 
    484 static int
    485 rf_rescan(void)
    486 {
    487 	RF_AutoConfig_t *ac_list;
    488 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
    489 	struct raid_softc *sc;
    490 	int raid_added;
    491 
    492 	ac_list = rf_find_raid_components();
    493 	config_sets = rf_create_auto_sets(ac_list);
    494 
    495 	raid_added = 1;
    496 	while (raid_added > 0) {
    497 		raid_added = 0;
    498 		cset = config_sets;
    499 		while (cset != NULL) {
    500 			next_cset = cset->next;
    501 			if (rf_have_enough_components(cset) &&
    502 			    cset->ac->clabel->autoconfigure == 1) {
    503 				sc = rf_auto_config_set(cset);
    504 				if (sc != NULL) {
    505 					aprint_debug("raid%d: configured ok, rootable %d\n",
    506 						     sc->sc_unit, cset->rootable);
    507 					/* We added one RAID set */
    508 					raid_added++;
    509 				} else {
    510 					/* The autoconfig didn't work :( */
    511 					aprint_debug("Autoconfig failed\n");
    512 					rf_release_all_vps(cset);
    513 				}
    514 			} else {
    515 				/* we're not autoconfiguring this set...
    516 				   release the associated resources */
    517 				rf_release_all_vps(cset);
    518 			}
    519 			/* cleanup */
    520 			rf_cleanup_config_set(cset);
    521 			cset = next_cset;
    522 		}
    523 		if (raid_added > 0) {
    524 			/* We added at least one RAID set, so re-scan for recursive RAID */
    525 			ac_list = rf_find_raid_components();
    526 			config_sets = rf_create_auto_sets(ac_list);
    527 		}
    528 	}
    529 
    530 	return 0;
    531 }
    532 
    533 
    534 static void
    535 rf_buildroothack(RF_ConfigSet_t *config_sets)
    536 {
    537 	RF_AutoConfig_t *ac_list;
    538 	RF_ConfigSet_t *cset;
    539 	RF_ConfigSet_t *next_cset;
    540 	int num_root;
    541 	int raid_added;
    542 	struct raid_softc *sc, *rsc;
    543 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
    544 
    545 	sc = rsc = NULL;
    546 	num_root = 0;
    547 
    548 	raid_added = 1;
    549 	while (raid_added > 0) {
    550 		raid_added = 0;
    551 		cset = config_sets;
    552 		while (cset != NULL) {
    553 			next_cset = cset->next;
    554 			if (rf_have_enough_components(cset) &&
    555 			    cset->ac->clabel->autoconfigure == 1) {
    556 				sc = rf_auto_config_set(cset);
    557 				if (sc != NULL) {
    558 					aprint_debug("raid%d: configured ok, rootable %d\n",
    559 						     sc->sc_unit, cset->rootable);
    560 					/* We added one RAID set */
    561 					raid_added++;
    562 					if (cset->rootable) {
    563 						rsc = sc;
    564 						num_root++;
    565 					}
    566 				} else {
    567 					/* The autoconfig didn't work :( */
    568 					aprint_debug("Autoconfig failed\n");
    569 					rf_release_all_vps(cset);
    570 				}
    571 			} else {
    572 				/* we're not autoconfiguring this set...
    573 				   release the associated resources */
    574 				rf_release_all_vps(cset);
    575 			}
    576 			/* cleanup */
    577 			rf_cleanup_config_set(cset);
    578 			cset = next_cset;
    579 		}
    580 		if (raid_added > 0) {
    581 			/* We added at least one RAID set, so re-scan for recursive RAID */
    582 			ac_list = rf_find_raid_components();
    583 			config_sets = rf_create_auto_sets(ac_list);
    584 		}
    585 	}
    586 
    587 	/* if the user has specified what the root device should be
    588 	   then we don't touch booted_device or boothowto... */
    589 
    590 	if (rootspec != NULL) {
    591 		DPRINTF("%s: rootspec %s\n", __func__, rootspec);
    592 		return;
    593 	}
    594 
    595 	/* we found something bootable... */
    596 
    597 	/*
    598 	 * XXX: The following code assumes that the root raid
    599 	 * is the first ('a') partition. This is about the best
    600 	 * we can do with a BSD disklabel, but we might be able
    601 	 * to do better with a GPT label, by setting a specified
    602 	 * attribute to indicate the root partition. We can then
    603 	 * stash the partition number in the r->root_partition
    604 	 * high bits (the bottom 2 bits are already used). For
    605 	 * now we just set booted_partition to 0 when we override
    606 	 * root.
    607 	 */
    608 	if (num_root == 1) {
    609 		device_t candidate_root;
    610 		dksc = &rsc->sc_dksc;
    611 		if (dksc->sc_dkdev.dk_nwedges != 0) {
    612 			char cname[sizeof(cset->ac->devname)];
    613 			/* XXX: assume partition 'a' first */
    614 			snprintf(cname, sizeof(cname), "%s%c",
    615 			    device_xname(dksc->sc_dev), 'a');
    616 			candidate_root = dkwedge_find_by_wname(cname);
    617 			DPRINTF("%s: candidate wedge root=%s\n", __func__,
    618 			    cname);
    619 			if (candidate_root == NULL) {
    620 				/*
    621 				 * If that is not found, because we don't use
    622 				 * disklabel, return the first dk child
    623 				 * XXX: we can skip the 'a' check above
    624 				 * and always do this...
    625 				 */
    626 				size_t i = 0;
    627 				candidate_root = dkwedge_find_by_parent(
    628 				    device_xname(dksc->sc_dev), &i);
    629 			}
    630 			DPRINTF("%s: candidate wedge root=%p\n", __func__,
    631 			    candidate_root);
    632 		} else
    633 			candidate_root = dksc->sc_dev;
    634 		DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
    635 		DPRINTF("%s: booted_device=%p root_partition=%d "
    636 			"contains_boot=%d",
    637 		    __func__, booted_device, rsc->sc_r.root_partition,
    638 			   rf_containsboot(&rsc->sc_r, booted_device));
    639 		/* XXX the check for booted_device == NULL can probably be
    640 		 * dropped, now that rf_containsboot handles that case.
    641 		 */
    642 		if (booted_device == NULL ||
    643 		    rsc->sc_r.root_partition == 1 ||
    644 		    rf_containsboot(&rsc->sc_r, booted_device)) {
    645 			booted_device = candidate_root;
    646 			booted_method = "raidframe/single";
    647 			booted_partition = 0;	/* XXX assume 'a' */
    648 			DPRINTF("%s: set booted_device=%s(%p)\n", __func__,
    649 			    device_xname(booted_device), booted_device);
    650 		}
    651 	} else if (num_root > 1) {
    652 		DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
    653 		    booted_device);
    654 
    655 		/*
    656 		 * Maybe the MD code can help. If it cannot, then
    657 		 * setroot() will discover that we have no
    658 		 * booted_device and will ask the user if nothing was
    659 		 * hardwired in the kernel config file
    660 		 */
    661 		if (booted_device == NULL)
    662 			return;
    663 
    664 		num_root = 0;
    665 		mutex_enter(&raid_lock);
    666 		LIST_FOREACH(sc, &raids, sc_link) {
    667 			RF_Raid_t *r = &sc->sc_r;
    668 			if (r->valid == 0)
    669 				continue;
    670 
    671 			if (r->root_partition == 0)
    672 				continue;
    673 
    674 			if (rf_containsboot(r, booted_device)) {
    675 				num_root++;
    676 				rsc = sc;
    677 				dksc = &rsc->sc_dksc;
    678 			}
    679 		}
    680 		mutex_exit(&raid_lock);
    681 
    682 		if (num_root == 1) {
    683 			booted_device = dksc->sc_dev;
    684 			booted_method = "raidframe/multi";
    685 			booted_partition = 0;	/* XXX assume 'a' */
    686 		} else {
    687 			/* we can't guess.. require the user to answer... */
    688 			boothowto |= RB_ASKNAME;
    689 		}
    690 	}
    691 }
    692 
    693 static int
    694 raidsize(dev_t dev)
    695 {
    696 	struct raid_softc *rs;
    697 	struct dk_softc *dksc;
    698 	unsigned int unit;
    699 
    700 	unit = raidunit(dev);
    701 	if ((rs = raidget(unit, false)) == NULL)
    702 		return -1;
    703 	dksc = &rs->sc_dksc;
    704 
    705 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    706 		return -1;
    707 
    708 	return dk_size(dksc, dev);
    709 }
    710 
    711 static int
    712 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    713 {
    714 	unsigned int unit;
    715 	struct raid_softc *rs;
    716 	struct dk_softc *dksc;
    717 
    718 	unit = raidunit(dev);
    719 	if ((rs = raidget(unit, false)) == NULL)
    720 		return ENXIO;
    721 	dksc = &rs->sc_dksc;
    722 
    723 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    724 		return ENODEV;
    725 
    726         /*
    727            Note that blkno is relative to this particular partition.
    728            By adding adding RF_PROTECTED_SECTORS, we get a value that
    729 	   is relative to the partition used for the underlying component.
    730         */
    731 	blkno += RF_PROTECTED_SECTORS;
    732 
    733 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
    734 }
    735 
    736 static int
    737 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
    738 {
    739 	struct raid_softc *rs = raidsoftc(dev);
    740 	const struct bdevsw *bdev;
    741 	RF_Raid_t *raidPtr;
    742 	int     c, sparecol, j, scol, dumpto;
    743 	int     error = 0;
    744 
    745 	raidPtr = &rs->sc_r;
    746 
    747 	/* we only support dumping to RAID 1 sets */
    748 	if (raidPtr->Layout.numDataCol != 1 ||
    749 	    raidPtr->Layout.numParityCol != 1)
    750 		return EINVAL;
    751 
    752 	if ((error = raidlock(rs)) != 0)
    753 		return error;
    754 
    755 	/* figure out what device is alive.. */
    756 
    757 	/*
    758 	   Look for a component to dump to.  The preference for the
    759 	   component to dump to is as follows:
    760 	   1) the first component
    761 	   2) a used_spare of the first component
    762 	   3) the second component
    763 	   4) a used_spare of the second component
    764 	*/
    765 
    766 	dumpto = -1;
    767 	for (c = 0; c < raidPtr->numCol; c++) {
    768 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    769 			/* this might be the one */
    770 			dumpto = c;
    771 			break;
    772 		}
    773 	}
    774 
    775 	/*
    776 	   At this point we have possibly selected a live component.
    777 	   If we didn't find a live ocmponent, we now check to see
    778 	   if there is a relevant spared component.
    779 	*/
    780 
    781 	for (c = 0; c < raidPtr->numSpare; c++) {
    782 		sparecol = raidPtr->numCol + c;
    783 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    784 			/* How about this one? */
    785 			scol = -1;
    786 			for(j=0;j<raidPtr->numCol;j++) {
    787 				if (raidPtr->Disks[j].spareCol == sparecol) {
    788 					scol = j;
    789 					break;
    790 				}
    791 			}
    792 			if (scol == 0) {
    793 				/*
    794 				   We must have found a spared first
    795 				   component!  We'll take that over
    796 				   anything else found so far.  (We
    797 				   couldn't have found a real first
    798 				   component before, since this is a
    799 				   used spare, and it's saying that
    800 				   it's replacing the first
    801 				   component.)  On reboot (with
    802 				   autoconfiguration turned on)
    803 				   sparecol will become the first
    804 				   component (component0) of this set.
    805 				*/
    806 				dumpto = sparecol;
    807 				break;
    808 			} else if (scol != -1) {
    809 				/*
    810 				   Must be a spared second component.
    811 				   We'll dump to that if we havn't found
    812 				   anything else so far.
    813 				*/
    814 				if (dumpto == -1)
    815 					dumpto = sparecol;
    816 			}
    817 		}
    818 	}
    819 
    820 	if (dumpto == -1) {
    821 		/* we couldn't find any live components to dump to!?!?
    822 		 */
    823 		error = EINVAL;
    824 		goto out;
    825 	}
    826 
    827 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    828 	if (bdev == NULL) {
    829 		error = ENXIO;
    830 		goto out;
    831 	}
    832 
    833 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    834 				blkno, va, nblk * raidPtr->bytesPerSector);
    835 
    836 out:
    837 	raidunlock(rs);
    838 
    839 	return error;
    840 }
    841 
    842 /* ARGSUSED */
    843 static int
    844 raidopen(dev_t dev, int flags, int fmt,
    845     struct lwp *l)
    846 {
    847 	int     unit = raidunit(dev);
    848 	struct raid_softc *rs;
    849 	struct dk_softc *dksc;
    850 	int     error = 0;
    851 	int     part, pmask;
    852 
    853 	if ((rs = raidget(unit, true)) == NULL)
    854 		return ENXIO;
    855 	if ((error = raidlock(rs)) != 0)
    856 		return error;
    857 
    858 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    859 		error = EBUSY;
    860 		goto bad;
    861 	}
    862 
    863 	dksc = &rs->sc_dksc;
    864 
    865 	part = DISKPART(dev);
    866 	pmask = (1 << part);
    867 
    868 	if (!DK_BUSY(dksc, pmask) &&
    869 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    870 		/* First one... mark things as dirty... Note that we *MUST*
    871 		 have done a configure before this.  I DO NOT WANT TO BE
    872 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    873 		 THAT THEY BELONG TOGETHER!!!!! */
    874 		/* XXX should check to see if we're only open for reading
    875 		   here... If so, we needn't do this, but then need some
    876 		   other way of keeping track of what's happened.. */
    877 
    878 		rf_markalldirty(&rs->sc_r);
    879 	}
    880 
    881 	if ((rs->sc_flags & RAIDF_INITED) != 0)
    882 		error = dk_open(dksc, dev, flags, fmt, l);
    883 
    884 bad:
    885 	raidunlock(rs);
    886 
    887 	return error;
    888 
    889 
    890 }
    891 
    892 static int
    893 raid_lastclose(device_t self)
    894 {
    895 	struct raid_softc *rs = raidsoftc(self);
    896 
    897 	/* Last one... device is not unconfigured yet.
    898 	   Device shutdown has taken care of setting the
    899 	   clean bits if RAIDF_INITED is not set
    900 	   mark things as clean... */
    901 
    902 	rf_update_component_labels(&rs->sc_r,
    903 	    RF_FINAL_COMPONENT_UPDATE);
    904 
    905 	/* pass to unlocked code */
    906 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    907 		rs->sc_flags |= RAIDF_DETACH;
    908 
    909 	return 0;
    910 }
    911 
    912 /* ARGSUSED */
    913 static int
    914 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    915 {
    916 	int     unit = raidunit(dev);
    917 	struct raid_softc *rs;
    918 	struct dk_softc *dksc;
    919 	cfdata_t cf;
    920 	int     error = 0, do_detach = 0, do_put = 0;
    921 
    922 	if ((rs = raidget(unit, false)) == NULL)
    923 		return ENXIO;
    924 	dksc = &rs->sc_dksc;
    925 
    926 	if ((error = raidlock(rs)) != 0)
    927 		return error;
    928 
    929 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
    930 		error = dk_close(dksc, dev, flags, fmt, l);
    931 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
    932 			do_detach = 1;
    933 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
    934 		do_put = 1;
    935 
    936 	raidunlock(rs);
    937 
    938 	if (do_detach) {
    939 		/* free the pseudo device attach bits */
    940 		cf = device_cfdata(dksc->sc_dev);
    941 		error = config_detach(dksc->sc_dev, 0);
    942 		if (error == 0)
    943 			free(cf, M_RAIDFRAME);
    944 	} else if (do_put) {
    945 		raidput(rs);
    946 	}
    947 
    948 	return error;
    949 
    950 }
    951 
    952 static void
    953 raid_wakeup(RF_Raid_t *raidPtr)
    954 {
    955 	rf_lock_mutex2(raidPtr->iodone_lock);
    956 	rf_signal_cond2(raidPtr->iodone_cv);
    957 	rf_unlock_mutex2(raidPtr->iodone_lock);
    958 }
    959 
    960 static void
    961 raidstrategy(struct buf *bp)
    962 {
    963 	unsigned int unit;
    964 	struct raid_softc *rs;
    965 	struct dk_softc *dksc;
    966 	RF_Raid_t *raidPtr;
    967 
    968 	unit = raidunit(bp->b_dev);
    969 	if ((rs = raidget(unit, false)) == NULL) {
    970 		bp->b_error = ENXIO;
    971 		goto fail;
    972 	}
    973 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    974 		bp->b_error = ENXIO;
    975 		goto fail;
    976 	}
    977 	dksc = &rs->sc_dksc;
    978 	raidPtr = &rs->sc_r;
    979 
    980 	/* Queue IO only */
    981 	if (dk_strategy_defer(dksc, bp))
    982 		goto done;
    983 
    984 	/* schedule the IO to happen at the next convenient time */
    985 	raid_wakeup(raidPtr);
    986 
    987 done:
    988 	return;
    989 
    990 fail:
    991 	bp->b_resid = bp->b_bcount;
    992 	biodone(bp);
    993 }
    994 
    995 static int
    996 raid_diskstart(device_t dev, struct buf *bp)
    997 {
    998 	struct raid_softc *rs = raidsoftc(dev);
    999 	RF_Raid_t *raidPtr;
   1000 
   1001 	raidPtr = &rs->sc_r;
   1002 	if (!raidPtr->valid) {
   1003 		db1_printf(("raid is not valid..\n"));
   1004 		return ENODEV;
   1005 	}
   1006 
   1007 	/* XXX */
   1008 	bp->b_resid = 0;
   1009 
   1010 	return raiddoaccess(raidPtr, bp);
   1011 }
   1012 
   1013 void
   1014 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
   1015 {
   1016 	struct raid_softc *rs;
   1017 	struct dk_softc *dksc;
   1018 
   1019 	rs = raidPtr->softc;
   1020 	dksc = &rs->sc_dksc;
   1021 
   1022 	dk_done(dksc, bp);
   1023 
   1024 	rf_lock_mutex2(raidPtr->mutex);
   1025 	raidPtr->openings++;
   1026 	rf_unlock_mutex2(raidPtr->mutex);
   1027 
   1028 	/* schedule more IO */
   1029 	raid_wakeup(raidPtr);
   1030 }
   1031 
   1032 /* ARGSUSED */
   1033 static int
   1034 raidread(dev_t dev, struct uio *uio, int flags)
   1035 {
   1036 	int     unit = raidunit(dev);
   1037 	struct raid_softc *rs;
   1038 
   1039 	if ((rs = raidget(unit, false)) == NULL)
   1040 		return ENXIO;
   1041 
   1042 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1043 		return ENXIO;
   1044 
   1045 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
   1046 
   1047 }
   1048 
   1049 /* ARGSUSED */
   1050 static int
   1051 raidwrite(dev_t dev, struct uio *uio, int flags)
   1052 {
   1053 	int     unit = raidunit(dev);
   1054 	struct raid_softc *rs;
   1055 
   1056 	if ((rs = raidget(unit, false)) == NULL)
   1057 		return ENXIO;
   1058 
   1059 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1060 		return ENXIO;
   1061 
   1062 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
   1063 
   1064 }
   1065 
   1066 static int
   1067 raid_detach_unlocked(struct raid_softc *rs)
   1068 {
   1069 	struct dk_softc *dksc = &rs->sc_dksc;
   1070 	RF_Raid_t *raidPtr;
   1071 	int error;
   1072 
   1073 	raidPtr = &rs->sc_r;
   1074 
   1075 	if (DK_BUSY(dksc, 0) ||
   1076 	    raidPtr->recon_in_progress != 0 ||
   1077 	    raidPtr->parity_rewrite_in_progress != 0 ||
   1078 	    raidPtr->copyback_in_progress != 0)
   1079 		return EBUSY;
   1080 
   1081 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   1082 		return 0;
   1083 
   1084 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1085 
   1086 	if ((error = rf_Shutdown(raidPtr)) != 0)
   1087 		return error;
   1088 
   1089 	rs->sc_flags &= ~RAIDF_INITED;
   1090 
   1091 	/* Kill off any queued buffers */
   1092 	dk_drain(dksc);
   1093 	bufq_free(dksc->sc_bufq);
   1094 
   1095 	/* Detach the disk. */
   1096 	dkwedge_delall(&dksc->sc_dkdev);
   1097 	disk_detach(&dksc->sc_dkdev);
   1098 	disk_destroy(&dksc->sc_dkdev);
   1099 	dk_detach(dksc);
   1100 
   1101 	return 0;
   1102 }
   1103 
   1104 static bool
   1105 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
   1106 {
   1107 	switch (cmd) {
   1108 	case RAIDFRAME_ADD_HOT_SPARE:
   1109 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1110 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1111 	case RAIDFRAME_CHECK_PARITY:
   1112 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1113 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1114 	case RAIDFRAME_CHECK_RECON_STATUS:
   1115 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1116 	case RAIDFRAME_COPYBACK:
   1117 	case RAIDFRAME_DELETE_COMPONENT:
   1118 	case RAIDFRAME_FAIL_DISK:
   1119 	case RAIDFRAME_GET_ACCTOTALS:
   1120 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1121 	case RAIDFRAME_GET_INFO:
   1122 	case RAIDFRAME_GET_SIZE:
   1123 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1124 	case RAIDFRAME_INIT_LABELS:
   1125 	case RAIDFRAME_KEEP_ACCTOTALS:
   1126 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1127 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1128 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1129 	case RAIDFRAME_PARITYMAP_STATUS:
   1130 	case RAIDFRAME_REBUILD_IN_PLACE:
   1131 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1132 	case RAIDFRAME_RESET_ACCTOTALS:
   1133 	case RAIDFRAME_REWRITEPARITY:
   1134 	case RAIDFRAME_SET_AUTOCONFIG:
   1135 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1136 	case RAIDFRAME_SET_ROOT:
   1137 		return (rs->sc_flags & RAIDF_INITED) == 0;
   1138 	}
   1139 	return false;
   1140 }
   1141 
   1142 int
   1143 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
   1144 {
   1145 	struct rf_recon_req_internal *rrint;
   1146 
   1147 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1148 		/* Can't do this on a RAID 0!! */
   1149 		return EINVAL;
   1150 	}
   1151 
   1152 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
   1153 		/* bad column */
   1154 		return EINVAL;
   1155 	}
   1156 
   1157 	rf_lock_mutex2(raidPtr->mutex);
   1158 	if (raidPtr->status == rf_rs_reconstructing) {
   1159 		/* you can't fail a disk while we're reconstructing! */
   1160 		/* XXX wrong for RAID6 */
   1161 		goto out;
   1162 	}
   1163 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
   1164 	    (raidPtr->numFailures > 0)) {
   1165 		/* some other component has failed.  Let's not make
   1166 		   things worse. XXX wrong for RAID6 */
   1167 		goto out;
   1168 	}
   1169 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1170 		/* Can't fail a spared disk! */
   1171 		goto out;
   1172 	}
   1173 	rf_unlock_mutex2(raidPtr->mutex);
   1174 
   1175 	/* make a copy of the recon request so that we don't rely on
   1176 	 * the user's buffer */
   1177 	rrint = RF_Malloc(sizeof(*rrint));
   1178 	if (rrint == NULL)
   1179 		return(ENOMEM);
   1180 	rrint->col = rr->col;
   1181 	rrint->flags = rr->flags;
   1182 	rrint->raidPtr = raidPtr;
   1183 
   1184 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
   1185 	    rrint, "raid_recon");
   1186 out:
   1187 	rf_unlock_mutex2(raidPtr->mutex);
   1188 	return EINVAL;
   1189 }
   1190 
   1191 static int
   1192 rf_copyinspecificbuf(RF_Config_t *k_cfg)
   1193 {
   1194 	/* allocate a buffer for the layout-specific data, and copy it in */
   1195 	if (k_cfg->layoutSpecificSize == 0)
   1196 		return 0;
   1197 
   1198 	if (k_cfg->layoutSpecificSize > 10000) {
   1199 	    /* sanity check */
   1200 	    return EINVAL;
   1201 	}
   1202 
   1203 	u_char *specific_buf;
   1204 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
   1205 	if (specific_buf == NULL)
   1206 		return ENOMEM;
   1207 
   1208 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1209 	    k_cfg->layoutSpecificSize);
   1210 	if (retcode) {
   1211 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1212 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
   1213 		return retcode;
   1214 	}
   1215 
   1216 	k_cfg->layoutSpecific = specific_buf;
   1217 	return 0;
   1218 }
   1219 
   1220 static int
   1221 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
   1222 {
   1223 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
   1224 
   1225 	if (rs->sc_r.valid) {
   1226 		/* There is a valid RAID set running on this unit! */
   1227 		printf("raid%d: Device already configured!\n", rs->sc_unit);
   1228 		return EINVAL;
   1229 	}
   1230 
   1231 	/* copy-in the configuration information */
   1232 	/* data points to a pointer to the configuration structure */
   1233 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
   1234 	if (*k_cfg == NULL) {
   1235 		return ENOMEM;
   1236 	}
   1237 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
   1238 	if (retcode == 0)
   1239 		return 0;
   1240 	RF_Free(*k_cfg, sizeof(RF_Config_t));
   1241 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
   1242 	rs->sc_flags |= RAIDF_SHUTDOWN;
   1243 	return retcode;
   1244 }
   1245 
   1246 int
   1247 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
   1248 {
   1249 	int retcode;
   1250 	RF_Raid_t *raidPtr = &rs->sc_r;
   1251 
   1252 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
   1253 
   1254 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
   1255 		goto out;
   1256 
   1257 	/* should do some kind of sanity check on the configuration.
   1258 	 * Store the sum of all the bytes in the last byte? */
   1259 
   1260 	/* configure the system */
   1261 
   1262 	/*
   1263 	 * Clear the entire RAID descriptor, just to make sure
   1264 	 *  there is no stale data left in the case of a
   1265 	 *  reconfiguration
   1266 	 */
   1267 	memset(raidPtr, 0, sizeof(*raidPtr));
   1268 	raidPtr->softc = rs;
   1269 	raidPtr->raidid = rs->sc_unit;
   1270 
   1271 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1272 
   1273 	if (retcode == 0) {
   1274 		/* allow this many simultaneous IO's to
   1275 		   this RAID device */
   1276 		raidPtr->openings = RAIDOUTSTANDING;
   1277 
   1278 		raidinit(rs);
   1279 		raid_wakeup(raidPtr);
   1280 		rf_markalldirty(raidPtr);
   1281 	}
   1282 
   1283 	/* free the buffers.  No return code here. */
   1284 	if (k_cfg->layoutSpecificSize) {
   1285 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
   1286 	}
   1287 out:
   1288 	RF_Free(k_cfg, sizeof(RF_Config_t));
   1289 	if (retcode) {
   1290 		/*
   1291 		 * If configuration failed, set sc_flags so that we
   1292 		 * will detach the device when we close it.
   1293 		 */
   1294 		rs->sc_flags |= RAIDF_SHUTDOWN;
   1295 	}
   1296 	return retcode;
   1297 }
   1298 
   1299 #if RF_DISABLED
   1300 static int
   1301 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1302 {
   1303 
   1304 	/* XXX check the label for valid stuff... */
   1305 	/* Note that some things *should not* get modified --
   1306 	   the user should be re-initing the labels instead of
   1307 	   trying to patch things.
   1308 	   */
   1309 #ifdef DEBUG
   1310 	int raidid = raidPtr->raidid;
   1311 	printf("raid%d: Got component label:\n", raidid);
   1312 	printf("raid%d: Version: %d\n", raidid, clabel->version);
   1313 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1314 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1315 	printf("raid%d: Column: %d\n", raidid, clabel->column);
   1316 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1317 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1318 	printf("raid%d: Status: %d\n", raidid, clabel->status);
   1319 #endif	/* DEBUG */
   1320 	clabel->row = 0;
   1321 	int column = clabel->column;
   1322 
   1323 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1324 		return(EINVAL);
   1325 	}
   1326 
   1327 	/* XXX this isn't allowed to do anything for now :-) */
   1328 
   1329 	/* XXX and before it is, we need to fill in the rest
   1330 	   of the fields!?!?!?! */
   1331 	memcpy(raidget_component_label(raidPtr, column),
   1332 	    clabel, sizeof(*clabel));
   1333 	raidflush_component_label(raidPtr, column);
   1334 	return 0;
   1335 }
   1336 #endif
   1337 
   1338 static int
   1339 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   1340 {
   1341 	/*
   1342 	   we only want the serial number from
   1343 	   the above.  We get all the rest of the information
   1344 	   from the config that was used to create this RAID
   1345 	   set.
   1346 	   */
   1347 
   1348 	raidPtr->serial_number = clabel->serial_number;
   1349 
   1350 	for (int column = 0; column < raidPtr->numCol; column++) {
   1351 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
   1352 		if (RF_DEAD_DISK(diskPtr->status))
   1353 			continue;
   1354 		RF_ComponentLabel_t *ci_label = raidget_component_label(
   1355 		    raidPtr, column);
   1356 		/* Zeroing this is important. */
   1357 		memset(ci_label, 0, sizeof(*ci_label));
   1358 		raid_init_component_label(raidPtr, ci_label);
   1359 		ci_label->serial_number = raidPtr->serial_number;
   1360 		ci_label->row = 0; /* we dont' pretend to support more */
   1361 		rf_component_label_set_partitionsize(ci_label,
   1362 		    diskPtr->partitionSize);
   1363 		ci_label->column = column;
   1364 		raidflush_component_label(raidPtr, column);
   1365 		/* XXXjld what about the spares? */
   1366 	}
   1367 
   1368 	return 0;
   1369 }
   1370 
   1371 static int
   1372 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
   1373 {
   1374 
   1375 	if (raidPtr->Layout.map->faultsTolerated == 0) {
   1376 		/* Can't do this on a RAID 0!! */
   1377 		return EINVAL;
   1378 	}
   1379 
   1380 	if (raidPtr->recon_in_progress == 1) {
   1381 		/* a reconstruct is already in progress! */
   1382 		return EINVAL;
   1383 	}
   1384 
   1385 	RF_SingleComponent_t component;
   1386 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1387 	component.row = 0; /* we don't support any more */
   1388 	int column = component.column;
   1389 
   1390 	if ((column < 0) || (column >= raidPtr->numCol)) {
   1391 		return EINVAL;
   1392 	}
   1393 
   1394 	rf_lock_mutex2(raidPtr->mutex);
   1395 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1396 	    (raidPtr->numFailures > 0)) {
   1397 		/* XXX 0 above shouldn't be constant!!! */
   1398 		/* some component other than this has failed.
   1399 		   Let's not make things worse than they already
   1400 		   are... */
   1401 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1402 		       raidPtr->raidid);
   1403 		printf("raid%d:     Col: %d   Too many failures.\n",
   1404 		       raidPtr->raidid, column);
   1405 		rf_unlock_mutex2(raidPtr->mutex);
   1406 		return EINVAL;
   1407 	}
   1408 
   1409 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
   1410 		printf("raid%d: Unable to reconstruct to disk at:\n",
   1411 		       raidPtr->raidid);
   1412 		printf("raid%d:    Col: %d   "
   1413 		    "Reconstruction already occurring!\n",
   1414 		    raidPtr->raidid, column);
   1415 
   1416 		rf_unlock_mutex2(raidPtr->mutex);
   1417 		return EINVAL;
   1418 	}
   1419 
   1420 	if (raidPtr->Disks[column].status == rf_ds_spared) {
   1421 		rf_unlock_mutex2(raidPtr->mutex);
   1422 		return EINVAL;
   1423 	}
   1424 
   1425 	rf_unlock_mutex2(raidPtr->mutex);
   1426 
   1427 	struct rf_recon_req_internal *rrint;
   1428 	rrint = RF_Malloc(sizeof(*rrint));
   1429 	if (rrint == NULL)
   1430 		return ENOMEM;
   1431 
   1432 	rrint->col = column;
   1433 	rrint->raidPtr = raidPtr;
   1434 
   1435 	return RF_CREATE_THREAD(raidPtr->recon_thread,
   1436 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
   1437 }
   1438 
   1439 static int
   1440 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
   1441 {
   1442 	/*
   1443 	 * This makes no sense on a RAID 0, or if we are not reconstructing
   1444 	 * so tell the user it's done.
   1445 	 */
   1446 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
   1447 	    raidPtr->status != rf_rs_reconstructing) {
   1448 		*data = 100;
   1449 		return 0;
   1450 	}
   1451 	if (raidPtr->reconControl->numRUsTotal == 0) {
   1452 		*data = 0;
   1453 		return 0;
   1454 	}
   1455 	*data = (raidPtr->reconControl->numRUsComplete * 100
   1456 	    / raidPtr->reconControl->numRUsTotal);
   1457 	return 0;
   1458 }
   1459 
   1460 static int
   1461 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
   1462 {
   1463 	int     unit = raidunit(dev);
   1464 	int     part, pmask;
   1465 	struct raid_softc *rs;
   1466 	struct dk_softc *dksc;
   1467 	RF_Config_t *k_cfg;
   1468 	RF_Raid_t *raidPtr;
   1469 	RF_AccTotals_t *totals;
   1470 	RF_SingleComponent_t component;
   1471 	RF_DeviceConfig_t *d_cfg, *ucfgp;
   1472 	int retcode = 0;
   1473 	int column;
   1474 	RF_ComponentLabel_t *clabel;
   1475 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1476 	int d;
   1477 
   1478 	if ((rs = raidget(unit, false)) == NULL)
   1479 		return ENXIO;
   1480 
   1481 	dksc = &rs->sc_dksc;
   1482 	raidPtr = &rs->sc_r;
   1483 
   1484 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1485 	    (int) DISKPART(dev), (int) unit, cmd));
   1486 
   1487 	/* Must be initialized for these... */
   1488 	if (rf_must_be_initialized(rs, cmd))
   1489 		return ENXIO;
   1490 
   1491 	switch (cmd) {
   1492 		/* configure the system */
   1493 	case RAIDFRAME_CONFIGURE:
   1494 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
   1495 			return retcode;
   1496 		return rf_construct(rs, k_cfg);
   1497 
   1498 		/* shutdown the system */
   1499 	case RAIDFRAME_SHUTDOWN:
   1500 
   1501 		part = DISKPART(dev);
   1502 		pmask = (1 << part);
   1503 
   1504 		if ((retcode = raidlock(rs)) != 0)
   1505 			return retcode;
   1506 
   1507 		if (DK_BUSY(dksc, pmask) ||
   1508 		    raidPtr->recon_in_progress != 0 ||
   1509 		    raidPtr->parity_rewrite_in_progress != 0 ||
   1510 		    raidPtr->copyback_in_progress != 0)
   1511 			retcode = EBUSY;
   1512 		else {
   1513 			/* detach and free on close */
   1514 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1515 			retcode = 0;
   1516 		}
   1517 
   1518 		raidunlock(rs);
   1519 
   1520 		return retcode;
   1521 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1522 		return rf_get_component_label(raidPtr, data);
   1523 
   1524 #if RF_DISABLED
   1525 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1526 		return rf_set_component_label(raidPtr, data);
   1527 #endif
   1528 
   1529 	case RAIDFRAME_INIT_LABELS:
   1530 		return rf_init_component_label(raidPtr, data);
   1531 
   1532 	case RAIDFRAME_SET_AUTOCONFIG:
   1533 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1534 		printf("raid%d: New autoconfig value is: %d\n",
   1535 		       raidPtr->raidid, d);
   1536 		*(int *) data = d;
   1537 		return retcode;
   1538 
   1539 	case RAIDFRAME_SET_ROOT:
   1540 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1541 		printf("raid%d: New rootpartition value is: %d\n",
   1542 		       raidPtr->raidid, d);
   1543 		*(int *) data = d;
   1544 		return retcode;
   1545 
   1546 		/* initialize all parity */
   1547 	case RAIDFRAME_REWRITEPARITY:
   1548 
   1549 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1550 			/* Parity for RAID 0 is trivially correct */
   1551 			raidPtr->parity_good = RF_RAID_CLEAN;
   1552 			return 0;
   1553 		}
   1554 
   1555 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1556 			/* Re-write is already in progress! */
   1557 			return EINVAL;
   1558 		}
   1559 
   1560 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1561 		    rf_RewriteParityThread, raidPtr,"raid_parity");
   1562 
   1563 	case RAIDFRAME_ADD_HOT_SPARE:
   1564 		sparePtr = (RF_SingleComponent_t *) data;
   1565 		memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t));
   1566 		return rf_add_hot_spare(raidPtr, &component);
   1567 
   1568 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1569 		return retcode;
   1570 
   1571 	case RAIDFRAME_DELETE_COMPONENT:
   1572 		componentPtr = (RF_SingleComponent_t *)data;
   1573 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1574 		return rf_delete_component(raidPtr, &component);
   1575 
   1576 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1577 		componentPtr = (RF_SingleComponent_t *)data;
   1578 		memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
   1579 		return rf_incorporate_hot_spare(raidPtr, &component);
   1580 
   1581 	case RAIDFRAME_REBUILD_IN_PLACE:
   1582 		return rf_rebuild_in_place(raidPtr, data);
   1583 
   1584 	case RAIDFRAME_GET_INFO:
   1585 		ucfgp = *(RF_DeviceConfig_t **)data;
   1586 		d_cfg = RF_Malloc(sizeof(*d_cfg));
   1587 		if (d_cfg == NULL)
   1588 			return ENOMEM;
   1589 		retcode = rf_get_info(raidPtr, d_cfg);
   1590 		if (retcode == 0) {
   1591 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
   1592 		}
   1593 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1594 		return retcode;
   1595 
   1596 	case RAIDFRAME_CHECK_PARITY:
   1597 		*(int *) data = raidPtr->parity_good;
   1598 		return 0;
   1599 
   1600 	case RAIDFRAME_PARITYMAP_STATUS:
   1601 		if (rf_paritymap_ineligible(raidPtr))
   1602 			return EINVAL;
   1603 		rf_paritymap_status(raidPtr->parity_map, data);
   1604 		return 0;
   1605 
   1606 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1607 		if (rf_paritymap_ineligible(raidPtr))
   1608 			return EINVAL;
   1609 		if (raidPtr->parity_map == NULL)
   1610 			return ENOENT; /* ??? */
   1611 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
   1612 			return EINVAL;
   1613 		return 0;
   1614 
   1615 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1616 		if (rf_paritymap_ineligible(raidPtr))
   1617 			return EINVAL;
   1618 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1619 		return 0;
   1620 
   1621 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1622 		if (rf_paritymap_ineligible(raidPtr))
   1623 			return EINVAL;
   1624 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1625 		/* XXX should errors be passed up? */
   1626 		return 0;
   1627 
   1628 	case RAIDFRAME_RESCAN:
   1629 		return rf_rescan();
   1630 
   1631 	case RAIDFRAME_RESET_ACCTOTALS:
   1632 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1633 		return 0;
   1634 
   1635 	case RAIDFRAME_GET_ACCTOTALS:
   1636 		totals = (RF_AccTotals_t *) data;
   1637 		*totals = raidPtr->acc_totals;
   1638 		return 0;
   1639 
   1640 	case RAIDFRAME_KEEP_ACCTOTALS:
   1641 		raidPtr->keep_acc_totals = *(int *)data;
   1642 		return 0;
   1643 
   1644 	case RAIDFRAME_GET_SIZE:
   1645 		*(int *) data = raidPtr->totalSectors;
   1646 		return 0;
   1647 
   1648 	case RAIDFRAME_FAIL_DISK:
   1649 		return rf_fail_disk(raidPtr, data);
   1650 
   1651 		/* invoke a copyback operation after recon on whatever disk
   1652 		 * needs it, if any */
   1653 	case RAIDFRAME_COPYBACK:
   1654 
   1655 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1656 			/* This makes no sense on a RAID 0!! */
   1657 			return EINVAL;
   1658 		}
   1659 
   1660 		if (raidPtr->copyback_in_progress == 1) {
   1661 			/* Copyback is already in progress! */
   1662 			return EINVAL;
   1663 		}
   1664 
   1665 		return RF_CREATE_THREAD(raidPtr->copyback_thread,
   1666 		    rf_CopybackThread, raidPtr, "raid_copyback");
   1667 
   1668 		/* return the percentage completion of reconstruction */
   1669 	case RAIDFRAME_CHECK_RECON_STATUS:
   1670 		return rf_check_recon_status(raidPtr, data);
   1671 
   1672 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1673 		rf_check_recon_status_ext(raidPtr, data);
   1674 		return 0;
   1675 
   1676 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1677 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1678 			/* This makes no sense on a RAID 0, so tell the
   1679 			   user it's done. */
   1680 			*(int *) data = 100;
   1681 			return 0;
   1682 		}
   1683 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1684 			*(int *) data = 100 *
   1685 				raidPtr->parity_rewrite_stripes_done /
   1686 				raidPtr->Layout.numStripe;
   1687 		} else {
   1688 			*(int *) data = 100;
   1689 		}
   1690 		return 0;
   1691 
   1692 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1693 		rf_check_parityrewrite_status_ext(raidPtr, data);
   1694 		return 0;
   1695 
   1696 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1697 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1698 			/* This makes no sense on a RAID 0 */
   1699 			*(int *) data = 100;
   1700 			return 0;
   1701 		}
   1702 		if (raidPtr->copyback_in_progress == 1) {
   1703 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1704 				raidPtr->Layout.numStripe;
   1705 		} else {
   1706 			*(int *) data = 100;
   1707 		}
   1708 		return 0;
   1709 
   1710 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1711 		rf_check_copyback_status_ext(raidPtr, data);
   1712 		return 0;
   1713 
   1714 	case RAIDFRAME_SET_LAST_UNIT:
   1715 		for (column = 0; column < raidPtr->numCol; column++)
   1716 			if (raidPtr->Disks[column].status != rf_ds_optimal)
   1717 				return EBUSY;
   1718 
   1719 		for (column = 0; column < raidPtr->numCol; column++) {
   1720 			clabel = raidget_component_label(raidPtr, column);
   1721 			clabel->last_unit = *(int *)data;
   1722 			raidflush_component_label(raidPtr, column);
   1723 		}
   1724 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
   1725 		return 0;
   1726 
   1727 		/* the sparetable daemon calls this to wait for the kernel to
   1728 		 * need a spare table. this ioctl does not return until a
   1729 		 * spare table is needed. XXX -- calling mpsleep here in the
   1730 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1731 		 * -- I should either compute the spare table in the kernel,
   1732 		 * or have a different -- XXX XXX -- interface (a different
   1733 		 * character device) for delivering the table     -- XXX */
   1734 #if RF_DISABLED
   1735 	case RAIDFRAME_SPARET_WAIT:
   1736 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1737 		while (!rf_sparet_wait_queue)
   1738 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1739 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
   1740 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1741 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1742 
   1743 		/* structure assignment */
   1744 		*((RF_SparetWait_t *) data) = *waitreq;
   1745 
   1746 		RF_Free(waitreq, sizeof(*waitreq));
   1747 		return 0;
   1748 
   1749 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1750 		 * code in it that will cause the dameon to exit */
   1751 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1752 		waitreq = RF_Malloc(sizeof(*waitreq));
   1753 		waitreq->fcol = -1;
   1754 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1755 		waitreq->next = rf_sparet_wait_queue;
   1756 		rf_sparet_wait_queue = waitreq;
   1757 		rf_broadcast_cond2(rf_sparet_wait_cv);
   1758 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1759 		return 0;
   1760 
   1761 		/* used by the spare table daemon to deliver a spare table
   1762 		 * into the kernel */
   1763 	case RAIDFRAME_SEND_SPARET:
   1764 
   1765 		/* install the spare table */
   1766 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1767 
   1768 		/* respond to the requestor.  the return status of the spare
   1769 		 * table installation is passed in the "fcol" field */
   1770 		waitred = RF_Malloc(sizeof(*waitreq));
   1771 		waitreq->fcol = retcode;
   1772 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1773 		waitreq->next = rf_sparet_resp_queue;
   1774 		rf_sparet_resp_queue = waitreq;
   1775 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1776 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1777 
   1778 		return retcode;
   1779 #endif
   1780 	default:
   1781 		/*
   1782 		 * Don't bother trying to load compat modules
   1783 		 * if it is not our ioctl. This is more efficient
   1784 		 * and makes rump tests not depend on compat code
   1785 		 */
   1786 		if (IOCGROUP(cmd) != 'r')
   1787 			break;
   1788 #ifdef _LP64
   1789 		if ((l->l_proc->p_flag & PK_32) != 0) {
   1790 			module_autoload("compat_netbsd32_raid",
   1791 			    MODULE_CLASS_EXEC);
   1792 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
   1793 			    (rs, cmd, data), enosys(), retcode);
   1794 			if (retcode != EPASSTHROUGH)
   1795 				return retcode;
   1796 		}
   1797 #endif
   1798 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
   1799 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
   1800 		    (rs, cmd, data), enosys(), retcode);
   1801 		if (retcode != EPASSTHROUGH)
   1802 			return retcode;
   1803 
   1804 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
   1805 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
   1806 		    (rs, cmd, data), enosys(), retcode);
   1807 		if (retcode != EPASSTHROUGH)
   1808 			return retcode;
   1809 		break; /* fall through to the os-specific code below */
   1810 
   1811 	}
   1812 
   1813 	if (!raidPtr->valid)
   1814 		return EINVAL;
   1815 
   1816 	/*
   1817 	 * Add support for "regular" device ioctls here.
   1818 	 */
   1819 
   1820 	switch (cmd) {
   1821 	case DIOCGCACHE:
   1822 		retcode = rf_get_component_caches(raidPtr, (int *)data);
   1823 		break;
   1824 
   1825 	case DIOCCACHESYNC:
   1826 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
   1827 		break;
   1828 
   1829 	default:
   1830 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
   1831 		break;
   1832 	}
   1833 
   1834 	return retcode;
   1835 
   1836 }
   1837 
   1838 
   1839 /* raidinit -- complete the rest of the initialization for the
   1840    RAIDframe device.  */
   1841 
   1842 
   1843 static void
   1844 raidinit(struct raid_softc *rs)
   1845 {
   1846 	cfdata_t cf;
   1847 	unsigned int unit;
   1848 	struct dk_softc *dksc = &rs->sc_dksc;
   1849 	RF_Raid_t *raidPtr = &rs->sc_r;
   1850 	device_t dev;
   1851 
   1852 	unit = raidPtr->raidid;
   1853 
   1854 	/* XXX doesn't check bounds. */
   1855 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
   1856 
   1857 	/* attach the pseudo device */
   1858 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1859 	cf->cf_name = raid_cd.cd_name;
   1860 	cf->cf_atname = raid_cd.cd_name;
   1861 	cf->cf_unit = unit;
   1862 	cf->cf_fstate = FSTATE_STAR;
   1863 
   1864 	dev = config_attach_pseudo(cf);
   1865 	if (dev == NULL) {
   1866 		printf("raid%d: config_attach_pseudo failed\n",
   1867 		    raidPtr->raidid);
   1868 		free(cf, M_RAIDFRAME);
   1869 		return;
   1870 	}
   1871 
   1872 	/* provide a backpointer to the real softc */
   1873 	raidsoftc(dev) = rs;
   1874 
   1875 	/* disk_attach actually creates space for the CPU disklabel, among
   1876 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1877 	 * with disklabels. */
   1878 	dk_init(dksc, dev, DKTYPE_RAID);
   1879 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1880 
   1881 	/* XXX There may be a weird interaction here between this, and
   1882 	 * protectedSectors, as used in RAIDframe.  */
   1883 
   1884 	rs->sc_size = raidPtr->totalSectors;
   1885 
   1886 	/* Attach dk and disk subsystems */
   1887 	dk_attach(dksc);
   1888 	disk_attach(&dksc->sc_dkdev);
   1889 	rf_set_geometry(rs, raidPtr);
   1890 
   1891 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
   1892 
   1893 	/* mark unit as usuable */
   1894 	rs->sc_flags |= RAIDF_INITED;
   1895 
   1896 	dkwedge_discover(&dksc->sc_dkdev);
   1897 }
   1898 
   1899 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1900 /* wake up the daemon & tell it to get us a spare table
   1901  * XXX
   1902  * the entries in the queues should be tagged with the raidPtr
   1903  * so that in the extremely rare case that two recons happen at once,
   1904  * we know for which device were requesting a spare table
   1905  * XXX
   1906  *
   1907  * XXX This code is not currently used. GO
   1908  */
   1909 int
   1910 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1911 {
   1912 	int     retcode;
   1913 
   1914 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1915 	req->next = rf_sparet_wait_queue;
   1916 	rf_sparet_wait_queue = req;
   1917 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1918 
   1919 	/* mpsleep unlocks the mutex */
   1920 	while (!rf_sparet_resp_queue) {
   1921 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1922 	}
   1923 	req = rf_sparet_resp_queue;
   1924 	rf_sparet_resp_queue = req->next;
   1925 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1926 
   1927 	retcode = req->fcol;
   1928 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1929 					 * alloc'd */
   1930 	return retcode;
   1931 }
   1932 #endif
   1933 
   1934 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   1935  * bp & passes it down.
   1936  * any calls originating in the kernel must use non-blocking I/O
   1937  * do some extra sanity checking to return "appropriate" error values for
   1938  * certain conditions (to make some standard utilities work)
   1939  *
   1940  * Formerly known as: rf_DoAccessKernel
   1941  */
   1942 void
   1943 raidstart(RF_Raid_t *raidPtr)
   1944 {
   1945 	struct raid_softc *rs;
   1946 	struct dk_softc *dksc;
   1947 
   1948 	rs = raidPtr->softc;
   1949 	dksc = &rs->sc_dksc;
   1950 	/* quick check to see if anything has died recently */
   1951 	rf_lock_mutex2(raidPtr->mutex);
   1952 	if (raidPtr->numNewFailures > 0) {
   1953 		rf_unlock_mutex2(raidPtr->mutex);
   1954 		rf_update_component_labels(raidPtr,
   1955 					   RF_NORMAL_COMPONENT_UPDATE);
   1956 		rf_lock_mutex2(raidPtr->mutex);
   1957 		raidPtr->numNewFailures--;
   1958 	}
   1959 	rf_unlock_mutex2(raidPtr->mutex);
   1960 
   1961 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
   1962 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
   1963 		return;
   1964 	}
   1965 
   1966 	dk_start(dksc, NULL);
   1967 }
   1968 
   1969 static int
   1970 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
   1971 {
   1972 	RF_SectorCount_t num_blocks, pb, sum;
   1973 	RF_RaidAddr_t raid_addr;
   1974 	daddr_t blocknum;
   1975 	int rc;
   1976 
   1977 	rf_lock_mutex2(raidPtr->mutex);
   1978 	if (raidPtr->openings == 0) {
   1979 		rf_unlock_mutex2(raidPtr->mutex);
   1980 		return EAGAIN;
   1981 	}
   1982 	rf_unlock_mutex2(raidPtr->mutex);
   1983 
   1984 	blocknum = bp->b_rawblkno;
   1985 
   1986 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   1987 		    (int) blocknum));
   1988 
   1989 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   1990 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   1991 
   1992 	/* *THIS* is where we adjust what block we're going to...
   1993 	 * but DO NOT TOUCH bp->b_blkno!!! */
   1994 	raid_addr = blocknum;
   1995 
   1996 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   1997 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   1998 	sum = raid_addr + num_blocks + pb;
   1999 	if (1 || rf_debugKernelAccess) {
   2000 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2001 			    (int) raid_addr, (int) sum, (int) num_blocks,
   2002 			    (int) pb, (int) bp->b_resid));
   2003 	}
   2004 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2005 	    || (sum < num_blocks) || (sum < pb)) {
   2006 		rc = ENOSPC;
   2007 		goto done;
   2008 	}
   2009 	/*
   2010 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2011 	 */
   2012 
   2013 	if (bp->b_bcount & raidPtr->sectorMask) {
   2014 		rc = ENOSPC;
   2015 		goto done;
   2016 	}
   2017 	db1_printf(("Calling DoAccess..\n"));
   2018 
   2019 
   2020 	rf_lock_mutex2(raidPtr->mutex);
   2021 	raidPtr->openings--;
   2022 	rf_unlock_mutex2(raidPtr->mutex);
   2023 
   2024 	/* don't ever condition on bp->b_flags & B_WRITE.
   2025 	 * always condition on B_READ instead */
   2026 
   2027 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2028 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2029 			 raid_addr, num_blocks,
   2030 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2031 
   2032 done:
   2033 	return rc;
   2034 }
   2035 
   2036 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2037 
   2038 int
   2039 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2040 {
   2041 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2042 	struct buf *bp;
   2043 
   2044 	req->queue = queue;
   2045 	bp = req->bp;
   2046 
   2047 	switch (req->type) {
   2048 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2049 		/* XXX need to do something extra here.. */
   2050 		/* I'm leaving this in, as I've never actually seen it used,
   2051 		 * and I'd like folks to report it... GO */
   2052 		printf("%s: WAKEUP CALLED\n", __func__);
   2053 		queue->numOutstanding++;
   2054 
   2055 		bp->b_flags = 0;
   2056 		bp->b_private = req;
   2057 
   2058 		KernelWakeupFunc(bp);
   2059 		break;
   2060 
   2061 	case RF_IO_TYPE_READ:
   2062 	case RF_IO_TYPE_WRITE:
   2063 #if RF_ACC_TRACE > 0
   2064 		if (req->tracerec) {
   2065 			RF_ETIMER_START(req->tracerec->timer);
   2066 		}
   2067 #endif
   2068 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2069 		    op, queue->rf_cinfo->ci_dev,
   2070 		    req->sectorOffset, req->numSector,
   2071 		    req->buf, KernelWakeupFunc, (void *) req,
   2072 		    queue->raidPtr->logBytesPerSector);
   2073 
   2074 		if (rf_debugKernelAccess) {
   2075 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2076 				(long) bp->b_blkno));
   2077 		}
   2078 		queue->numOutstanding++;
   2079 		queue->last_deq_sector = req->sectorOffset;
   2080 		/* acc wouldn't have been let in if there were any pending
   2081 		 * reqs at any other priority */
   2082 		queue->curPriority = req->priority;
   2083 
   2084 		db1_printf(("Going for %c to unit %d col %d\n",
   2085 			    req->type, queue->raidPtr->raidid,
   2086 			    queue->col));
   2087 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2088 			(int) req->sectorOffset, (int) req->numSector,
   2089 			(int) (req->numSector <<
   2090 			    queue->raidPtr->logBytesPerSector),
   2091 			(int) queue->raidPtr->logBytesPerSector));
   2092 
   2093 		/*
   2094 		 * XXX: drop lock here since this can block at
   2095 		 * least with backing SCSI devices.  Retake it
   2096 		 * to minimize fuss with calling interfaces.
   2097 		 */
   2098 
   2099 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2100 		bdev_strategy(bp);
   2101 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2102 		break;
   2103 
   2104 	default:
   2105 		panic("bad req->type in rf_DispatchKernelIO");
   2106 	}
   2107 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2108 
   2109 	return 0;
   2110 }
   2111 /* this is the callback function associated with a I/O invoked from
   2112    kernel code.
   2113  */
   2114 static void
   2115 KernelWakeupFunc(struct buf *bp)
   2116 {
   2117 	RF_DiskQueueData_t *req = NULL;
   2118 	RF_DiskQueue_t *queue;
   2119 
   2120 	db1_printf(("recovering the request queue:\n"));
   2121 
   2122 	req = bp->b_private;
   2123 
   2124 	queue = (RF_DiskQueue_t *) req->queue;
   2125 
   2126 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2127 
   2128 #if RF_ACC_TRACE > 0
   2129 	if (req->tracerec) {
   2130 		RF_ETIMER_STOP(req->tracerec->timer);
   2131 		RF_ETIMER_EVAL(req->tracerec->timer);
   2132 		rf_lock_mutex2(rf_tracing_mutex);
   2133 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2134 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2135 		req->tracerec->num_phys_ios++;
   2136 		rf_unlock_mutex2(rf_tracing_mutex);
   2137 	}
   2138 #endif
   2139 
   2140 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2141 	 * ballistic, and mark the component as hosed... */
   2142 
   2143 	if (bp->b_error != 0) {
   2144 		/* Mark the disk as dead */
   2145 		/* but only mark it once... */
   2146 		/* and only if it wouldn't leave this RAID set
   2147 		   completely broken */
   2148 		if (((queue->raidPtr->Disks[queue->col].status ==
   2149 		      rf_ds_optimal) ||
   2150 		     (queue->raidPtr->Disks[queue->col].status ==
   2151 		      rf_ds_used_spare)) &&
   2152 		     (queue->raidPtr->numFailures <
   2153 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2154 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
   2155 			       queue->raidPtr->raidid,
   2156 			       bp->b_error,
   2157 			       queue->raidPtr->Disks[queue->col].devname);
   2158 			queue->raidPtr->Disks[queue->col].status =
   2159 			    rf_ds_failed;
   2160 			queue->raidPtr->status = rf_rs_degraded;
   2161 			queue->raidPtr->numFailures++;
   2162 			queue->raidPtr->numNewFailures++;
   2163 		} else {	/* Disk is already dead... */
   2164 			/* printf("Disk already marked as dead!\n"); */
   2165 		}
   2166 
   2167 	}
   2168 
   2169 	/* Fill in the error value */
   2170 	req->error = bp->b_error;
   2171 
   2172 	/* Drop this one on the "finished" queue... */
   2173 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2174 
   2175 	/* Let the raidio thread know there is work to be done. */
   2176 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2177 
   2178 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2179 }
   2180 
   2181 
   2182 /*
   2183  * initialize a buf structure for doing an I/O in the kernel.
   2184  */
   2185 static void
   2186 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2187        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2188        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
   2189 {
   2190 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
   2191 	bp->b_oflags = 0;
   2192 	bp->b_cflags = 0;
   2193 	bp->b_bcount = numSect << logBytesPerSector;
   2194 	bp->b_bufsize = bp->b_bcount;
   2195 	bp->b_error = 0;
   2196 	bp->b_dev = dev;
   2197 	bp->b_data = bf;
   2198 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2199 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2200 	if (bp->b_bcount == 0) {
   2201 		panic("bp->b_bcount is zero in InitBP!!");
   2202 	}
   2203 	bp->b_iodone = cbFunc;
   2204 	bp->b_private = cbArg;
   2205 }
   2206 
   2207 /*
   2208  * Wait interruptibly for an exclusive lock.
   2209  *
   2210  * XXX
   2211  * Several drivers do this; it should be abstracted and made MP-safe.
   2212  * (Hmm... where have we seen this warning before :->  GO )
   2213  */
   2214 static int
   2215 raidlock(struct raid_softc *rs)
   2216 {
   2217 	int     error;
   2218 
   2219 	error = 0;
   2220 	mutex_enter(&rs->sc_mutex);
   2221 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2222 		rs->sc_flags |= RAIDF_WANTED;
   2223 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
   2224 		if (error != 0)
   2225 			goto done;
   2226 	}
   2227 	rs->sc_flags |= RAIDF_LOCKED;
   2228 done:
   2229 	mutex_exit(&rs->sc_mutex);
   2230 	return error;
   2231 }
   2232 /*
   2233  * Unlock and wake up any waiters.
   2234  */
   2235 static void
   2236 raidunlock(struct raid_softc *rs)
   2237 {
   2238 
   2239 	mutex_enter(&rs->sc_mutex);
   2240 	rs->sc_flags &= ~RAIDF_LOCKED;
   2241 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2242 		rs->sc_flags &= ~RAIDF_WANTED;
   2243 		cv_broadcast(&rs->sc_cv);
   2244 	}
   2245 	mutex_exit(&rs->sc_mutex);
   2246 }
   2247 
   2248 
   2249 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2250 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2251 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2252 
   2253 static daddr_t
   2254 rf_component_info_offset(void)
   2255 {
   2256 
   2257 	return RF_COMPONENT_INFO_OFFSET;
   2258 }
   2259 
   2260 static daddr_t
   2261 rf_component_info_size(unsigned secsize)
   2262 {
   2263 	daddr_t info_size;
   2264 
   2265 	KASSERT(secsize);
   2266 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2267 		info_size = secsize;
   2268 	else
   2269 		info_size = RF_COMPONENT_INFO_SIZE;
   2270 
   2271 	return info_size;
   2272 }
   2273 
   2274 static daddr_t
   2275 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2276 {
   2277 	daddr_t map_offset;
   2278 
   2279 	KASSERT(raidPtr->bytesPerSector);
   2280 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2281 		map_offset = raidPtr->bytesPerSector;
   2282 	else
   2283 		map_offset = RF_COMPONENT_INFO_SIZE;
   2284 	map_offset += rf_component_info_offset();
   2285 
   2286 	return map_offset;
   2287 }
   2288 
   2289 static daddr_t
   2290 rf_parity_map_size(RF_Raid_t *raidPtr)
   2291 {
   2292 	daddr_t map_size;
   2293 
   2294 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2295 		map_size = raidPtr->bytesPerSector;
   2296 	else
   2297 		map_size = RF_PARITY_MAP_SIZE;
   2298 
   2299 	return map_size;
   2300 }
   2301 
   2302 int
   2303 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2304 {
   2305 	RF_ComponentLabel_t *clabel;
   2306 
   2307 	clabel = raidget_component_label(raidPtr, col);
   2308 	clabel->clean = RF_RAID_CLEAN;
   2309 	raidflush_component_label(raidPtr, col);
   2310 	return(0);
   2311 }
   2312 
   2313 
   2314 int
   2315 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2316 {
   2317 	RF_ComponentLabel_t *clabel;
   2318 
   2319 	clabel = raidget_component_label(raidPtr, col);
   2320 	clabel->clean = RF_RAID_DIRTY;
   2321 	raidflush_component_label(raidPtr, col);
   2322 	return(0);
   2323 }
   2324 
   2325 int
   2326 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2327 {
   2328 	KASSERT(raidPtr->bytesPerSector);
   2329 
   2330 	return raidread_component_label(raidPtr->bytesPerSector,
   2331 	    raidPtr->Disks[col].dev,
   2332 	    raidPtr->raid_cinfo[col].ci_vp,
   2333 	    &raidPtr->raid_cinfo[col].ci_label);
   2334 }
   2335 
   2336 RF_ComponentLabel_t *
   2337 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2338 {
   2339 	return &raidPtr->raid_cinfo[col].ci_label;
   2340 }
   2341 
   2342 int
   2343 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2344 {
   2345 	RF_ComponentLabel_t *label;
   2346 
   2347 	label = &raidPtr->raid_cinfo[col].ci_label;
   2348 	label->mod_counter = raidPtr->mod_counter;
   2349 #ifndef RF_NO_PARITY_MAP
   2350 	label->parity_map_modcount = label->mod_counter;
   2351 #endif
   2352 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2353 	    raidPtr->Disks[col].dev,
   2354 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2355 }
   2356 
   2357 /*
   2358  * Swap the label endianness.
   2359  *
   2360  * Everything in the component label is 4-byte-swapped except the version,
   2361  * which is kept in the byte-swapped version at all times, and indicates
   2362  * for the writer that a swap is necessary.
   2363  *
   2364  * For reads it is expected that out_label == clabel, but writes expect
   2365  * separate labels so only the re-swapped label is written out to disk,
   2366  * leaving the swapped-except-version internally.
   2367  *
   2368  * Only support swapping label version 2.
   2369  */
   2370 static void
   2371 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
   2372 {
   2373 	int	*in, *out, *in_last;
   2374 
   2375 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
   2376 
   2377 	/* Don't swap the label, but do copy it. */
   2378 	out_label->version = clabel->version;
   2379 
   2380 	in = &clabel->serial_number;
   2381 	in_last = &clabel->future_use2[42];
   2382 	out = &out_label->serial_number;
   2383 
   2384 	for (; in < in_last; in++, out++)
   2385 		*out = bswap32(*in);
   2386 }
   2387 
   2388 static int
   2389 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2390     RF_ComponentLabel_t *clabel)
   2391 {
   2392 	int error;
   2393 
   2394 	error = raidread_component_area(dev, b_vp, clabel,
   2395 	    sizeof(RF_ComponentLabel_t),
   2396 	    rf_component_info_offset(),
   2397 	    rf_component_info_size(secsize));
   2398 
   2399 	if (error == 0 &&
   2400 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2401 		rf_swap_label(clabel, clabel);
   2402 	}
   2403 
   2404 	return error;
   2405 }
   2406 
   2407 /* ARGSUSED */
   2408 static int
   2409 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2410     size_t msize, daddr_t offset, daddr_t dsize)
   2411 {
   2412 	struct buf *bp;
   2413 	int error;
   2414 
   2415 	/* XXX should probably ensure that we don't try to do this if
   2416 	   someone has changed rf_protected_sectors. */
   2417 
   2418 	if (b_vp == NULL) {
   2419 		/* For whatever reason, this component is not valid.
   2420 		   Don't try to read a component label from it. */
   2421 		return(EINVAL);
   2422 	}
   2423 
   2424 	/* get a block of the appropriate size... */
   2425 	bp = geteblk((int)dsize);
   2426 	bp->b_dev = dev;
   2427 
   2428 	/* get our ducks in a row for the read */
   2429 	bp->b_blkno = offset / DEV_BSIZE;
   2430 	bp->b_bcount = dsize;
   2431 	bp->b_flags |= B_READ;
   2432  	bp->b_resid = dsize;
   2433 
   2434 	bdev_strategy(bp);
   2435 	error = biowait(bp);
   2436 
   2437 	if (!error) {
   2438 		memcpy(data, bp->b_data, msize);
   2439 	}
   2440 
   2441 	brelse(bp, 0);
   2442 	return(error);
   2443 }
   2444 
   2445 static int
   2446 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2447     RF_ComponentLabel_t *clabel)
   2448 {
   2449 	RF_ComponentLabel_t *clabel_write = clabel;
   2450 	RF_ComponentLabel_t lclabel;
   2451 	int error;
   2452 
   2453 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
   2454 		clabel_write = &lclabel;
   2455 		rf_swap_label(clabel, clabel_write);
   2456 	}
   2457 	error = raidwrite_component_area(dev, b_vp, clabel_write,
   2458 	    sizeof(RF_ComponentLabel_t),
   2459 	    rf_component_info_offset(),
   2460 	    rf_component_info_size(secsize), 0);
   2461 
   2462 	return error;
   2463 }
   2464 
   2465 /* ARGSUSED */
   2466 static int
   2467 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2468     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2469 {
   2470 	struct buf *bp;
   2471 	int error;
   2472 
   2473 	/* get a block of the appropriate size... */
   2474 	bp = geteblk((int)dsize);
   2475 	bp->b_dev = dev;
   2476 
   2477 	/* get our ducks in a row for the write */
   2478 	bp->b_blkno = offset / DEV_BSIZE;
   2479 	bp->b_bcount = dsize;
   2480 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2481  	bp->b_resid = dsize;
   2482 
   2483 	memset(bp->b_data, 0, dsize);
   2484 	memcpy(bp->b_data, data, msize);
   2485 
   2486 	bdev_strategy(bp);
   2487 	if (asyncp)
   2488 		return 0;
   2489 	error = biowait(bp);
   2490 	brelse(bp, 0);
   2491 	if (error) {
   2492 #if 1
   2493 		printf("Failed to write RAID component info!\n");
   2494 #endif
   2495 	}
   2496 
   2497 	return(error);
   2498 }
   2499 
   2500 void
   2501 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2502 {
   2503 	int c;
   2504 
   2505 	for (c = 0; c < raidPtr->numCol; c++) {
   2506 		/* Skip dead disks. */
   2507 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2508 			continue;
   2509 		/* XXXjld: what if an error occurs here? */
   2510 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2511 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2512 		    RF_PARITYMAP_NBYTE,
   2513 		    rf_parity_map_offset(raidPtr),
   2514 		    rf_parity_map_size(raidPtr), 0);
   2515 	}
   2516 }
   2517 
   2518 void
   2519 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2520 {
   2521 	struct rf_paritymap_ondisk tmp;
   2522 	int c,first;
   2523 
   2524 	first=1;
   2525 	for (c = 0; c < raidPtr->numCol; c++) {
   2526 		/* Skip dead disks. */
   2527 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2528 			continue;
   2529 		raidread_component_area(raidPtr->Disks[c].dev,
   2530 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2531 		    RF_PARITYMAP_NBYTE,
   2532 		    rf_parity_map_offset(raidPtr),
   2533 		    rf_parity_map_size(raidPtr));
   2534 		if (first) {
   2535 			memcpy(map, &tmp, sizeof(*map));
   2536 			first = 0;
   2537 		} else {
   2538 			rf_paritymap_merge(map, &tmp);
   2539 		}
   2540 	}
   2541 }
   2542 
   2543 void
   2544 rf_markalldirty(RF_Raid_t *raidPtr)
   2545 {
   2546 	RF_ComponentLabel_t *clabel;
   2547 	int sparecol;
   2548 	int c;
   2549 	int j;
   2550 	int scol = -1;
   2551 
   2552 	raidPtr->mod_counter++;
   2553 	for (c = 0; c < raidPtr->numCol; c++) {
   2554 		/* we don't want to touch (at all) a disk that has
   2555 		   failed */
   2556 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2557 			clabel = raidget_component_label(raidPtr, c);
   2558 			if (clabel->status == rf_ds_spared) {
   2559 				/* XXX do something special...
   2560 				   but whatever you do, don't
   2561 				   try to access it!! */
   2562 			} else {
   2563 				raidmarkdirty(raidPtr, c);
   2564 			}
   2565 		}
   2566 	}
   2567 
   2568 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2569 		sparecol = raidPtr->numCol + c;
   2570 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2571 			/*
   2572 
   2573 			   we claim this disk is "optimal" if it's
   2574 			   rf_ds_used_spare, as that means it should be
   2575 			   directly substitutable for the disk it replaced.
   2576 			   We note that too...
   2577 
   2578 			 */
   2579 
   2580 			for(j=0;j<raidPtr->numCol;j++) {
   2581 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2582 					scol = j;
   2583 					break;
   2584 				}
   2585 			}
   2586 
   2587 			clabel = raidget_component_label(raidPtr, sparecol);
   2588 			/* make sure status is noted */
   2589 
   2590 			raid_init_component_label(raidPtr, clabel);
   2591 
   2592 			clabel->row = 0;
   2593 			clabel->column = scol;
   2594 			/* Note: we *don't* change status from rf_ds_used_spare
   2595 			   to rf_ds_optimal */
   2596 			/* clabel.status = rf_ds_optimal; */
   2597 
   2598 			raidmarkdirty(raidPtr, sparecol);
   2599 		}
   2600 	}
   2601 }
   2602 
   2603 
   2604 void
   2605 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2606 {
   2607 	RF_ComponentLabel_t *clabel;
   2608 	int sparecol;
   2609 	int c;
   2610 	int j;
   2611 	int scol;
   2612 	struct raid_softc *rs = raidPtr->softc;
   2613 
   2614 	scol = -1;
   2615 
   2616 	/* XXX should do extra checks to make sure things really are clean,
   2617 	   rather than blindly setting the clean bit... */
   2618 
   2619 	raidPtr->mod_counter++;
   2620 
   2621 	for (c = 0; c < raidPtr->numCol; c++) {
   2622 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2623 			clabel = raidget_component_label(raidPtr, c);
   2624 			/* make sure status is noted */
   2625 			clabel->status = rf_ds_optimal;
   2626 
   2627 			/* note what unit we are configured as */
   2628 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2629 				clabel->last_unit = raidPtr->raidid;
   2630 
   2631 			raidflush_component_label(raidPtr, c);
   2632 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2633 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2634 					raidmarkclean(raidPtr, c);
   2635 				}
   2636 			}
   2637 		}
   2638 		/* else we don't touch it.. */
   2639 	}
   2640 
   2641 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2642 		sparecol = raidPtr->numCol + c;
   2643 		/* Need to ensure that the reconstruct actually completed! */
   2644 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2645 			/*
   2646 
   2647 			   we claim this disk is "optimal" if it's
   2648 			   rf_ds_used_spare, as that means it should be
   2649 			   directly substitutable for the disk it replaced.
   2650 			   We note that too...
   2651 
   2652 			 */
   2653 
   2654 			for(j=0;j<raidPtr->numCol;j++) {
   2655 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2656 					scol = j;
   2657 					break;
   2658 				}
   2659 			}
   2660 
   2661 			/* XXX shouldn't *really* need this... */
   2662 			clabel = raidget_component_label(raidPtr, sparecol);
   2663 			/* make sure status is noted */
   2664 
   2665 			raid_init_component_label(raidPtr, clabel);
   2666 
   2667 			clabel->column = scol;
   2668 			clabel->status = rf_ds_optimal;
   2669 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
   2670 				clabel->last_unit = raidPtr->raidid;
   2671 
   2672 			raidflush_component_label(raidPtr, sparecol);
   2673 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2674 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2675 					raidmarkclean(raidPtr, sparecol);
   2676 				}
   2677 			}
   2678 		}
   2679 	}
   2680 }
   2681 
   2682 void
   2683 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2684 {
   2685 
   2686 	if (vp != NULL) {
   2687 		if (auto_configured == 1) {
   2688 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2689 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2690 			vput(vp);
   2691 
   2692 		} else {
   2693 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2694 		}
   2695 	}
   2696 }
   2697 
   2698 
   2699 void
   2700 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2701 {
   2702 	int r,c;
   2703 	struct vnode *vp;
   2704 	int acd;
   2705 
   2706 
   2707 	/* We take this opportunity to close the vnodes like we should.. */
   2708 
   2709 	for (c = 0; c < raidPtr->numCol; c++) {
   2710 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2711 		acd = raidPtr->Disks[c].auto_configured;
   2712 		rf_close_component(raidPtr, vp, acd);
   2713 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2714 		raidPtr->Disks[c].auto_configured = 0;
   2715 	}
   2716 
   2717 	for (r = 0; r < raidPtr->numSpare; r++) {
   2718 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2719 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2720 		rf_close_component(raidPtr, vp, acd);
   2721 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2722 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2723 	}
   2724 }
   2725 
   2726 
   2727 static void
   2728 rf_ReconThread(struct rf_recon_req_internal *req)
   2729 {
   2730 	int     s;
   2731 	RF_Raid_t *raidPtr;
   2732 
   2733 	s = splbio();
   2734 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2735 	raidPtr->recon_in_progress = 1;
   2736 
   2737 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2738 		raidPtr->forceRecon = 1;
   2739 	}
   2740 
   2741 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2742 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2743 
   2744 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2745 		raidPtr->forceRecon = 0;
   2746 	}
   2747 
   2748 	RF_Free(req, sizeof(*req));
   2749 
   2750 	raidPtr->recon_in_progress = 0;
   2751 	splx(s);
   2752 
   2753 	/* That's all... */
   2754 	kthread_exit(0);	/* does not return */
   2755 }
   2756 
   2757 static void
   2758 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2759 {
   2760 	int retcode;
   2761 	int s;
   2762 
   2763 	raidPtr->parity_rewrite_stripes_done = 0;
   2764 	raidPtr->parity_rewrite_in_progress = 1;
   2765 	s = splbio();
   2766 	retcode = rf_RewriteParity(raidPtr);
   2767 	splx(s);
   2768 	if (retcode) {
   2769 		printf("raid%d: Error re-writing parity (%d)!\n",
   2770 		    raidPtr->raidid, retcode);
   2771 	} else {
   2772 		/* set the clean bit!  If we shutdown correctly,
   2773 		   the clean bit on each component label will get
   2774 		   set */
   2775 		raidPtr->parity_good = RF_RAID_CLEAN;
   2776 	}
   2777 	raidPtr->parity_rewrite_in_progress = 0;
   2778 
   2779 	/* Anyone waiting for us to stop?  If so, inform them... */
   2780 	if (raidPtr->waitShutdown) {
   2781 		rf_lock_mutex2(raidPtr->rad_lock);
   2782 		cv_broadcast(&raidPtr->parity_rewrite_cv);
   2783 		rf_unlock_mutex2(raidPtr->rad_lock);
   2784 	}
   2785 
   2786 	/* That's all... */
   2787 	kthread_exit(0);	/* does not return */
   2788 }
   2789 
   2790 
   2791 static void
   2792 rf_CopybackThread(RF_Raid_t *raidPtr)
   2793 {
   2794 	int s;
   2795 
   2796 	raidPtr->copyback_in_progress = 1;
   2797 	s = splbio();
   2798 	rf_CopybackReconstructedData(raidPtr);
   2799 	splx(s);
   2800 	raidPtr->copyback_in_progress = 0;
   2801 
   2802 	/* That's all... */
   2803 	kthread_exit(0);	/* does not return */
   2804 }
   2805 
   2806 
   2807 static void
   2808 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
   2809 {
   2810 	int s;
   2811 	RF_Raid_t *raidPtr;
   2812 
   2813 	s = splbio();
   2814 	raidPtr = req->raidPtr;
   2815 	raidPtr->recon_in_progress = 1;
   2816 
   2817 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2818 		raidPtr->forceRecon = 1;
   2819 	}
   2820 
   2821 	rf_ReconstructInPlace(raidPtr, req->col);
   2822 
   2823 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
   2824 		raidPtr->forceRecon = 0;
   2825 	}
   2826 
   2827 	RF_Free(req, sizeof(*req));
   2828 	raidPtr->recon_in_progress = 0;
   2829 	splx(s);
   2830 
   2831 	/* That's all... */
   2832 	kthread_exit(0);	/* does not return */
   2833 }
   2834 
   2835 static RF_AutoConfig_t *
   2836 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2837     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2838     unsigned secsize)
   2839 {
   2840 	int good_one = 0;
   2841 	RF_ComponentLabel_t *clabel;
   2842 	RF_AutoConfig_t *ac;
   2843 
   2844 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
   2845 
   2846 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2847 		/* Got the label.  Does it look reasonable? */
   2848 		if (rf_reasonable_label(clabel, numsecs) &&
   2849 		    (rf_component_label_partitionsize(clabel) <= size)) {
   2850 #ifdef DEBUG
   2851 			printf("Component on: %s: %llu\n",
   2852 				cname, (unsigned long long)size);
   2853 			rf_print_component_label(clabel);
   2854 #endif
   2855 			/* if it's reasonable, add it, else ignore it. */
   2856 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   2857 				M_WAITOK);
   2858 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   2859 			ac->dev = dev;
   2860 			ac->vp = vp;
   2861 			ac->clabel = clabel;
   2862 			ac->next = ac_list;
   2863 			ac_list = ac;
   2864 			good_one = 1;
   2865 		}
   2866 	}
   2867 	if (!good_one) {
   2868 		/* cleanup */
   2869 		free(clabel, M_RAIDFRAME);
   2870 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2871 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2872 		vput(vp);
   2873 	}
   2874 	return ac_list;
   2875 }
   2876 
   2877 static RF_AutoConfig_t *
   2878 rf_find_raid_components(void)
   2879 {
   2880 	struct vnode *vp;
   2881 	struct disklabel label;
   2882 	device_t dv;
   2883 	deviter_t di;
   2884 	dev_t dev;
   2885 	int bmajor, bminor, wedge, rf_part_found;
   2886 	int error;
   2887 	int i;
   2888 	RF_AutoConfig_t *ac_list;
   2889 	uint64_t numsecs;
   2890 	unsigned secsize;
   2891 	int dowedges;
   2892 
   2893 	/* initialize the AutoConfig list */
   2894 	ac_list = NULL;
   2895 
   2896 	/*
   2897 	 * we begin by trolling through *all* the devices on the system *twice*
   2898 	 * first we scan for wedges, second for other devices. This avoids
   2899 	 * using a raw partition instead of a wedge that covers the whole disk
   2900 	 */
   2901 
   2902 	for (dowedges=1; dowedges>=0; --dowedges) {
   2903 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   2904 		     dv = deviter_next(&di)) {
   2905 
   2906 			/* we are only interested in disks */
   2907 			if (device_class(dv) != DV_DISK)
   2908 				continue;
   2909 
   2910 			/* we don't care about floppies */
   2911 			if (device_is_a(dv, "fd")) {
   2912 				continue;
   2913 			}
   2914 
   2915 			/* we don't care about CDs. */
   2916 			if (device_is_a(dv, "cd")) {
   2917 				continue;
   2918 			}
   2919 
   2920 			/* we don't care about md. */
   2921 			if (device_is_a(dv, "md")) {
   2922 				continue;
   2923 			}
   2924 
   2925 			/* hdfd is the Atari/Hades floppy driver */
   2926 			if (device_is_a(dv, "hdfd")) {
   2927 				continue;
   2928 			}
   2929 
   2930 			/* fdisa is the Atari/Milan floppy driver */
   2931 			if (device_is_a(dv, "fdisa")) {
   2932 				continue;
   2933 			}
   2934 
   2935 			/* we don't care about spiflash */
   2936 			if (device_is_a(dv, "spiflash")) {
   2937 				continue;
   2938 			}
   2939 
   2940 			/* are we in the wedges pass ? */
   2941 			wedge = device_is_a(dv, "dk");
   2942 			if (wedge != dowedges) {
   2943 				continue;
   2944 			}
   2945 
   2946 			/* need to find the device_name_to_block_device_major stuff */
   2947 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   2948 
   2949 			rf_part_found = 0; /*No raid partition as yet*/
   2950 
   2951 			/* get a vnode for the raw partition of this disk */
   2952 			bminor = minor(device_unit(dv));
   2953 			dev = wedge ? makedev(bmajor, bminor) :
   2954 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   2955 			if (bdevvp(dev, &vp))
   2956 				panic("RAID can't alloc vnode");
   2957 
   2958 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2959 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   2960 
   2961 			if (error) {
   2962 				/* "Who cares."  Continue looking
   2963 				   for something that exists*/
   2964 				vput(vp);
   2965 				continue;
   2966 			}
   2967 
   2968 			error = getdisksize(vp, &numsecs, &secsize);
   2969 			if (error) {
   2970 				/*
   2971 				 * Pseudo devices like vnd and cgd can be
   2972 				 * opened but may still need some configuration.
   2973 				 * Ignore these quietly.
   2974 				 */
   2975 				if (error != ENXIO)
   2976 					printf("RAIDframe: can't get disk size"
   2977 					    " for dev %s (%d)\n",
   2978 					    device_xname(dv), error);
   2979 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2980 				vput(vp);
   2981 				continue;
   2982 			}
   2983 			if (wedge) {
   2984 				struct dkwedge_info dkw;
   2985 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   2986 				    NOCRED);
   2987 				if (error) {
   2988 					printf("RAIDframe: can't get wedge info for "
   2989 					    "dev %s (%d)\n", device_xname(dv), error);
   2990 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2991 					vput(vp);
   2992 					continue;
   2993 				}
   2994 
   2995 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   2996 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2997 					vput(vp);
   2998 					continue;
   2999 				}
   3000 
   3001 				VOP_UNLOCK(vp);
   3002 				ac_list = rf_get_component(ac_list, dev, vp,
   3003 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3004 				rf_part_found = 1; /*There is a raid component on this disk*/
   3005 				continue;
   3006 			}
   3007 
   3008 			/* Ok, the disk exists.  Go get the disklabel. */
   3009 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3010 			if (error) {
   3011 				/*
   3012 				 * XXX can't happen - open() would
   3013 				 * have errored out (or faked up one)
   3014 				 */
   3015 				if (error != ENOTTY)
   3016 					printf("RAIDframe: can't get label for dev "
   3017 					    "%s (%d)\n", device_xname(dv), error);
   3018 			}
   3019 
   3020 			/* don't need this any more.  We'll allocate it again
   3021 			   a little later if we really do... */
   3022 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3023 			vput(vp);
   3024 
   3025 			if (error)
   3026 				continue;
   3027 
   3028 			rf_part_found = 0; /*No raid partitions yet*/
   3029 			for (i = 0; i < label.d_npartitions; i++) {
   3030 				char cname[sizeof(ac_list->devname)];
   3031 
   3032 				/* We only support partitions marked as RAID */
   3033 				if (label.d_partitions[i].p_fstype != FS_RAID)
   3034 					continue;
   3035 
   3036 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3037 				if (bdevvp(dev, &vp))
   3038 					panic("RAID can't alloc vnode");
   3039 
   3040 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3041 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3042 				if (error) {
   3043 					/* Not quite a 'whatever'.  In
   3044 					 * this situation we know
   3045 					 * there is a FS_RAID
   3046 					 * partition, but we can't
   3047 					 * open it.  The most likely
   3048 					 * reason is that the
   3049 					 * partition is already in
   3050 					 * use by another RAID set.
   3051 					 * So note that we've already
   3052 					 * found a partition on this
   3053 					 * disk so we don't attempt
   3054 					 * to use the raw disk later. */
   3055 					rf_part_found = 1;
   3056 					vput(vp);
   3057 					continue;
   3058 				}
   3059 				VOP_UNLOCK(vp);
   3060 				snprintf(cname, sizeof(cname), "%s%c",
   3061 				    device_xname(dv), 'a' + i);
   3062 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3063 					label.d_partitions[i].p_size, numsecs, secsize);
   3064 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3065 			}
   3066 
   3067 			/*
   3068 			 *If there is no raid component on this disk, either in a
   3069 			 *disklabel or inside a wedge, check the raw partition as well,
   3070 			 *as it is possible to configure raid components on raw disk
   3071 			 *devices.
   3072 			 */
   3073 
   3074 			if (!rf_part_found) {
   3075 				char cname[sizeof(ac_list->devname)];
   3076 
   3077 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3078 				if (bdevvp(dev, &vp))
   3079 					panic("RAID can't alloc vnode");
   3080 
   3081 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3082 
   3083 				error = VOP_OPEN(vp, FREAD, NOCRED);
   3084 				if (error) {
   3085 					/* Whatever... */
   3086 					vput(vp);
   3087 					continue;
   3088 				}
   3089 				VOP_UNLOCK(vp);
   3090 				snprintf(cname, sizeof(cname), "%s%c",
   3091 				    device_xname(dv), 'a' + RAW_PART);
   3092 				ac_list = rf_get_component(ac_list, dev, vp, cname,
   3093 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3094 			}
   3095 		}
   3096 		deviter_release(&di);
   3097 	}
   3098 	return ac_list;
   3099 }
   3100 
   3101 int
   3102 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3103 {
   3104 
   3105 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
   3106 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
   3107 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
   3108 	    (clabel->clean == RF_RAID_CLEAN ||
   3109 	     clabel->clean == RF_RAID_DIRTY) &&
   3110 	    clabel->row >=0 &&
   3111 	    clabel->column >= 0 &&
   3112 	    clabel->num_rows > 0 &&
   3113 	    clabel->num_columns > 0 &&
   3114 	    clabel->row < clabel->num_rows &&
   3115 	    clabel->column < clabel->num_columns &&
   3116 	    clabel->blockSize > 0 &&
   3117 	    /*
   3118 	     * numBlocksHi may contain garbage, but it is ok since
   3119 	     * the type is unsigned.  If it is really garbage,
   3120 	     * rf_fix_old_label_size() will fix it.
   3121 	     */
   3122 	    rf_component_label_numblocks(clabel) > 0) {
   3123 		/*
   3124 		 * label looks reasonable enough...
   3125 		 * let's make sure it has no old garbage.
   3126 		 */
   3127 		if (numsecs)
   3128 			rf_fix_old_label_size(clabel, numsecs);
   3129 		return(1);
   3130 	}
   3131 	return(0);
   3132 }
   3133 
   3134 
   3135 /*
   3136  * For reasons yet unknown, some old component labels have garbage in
   3137  * the newer numBlocksHi region, and this causes lossage.  Since those
   3138  * disks will also have numsecs set to less than 32 bits of sectors,
   3139  * we can determine when this corruption has occurred, and fix it.
   3140  *
   3141  * The exact same problem, with the same unknown reason, happens to
   3142  * the partitionSizeHi member as well.
   3143  */
   3144 static void
   3145 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3146 {
   3147 
   3148 	if (numsecs < ((uint64_t)1 << 32)) {
   3149 		if (clabel->numBlocksHi) {
   3150 			printf("WARNING: total sectors < 32 bits, yet "
   3151 			       "numBlocksHi set\n"
   3152 			       "WARNING: resetting numBlocksHi to zero.\n");
   3153 			clabel->numBlocksHi = 0;
   3154 		}
   3155 
   3156 		if (clabel->partitionSizeHi) {
   3157 			printf("WARNING: total sectors < 32 bits, yet "
   3158 			       "partitionSizeHi set\n"
   3159 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3160 			clabel->partitionSizeHi = 0;
   3161 		}
   3162 	}
   3163 }
   3164 
   3165 
   3166 #ifdef DEBUG
   3167 void
   3168 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3169 {
   3170 	uint64_t numBlocks;
   3171 	static const char *rp[] = {
   3172 	    "No", "Force", "Soft", "*invalid*"
   3173 	};
   3174 
   3175 
   3176 	numBlocks = rf_component_label_numblocks(clabel);
   3177 
   3178 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3179 	       clabel->row, clabel->column,
   3180 	       clabel->num_rows, clabel->num_columns);
   3181 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3182 	       clabel->version, clabel->serial_number,
   3183 	       clabel->mod_counter);
   3184 	printf("   Clean: %s Status: %d\n",
   3185 	       clabel->clean ? "Yes" : "No", clabel->status);
   3186 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3187 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3188 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3189 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3190 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3191 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
   3192 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3193 #if 0
   3194 	   printf("   Config order: %d\n", clabel->config_order);
   3195 #endif
   3196 
   3197 }
   3198 #endif
   3199 
   3200 static RF_ConfigSet_t *
   3201 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3202 {
   3203 	RF_AutoConfig_t *ac;
   3204 	RF_ConfigSet_t *config_sets;
   3205 	RF_ConfigSet_t *cset;
   3206 	RF_AutoConfig_t *ac_next;
   3207 
   3208 
   3209 	config_sets = NULL;
   3210 
   3211 	/* Go through the AutoConfig list, and figure out which components
   3212 	   belong to what sets.  */
   3213 	ac = ac_list;
   3214 	while(ac!=NULL) {
   3215 		/* we're going to putz with ac->next, so save it here
   3216 		   for use at the end of the loop */
   3217 		ac_next = ac->next;
   3218 
   3219 		if (config_sets == NULL) {
   3220 			/* will need at least this one... */
   3221 			config_sets = malloc(sizeof(RF_ConfigSet_t),
   3222 				       M_RAIDFRAME, M_WAITOK);
   3223 			/* this one is easy :) */
   3224 			config_sets->ac = ac;
   3225 			config_sets->next = NULL;
   3226 			config_sets->rootable = 0;
   3227 			ac->next = NULL;
   3228 		} else {
   3229 			/* which set does this component fit into? */
   3230 			cset = config_sets;
   3231 			while(cset!=NULL) {
   3232 				if (rf_does_it_fit(cset, ac)) {
   3233 					/* looks like it matches... */
   3234 					ac->next = cset->ac;
   3235 					cset->ac = ac;
   3236 					break;
   3237 				}
   3238 				cset = cset->next;
   3239 			}
   3240 			if (cset==NULL) {
   3241 				/* didn't find a match above... new set..*/
   3242 				cset = malloc(sizeof(RF_ConfigSet_t),
   3243 					       M_RAIDFRAME, M_WAITOK);
   3244 				cset->ac = ac;
   3245 				ac->next = NULL;
   3246 				cset->next = config_sets;
   3247 				cset->rootable = 0;
   3248 				config_sets = cset;
   3249 			}
   3250 		}
   3251 		ac = ac_next;
   3252 	}
   3253 
   3254 
   3255 	return(config_sets);
   3256 }
   3257 
   3258 static int
   3259 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3260 {
   3261 	RF_ComponentLabel_t *clabel1, *clabel2;
   3262 
   3263 	/* If this one matches the *first* one in the set, that's good
   3264 	   enough, since the other members of the set would have been
   3265 	   through here too... */
   3266 	/* note that we are not checking partitionSize here..
   3267 
   3268 	   Note that we are also not checking the mod_counters here.
   3269 	   If everything else matches except the mod_counter, that's
   3270 	   good enough for this test.  We will deal with the mod_counters
   3271 	   a little later in the autoconfiguration process.
   3272 
   3273 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3274 
   3275 	   The reason we don't check for this is that failed disks
   3276 	   will have lower modification counts.  If those disks are
   3277 	   not added to the set they used to belong to, then they will
   3278 	   form their own set, which may result in 2 different sets,
   3279 	   for example, competing to be configured at raid0, and
   3280 	   perhaps competing to be the root filesystem set.  If the
   3281 	   wrong ones get configured, or both attempt to become /,
   3282 	   weird behaviour and or serious lossage will occur.  Thus we
   3283 	   need to bring them into the fold here, and kick them out at
   3284 	   a later point.
   3285 
   3286 	*/
   3287 
   3288 	clabel1 = cset->ac->clabel;
   3289 	clabel2 = ac->clabel;
   3290 	if ((clabel1->version == clabel2->version) &&
   3291 	    (clabel1->serial_number == clabel2->serial_number) &&
   3292 	    (clabel1->num_rows == clabel2->num_rows) &&
   3293 	    (clabel1->num_columns == clabel2->num_columns) &&
   3294 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3295 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3296 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3297 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3298 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3299 	    (clabel1->blockSize == clabel2->blockSize) &&
   3300 	    rf_component_label_numblocks(clabel1) ==
   3301 	    rf_component_label_numblocks(clabel2) &&
   3302 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3303 	    (clabel1->root_partition == clabel2->root_partition) &&
   3304 	    (clabel1->last_unit == clabel2->last_unit) &&
   3305 	    (clabel1->config_order == clabel2->config_order)) {
   3306 		/* if it get's here, it almost *has* to be a match */
   3307 	} else {
   3308 		/* it's not consistent with somebody in the set..
   3309 		   punt */
   3310 		return(0);
   3311 	}
   3312 	/* all was fine.. it must fit... */
   3313 	return(1);
   3314 }
   3315 
   3316 static int
   3317 rf_have_enough_components(RF_ConfigSet_t *cset)
   3318 {
   3319 	RF_AutoConfig_t *ac;
   3320 	RF_AutoConfig_t *auto_config;
   3321 	RF_ComponentLabel_t *clabel;
   3322 	int c;
   3323 	int num_cols;
   3324 	int num_missing;
   3325 	int mod_counter;
   3326 	int mod_counter_found;
   3327 	int even_pair_failed;
   3328 	char parity_type;
   3329 
   3330 
   3331 	/* check to see that we have enough 'live' components
   3332 	   of this set.  If so, we can configure it if necessary */
   3333 
   3334 	num_cols = cset->ac->clabel->num_columns;
   3335 	parity_type = cset->ac->clabel->parityConfig;
   3336 
   3337 	/* XXX Check for duplicate components!?!?!? */
   3338 
   3339 	/* Determine what the mod_counter is supposed to be for this set. */
   3340 
   3341 	mod_counter_found = 0;
   3342 	mod_counter = 0;
   3343 	ac = cset->ac;
   3344 	while(ac!=NULL) {
   3345 		if (mod_counter_found==0) {
   3346 			mod_counter = ac->clabel->mod_counter;
   3347 			mod_counter_found = 1;
   3348 		} else {
   3349 			if (ac->clabel->mod_counter > mod_counter) {
   3350 				mod_counter = ac->clabel->mod_counter;
   3351 			}
   3352 		}
   3353 		ac = ac->next;
   3354 	}
   3355 
   3356 	num_missing = 0;
   3357 	auto_config = cset->ac;
   3358 
   3359 	even_pair_failed = 0;
   3360 	for(c=0; c<num_cols; c++) {
   3361 		ac = auto_config;
   3362 		while(ac!=NULL) {
   3363 			if ((ac->clabel->column == c) &&
   3364 			    (ac->clabel->mod_counter == mod_counter)) {
   3365 				/* it's this one... */
   3366 #ifdef DEBUG
   3367 				printf("Found: %s at %d\n",
   3368 				       ac->devname,c);
   3369 #endif
   3370 				break;
   3371 			}
   3372 			ac=ac->next;
   3373 		}
   3374 		if (ac==NULL) {
   3375 				/* Didn't find one here! */
   3376 				/* special case for RAID 1, especially
   3377 				   where there are more than 2
   3378 				   components (where RAIDframe treats
   3379 				   things a little differently :( ) */
   3380 			if (parity_type == '1') {
   3381 				if (c%2 == 0) { /* even component */
   3382 					even_pair_failed = 1;
   3383 				} else { /* odd component.  If
   3384 					    we're failed, and
   3385 					    so is the even
   3386 					    component, it's
   3387 					    "Good Night, Charlie" */
   3388 					if (even_pair_failed == 1) {
   3389 						return(0);
   3390 					}
   3391 				}
   3392 			} else {
   3393 				/* normal accounting */
   3394 				num_missing++;
   3395 			}
   3396 		}
   3397 		if ((parity_type == '1') && (c%2 == 1)) {
   3398 				/* Just did an even component, and we didn't
   3399 				   bail.. reset the even_pair_failed flag,
   3400 				   and go on to the next component.... */
   3401 			even_pair_failed = 0;
   3402 		}
   3403 	}
   3404 
   3405 	clabel = cset->ac->clabel;
   3406 
   3407 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3408 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3409 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3410 		/* XXX this needs to be made *much* more general */
   3411 		/* Too many failures */
   3412 		return(0);
   3413 	}
   3414 	/* otherwise, all is well, and we've got enough to take a kick
   3415 	   at autoconfiguring this set */
   3416 	return(1);
   3417 }
   3418 
   3419 static void
   3420 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3421 			RF_Raid_t *raidPtr)
   3422 {
   3423 	RF_ComponentLabel_t *clabel;
   3424 	int i;
   3425 
   3426 	clabel = ac->clabel;
   3427 
   3428 	/* 1. Fill in the common stuff */
   3429 	config->numCol = clabel->num_columns;
   3430 	config->numSpare = 0; /* XXX should this be set here? */
   3431 	config->sectPerSU = clabel->sectPerSU;
   3432 	config->SUsPerPU = clabel->SUsPerPU;
   3433 	config->SUsPerRU = clabel->SUsPerRU;
   3434 	config->parityConfig = clabel->parityConfig;
   3435 	/* XXX... */
   3436 	strcpy(config->diskQueueType,"fifo");
   3437 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3438 	config->layoutSpecificSize = 0; /* XXX ?? */
   3439 
   3440 	while(ac!=NULL) {
   3441 		/* row/col values will be in range due to the checks
   3442 		   in reasonable_label() */
   3443 		strcpy(config->devnames[0][ac->clabel->column],
   3444 		       ac->devname);
   3445 		ac = ac->next;
   3446 	}
   3447 
   3448 	for(i=0;i<RF_MAXDBGV;i++) {
   3449 		config->debugVars[i][0] = 0;
   3450 	}
   3451 }
   3452 
   3453 static int
   3454 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3455 {
   3456 	RF_ComponentLabel_t *clabel;
   3457 	int column;
   3458 	int sparecol;
   3459 
   3460 	raidPtr->autoconfigure = new_value;
   3461 
   3462 	for(column=0; column<raidPtr->numCol; column++) {
   3463 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3464 			clabel = raidget_component_label(raidPtr, column);
   3465 			clabel->autoconfigure = new_value;
   3466 			raidflush_component_label(raidPtr, column);
   3467 		}
   3468 	}
   3469 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3470 		sparecol = raidPtr->numCol + column;
   3471 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3472 			clabel = raidget_component_label(raidPtr, sparecol);
   3473 			clabel->autoconfigure = new_value;
   3474 			raidflush_component_label(raidPtr, sparecol);
   3475 		}
   3476 	}
   3477 	return(new_value);
   3478 }
   3479 
   3480 static int
   3481 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3482 {
   3483 	RF_ComponentLabel_t *clabel;
   3484 	int column;
   3485 	int sparecol;
   3486 
   3487 	raidPtr->root_partition = new_value;
   3488 	for(column=0; column<raidPtr->numCol; column++) {
   3489 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3490 			clabel = raidget_component_label(raidPtr, column);
   3491 			clabel->root_partition = new_value;
   3492 			raidflush_component_label(raidPtr, column);
   3493 		}
   3494 	}
   3495 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3496 		sparecol = raidPtr->numCol + column;
   3497 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3498 			clabel = raidget_component_label(raidPtr, sparecol);
   3499 			clabel->root_partition = new_value;
   3500 			raidflush_component_label(raidPtr, sparecol);
   3501 		}
   3502 	}
   3503 	return(new_value);
   3504 }
   3505 
   3506 static void
   3507 rf_release_all_vps(RF_ConfigSet_t *cset)
   3508 {
   3509 	RF_AutoConfig_t *ac;
   3510 
   3511 	ac = cset->ac;
   3512 	while(ac!=NULL) {
   3513 		/* Close the vp, and give it back */
   3514 		if (ac->vp) {
   3515 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3516 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
   3517 			vput(ac->vp);
   3518 			ac->vp = NULL;
   3519 		}
   3520 		ac = ac->next;
   3521 	}
   3522 }
   3523 
   3524 
   3525 static void
   3526 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3527 {
   3528 	RF_AutoConfig_t *ac;
   3529 	RF_AutoConfig_t *next_ac;
   3530 
   3531 	ac = cset->ac;
   3532 	while(ac!=NULL) {
   3533 		next_ac = ac->next;
   3534 		/* nuke the label */
   3535 		free(ac->clabel, M_RAIDFRAME);
   3536 		/* cleanup the config structure */
   3537 		free(ac, M_RAIDFRAME);
   3538 		/* "next.." */
   3539 		ac = next_ac;
   3540 	}
   3541 	/* and, finally, nuke the config set */
   3542 	free(cset, M_RAIDFRAME);
   3543 }
   3544 
   3545 
   3546 void
   3547 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3548 {
   3549 	/* avoid over-writing byteswapped version. */
   3550 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
   3551 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3552 	clabel->serial_number = raidPtr->serial_number;
   3553 	clabel->mod_counter = raidPtr->mod_counter;
   3554 
   3555 	clabel->num_rows = 1;
   3556 	clabel->num_columns = raidPtr->numCol;
   3557 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3558 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3559 
   3560 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3561 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3562 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3563 
   3564 	clabel->blockSize = raidPtr->bytesPerSector;
   3565 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3566 
   3567 	/* XXX not portable */
   3568 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3569 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3570 	clabel->autoconfigure = raidPtr->autoconfigure;
   3571 	clabel->root_partition = raidPtr->root_partition;
   3572 	clabel->last_unit = raidPtr->raidid;
   3573 	clabel->config_order = raidPtr->config_order;
   3574 
   3575 #ifndef RF_NO_PARITY_MAP
   3576 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3577 #endif
   3578 }
   3579 
   3580 static struct raid_softc *
   3581 rf_auto_config_set(RF_ConfigSet_t *cset)
   3582 {
   3583 	RF_Raid_t *raidPtr;
   3584 	RF_Config_t *config;
   3585 	int raidID;
   3586 	struct raid_softc *sc;
   3587 
   3588 #ifdef DEBUG
   3589 	printf("RAID autoconfigure\n");
   3590 #endif
   3591 
   3592 	/* 1. Create a config structure */
   3593 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
   3594 
   3595 	/*
   3596 	   2. Figure out what RAID ID this one is supposed to live at
   3597 	   See if we can get the same RAID dev that it was configured
   3598 	   on last time..
   3599 	*/
   3600 
   3601 	raidID = cset->ac->clabel->last_unit;
   3602 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
   3603 	     sc = raidget(++raidID, false))
   3604 		continue;
   3605 #ifdef DEBUG
   3606 	printf("Configuring raid%d:\n",raidID);
   3607 #endif
   3608 
   3609 	if (sc == NULL)
   3610 		sc = raidget(raidID, true);
   3611 	raidPtr = &sc->sc_r;
   3612 
   3613 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3614 	raidPtr->softc = sc;
   3615 	raidPtr->raidid = raidID;
   3616 	raidPtr->openings = RAIDOUTSTANDING;
   3617 
   3618 	/* 3. Build the configuration structure */
   3619 	rf_create_configuration(cset->ac, config, raidPtr);
   3620 
   3621 	/* 4. Do the configuration */
   3622 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3623 		raidinit(sc);
   3624 
   3625 		rf_markalldirty(raidPtr);
   3626 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3627 		switch (cset->ac->clabel->root_partition) {
   3628 		case 1:	/* Force Root */
   3629 		case 2:	/* Soft Root: root when boot partition part of raid */
   3630 			/*
   3631 			 * everything configured just fine.  Make a note
   3632 			 * that this set is eligible to be root,
   3633 			 * or forced to be root
   3634 			 */
   3635 			cset->rootable = cset->ac->clabel->root_partition;
   3636 			/* XXX do this here? */
   3637 			raidPtr->root_partition = cset->rootable;
   3638 			break;
   3639 		default:
   3640 			break;
   3641 		}
   3642 	} else {
   3643 		raidput(sc);
   3644 		sc = NULL;
   3645 	}
   3646 
   3647 	/* 5. Cleanup */
   3648 	free(config, M_RAIDFRAME);
   3649 	return sc;
   3650 }
   3651 
   3652 void
   3653 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
   3654 	     size_t xmin, size_t xmax)
   3655 {
   3656 
   3657 	/* Format: raid%d_foo */
   3658 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
   3659 
   3660 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3661 	pool_sethiwat(p, xmax);
   3662 	pool_prime(p, xmin);
   3663 }
   3664 
   3665 
   3666 /*
   3667  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
   3668  * to see if there is IO pending and if that IO could possibly be done
   3669  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3670  * otherwise.
   3671  *
   3672  */
   3673 int
   3674 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3675 {
   3676 	struct raid_softc *rs;
   3677 	struct dk_softc *dksc;
   3678 
   3679 	rs = raidPtr->softc;
   3680 	dksc = &rs->sc_dksc;
   3681 
   3682 	if ((rs->sc_flags & RAIDF_INITED) == 0)
   3683 		return 1;
   3684 
   3685 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
   3686 		/* there is work to do */
   3687 		return 0;
   3688 	}
   3689 	/* default is nothing to do */
   3690 	return 1;
   3691 }
   3692 
   3693 int
   3694 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3695 {
   3696 	uint64_t numsecs;
   3697 	unsigned secsize;
   3698 	int error;
   3699 
   3700 	error = getdisksize(vp, &numsecs, &secsize);
   3701 	if (error == 0) {
   3702 		diskPtr->blockSize = secsize;
   3703 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3704 		diskPtr->partitionSize = numsecs;
   3705 		return 0;
   3706 	}
   3707 	return error;
   3708 }
   3709 
   3710 static int
   3711 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3712 {
   3713 	return 1;
   3714 }
   3715 
   3716 static void
   3717 raid_attach(device_t parent, device_t self, void *aux)
   3718 {
   3719 }
   3720 
   3721 
   3722 static int
   3723 raid_detach(device_t self, int flags)
   3724 {
   3725 	int error;
   3726 	struct raid_softc *rs = raidsoftc(self);
   3727 
   3728 	if (rs == NULL)
   3729 		return ENXIO;
   3730 
   3731 	if ((error = raidlock(rs)) != 0)
   3732 		return error;
   3733 
   3734 	error = raid_detach_unlocked(rs);
   3735 
   3736 	raidunlock(rs);
   3737 
   3738 	/* XXX raid can be referenced here */
   3739 
   3740 	if (error)
   3741 		return error;
   3742 
   3743 	/* Free the softc */
   3744 	raidput(rs);
   3745 
   3746 	return 0;
   3747 }
   3748 
   3749 static void
   3750 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3751 {
   3752 	struct dk_softc *dksc = &rs->sc_dksc;
   3753 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
   3754 
   3755 	memset(dg, 0, sizeof(*dg));
   3756 
   3757 	dg->dg_secperunit = raidPtr->totalSectors;
   3758 	dg->dg_secsize = raidPtr->bytesPerSector;
   3759 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3760 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3761 
   3762 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
   3763 }
   3764 
   3765 /*
   3766  * Get cache info for all the components (including spares).
   3767  * Returns intersection of all the cache flags of all disks, or first
   3768  * error if any encountered.
   3769  * XXXfua feature flags can change as spares are added - lock down somehow
   3770  */
   3771 static int
   3772 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
   3773 {
   3774 	int c;
   3775 	int error;
   3776 	int dkwhole = 0, dkpart;
   3777 
   3778 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
   3779 		/*
   3780 		 * Check any non-dead disk, even when currently being
   3781 		 * reconstructed.
   3782 		 */
   3783 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
   3784 		    || raidPtr->Disks[c].status == rf_ds_reconstructing) {
   3785 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
   3786 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
   3787 			if (error) {
   3788 				if (error != ENODEV) {
   3789 					printf("raid%d: get cache for component %s failed\n",
   3790 					    raidPtr->raidid,
   3791 					    raidPtr->Disks[c].devname);
   3792 				}
   3793 
   3794 				return error;
   3795 			}
   3796 
   3797 			if (c == 0)
   3798 				dkwhole = dkpart;
   3799 			else
   3800 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
   3801 		}
   3802 	}
   3803 
   3804 	*data = dkwhole;
   3805 
   3806 	return 0;
   3807 }
   3808 
   3809 /*
   3810  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3811  * We end up returning whatever error was returned by the first cache flush
   3812  * that fails.
   3813  */
   3814 
   3815 static int
   3816 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
   3817 {
   3818 	int e = 0;
   3819 	for (int i = 0; i < 5; i++) {
   3820 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3821 		    &force, FWRITE, NOCRED);
   3822 		if (!e || e == ENODEV)
   3823 			return e;
   3824 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
   3825 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
   3826 	}
   3827 	return e;
   3828 }
   3829 
   3830 int
   3831 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
   3832 {
   3833 	int c, error;
   3834 
   3835 	error = 0;
   3836 	for (c = 0; c < raidPtr->numCol; c++) {
   3837 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3838 			int e = rf_sync_component_cache(raidPtr, c, force);
   3839 			if (e && !error)
   3840 				error = e;
   3841 		}
   3842 	}
   3843 
   3844 	for (c = 0; c < raidPtr->numSpare ; c++) {
   3845 		int sparecol = raidPtr->numCol + c;
   3846 		/* Need to ensure that the reconstruct actually completed! */
   3847 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3848 			int e = rf_sync_component_cache(raidPtr, sparecol,
   3849 			    force);
   3850 			if (e && !error)
   3851 				error = e;
   3852 		}
   3853 	}
   3854 	return error;
   3855 }
   3856 
   3857 /* Fill in info with the current status */
   3858 void
   3859 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3860 {
   3861 
   3862 	if (raidPtr->status != rf_rs_reconstructing) {
   3863 		info->total = 100;
   3864 		info->completed = 100;
   3865 	} else {
   3866 		info->total = raidPtr->reconControl->numRUsTotal;
   3867 		info->completed = raidPtr->reconControl->numRUsComplete;
   3868 	}
   3869 	info->remaining = info->total - info->completed;
   3870 }
   3871 
   3872 /* Fill in info with the current status */
   3873 void
   3874 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3875 {
   3876 
   3877 	if (raidPtr->parity_rewrite_in_progress == 1) {
   3878 		info->total = raidPtr->Layout.numStripe;
   3879 		info->completed = raidPtr->parity_rewrite_stripes_done;
   3880 	} else {
   3881 		info->completed = 100;
   3882 		info->total = 100;
   3883 	}
   3884 	info->remaining = info->total - info->completed;
   3885 }
   3886 
   3887 /* Fill in info with the current status */
   3888 void
   3889 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
   3890 {
   3891 
   3892 	if (raidPtr->copyback_in_progress == 1) {
   3893 		info->total = raidPtr->Layout.numStripe;
   3894 		info->completed = raidPtr->copyback_stripes_done;
   3895 		info->remaining = info->total - info->completed;
   3896 	} else {
   3897 		info->remaining = 0;
   3898 		info->completed = 100;
   3899 		info->total = 100;
   3900 	}
   3901 }
   3902 
   3903 /* Fill in config with the current info */
   3904 int
   3905 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
   3906 {
   3907 	int	d, i, j;
   3908 
   3909 	if (!raidPtr->valid)
   3910 		return ENODEV;
   3911 	config->cols = raidPtr->numCol;
   3912 	config->ndevs = raidPtr->numCol;
   3913 	if (config->ndevs >= RF_MAX_DISKS)
   3914 		return ENOMEM;
   3915 	config->nspares = raidPtr->numSpare;
   3916 	if (config->nspares >= RF_MAX_DISKS)
   3917 		return ENOMEM;
   3918 	config->maxqdepth = raidPtr->maxQueueDepth;
   3919 	d = 0;
   3920 	for (j = 0; j < config->cols; j++) {
   3921 		config->devs[d] = raidPtr->Disks[j];
   3922 		d++;
   3923 	}
   3924 	for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
   3925 		config->spares[i] = raidPtr->Disks[j];
   3926 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
   3927 			/* XXX: raidctl(8) expects to see this as a used spare */
   3928 			config->spares[i].status = rf_ds_used_spare;
   3929 		}
   3930 	}
   3931 	return 0;
   3932 }
   3933 
   3934 int
   3935 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
   3936 {
   3937 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
   3938 	RF_ComponentLabel_t *raid_clabel;
   3939 	int column = clabel->column;
   3940 
   3941 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
   3942 		return EINVAL;
   3943 	raid_clabel = raidget_component_label(raidPtr, column);
   3944 	memcpy(clabel, raid_clabel, sizeof *clabel);
   3945 	/* Fix-up for userland. */
   3946 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
   3947 		clabel->version = RF_COMPONENT_LABEL_VERSION;
   3948 
   3949 	return 0;
   3950 }
   3951 
   3952 /*
   3953  * Module interface
   3954  */
   3955 
   3956 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
   3957 
   3958 #ifdef _MODULE
   3959 CFDRIVER_DECL(raid, DV_DISK, NULL);
   3960 #endif
   3961 
   3962 static int raid_modcmd(modcmd_t, void *);
   3963 static int raid_modcmd_init(void);
   3964 static int raid_modcmd_fini(void);
   3965 
   3966 static int
   3967 raid_modcmd(modcmd_t cmd, void *data)
   3968 {
   3969 	int error;
   3970 
   3971 	error = 0;
   3972 	switch (cmd) {
   3973 	case MODULE_CMD_INIT:
   3974 		error = raid_modcmd_init();
   3975 		break;
   3976 	case MODULE_CMD_FINI:
   3977 		error = raid_modcmd_fini();
   3978 		break;
   3979 	default:
   3980 		error = ENOTTY;
   3981 		break;
   3982 	}
   3983 	return error;
   3984 }
   3985 
   3986 static int
   3987 raid_modcmd_init(void)
   3988 {
   3989 	int error;
   3990 	int bmajor, cmajor;
   3991 
   3992 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
   3993 	mutex_enter(&raid_lock);
   3994 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   3995 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
   3996 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
   3997 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
   3998 
   3999 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
   4000 #endif
   4001 
   4002 	bmajor = cmajor = -1;
   4003 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
   4004 	    &raid_cdevsw, &cmajor);
   4005 	if (error != 0 && error != EEXIST) {
   4006 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
   4007 		mutex_exit(&raid_lock);
   4008 		return error;
   4009 	}
   4010 #ifdef _MODULE
   4011 	error = config_cfdriver_attach(&raid_cd);
   4012 	if (error != 0) {
   4013 		aprint_error("%s: config_cfdriver_attach failed %d\n",
   4014 		    __func__, error);
   4015 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4016 		mutex_exit(&raid_lock);
   4017 		return error;
   4018 	}
   4019 #endif
   4020 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4021 	if (error != 0) {
   4022 		aprint_error("%s: config_cfattach_attach failed %d\n",
   4023 		    __func__, error);
   4024 #ifdef _MODULE
   4025 		config_cfdriver_detach(&raid_cd);
   4026 #endif
   4027 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4028 		mutex_exit(&raid_lock);
   4029 		return error;
   4030 	}
   4031 
   4032 	raidautoconfigdone = false;
   4033 
   4034 	mutex_exit(&raid_lock);
   4035 
   4036 	if (error == 0) {
   4037 		if (rf_BootRaidframe(true) == 0)
   4038 			aprint_verbose("Kernelized RAIDframe activated\n");
   4039 		else
   4040 			panic("Serious error activating RAID!!");
   4041 	}
   4042 
   4043 	/*
   4044 	 * Register a finalizer which will be used to auto-config RAID
   4045 	 * sets once all real hardware devices have been found.
   4046 	 */
   4047 	error = config_finalize_register(NULL, rf_autoconfig);
   4048 	if (error != 0) {
   4049 		aprint_error("WARNING: unable to register RAIDframe "
   4050 		    "finalizer\n");
   4051 		error = 0;
   4052 	}
   4053 
   4054 	return error;
   4055 }
   4056 
   4057 static int
   4058 raid_modcmd_fini(void)
   4059 {
   4060 	int error;
   4061 
   4062 	mutex_enter(&raid_lock);
   4063 
   4064 	/* Don't allow unload if raid device(s) exist.  */
   4065 	if (!LIST_EMPTY(&raids)) {
   4066 		mutex_exit(&raid_lock);
   4067 		return EBUSY;
   4068 	}
   4069 
   4070 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
   4071 	if (error != 0) {
   4072 		aprint_error("%s: cannot detach cfattach\n",__func__);
   4073 		mutex_exit(&raid_lock);
   4074 		return error;
   4075 	}
   4076 #ifdef _MODULE
   4077 	error = config_cfdriver_detach(&raid_cd);
   4078 	if (error != 0) {
   4079 		aprint_error("%s: cannot detach cfdriver\n",__func__);
   4080 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4081 		mutex_exit(&raid_lock);
   4082 		return error;
   4083 	}
   4084 #endif
   4085 	error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
   4086 	if (error != 0) {
   4087 		aprint_error("%s: cannot detach devsw\n",__func__);
   4088 #ifdef _MODULE
   4089 		config_cfdriver_attach(&raid_cd);
   4090 #endif
   4091 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
   4092 		mutex_exit(&raid_lock);
   4093 		return error;
   4094 	}
   4095 	rf_BootRaidframe(false);
   4096 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   4097 	rf_destroy_mutex2(rf_sparet_wait_mutex);
   4098 	rf_destroy_cond2(rf_sparet_wait_cv);
   4099 	rf_destroy_cond2(rf_sparet_resp_cv);
   4100 #endif
   4101 	mutex_exit(&raid_lock);
   4102 	mutex_destroy(&raid_lock);
   4103 
   4104 	return error;
   4105 }
   4106