Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.298.2.3
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.298.2.3 2013/06/23 06:20:21 tls Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.298.2.3 2013/06/23 06:20:21 tls Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #endif
    110 
    111 #include <sys/param.h>
    112 #include <sys/errno.h>
    113 #include <sys/pool.h>
    114 #include <sys/proc.h>
    115 #include <sys/queue.h>
    116 #include <sys/disk.h>
    117 #include <sys/device.h>
    118 #include <sys/stat.h>
    119 #include <sys/ioctl.h>
    120 #include <sys/fcntl.h>
    121 #include <sys/systm.h>
    122 #include <sys/vnode.h>
    123 #include <sys/disklabel.h>
    124 #include <sys/conf.h>
    125 #include <sys/buf.h>
    126 #include <sys/bufq.h>
    127 #include <sys/reboot.h>
    128 #include <sys/kauth.h>
    129 
    130 #include <prop/proplib.h>
    131 
    132 #include <dev/raidframe/raidframevar.h>
    133 #include <dev/raidframe/raidframeio.h>
    134 #include <dev/raidframe/rf_paritymap.h>
    135 
    136 #include "rf_raid.h"
    137 #include "rf_copyback.h"
    138 #include "rf_dag.h"
    139 #include "rf_dagflags.h"
    140 #include "rf_desc.h"
    141 #include "rf_diskqueue.h"
    142 #include "rf_etimer.h"
    143 #include "rf_general.h"
    144 #include "rf_kintf.h"
    145 #include "rf_options.h"
    146 #include "rf_driver.h"
    147 #include "rf_parityscan.h"
    148 #include "rf_threadstuff.h"
    149 
    150 #ifdef COMPAT_50
    151 #include "rf_compat50.h"
    152 #endif
    153 
    154 #ifdef DEBUG
    155 int     rf_kdebug_level = 0;
    156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    157 #else				/* DEBUG */
    158 #define db1_printf(a) { }
    159 #endif				/* DEBUG */
    160 
    161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    162 static rf_declare_mutex2(rf_sparet_wait_mutex);
    163 static rf_declare_cond2(rf_sparet_wait_cv);
    164 static rf_declare_cond2(rf_sparet_resp_cv);
    165 
    166 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    167 						 * spare table */
    168 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    169 						 * installation process */
    170 #endif
    171 
    172 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    173 
    174 /* prototypes */
    175 static void KernelWakeupFunc(struct buf *);
    176 static void InitBP(struct buf *, struct vnode *, unsigned,
    177     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    178     void *, int, struct proc *);
    179 struct raid_softc;
    180 static void raidinit(struct raid_softc *);
    181 
    182 void raidattach(int);
    183 static int raid_match(device_t, cfdata_t, void *);
    184 static void raid_attach(device_t, device_t, void *);
    185 static int raid_detach(device_t, int);
    186 
    187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    188     daddr_t, daddr_t);
    189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t, int);
    191 
    192 static int raidwrite_component_label(unsigned,
    193     dev_t, struct vnode *, RF_ComponentLabel_t *);
    194 static int raidread_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 
    197 
    198 dev_type_open(raidopen);
    199 dev_type_close(raidclose);
    200 dev_type_read(raidread);
    201 dev_type_write(raidwrite);
    202 dev_type_ioctl(raidioctl);
    203 dev_type_strategy(raidstrategy);
    204 dev_type_dump(raiddump);
    205 dev_type_size(raidsize);
    206 
    207 const struct bdevsw raid_bdevsw = {
    208 	raidopen, raidclose, raidstrategy, raidioctl,
    209 	raiddump, raidsize, D_DISK
    210 };
    211 
    212 const struct cdevsw raid_cdevsw = {
    213 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    214 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    215 };
    216 
    217 static void	raidminphys(struct buf *);
    218 
    219 static struct dkdriver rf_dkdriver = { raidstrategy, raidminphys };
    220 
    221 struct raid_softc {
    222 	device_t sc_dev;
    223 	int	sc_unit;
    224 	int     sc_flags;	/* flags */
    225 	int     sc_cflags;	/* configuration flags */
    226 	uint64_t sc_size;	/* size of the raid device */
    227 	char    sc_xname[20];	/* XXX external name */
    228 	struct disk sc_dkdev;	/* generic disk device info */
    229 	struct bufq_state *buf_queue;	/* used for the device queue */
    230 	RF_Raid_t sc_r;
    231 	LIST_ENTRY(raid_softc) sc_link;
    232 };
    233 /* sc_flags */
    234 #define RAIDF_INITED	0x01	/* unit has been initialized */
    235 #define RAIDF_WLABEL	0x02	/* label area is writable */
    236 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    237 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    238 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    239 #define RAIDF_LOCKED	0x80	/* unit is locked */
    240 
    241 #define	raidunit(x)	DISKUNIT(x)
    242 
    243 extern struct cfdriver raid_cd;
    244 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    245     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    246     DVF_DETACH_SHUTDOWN);
    247 
    248 /*
    249  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    250  * Be aware that large numbers can allow the driver to consume a lot of
    251  * kernel memory, especially on writes, and in degraded mode reads.
    252  *
    253  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    254  * a single 64K write will typically require 64K for the old data,
    255  * 64K for the old parity, and 64K for the new parity, for a total
    256  * of 192K (if the parity buffer is not re-used immediately).
    257  * Even it if is used immediately, that's still 128K, which when multiplied
    258  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    259  *
    260  * Now in degraded mode, for example, a 64K read on the above setup may
    261  * require data reconstruction, which will require *all* of the 4 remaining
    262  * disks to participate -- 4 * 32K/disk == 128K again.
    263  */
    264 
    265 #ifndef RAIDOUTSTANDING
    266 #define RAIDOUTSTANDING   6
    267 #endif
    268 
    269 #define RAIDLABELDEV(dev)	\
    270 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    271 
    272 /* declared here, and made public, for the benefit of KVM stuff.. */
    273 
    274 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    275 				     struct disklabel *);
    276 static void raidgetdisklabel(dev_t);
    277 static void raidmakedisklabel(struct raid_softc *);
    278 
    279 static int raidlock(struct raid_softc *);
    280 static void raidunlock(struct raid_softc *);
    281 
    282 static int raid_detach_unlocked(struct raid_softc *);
    283 
    284 static void rf_markalldirty(RF_Raid_t *);
    285 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
    286 
    287 void rf_ReconThread(struct rf_recon_req *);
    288 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    289 void rf_CopybackThread(RF_Raid_t *raidPtr);
    290 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    291 int rf_autoconfig(device_t);
    292 void rf_buildroothack(RF_ConfigSet_t *);
    293 
    294 RF_AutoConfig_t *rf_find_raid_components(void);
    295 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    296 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    297 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    298 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    299 int rf_set_autoconfig(RF_Raid_t *, int);
    300 int rf_set_rootpartition(RF_Raid_t *, int);
    301 void rf_release_all_vps(RF_ConfigSet_t *);
    302 void rf_cleanup_config_set(RF_ConfigSet_t *);
    303 int rf_have_enough_components(RF_ConfigSet_t *);
    304 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    305 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    306 
    307 /*
    308  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    309  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    310  * in the kernel config file.
    311  */
    312 #ifdef RAID_AUTOCONFIG
    313 int raidautoconfig = 1;
    314 #else
    315 int raidautoconfig = 0;
    316 #endif
    317 static bool raidautoconfigdone = false;
    318 
    319 struct RF_Pools_s rf_pools;
    320 
    321 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    322 static kmutex_t raid_lock;
    323 
    324 static struct raid_softc *
    325 raidcreate(int unit) {
    326 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    327 	if (sc == NULL) {
    328 #ifdef DIAGNOSTIC
    329 		printf("%s: out of memory\n", __func__);
    330 #endif
    331 		return NULL;
    332 	}
    333 	sc->sc_unit = unit;
    334 	bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    335 	return sc;
    336 }
    337 
    338 static void
    339 raiddestroy(struct raid_softc *sc) {
    340 	bufq_free(sc->buf_queue);
    341 	kmem_free(sc, sizeof(*sc));
    342 }
    343 
    344 static struct raid_softc *
    345 raidget(int unit) {
    346 	struct raid_softc *sc;
    347 	if (unit < 0) {
    348 #ifdef DIAGNOSTIC
    349 		panic("%s: unit %d!", __func__, unit);
    350 #endif
    351 		return NULL;
    352 	}
    353 	mutex_enter(&raid_lock);
    354 	LIST_FOREACH(sc, &raids, sc_link) {
    355 		if (sc->sc_unit == unit) {
    356 			mutex_exit(&raid_lock);
    357 			return sc;
    358 		}
    359 	}
    360 	mutex_exit(&raid_lock);
    361 	if ((sc = raidcreate(unit)) == NULL)
    362 		return NULL;
    363 	mutex_enter(&raid_lock);
    364 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    365 	mutex_exit(&raid_lock);
    366 	return sc;
    367 }
    368 
    369 static void
    370 raidput(struct raid_softc *sc) {
    371 	mutex_enter(&raid_lock);
    372 	LIST_REMOVE(sc, sc_link);
    373 	mutex_exit(&raid_lock);
    374 	raiddestroy(sc);
    375 }
    376 
    377 void
    378 raidattach(int num)
    379 {
    380 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    381 	/* This is where all the initialization stuff gets done. */
    382 
    383 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    384 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    385 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    386 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    387 
    388 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    389 #endif
    390 
    391 	if (rf_BootRaidframe() == 0)
    392 		aprint_verbose("Kernelized RAIDframe activated\n");
    393 	else
    394 		panic("Serious error booting RAID!!");
    395 
    396 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    397 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    398 	}
    399 
    400 	raidautoconfigdone = false;
    401 
    402 	/*
    403 	 * Register a finalizer which will be used to auto-config RAID
    404 	 * sets once all real hardware devices have been found.
    405 	 */
    406 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    407 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    408 }
    409 
    410 int
    411 rf_autoconfig(device_t self)
    412 {
    413 	RF_AutoConfig_t *ac_list;
    414 	RF_ConfigSet_t *config_sets;
    415 
    416 	if (!raidautoconfig || raidautoconfigdone == true)
    417 		return (0);
    418 
    419 	/* XXX This code can only be run once. */
    420 	raidautoconfigdone = true;
    421 
    422 	/* 1. locate all RAID components on the system */
    423 	aprint_debug("Searching for RAID components...\n");
    424 	ac_list = rf_find_raid_components();
    425 
    426 	/* 2. Sort them into their respective sets. */
    427 	config_sets = rf_create_auto_sets(ac_list);
    428 
    429 	/*
    430 	 * 3. Evaluate each set and configure the valid ones.
    431 	 * This gets done in rf_buildroothack().
    432 	 */
    433 	rf_buildroothack(config_sets);
    434 
    435 	return 1;
    436 }
    437 
    438 void
    439 rf_buildroothack(RF_ConfigSet_t *config_sets)
    440 {
    441 	RF_ConfigSet_t *cset;
    442 	RF_ConfigSet_t *next_cset;
    443 	int col;
    444 	int num_root;
    445 	char *devname;
    446 	struct raid_softc *sc, *rsc;
    447 
    448 	sc = rsc = NULL;
    449 	num_root = 0;
    450 	cset = config_sets;
    451 	while (cset != NULL) {
    452 		next_cset = cset->next;
    453 		if (rf_have_enough_components(cset) &&
    454 		    cset->ac->clabel->autoconfigure == 1) {
    455 			sc = rf_auto_config_set(cset);
    456 			if (sc != NULL) {
    457 				aprint_debug("raid%d: configured ok\n",
    458 				    sc->sc_unit);
    459 				if (cset->rootable) {
    460 					rsc = sc;
    461 					num_root++;
    462 				}
    463 			} else {
    464 				/* The autoconfig didn't work :( */
    465 				aprint_debug("Autoconfig failed\n");
    466 				rf_release_all_vps(cset);
    467 			}
    468 		} else {
    469 			/* we're not autoconfiguring this set...
    470 			   release the associated resources */
    471 			rf_release_all_vps(cset);
    472 		}
    473 		/* cleanup */
    474 		rf_cleanup_config_set(cset);
    475 		cset = next_cset;
    476 	}
    477 
    478 	/* if the user has specified what the root device should be
    479 	   then we don't touch booted_device or boothowto... */
    480 
    481 	if (rootspec != NULL)
    482 		return;
    483 
    484 	/* we found something bootable... */
    485 
    486 	if (num_root == 1) {
    487 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    488 			/* XXX: How do we find the real root partition? */
    489 			char cname[sizeof(cset->ac->devname)];
    490 			snprintf(cname, sizeof(cname), "%s%c",
    491 			    device_xname(rsc->sc_dev), 'a');
    492 			booted_device = dkwedge_find_by_wname(cname);
    493 		} else
    494 			booted_device = rsc->sc_dev;
    495 	} else if (num_root > 1) {
    496 
    497 		/*
    498 		 * Maybe the MD code can help. If it cannot, then
    499 		 * setroot() will discover that we have no
    500 		 * booted_device and will ask the user if nothing was
    501 		 * hardwired in the kernel config file
    502 		 */
    503 
    504 		if (booted_device == NULL)
    505 			cpu_rootconf();
    506 		if (booted_device == NULL)
    507 			return;
    508 
    509 		num_root = 0;
    510 		mutex_enter(&raid_lock);
    511 		LIST_FOREACH(sc, &raids, sc_link) {
    512 			RF_Raid_t *r = &sc->sc_r;
    513 			if (r->valid == 0)
    514 				continue;
    515 
    516 			if (r->root_partition == 0)
    517 				continue;
    518 
    519 			for (col = 0; col < r->numCol; col++) {
    520 				devname = r->Disks[col].devname;
    521 				devname += sizeof("/dev/") - 1;
    522 				if (strncmp(devname, device_xname(booted_device),
    523 					    strlen(device_xname(booted_device))) != 0)
    524 					continue;
    525 				aprint_debug("raid%d includes boot device %s\n",
    526 				       sc->sc_unit, devname);
    527 				num_root++;
    528 				rsc = sc;
    529 			}
    530 		}
    531 		mutex_exit(&raid_lock);
    532 
    533 		if (num_root == 1) {
    534 			booted_device = rsc->sc_dev;
    535 		} else {
    536 			/* we can't guess.. require the user to answer... */
    537 			boothowto |= RB_ASKNAME;
    538 		}
    539 	}
    540 }
    541 
    542 
    543 int
    544 raidsize(dev_t dev)
    545 {
    546 	struct raid_softc *rs;
    547 	struct disklabel *lp;
    548 	int     part, unit, omask, size;
    549 
    550 	unit = raidunit(dev);
    551 	if ((rs = raidget(unit)) == NULL)
    552 		return -1;
    553 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    554 		return (-1);
    555 
    556 	part = DISKPART(dev);
    557 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    558 	lp = rs->sc_dkdev.dk_label;
    559 
    560 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    561 		return (-1);
    562 
    563 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    564 		size = -1;
    565 	else
    566 		size = lp->d_partitions[part].p_size *
    567 		    (lp->d_secsize / DEV_BSIZE);
    568 
    569 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    570 		return (-1);
    571 
    572 	return (size);
    573 
    574 }
    575 
    576 int
    577 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    578 {
    579 	int     unit = raidunit(dev);
    580 	struct raid_softc *rs;
    581 	const struct bdevsw *bdev;
    582 	struct disklabel *lp;
    583 	RF_Raid_t *raidPtr;
    584 	daddr_t offset;
    585 	int     part, c, sparecol, j, scol, dumpto;
    586 	int     error = 0;
    587 
    588 	if ((rs = raidget(unit)) == NULL)
    589 		return ENXIO;
    590 
    591 	raidPtr = &rs->sc_r;
    592 
    593 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    594 		return ENXIO;
    595 
    596 	/* we only support dumping to RAID 1 sets */
    597 	if (raidPtr->Layout.numDataCol != 1 ||
    598 	    raidPtr->Layout.numParityCol != 1)
    599 		return EINVAL;
    600 
    601 
    602 	if ((error = raidlock(rs)) != 0)
    603 		return error;
    604 
    605 	if (size % DEV_BSIZE != 0) {
    606 		error = EINVAL;
    607 		goto out;
    608 	}
    609 
    610 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    611 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    612 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    613 		    size / DEV_BSIZE, rs->sc_size);
    614 		error = EINVAL;
    615 		goto out;
    616 	}
    617 
    618 	part = DISKPART(dev);
    619 	lp = rs->sc_dkdev.dk_label;
    620 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    621 
    622 	/* figure out what device is alive.. */
    623 
    624 	/*
    625 	   Look for a component to dump to.  The preference for the
    626 	   component to dump to is as follows:
    627 	   1) the master
    628 	   2) a used_spare of the master
    629 	   3) the slave
    630 	   4) a used_spare of the slave
    631 	*/
    632 
    633 	dumpto = -1;
    634 	for (c = 0; c < raidPtr->numCol; c++) {
    635 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    636 			/* this might be the one */
    637 			dumpto = c;
    638 			break;
    639 		}
    640 	}
    641 
    642 	/*
    643 	   At this point we have possibly selected a live master or a
    644 	   live slave.  We now check to see if there is a spared
    645 	   master (or a spared slave), if we didn't find a live master
    646 	   or a live slave.
    647 	*/
    648 
    649 	for (c = 0; c < raidPtr->numSpare; c++) {
    650 		sparecol = raidPtr->numCol + c;
    651 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    652 			/* How about this one? */
    653 			scol = -1;
    654 			for(j=0;j<raidPtr->numCol;j++) {
    655 				if (raidPtr->Disks[j].spareCol == sparecol) {
    656 					scol = j;
    657 					break;
    658 				}
    659 			}
    660 			if (scol == 0) {
    661 				/*
    662 				   We must have found a spared master!
    663 				   We'll take that over anything else
    664 				   found so far.  (We couldn't have
    665 				   found a real master before, since
    666 				   this is a used spare, and it's
    667 				   saying that it's replacing the
    668 				   master.)  On reboot (with
    669 				   autoconfiguration turned on)
    670 				   sparecol will become the 1st
    671 				   component (component0) of this set.
    672 				*/
    673 				dumpto = sparecol;
    674 				break;
    675 			} else if (scol != -1) {
    676 				/*
    677 				   Must be a spared slave.  We'll dump
    678 				   to that if we havn't found anything
    679 				   else so far.
    680 				*/
    681 				if (dumpto == -1)
    682 					dumpto = sparecol;
    683 			}
    684 		}
    685 	}
    686 
    687 	if (dumpto == -1) {
    688 		/* we couldn't find any live components to dump to!?!?
    689 		 */
    690 		error = EINVAL;
    691 		goto out;
    692 	}
    693 
    694 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    695 
    696 	/*
    697 	   Note that blkno is relative to this particular partition.
    698 	   By adding the offset of this partition in the RAID
    699 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    700 	   value that is relative to the partition used for the
    701 	   underlying component.
    702 	*/
    703 
    704 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    705 				blkno + offset, va, size);
    706 
    707 out:
    708 	raidunlock(rs);
    709 
    710 	return error;
    711 }
    712 /* ARGSUSED */
    713 int
    714 raidopen(dev_t dev, int flags, int fmt,
    715     struct lwp *l)
    716 {
    717 	int     unit = raidunit(dev);
    718 	struct raid_softc *rs;
    719 	struct disklabel *lp;
    720 	int     part, pmask;
    721 	int     error = 0;
    722 
    723 	if ((rs = raidget(unit)) == NULL)
    724 		return ENXIO;
    725 	if ((error = raidlock(rs)) != 0)
    726 		return (error);
    727 
    728 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    729 		error = EBUSY;
    730 		goto bad;
    731 	}
    732 
    733 	lp = rs->sc_dkdev.dk_label;
    734 
    735 	part = DISKPART(dev);
    736 
    737 	/*
    738 	 * If there are wedges, and this is not RAW_PART, then we
    739 	 * need to fail.
    740 	 */
    741 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    742 		error = EBUSY;
    743 		goto bad;
    744 	}
    745 	pmask = (1 << part);
    746 
    747 	if ((rs->sc_flags & RAIDF_INITED) &&
    748 	    (rs->sc_dkdev.dk_openmask == 0))
    749 		raidgetdisklabel(dev);
    750 
    751 	/* make sure that this partition exists */
    752 
    753 	if (part != RAW_PART) {
    754 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    755 		    ((part >= lp->d_npartitions) ||
    756 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    757 			error = ENXIO;
    758 			goto bad;
    759 		}
    760 	}
    761 	/* Prevent this unit from being unconfigured while open. */
    762 	switch (fmt) {
    763 	case S_IFCHR:
    764 		rs->sc_dkdev.dk_copenmask |= pmask;
    765 		break;
    766 
    767 	case S_IFBLK:
    768 		rs->sc_dkdev.dk_bopenmask |= pmask;
    769 		break;
    770 	}
    771 
    772 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    773 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    774 		/* First one... mark things as dirty... Note that we *MUST*
    775 		 have done a configure before this.  I DO NOT WANT TO BE
    776 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    777 		 THAT THEY BELONG TOGETHER!!!!! */
    778 		/* XXX should check to see if we're only open for reading
    779 		   here... If so, we needn't do this, but then need some
    780 		   other way of keeping track of what's happened.. */
    781 
    782 		rf_markalldirty(&rs->sc_r);
    783 	}
    784 
    785 
    786 	rs->sc_dkdev.dk_openmask =
    787 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    788 
    789 bad:
    790 	raidunlock(rs);
    791 
    792 	return (error);
    793 
    794 
    795 }
    796 /* ARGSUSED */
    797 int
    798 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    799 {
    800 	int     unit = raidunit(dev);
    801 	struct raid_softc *rs;
    802 	int     error = 0;
    803 	int     part;
    804 
    805 	if ((rs = raidget(unit)) == NULL)
    806 		return ENXIO;
    807 
    808 	if ((error = raidlock(rs)) != 0)
    809 		return (error);
    810 
    811 	part = DISKPART(dev);
    812 
    813 	/* ...that much closer to allowing unconfiguration... */
    814 	switch (fmt) {
    815 	case S_IFCHR:
    816 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    817 		break;
    818 
    819 	case S_IFBLK:
    820 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    821 		break;
    822 	}
    823 	rs->sc_dkdev.dk_openmask =
    824 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    825 
    826 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    827 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    828 		/* Last one... device is not unconfigured yet.
    829 		   Device shutdown has taken care of setting the
    830 		   clean bits if RAIDF_INITED is not set
    831 		   mark things as clean... */
    832 
    833 		rf_update_component_labels(&rs->sc_r,
    834 						 RF_FINAL_COMPONENT_UPDATE);
    835 
    836 		/* If the kernel is shutting down, it will detach
    837 		 * this RAID set soon enough.
    838 		 */
    839 	}
    840 
    841 	raidunlock(rs);
    842 	return (0);
    843 
    844 }
    845 
    846 void
    847 raidstrategy(struct buf *bp)
    848 {
    849 	unsigned int unit = raidunit(bp->b_dev);
    850 	RF_Raid_t *raidPtr;
    851 	int     wlabel;
    852 	struct raid_softc *rs;
    853 
    854 	if ((rs = raidget(unit)) == NULL) {
    855 		bp->b_error = ENXIO;
    856 		goto done;
    857 	}
    858 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    859 		bp->b_error = ENXIO;
    860 		goto done;
    861 	}
    862 	raidPtr = &rs->sc_r;
    863 	if (!raidPtr->valid) {
    864 		bp->b_error = ENODEV;
    865 		goto done;
    866 	}
    867 	if (bp->b_bcount == 0) {
    868 		db1_printf(("b_bcount is zero..\n"));
    869 		goto done;
    870 	}
    871 
    872 	/*
    873 	 * Do bounds checking and adjust transfer.  If there's an
    874 	 * error, the bounds check will flag that for us.
    875 	 */
    876 
    877 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    878 	if (DISKPART(bp->b_dev) == RAW_PART) {
    879 		uint64_t size; /* device size in DEV_BSIZE unit */
    880 
    881 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    882 			size = raidPtr->totalSectors <<
    883 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    884 		} else {
    885 			size = raidPtr->totalSectors >>
    886 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    887 		}
    888 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    889 			goto done;
    890 		}
    891 	} else {
    892 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    893 			db1_printf(("Bounds check failed!!:%d %d\n",
    894 				(int) bp->b_blkno, (int) wlabel));
    895 			goto done;
    896 		}
    897 	}
    898 
    899 	rf_lock_mutex2(raidPtr->iodone_lock);
    900 
    901 	bp->b_resid = 0;
    902 
    903 	/* stuff it onto our queue */
    904 	bufq_put(rs->buf_queue, bp);
    905 
    906 	/* scheduled the IO to happen at the next convenient time */
    907 	rf_signal_cond2(raidPtr->iodone_cv);
    908 	rf_unlock_mutex2(raidPtr->iodone_lock);
    909 
    910 	return;
    911 
    912 done:
    913 	bp->b_resid = bp->b_bcount;
    914 	biodone(bp);
    915 }
    916 /* ARGSUSED */
    917 int
    918 raidread(dev_t dev, struct uio *uio, int flags)
    919 {
    920 	int     unit = raidunit(dev);
    921 	struct raid_softc *rs;
    922 
    923 	if ((rs = raidget(unit)) == NULL)
    924 		return ENXIO;
    925 
    926 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    927 		return (ENXIO);
    928 
    929 	return (physio(raidstrategy, NULL, dev, B_READ, raidminphys, uio));
    930 
    931 }
    932 /* ARGSUSED */
    933 int
    934 raidwrite(dev_t dev, struct uio *uio, int flags)
    935 {
    936 	int     unit = raidunit(dev);
    937 	struct raid_softc *rs;
    938 
    939 	if ((rs = raidget(unit)) == NULL)
    940 		return ENXIO;
    941 
    942 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    943 		return (ENXIO);
    944 
    945 	return (physio(raidstrategy, NULL, dev, B_WRITE, raidminphys, uio));
    946 
    947 }
    948 
    949 static int
    950 raid_detach_unlocked(struct raid_softc *rs)
    951 {
    952 	int error;
    953 	RF_Raid_t *raidPtr;
    954 
    955 	raidPtr = &rs->sc_r;
    956 
    957 	/*
    958 	 * If somebody has a partition mounted, we shouldn't
    959 	 * shutdown.
    960 	 */
    961 	if (rs->sc_dkdev.dk_openmask != 0)
    962 		return EBUSY;
    963 
    964 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    965 		;	/* not initialized: nothing to do */
    966 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    967 		return error;
    968 	else
    969 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    970 
    971 	/* Detach the disk. */
    972 	dkwedge_delall(&rs->sc_dkdev);
    973 	disk_detach(&rs->sc_dkdev);
    974 	disk_destroy(&rs->sc_dkdev);
    975 
    976 	aprint_normal_dev(rs->sc_dev, "detached\n");
    977 
    978 	return 0;
    979 }
    980 
    981 int
    982 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    983 {
    984 	int     unit = raidunit(dev);
    985 	int     error = 0;
    986 	int     part, pmask, s;
    987 	cfdata_t cf;
    988 	struct raid_softc *rs;
    989 	RF_Config_t *k_cfg, *u_cfg;
    990 	RF_Raid_t *raidPtr;
    991 	RF_RaidDisk_t *diskPtr;
    992 	RF_AccTotals_t *totals;
    993 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    994 	u_char *specific_buf;
    995 	int retcode = 0;
    996 	int column;
    997 /*	int raidid; */
    998 	struct rf_recon_req *rrcopy, *rr;
    999 	RF_ComponentLabel_t *clabel;
   1000 	RF_ComponentLabel_t *ci_label;
   1001 	RF_ComponentLabel_t **clabel_ptr;
   1002 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1003 	RF_SingleComponent_t component;
   1004 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1005 	int i, j, d;
   1006 #ifdef __HAVE_OLD_DISKLABEL
   1007 	struct disklabel newlabel;
   1008 #endif
   1009 	struct dkwedge_info *dkw;
   1010 
   1011 	if ((rs = raidget(unit)) == NULL)
   1012 		return ENXIO;
   1013 	raidPtr = &rs->sc_r;
   1014 
   1015 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1016 		(int) DISKPART(dev), (int) unit, cmd));
   1017 
   1018 	/* Must be open for writes for these commands... */
   1019 	switch (cmd) {
   1020 #ifdef DIOCGSECTORSIZE
   1021 	case DIOCGSECTORSIZE:
   1022 		*(u_int *)data = raidPtr->bytesPerSector;
   1023 		return 0;
   1024 	case DIOCGMEDIASIZE:
   1025 		*(off_t *)data =
   1026 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1027 		return 0;
   1028 #endif
   1029 	case DIOCSDINFO:
   1030 	case DIOCWDINFO:
   1031 #ifdef __HAVE_OLD_DISKLABEL
   1032 	case ODIOCWDINFO:
   1033 	case ODIOCSDINFO:
   1034 #endif
   1035 	case DIOCWLABEL:
   1036 	case DIOCAWEDGE:
   1037 	case DIOCDWEDGE:
   1038 	case DIOCSSTRATEGY:
   1039 		if ((flag & FWRITE) == 0)
   1040 			return (EBADF);
   1041 	}
   1042 
   1043 	/* Must be initialized for these... */
   1044 	switch (cmd) {
   1045 	case DIOCGDINFO:
   1046 	case DIOCSDINFO:
   1047 	case DIOCWDINFO:
   1048 #ifdef __HAVE_OLD_DISKLABEL
   1049 	case ODIOCGDINFO:
   1050 	case ODIOCWDINFO:
   1051 	case ODIOCSDINFO:
   1052 	case ODIOCGDEFLABEL:
   1053 #endif
   1054 	case DIOCGPART:
   1055 	case DIOCWLABEL:
   1056 	case DIOCGDEFLABEL:
   1057 	case DIOCAWEDGE:
   1058 	case DIOCDWEDGE:
   1059 	case DIOCLWEDGES:
   1060 	case DIOCCACHESYNC:
   1061 	case RAIDFRAME_SHUTDOWN:
   1062 	case RAIDFRAME_REWRITEPARITY:
   1063 	case RAIDFRAME_GET_INFO:
   1064 	case RAIDFRAME_RESET_ACCTOTALS:
   1065 	case RAIDFRAME_GET_ACCTOTALS:
   1066 	case RAIDFRAME_KEEP_ACCTOTALS:
   1067 	case RAIDFRAME_GET_SIZE:
   1068 	case RAIDFRAME_FAIL_DISK:
   1069 	case RAIDFRAME_COPYBACK:
   1070 	case RAIDFRAME_CHECK_RECON_STATUS:
   1071 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1072 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1073 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1074 	case RAIDFRAME_ADD_HOT_SPARE:
   1075 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1076 	case RAIDFRAME_INIT_LABELS:
   1077 	case RAIDFRAME_REBUILD_IN_PLACE:
   1078 	case RAIDFRAME_CHECK_PARITY:
   1079 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1080 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1081 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1082 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1083 	case RAIDFRAME_SET_AUTOCONFIG:
   1084 	case RAIDFRAME_SET_ROOT:
   1085 	case RAIDFRAME_DELETE_COMPONENT:
   1086 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1087 	case RAIDFRAME_PARITYMAP_STATUS:
   1088 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1089 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1090 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1091 	case DIOCGSTRATEGY:
   1092 	case DIOCSSTRATEGY:
   1093 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1094 			return (ENXIO);
   1095 	}
   1096 
   1097 	switch (cmd) {
   1098 #ifdef COMPAT_50
   1099 	case RAIDFRAME_GET_INFO50:
   1100 		return rf_get_info50(raidPtr, data);
   1101 
   1102 	case RAIDFRAME_CONFIGURE50:
   1103 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1104 			return retcode;
   1105 		goto config;
   1106 #endif
   1107 		/* configure the system */
   1108 	case RAIDFRAME_CONFIGURE:
   1109 
   1110 		if (raidPtr->valid) {
   1111 			/* There is a valid RAID set running on this unit! */
   1112 			printf("raid%d: Device already configured!\n",unit);
   1113 			return(EINVAL);
   1114 		}
   1115 
   1116 		/* copy-in the configuration information */
   1117 		/* data points to a pointer to the configuration structure */
   1118 
   1119 		u_cfg = *((RF_Config_t **) data);
   1120 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1121 		if (k_cfg == NULL) {
   1122 			return (ENOMEM);
   1123 		}
   1124 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1125 		if (retcode) {
   1126 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1127 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1128 				retcode));
   1129 			return (retcode);
   1130 		}
   1131 		goto config;
   1132 	config:
   1133 		/* allocate a buffer for the layout-specific data, and copy it
   1134 		 * in */
   1135 		if (k_cfg->layoutSpecificSize) {
   1136 			if (k_cfg->layoutSpecificSize > 10000) {
   1137 				/* sanity check */
   1138 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1139 				return (EINVAL);
   1140 			}
   1141 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1142 			    (u_char *));
   1143 			if (specific_buf == NULL) {
   1144 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1145 				return (ENOMEM);
   1146 			}
   1147 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1148 			    k_cfg->layoutSpecificSize);
   1149 			if (retcode) {
   1150 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1151 				RF_Free(specific_buf,
   1152 					k_cfg->layoutSpecificSize);
   1153 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1154 					retcode));
   1155 				return (retcode);
   1156 			}
   1157 		} else
   1158 			specific_buf = NULL;
   1159 		k_cfg->layoutSpecific = specific_buf;
   1160 
   1161 		/* should do some kind of sanity check on the configuration.
   1162 		 * Store the sum of all the bytes in the last byte? */
   1163 
   1164 		/* configure the system */
   1165 
   1166 		/*
   1167 		 * Clear the entire RAID descriptor, just to make sure
   1168 		 *  there is no stale data left in the case of a
   1169 		 *  reconfiguration
   1170 		 */
   1171 		memset(raidPtr, 0, sizeof(*raidPtr));
   1172 		raidPtr->softc = rs;
   1173 		raidPtr->raidid = unit;
   1174 
   1175 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1176 
   1177 		if (retcode == 0) {
   1178 
   1179 			/* allow this many simultaneous IO's to
   1180 			   this RAID device */
   1181 			raidPtr->openings = RAIDOUTSTANDING;
   1182 
   1183 			raidinit(rs);
   1184 			rf_markalldirty(raidPtr);
   1185 		}
   1186 		/* free the buffers.  No return code here. */
   1187 		if (k_cfg->layoutSpecificSize) {
   1188 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1189 		}
   1190 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1191 
   1192 		return (retcode);
   1193 
   1194 		/* shutdown the system */
   1195 	case RAIDFRAME_SHUTDOWN:
   1196 
   1197 		part = DISKPART(dev);
   1198 		pmask = (1 << part);
   1199 
   1200 		if ((error = raidlock(rs)) != 0)
   1201 			return (error);
   1202 
   1203 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1204 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1205 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1206 			retcode = EBUSY;
   1207 		else {
   1208 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1209 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1210 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1211 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1212 			retcode = 0;
   1213 		}
   1214 
   1215 		raidunlock(rs);
   1216 
   1217 		if (retcode != 0)
   1218 			return retcode;
   1219 
   1220 		/* free the pseudo device attach bits */
   1221 
   1222 		cf = device_cfdata(rs->sc_dev);
   1223 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1224 			free(cf, M_RAIDFRAME);
   1225 
   1226 		return (retcode);
   1227 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1228 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1229 		/* need to read the component label for the disk indicated
   1230 		   by row,column in clabel */
   1231 
   1232 		/*
   1233 		 * Perhaps there should be an option to skip the in-core
   1234 		 * copy and hit the disk, as with disklabel(8).
   1235 		 */
   1236 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1237 
   1238 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1239 
   1240 		if (retcode) {
   1241 			RF_Free(clabel, sizeof(*clabel));
   1242 			return retcode;
   1243 		}
   1244 
   1245 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1246 
   1247 		column = clabel->column;
   1248 
   1249 		if ((column < 0) || (column >= raidPtr->numCol +
   1250 		    raidPtr->numSpare)) {
   1251 			RF_Free(clabel, sizeof(*clabel));
   1252 			return EINVAL;
   1253 		}
   1254 
   1255 		RF_Free(clabel, sizeof(*clabel));
   1256 
   1257 		clabel = raidget_component_label(raidPtr, column);
   1258 
   1259 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1260 
   1261 #if 0
   1262 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1263 		clabel = (RF_ComponentLabel_t *) data;
   1264 
   1265 		/* XXX check the label for valid stuff... */
   1266 		/* Note that some things *should not* get modified --
   1267 		   the user should be re-initing the labels instead of
   1268 		   trying to patch things.
   1269 		   */
   1270 
   1271 		raidid = raidPtr->raidid;
   1272 #ifdef DEBUG
   1273 		printf("raid%d: Got component label:\n", raidid);
   1274 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1275 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1276 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1277 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1278 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1279 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1280 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1281 #endif
   1282 		clabel->row = 0;
   1283 		column = clabel->column;
   1284 
   1285 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1286 			return(EINVAL);
   1287 		}
   1288 
   1289 		/* XXX this isn't allowed to do anything for now :-) */
   1290 
   1291 		/* XXX and before it is, we need to fill in the rest
   1292 		   of the fields!?!?!?! */
   1293 		memcpy(raidget_component_label(raidPtr, column),
   1294 		    clabel, sizeof(*clabel));
   1295 		raidflush_component_label(raidPtr, column);
   1296 		return (0);
   1297 #endif
   1298 
   1299 	case RAIDFRAME_INIT_LABELS:
   1300 		clabel = (RF_ComponentLabel_t *) data;
   1301 		/*
   1302 		   we only want the serial number from
   1303 		   the above.  We get all the rest of the information
   1304 		   from the config that was used to create this RAID
   1305 		   set.
   1306 		   */
   1307 
   1308 		raidPtr->serial_number = clabel->serial_number;
   1309 
   1310 		for(column=0;column<raidPtr->numCol;column++) {
   1311 			diskPtr = &raidPtr->Disks[column];
   1312 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1313 				ci_label = raidget_component_label(raidPtr,
   1314 				    column);
   1315 				/* Zeroing this is important. */
   1316 				memset(ci_label, 0, sizeof(*ci_label));
   1317 				raid_init_component_label(raidPtr, ci_label);
   1318 				ci_label->serial_number =
   1319 				    raidPtr->serial_number;
   1320 				ci_label->row = 0; /* we dont' pretend to support more */
   1321 				rf_component_label_set_partitionsize(ci_label,
   1322 				    diskPtr->partitionSize);
   1323 				ci_label->column = column;
   1324 				raidflush_component_label(raidPtr, column);
   1325 			}
   1326 			/* XXXjld what about the spares? */
   1327 		}
   1328 
   1329 		return (retcode);
   1330 	case RAIDFRAME_SET_AUTOCONFIG:
   1331 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1332 		printf("raid%d: New autoconfig value is: %d\n",
   1333 		       raidPtr->raidid, d);
   1334 		*(int *) data = d;
   1335 		return (retcode);
   1336 
   1337 	case RAIDFRAME_SET_ROOT:
   1338 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1339 		printf("raid%d: New rootpartition value is: %d\n",
   1340 		       raidPtr->raidid, d);
   1341 		*(int *) data = d;
   1342 		return (retcode);
   1343 
   1344 		/* initialize all parity */
   1345 	case RAIDFRAME_REWRITEPARITY:
   1346 
   1347 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1348 			/* Parity for RAID 0 is trivially correct */
   1349 			raidPtr->parity_good = RF_RAID_CLEAN;
   1350 			return(0);
   1351 		}
   1352 
   1353 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1354 			/* Re-write is already in progress! */
   1355 			return(EINVAL);
   1356 		}
   1357 
   1358 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1359 					   rf_RewriteParityThread,
   1360 					   raidPtr,"raid_parity");
   1361 		return (retcode);
   1362 
   1363 
   1364 	case RAIDFRAME_ADD_HOT_SPARE:
   1365 		sparePtr = (RF_SingleComponent_t *) data;
   1366 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1367 		retcode = rf_add_hot_spare(raidPtr, &component);
   1368 		return(retcode);
   1369 
   1370 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1371 		return(retcode);
   1372 
   1373 	case RAIDFRAME_DELETE_COMPONENT:
   1374 		componentPtr = (RF_SingleComponent_t *)data;
   1375 		memcpy( &component, componentPtr,
   1376 			sizeof(RF_SingleComponent_t));
   1377 		retcode = rf_delete_component(raidPtr, &component);
   1378 		return(retcode);
   1379 
   1380 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1381 		componentPtr = (RF_SingleComponent_t *)data;
   1382 		memcpy( &component, componentPtr,
   1383 			sizeof(RF_SingleComponent_t));
   1384 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1385 		return(retcode);
   1386 
   1387 	case RAIDFRAME_REBUILD_IN_PLACE:
   1388 
   1389 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1390 			/* Can't do this on a RAID 0!! */
   1391 			return(EINVAL);
   1392 		}
   1393 
   1394 		if (raidPtr->recon_in_progress == 1) {
   1395 			/* a reconstruct is already in progress! */
   1396 			return(EINVAL);
   1397 		}
   1398 
   1399 		componentPtr = (RF_SingleComponent_t *) data;
   1400 		memcpy( &component, componentPtr,
   1401 			sizeof(RF_SingleComponent_t));
   1402 		component.row = 0; /* we don't support any more */
   1403 		column = component.column;
   1404 
   1405 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1406 			return(EINVAL);
   1407 		}
   1408 
   1409 		rf_lock_mutex2(raidPtr->mutex);
   1410 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1411 		    (raidPtr->numFailures > 0)) {
   1412 			/* XXX 0 above shouldn't be constant!!! */
   1413 			/* some component other than this has failed.
   1414 			   Let's not make things worse than they already
   1415 			   are... */
   1416 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1417 			       raidPtr->raidid);
   1418 			printf("raid%d:     Col: %d   Too many failures.\n",
   1419 			       raidPtr->raidid, column);
   1420 			rf_unlock_mutex2(raidPtr->mutex);
   1421 			return (EINVAL);
   1422 		}
   1423 		if (raidPtr->Disks[column].status ==
   1424 		    rf_ds_reconstructing) {
   1425 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1426 			       raidPtr->raidid);
   1427 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1428 
   1429 			rf_unlock_mutex2(raidPtr->mutex);
   1430 			return (EINVAL);
   1431 		}
   1432 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1433 			rf_unlock_mutex2(raidPtr->mutex);
   1434 			return (EINVAL);
   1435 		}
   1436 		rf_unlock_mutex2(raidPtr->mutex);
   1437 
   1438 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1439 		if (rrcopy == NULL)
   1440 			return(ENOMEM);
   1441 
   1442 		rrcopy->raidPtr = (void *) raidPtr;
   1443 		rrcopy->col = column;
   1444 
   1445 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1446 					   rf_ReconstructInPlaceThread,
   1447 					   rrcopy,"raid_reconip");
   1448 		return(retcode);
   1449 
   1450 	case RAIDFRAME_GET_INFO:
   1451 		if (!raidPtr->valid)
   1452 			return (ENODEV);
   1453 		ucfgp = (RF_DeviceConfig_t **) data;
   1454 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1455 			  (RF_DeviceConfig_t *));
   1456 		if (d_cfg == NULL)
   1457 			return (ENOMEM);
   1458 		d_cfg->rows = 1; /* there is only 1 row now */
   1459 		d_cfg->cols = raidPtr->numCol;
   1460 		d_cfg->ndevs = raidPtr->numCol;
   1461 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1462 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1463 			return (ENOMEM);
   1464 		}
   1465 		d_cfg->nspares = raidPtr->numSpare;
   1466 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1467 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1468 			return (ENOMEM);
   1469 		}
   1470 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1471 		d = 0;
   1472 		for (j = 0; j < d_cfg->cols; j++) {
   1473 			d_cfg->devs[d] = raidPtr->Disks[j];
   1474 			d++;
   1475 		}
   1476 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1477 			d_cfg->spares[i] = raidPtr->Disks[j];
   1478 		}
   1479 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1480 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1481 
   1482 		return (retcode);
   1483 
   1484 	case RAIDFRAME_CHECK_PARITY:
   1485 		*(int *) data = raidPtr->parity_good;
   1486 		return (0);
   1487 
   1488 	case RAIDFRAME_PARITYMAP_STATUS:
   1489 		if (rf_paritymap_ineligible(raidPtr))
   1490 			return EINVAL;
   1491 		rf_paritymap_status(raidPtr->parity_map,
   1492 		    (struct rf_pmstat *)data);
   1493 		return 0;
   1494 
   1495 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1496 		if (rf_paritymap_ineligible(raidPtr))
   1497 			return EINVAL;
   1498 		if (raidPtr->parity_map == NULL)
   1499 			return ENOENT; /* ??? */
   1500 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1501 			(struct rf_pmparams *)data, 1))
   1502 			return EINVAL;
   1503 		return 0;
   1504 
   1505 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1506 		if (rf_paritymap_ineligible(raidPtr))
   1507 			return EINVAL;
   1508 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1509 		return 0;
   1510 
   1511 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1512 		if (rf_paritymap_ineligible(raidPtr))
   1513 			return EINVAL;
   1514 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1515 		/* XXX should errors be passed up? */
   1516 		return 0;
   1517 
   1518 	case RAIDFRAME_RESET_ACCTOTALS:
   1519 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1520 		return (0);
   1521 
   1522 	case RAIDFRAME_GET_ACCTOTALS:
   1523 		totals = (RF_AccTotals_t *) data;
   1524 		*totals = raidPtr->acc_totals;
   1525 		return (0);
   1526 
   1527 	case RAIDFRAME_KEEP_ACCTOTALS:
   1528 		raidPtr->keep_acc_totals = *(int *)data;
   1529 		return (0);
   1530 
   1531 	case RAIDFRAME_GET_SIZE:
   1532 		*(int *) data = raidPtr->totalSectors;
   1533 		return (0);
   1534 
   1535 		/* fail a disk & optionally start reconstruction */
   1536 	case RAIDFRAME_FAIL_DISK:
   1537 
   1538 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1539 			/* Can't do this on a RAID 0!! */
   1540 			return(EINVAL);
   1541 		}
   1542 
   1543 		rr = (struct rf_recon_req *) data;
   1544 		rr->row = 0;
   1545 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1546 			return (EINVAL);
   1547 
   1548 
   1549 		rf_lock_mutex2(raidPtr->mutex);
   1550 		if (raidPtr->status == rf_rs_reconstructing) {
   1551 			/* you can't fail a disk while we're reconstructing! */
   1552 			/* XXX wrong for RAID6 */
   1553 			rf_unlock_mutex2(raidPtr->mutex);
   1554 			return (EINVAL);
   1555 		}
   1556 		if ((raidPtr->Disks[rr->col].status ==
   1557 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1558 			/* some other component has failed.  Let's not make
   1559 			   things worse. XXX wrong for RAID6 */
   1560 			rf_unlock_mutex2(raidPtr->mutex);
   1561 			return (EINVAL);
   1562 		}
   1563 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1564 			/* Can't fail a spared disk! */
   1565 			rf_unlock_mutex2(raidPtr->mutex);
   1566 			return (EINVAL);
   1567 		}
   1568 		rf_unlock_mutex2(raidPtr->mutex);
   1569 
   1570 		/* make a copy of the recon request so that we don't rely on
   1571 		 * the user's buffer */
   1572 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1573 		if (rrcopy == NULL)
   1574 			return(ENOMEM);
   1575 		memcpy(rrcopy, rr, sizeof(*rr));
   1576 		rrcopy->raidPtr = (void *) raidPtr;
   1577 
   1578 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1579 					   rf_ReconThread,
   1580 					   rrcopy,"raid_recon");
   1581 		return (0);
   1582 
   1583 		/* invoke a copyback operation after recon on whatever disk
   1584 		 * needs it, if any */
   1585 	case RAIDFRAME_COPYBACK:
   1586 
   1587 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1588 			/* This makes no sense on a RAID 0!! */
   1589 			return(EINVAL);
   1590 		}
   1591 
   1592 		if (raidPtr->copyback_in_progress == 1) {
   1593 			/* Copyback is already in progress! */
   1594 			return(EINVAL);
   1595 		}
   1596 
   1597 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1598 					   rf_CopybackThread,
   1599 					   raidPtr,"raid_copyback");
   1600 		return (retcode);
   1601 
   1602 		/* return the percentage completion of reconstruction */
   1603 	case RAIDFRAME_CHECK_RECON_STATUS:
   1604 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1605 			/* This makes no sense on a RAID 0, so tell the
   1606 			   user it's done. */
   1607 			*(int *) data = 100;
   1608 			return(0);
   1609 		}
   1610 		if (raidPtr->status != rf_rs_reconstructing)
   1611 			*(int *) data = 100;
   1612 		else {
   1613 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1614 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1615 			} else {
   1616 				*(int *) data = 0;
   1617 			}
   1618 		}
   1619 		return (0);
   1620 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1621 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1622 		if (raidPtr->status != rf_rs_reconstructing) {
   1623 			progressInfo.remaining = 0;
   1624 			progressInfo.completed = 100;
   1625 			progressInfo.total = 100;
   1626 		} else {
   1627 			progressInfo.total =
   1628 				raidPtr->reconControl->numRUsTotal;
   1629 			progressInfo.completed =
   1630 				raidPtr->reconControl->numRUsComplete;
   1631 			progressInfo.remaining = progressInfo.total -
   1632 				progressInfo.completed;
   1633 		}
   1634 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1635 				  sizeof(RF_ProgressInfo_t));
   1636 		return (retcode);
   1637 
   1638 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1639 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1640 			/* This makes no sense on a RAID 0, so tell the
   1641 			   user it's done. */
   1642 			*(int *) data = 100;
   1643 			return(0);
   1644 		}
   1645 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1646 			*(int *) data = 100 *
   1647 				raidPtr->parity_rewrite_stripes_done /
   1648 				raidPtr->Layout.numStripe;
   1649 		} else {
   1650 			*(int *) data = 100;
   1651 		}
   1652 		return (0);
   1653 
   1654 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1655 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1656 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1657 			progressInfo.total = raidPtr->Layout.numStripe;
   1658 			progressInfo.completed =
   1659 				raidPtr->parity_rewrite_stripes_done;
   1660 			progressInfo.remaining = progressInfo.total -
   1661 				progressInfo.completed;
   1662 		} else {
   1663 			progressInfo.remaining = 0;
   1664 			progressInfo.completed = 100;
   1665 			progressInfo.total = 100;
   1666 		}
   1667 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1668 				  sizeof(RF_ProgressInfo_t));
   1669 		return (retcode);
   1670 
   1671 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1672 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1673 			/* This makes no sense on a RAID 0 */
   1674 			*(int *) data = 100;
   1675 			return(0);
   1676 		}
   1677 		if (raidPtr->copyback_in_progress == 1) {
   1678 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1679 				raidPtr->Layout.numStripe;
   1680 		} else {
   1681 			*(int *) data = 100;
   1682 		}
   1683 		return (0);
   1684 
   1685 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1686 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1687 		if (raidPtr->copyback_in_progress == 1) {
   1688 			progressInfo.total = raidPtr->Layout.numStripe;
   1689 			progressInfo.completed =
   1690 				raidPtr->copyback_stripes_done;
   1691 			progressInfo.remaining = progressInfo.total -
   1692 				progressInfo.completed;
   1693 		} else {
   1694 			progressInfo.remaining = 0;
   1695 			progressInfo.completed = 100;
   1696 			progressInfo.total = 100;
   1697 		}
   1698 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1699 				  sizeof(RF_ProgressInfo_t));
   1700 		return (retcode);
   1701 
   1702 		/* the sparetable daemon calls this to wait for the kernel to
   1703 		 * need a spare table. this ioctl does not return until a
   1704 		 * spare table is needed. XXX -- calling mpsleep here in the
   1705 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1706 		 * -- I should either compute the spare table in the kernel,
   1707 		 * or have a different -- XXX XXX -- interface (a different
   1708 		 * character device) for delivering the table     -- XXX */
   1709 #if 0
   1710 	case RAIDFRAME_SPARET_WAIT:
   1711 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1712 		while (!rf_sparet_wait_queue)
   1713 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1714 		waitreq = rf_sparet_wait_queue;
   1715 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1716 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1717 
   1718 		/* structure assignment */
   1719 		*((RF_SparetWait_t *) data) = *waitreq;
   1720 
   1721 		RF_Free(waitreq, sizeof(*waitreq));
   1722 		return (0);
   1723 
   1724 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1725 		 * code in it that will cause the dameon to exit */
   1726 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1727 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1728 		waitreq->fcol = -1;
   1729 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1730 		waitreq->next = rf_sparet_wait_queue;
   1731 		rf_sparet_wait_queue = waitreq;
   1732 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1733 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1734 		return (0);
   1735 
   1736 		/* used by the spare table daemon to deliver a spare table
   1737 		 * into the kernel */
   1738 	case RAIDFRAME_SEND_SPARET:
   1739 
   1740 		/* install the spare table */
   1741 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1742 
   1743 		/* respond to the requestor.  the return status of the spare
   1744 		 * table installation is passed in the "fcol" field */
   1745 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1746 		waitreq->fcol = retcode;
   1747 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1748 		waitreq->next = rf_sparet_resp_queue;
   1749 		rf_sparet_resp_queue = waitreq;
   1750 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1751 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1752 
   1753 		return (retcode);
   1754 #endif
   1755 
   1756 	default:
   1757 		break; /* fall through to the os-specific code below */
   1758 
   1759 	}
   1760 
   1761 	if (!raidPtr->valid)
   1762 		return (EINVAL);
   1763 
   1764 	/*
   1765 	 * Add support for "regular" device ioctls here.
   1766 	 */
   1767 
   1768 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1769 	if (error != EPASSTHROUGH)
   1770 		return (error);
   1771 
   1772 	switch (cmd) {
   1773 	case DIOCGDINFO:
   1774 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1775 		break;
   1776 #ifdef __HAVE_OLD_DISKLABEL
   1777 	case ODIOCGDINFO:
   1778 		newlabel = *(rs->sc_dkdev.dk_label);
   1779 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1780 			return ENOTTY;
   1781 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1782 		break;
   1783 #endif
   1784 
   1785 	case DIOCGPART:
   1786 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1787 		((struct partinfo *) data)->part =
   1788 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1789 		break;
   1790 
   1791 	case DIOCWDINFO:
   1792 	case DIOCSDINFO:
   1793 #ifdef __HAVE_OLD_DISKLABEL
   1794 	case ODIOCWDINFO:
   1795 	case ODIOCSDINFO:
   1796 #endif
   1797 	{
   1798 		struct disklabel *lp;
   1799 #ifdef __HAVE_OLD_DISKLABEL
   1800 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1801 			memset(&newlabel, 0, sizeof newlabel);
   1802 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1803 			lp = &newlabel;
   1804 		} else
   1805 #endif
   1806 		lp = (struct disklabel *)data;
   1807 
   1808 		if ((error = raidlock(rs)) != 0)
   1809 			return (error);
   1810 
   1811 		rs->sc_flags |= RAIDF_LABELLING;
   1812 
   1813 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1814 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1815 		if (error == 0) {
   1816 			if (cmd == DIOCWDINFO
   1817 #ifdef __HAVE_OLD_DISKLABEL
   1818 			    || cmd == ODIOCWDINFO
   1819 #endif
   1820 			   )
   1821 				error = writedisklabel(RAIDLABELDEV(dev),
   1822 				    raidstrategy, rs->sc_dkdev.dk_label,
   1823 				    rs->sc_dkdev.dk_cpulabel);
   1824 		}
   1825 		rs->sc_flags &= ~RAIDF_LABELLING;
   1826 
   1827 		raidunlock(rs);
   1828 
   1829 		if (error)
   1830 			return (error);
   1831 		break;
   1832 	}
   1833 
   1834 	case DIOCWLABEL:
   1835 		if (*(int *) data != 0)
   1836 			rs->sc_flags |= RAIDF_WLABEL;
   1837 		else
   1838 			rs->sc_flags &= ~RAIDF_WLABEL;
   1839 		break;
   1840 
   1841 	case DIOCGDEFLABEL:
   1842 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1843 		break;
   1844 
   1845 #ifdef __HAVE_OLD_DISKLABEL
   1846 	case ODIOCGDEFLABEL:
   1847 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1848 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1849 			return ENOTTY;
   1850 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1851 		break;
   1852 #endif
   1853 
   1854 	case DIOCAWEDGE:
   1855 	case DIOCDWEDGE:
   1856 	    	dkw = (void *)data;
   1857 
   1858 		/* If the ioctl happens here, the parent is us. */
   1859 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1860 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1861 
   1862 	case DIOCLWEDGES:
   1863 		return dkwedge_list(&rs->sc_dkdev,
   1864 		    (struct dkwedge_list *)data, l);
   1865 	case DIOCCACHESYNC:
   1866 		return rf_sync_component_caches(raidPtr);
   1867 
   1868 	case DIOCGSTRATEGY:
   1869 	    {
   1870 		struct disk_strategy *dks = (void *)data;
   1871 
   1872 		s = splbio();
   1873 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1874 		    sizeof(dks->dks_name));
   1875 		splx(s);
   1876 		dks->dks_paramlen = 0;
   1877 
   1878 		return 0;
   1879 	    }
   1880 
   1881 	case DIOCSSTRATEGY:
   1882 	    {
   1883 		struct disk_strategy *dks = (void *)data;
   1884 		struct bufq_state *new;
   1885 		struct bufq_state *old;
   1886 
   1887 		if (dks->dks_param != NULL) {
   1888 			return EINVAL;
   1889 		}
   1890 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1891 		error = bufq_alloc(&new, dks->dks_name,
   1892 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1893 		if (error) {
   1894 			return error;
   1895 		}
   1896 		s = splbio();
   1897 		old = rs->buf_queue;
   1898 		bufq_move(new, old);
   1899 		rs->buf_queue = new;
   1900 		splx(s);
   1901 		bufq_free(old);
   1902 
   1903 		return 0;
   1904 	    }
   1905 
   1906 	default:
   1907 		retcode = ENOTTY;
   1908 	}
   1909 	return (retcode);
   1910 
   1911 }
   1912 
   1913 
   1914 /* raidinit -- complete the rest of the initialization for the
   1915    RAIDframe device.  */
   1916 
   1917 
   1918 static void
   1919 raidinit(struct raid_softc *rs)
   1920 {
   1921 	cfdata_t cf;
   1922 	int     unit;
   1923 	RF_Raid_t *raidPtr = &rs->sc_r;
   1924 
   1925 	unit = raidPtr->raidid;
   1926 
   1927 
   1928 	/* XXX should check return code first... */
   1929 	rs->sc_flags |= RAIDF_INITED;
   1930 
   1931 	/* XXX doesn't check bounds. */
   1932 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1933 
   1934 	/* attach the pseudo device */
   1935 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1936 	cf->cf_name = raid_cd.cd_name;
   1937 	cf->cf_atname = raid_cd.cd_name;
   1938 	cf->cf_unit = unit;
   1939 	cf->cf_fstate = FSTATE_STAR;
   1940 
   1941 	rs->sc_dev = config_attach_pseudo(cf);
   1942 
   1943 	if (rs->sc_dev == NULL) {
   1944 		printf("raid%d: config_attach_pseudo failed\n",
   1945 		    raidPtr->raidid);
   1946 		rs->sc_flags &= ~RAIDF_INITED;
   1947 		free(cf, M_RAIDFRAME);
   1948 		return;
   1949 	}
   1950 
   1951 	/* disk_attach actually creates space for the CPU disklabel, among
   1952 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1953 	 * with disklabels. */
   1954 
   1955 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1956 	disk_attach(&rs->sc_dkdev);
   1957 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1958 
   1959 	/* XXX There may be a weird interaction here between this, and
   1960 	 * protectedSectors, as used in RAIDframe.  */
   1961 
   1962 	rs->sc_size = raidPtr->totalSectors;
   1963 
   1964 	dkwedge_discover(&rs->sc_dkdev);
   1965 
   1966 	rf_set_geometry(rs, raidPtr);
   1967 
   1968 }
   1969 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1970 /* wake up the daemon & tell it to get us a spare table
   1971  * XXX
   1972  * the entries in the queues should be tagged with the raidPtr
   1973  * so that in the extremely rare case that two recons happen at once,
   1974  * we know for which device were requesting a spare table
   1975  * XXX
   1976  *
   1977  * XXX This code is not currently used. GO
   1978  */
   1979 int
   1980 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1981 {
   1982 	int     retcode;
   1983 
   1984 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1985 	req->next = rf_sparet_wait_queue;
   1986 	rf_sparet_wait_queue = req;
   1987 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1988 
   1989 	/* mpsleep unlocks the mutex */
   1990 	while (!rf_sparet_resp_queue) {
   1991 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1992 	}
   1993 	req = rf_sparet_resp_queue;
   1994 	rf_sparet_resp_queue = req->next;
   1995 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1996 
   1997 	retcode = req->fcol;
   1998 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1999 					 * alloc'd */
   2000 	return (retcode);
   2001 }
   2002 #endif
   2003 
   2004 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2005  * bp & passes it down.
   2006  * any calls originating in the kernel must use non-blocking I/O
   2007  * do some extra sanity checking to return "appropriate" error values for
   2008  * certain conditions (to make some standard utilities work)
   2009  *
   2010  * Formerly known as: rf_DoAccessKernel
   2011  */
   2012 void
   2013 raidstart(RF_Raid_t *raidPtr)
   2014 {
   2015 	RF_SectorCount_t num_blocks, pb, sum;
   2016 	RF_RaidAddr_t raid_addr;
   2017 	struct partition *pp;
   2018 	daddr_t blocknum;
   2019 	struct raid_softc *rs;
   2020 	int     do_async;
   2021 	struct buf *bp;
   2022 	int rc;
   2023 
   2024 	rs = raidPtr->softc;
   2025 	/* quick check to see if anything has died recently */
   2026 	rf_lock_mutex2(raidPtr->mutex);
   2027 	if (raidPtr->numNewFailures > 0) {
   2028 		rf_unlock_mutex2(raidPtr->mutex);
   2029 		rf_update_component_labels(raidPtr,
   2030 					   RF_NORMAL_COMPONENT_UPDATE);
   2031 		rf_lock_mutex2(raidPtr->mutex);
   2032 		raidPtr->numNewFailures--;
   2033 	}
   2034 
   2035 	/* Check to see if we're at the limit... */
   2036 	while (raidPtr->openings > 0) {
   2037 		rf_unlock_mutex2(raidPtr->mutex);
   2038 
   2039 		/* get the next item, if any, from the queue */
   2040 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2041 			/* nothing more to do */
   2042 			return;
   2043 		}
   2044 
   2045 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2046 		 * partition.. Need to make it absolute to the underlying
   2047 		 * device.. */
   2048 
   2049 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2050 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2051 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2052 			blocknum += pp->p_offset;
   2053 		}
   2054 
   2055 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2056 			    (int) blocknum));
   2057 
   2058 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2059 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2060 
   2061 		/* *THIS* is where we adjust what block we're going to...
   2062 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2063 		raid_addr = blocknum;
   2064 
   2065 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2066 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2067 		sum = raid_addr + num_blocks + pb;
   2068 		if (1 || rf_debugKernelAccess) {
   2069 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2070 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2071 				    (int) pb, (int) bp->b_resid));
   2072 		}
   2073 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2074 		    || (sum < num_blocks) || (sum < pb)) {
   2075 			bp->b_error = ENOSPC;
   2076 			bp->b_resid = bp->b_bcount;
   2077 			biodone(bp);
   2078 			rf_lock_mutex2(raidPtr->mutex);
   2079 			continue;
   2080 		}
   2081 		/*
   2082 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2083 		 */
   2084 
   2085 		if (bp->b_bcount & raidPtr->sectorMask) {
   2086 			bp->b_error = EINVAL;
   2087 			bp->b_resid = bp->b_bcount;
   2088 			biodone(bp);
   2089 			rf_lock_mutex2(raidPtr->mutex);
   2090 			continue;
   2091 
   2092 		}
   2093 		db1_printf(("Calling DoAccess..\n"));
   2094 
   2095 
   2096 		rf_lock_mutex2(raidPtr->mutex);
   2097 		raidPtr->openings--;
   2098 		rf_unlock_mutex2(raidPtr->mutex);
   2099 
   2100 		/*
   2101 		 * Everything is async.
   2102 		 */
   2103 		do_async = 1;
   2104 
   2105 		disk_busy(&rs->sc_dkdev);
   2106 
   2107 		/* XXX we're still at splbio() here... do we *really*
   2108 		   need to be? */
   2109 
   2110 		/* don't ever condition on bp->b_flags & B_WRITE.
   2111 		 * always condition on B_READ instead */
   2112 
   2113 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2114 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2115 				 do_async, raid_addr, num_blocks,
   2116 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2117 
   2118 		if (rc) {
   2119 			bp->b_error = rc;
   2120 			bp->b_resid = bp->b_bcount;
   2121 			biodone(bp);
   2122 			/* continue loop */
   2123 		}
   2124 
   2125 		rf_lock_mutex2(raidPtr->mutex);
   2126 	}
   2127 	rf_unlock_mutex2(raidPtr->mutex);
   2128 }
   2129 
   2130 
   2131 
   2132 
   2133 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2134 
   2135 int
   2136 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2137 {
   2138 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2139 	struct buf *bp;
   2140 
   2141 	req->queue = queue;
   2142 	bp = req->bp;
   2143 
   2144 	switch (req->type) {
   2145 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2146 		/* XXX need to do something extra here.. */
   2147 		/* I'm leaving this in, as I've never actually seen it used,
   2148 		 * and I'd like folks to report it... GO */
   2149 		printf(("WAKEUP CALLED\n"));
   2150 		queue->numOutstanding++;
   2151 
   2152 		bp->b_flags = 0;
   2153 		bp->b_private = req;
   2154 
   2155 		KernelWakeupFunc(bp);
   2156 		break;
   2157 
   2158 	case RF_IO_TYPE_READ:
   2159 	case RF_IO_TYPE_WRITE:
   2160 #if RF_ACC_TRACE > 0
   2161 		if (req->tracerec) {
   2162 			RF_ETIMER_START(req->tracerec->timer);
   2163 		}
   2164 #endif
   2165 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2166 		    op, queue->rf_cinfo->ci_dev,
   2167 		    req->sectorOffset, req->numSector,
   2168 		    req->buf, KernelWakeupFunc, (void *) req,
   2169 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2170 
   2171 		if (rf_debugKernelAccess) {
   2172 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2173 				(long) bp->b_blkno));
   2174 		}
   2175 		queue->numOutstanding++;
   2176 		queue->last_deq_sector = req->sectorOffset;
   2177 		/* acc wouldn't have been let in if there were any pending
   2178 		 * reqs at any other priority */
   2179 		queue->curPriority = req->priority;
   2180 
   2181 		db1_printf(("Going for %c to unit %d col %d\n",
   2182 			    req->type, queue->raidPtr->raidid,
   2183 			    queue->col));
   2184 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2185 			(int) req->sectorOffset, (int) req->numSector,
   2186 			(int) (req->numSector <<
   2187 			    queue->raidPtr->logBytesPerSector),
   2188 			(int) queue->raidPtr->logBytesPerSector));
   2189 
   2190 		/*
   2191 		 * XXX: drop lock here since this can block at
   2192 		 * least with backing SCSI devices.  Retake it
   2193 		 * to minimize fuss with calling interfaces.
   2194 		 */
   2195 
   2196 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2197 		bdev_strategy(bp);
   2198 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2199 		break;
   2200 
   2201 	default:
   2202 		panic("bad req->type in rf_DispatchKernelIO");
   2203 	}
   2204 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2205 
   2206 	return (0);
   2207 }
   2208 /* this is the callback function associated with a I/O invoked from
   2209    kernel code.
   2210  */
   2211 static void
   2212 KernelWakeupFunc(struct buf *bp)
   2213 {
   2214 	RF_DiskQueueData_t *req = NULL;
   2215 	RF_DiskQueue_t *queue;
   2216 
   2217 	db1_printf(("recovering the request queue:\n"));
   2218 
   2219 	req = bp->b_private;
   2220 
   2221 	queue = (RF_DiskQueue_t *) req->queue;
   2222 
   2223 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2224 
   2225 #if RF_ACC_TRACE > 0
   2226 	if (req->tracerec) {
   2227 		RF_ETIMER_STOP(req->tracerec->timer);
   2228 		RF_ETIMER_EVAL(req->tracerec->timer);
   2229 		rf_lock_mutex2(rf_tracing_mutex);
   2230 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2231 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2232 		req->tracerec->num_phys_ios++;
   2233 		rf_unlock_mutex2(rf_tracing_mutex);
   2234 	}
   2235 #endif
   2236 
   2237 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2238 	 * ballistic, and mark the component as hosed... */
   2239 
   2240 	if (bp->b_error != 0) {
   2241 		/* Mark the disk as dead */
   2242 		/* but only mark it once... */
   2243 		/* and only if it wouldn't leave this RAID set
   2244 		   completely broken */
   2245 		if (((queue->raidPtr->Disks[queue->col].status ==
   2246 		      rf_ds_optimal) ||
   2247 		     (queue->raidPtr->Disks[queue->col].status ==
   2248 		      rf_ds_used_spare)) &&
   2249 		     (queue->raidPtr->numFailures <
   2250 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2251 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2252 			       queue->raidPtr->raidid,
   2253 			       queue->raidPtr->Disks[queue->col].devname);
   2254 			queue->raidPtr->Disks[queue->col].status =
   2255 			    rf_ds_failed;
   2256 			queue->raidPtr->status = rf_rs_degraded;
   2257 			queue->raidPtr->numFailures++;
   2258 			queue->raidPtr->numNewFailures++;
   2259 		} else {	/* Disk is already dead... */
   2260 			/* printf("Disk already marked as dead!\n"); */
   2261 		}
   2262 
   2263 	}
   2264 
   2265 	/* Fill in the error value */
   2266 	req->error = bp->b_error;
   2267 
   2268 	/* Drop this one on the "finished" queue... */
   2269 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2270 
   2271 	/* Let the raidio thread know there is work to be done. */
   2272 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2273 
   2274 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2275 }
   2276 
   2277 
   2278 /*
   2279  * initialize a buf structure for doing an I/O in the kernel.
   2280  */
   2281 static void
   2282 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2283        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2284        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2285        struct proc *b_proc)
   2286 {
   2287 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2288 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2289 	bp->b_oflags = 0;
   2290 	bp->b_cflags = 0;
   2291 	bp->b_bcount = numSect << logBytesPerSector;
   2292 	bp->b_bufsize = bp->b_bcount;
   2293 	bp->b_error = 0;
   2294 	bp->b_dev = dev;
   2295 	bp->b_data = bf;
   2296 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2297 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2298 	if (bp->b_bcount == 0) {
   2299 		panic("bp->b_bcount is zero in InitBP!!");
   2300 	}
   2301 	bp->b_proc = b_proc;
   2302 	bp->b_iodone = cbFunc;
   2303 	bp->b_private = cbArg;
   2304 }
   2305 
   2306 static void
   2307 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2308 		    struct disklabel *lp)
   2309 {
   2310 	memset(lp, 0, sizeof(*lp));
   2311 
   2312 	/* fabricate a label... */
   2313 	lp->d_secperunit = raidPtr->totalSectors;
   2314 	lp->d_secsize = raidPtr->bytesPerSector;
   2315 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2316 	lp->d_ntracks = 4 * raidPtr->numCol;
   2317 	lp->d_ncylinders = raidPtr->totalSectors /
   2318 		(lp->d_nsectors * lp->d_ntracks);
   2319 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2320 
   2321 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2322 	lp->d_type = DTYPE_RAID;
   2323 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2324 	lp->d_rpm = 3600;
   2325 	lp->d_interleave = 1;
   2326 	lp->d_flags = 0;
   2327 
   2328 	lp->d_partitions[RAW_PART].p_offset = 0;
   2329 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2330 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2331 	lp->d_npartitions = RAW_PART + 1;
   2332 
   2333 	lp->d_magic = DISKMAGIC;
   2334 	lp->d_magic2 = DISKMAGIC;
   2335 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2336 
   2337 }
   2338 /*
   2339  * Read the disklabel from the raid device.  If one is not present, fake one
   2340  * up.
   2341  */
   2342 static void
   2343 raidgetdisklabel(dev_t dev)
   2344 {
   2345 	int     unit = raidunit(dev);
   2346 	struct raid_softc *rs;
   2347 	const char   *errstring;
   2348 	struct disklabel *lp;
   2349 	struct cpu_disklabel *clp;
   2350 	RF_Raid_t *raidPtr;
   2351 
   2352 	if ((rs = raidget(unit)) == NULL)
   2353 		return;
   2354 
   2355 	lp = rs->sc_dkdev.dk_label;
   2356 	clp = rs->sc_dkdev.dk_cpulabel;
   2357 
   2358 	db1_printf(("Getting the disklabel...\n"));
   2359 
   2360 	memset(clp, 0, sizeof(*clp));
   2361 
   2362 	raidPtr = &rs->sc_r;
   2363 
   2364 	raidgetdefaultlabel(raidPtr, rs, lp);
   2365 
   2366 	/*
   2367 	 * Call the generic disklabel extraction routine.
   2368 	 */
   2369 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2370 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2371 	if (errstring)
   2372 		raidmakedisklabel(rs);
   2373 	else {
   2374 		int     i;
   2375 		struct partition *pp;
   2376 
   2377 		/*
   2378 		 * Sanity check whether the found disklabel is valid.
   2379 		 *
   2380 		 * This is necessary since total size of the raid device
   2381 		 * may vary when an interleave is changed even though exactly
   2382 		 * same components are used, and old disklabel may used
   2383 		 * if that is found.
   2384 		 */
   2385 		if (lp->d_secperunit != rs->sc_size)
   2386 			printf("raid%d: WARNING: %s: "
   2387 			    "total sector size in disklabel (%" PRIu32 ") != "
   2388 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2389 			    lp->d_secperunit, rs->sc_size);
   2390 		for (i = 0; i < lp->d_npartitions; i++) {
   2391 			pp = &lp->d_partitions[i];
   2392 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2393 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2394 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2395 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2396 		}
   2397 	}
   2398 
   2399 }
   2400 /*
   2401  * Take care of things one might want to take care of in the event
   2402  * that a disklabel isn't present.
   2403  */
   2404 static void
   2405 raidmakedisklabel(struct raid_softc *rs)
   2406 {
   2407 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2408 	db1_printf(("Making a label..\n"));
   2409 
   2410 	/*
   2411 	 * For historical reasons, if there's no disklabel present
   2412 	 * the raw partition must be marked FS_BSDFFS.
   2413 	 */
   2414 
   2415 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2416 
   2417 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2418 
   2419 	lp->d_checksum = dkcksum(lp);
   2420 }
   2421 /*
   2422  * Wait interruptibly for an exclusive lock.
   2423  *
   2424  * XXX
   2425  * Several drivers do this; it should be abstracted and made MP-safe.
   2426  * (Hmm... where have we seen this warning before :->  GO )
   2427  */
   2428 static int
   2429 raidlock(struct raid_softc *rs)
   2430 {
   2431 	int     error;
   2432 
   2433 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2434 		rs->sc_flags |= RAIDF_WANTED;
   2435 		if ((error =
   2436 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2437 			return (error);
   2438 	}
   2439 	rs->sc_flags |= RAIDF_LOCKED;
   2440 	return (0);
   2441 }
   2442 /*
   2443  * Unlock and wake up any waiters.
   2444  */
   2445 static void
   2446 raidunlock(struct raid_softc *rs)
   2447 {
   2448 
   2449 	rs->sc_flags &= ~RAIDF_LOCKED;
   2450 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2451 		rs->sc_flags &= ~RAIDF_WANTED;
   2452 		wakeup(rs);
   2453 	}
   2454 }
   2455 
   2456 
   2457 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2458 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2459 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2460 
   2461 static daddr_t
   2462 rf_component_info_offset(void)
   2463 {
   2464 
   2465 	return RF_COMPONENT_INFO_OFFSET;
   2466 }
   2467 
   2468 static daddr_t
   2469 rf_component_info_size(unsigned secsize)
   2470 {
   2471 	daddr_t info_size;
   2472 
   2473 	KASSERT(secsize);
   2474 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2475 		info_size = secsize;
   2476 	else
   2477 		info_size = RF_COMPONENT_INFO_SIZE;
   2478 
   2479 	return info_size;
   2480 }
   2481 
   2482 static daddr_t
   2483 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2484 {
   2485 	daddr_t map_offset;
   2486 
   2487 	KASSERT(raidPtr->bytesPerSector);
   2488 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2489 		map_offset = raidPtr->bytesPerSector;
   2490 	else
   2491 		map_offset = RF_COMPONENT_INFO_SIZE;
   2492 	map_offset += rf_component_info_offset();
   2493 
   2494 	return map_offset;
   2495 }
   2496 
   2497 static daddr_t
   2498 rf_parity_map_size(RF_Raid_t *raidPtr)
   2499 {
   2500 	daddr_t map_size;
   2501 
   2502 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2503 		map_size = raidPtr->bytesPerSector;
   2504 	else
   2505 		map_size = RF_PARITY_MAP_SIZE;
   2506 
   2507 	return map_size;
   2508 }
   2509 
   2510 int
   2511 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2512 {
   2513 	RF_ComponentLabel_t *clabel;
   2514 
   2515 	clabel = raidget_component_label(raidPtr, col);
   2516 	clabel->clean = RF_RAID_CLEAN;
   2517 	raidflush_component_label(raidPtr, col);
   2518 	return(0);
   2519 }
   2520 
   2521 
   2522 int
   2523 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2524 {
   2525 	RF_ComponentLabel_t *clabel;
   2526 
   2527 	clabel = raidget_component_label(raidPtr, col);
   2528 	clabel->clean = RF_RAID_DIRTY;
   2529 	raidflush_component_label(raidPtr, col);
   2530 	return(0);
   2531 }
   2532 
   2533 int
   2534 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2535 {
   2536 	KASSERT(raidPtr->bytesPerSector);
   2537 	return raidread_component_label(raidPtr->bytesPerSector,
   2538 	    raidPtr->Disks[col].dev,
   2539 	    raidPtr->raid_cinfo[col].ci_vp,
   2540 	    &raidPtr->raid_cinfo[col].ci_label);
   2541 }
   2542 
   2543 RF_ComponentLabel_t *
   2544 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2545 {
   2546 	return &raidPtr->raid_cinfo[col].ci_label;
   2547 }
   2548 
   2549 int
   2550 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2551 {
   2552 	RF_ComponentLabel_t *label;
   2553 
   2554 	label = &raidPtr->raid_cinfo[col].ci_label;
   2555 	label->mod_counter = raidPtr->mod_counter;
   2556 #ifndef RF_NO_PARITY_MAP
   2557 	label->parity_map_modcount = label->mod_counter;
   2558 #endif
   2559 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2560 	    raidPtr->Disks[col].dev,
   2561 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2562 }
   2563 
   2564 
   2565 static int
   2566 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2567     RF_ComponentLabel_t *clabel)
   2568 {
   2569 	return raidread_component_area(dev, b_vp, clabel,
   2570 	    sizeof(RF_ComponentLabel_t),
   2571 	    rf_component_info_offset(),
   2572 	    rf_component_info_size(secsize));
   2573 }
   2574 
   2575 /* ARGSUSED */
   2576 static int
   2577 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2578     size_t msize, daddr_t offset, daddr_t dsize)
   2579 {
   2580 	struct buf *bp;
   2581 	const struct bdevsw *bdev;
   2582 	int error;
   2583 
   2584 	/* XXX should probably ensure that we don't try to do this if
   2585 	   someone has changed rf_protected_sectors. */
   2586 
   2587 	if (b_vp == NULL) {
   2588 		/* For whatever reason, this component is not valid.
   2589 		   Don't try to read a component label from it. */
   2590 		return(EINVAL);
   2591 	}
   2592 
   2593 	/* get a block of the appropriate size... */
   2594 	bp = geteblk((int)dsize);
   2595 	bp->b_dev = dev;
   2596 
   2597 	/* get our ducks in a row for the read */
   2598 	bp->b_blkno = offset / DEV_BSIZE;
   2599 	bp->b_bcount = dsize;
   2600 	bp->b_flags |= B_READ;
   2601  	bp->b_resid = dsize;
   2602 
   2603 	bdev = bdevsw_lookup(bp->b_dev);
   2604 	if (bdev == NULL)
   2605 		return (ENXIO);
   2606 	(*bdev->d_strategy)(bp);
   2607 
   2608 	error = biowait(bp);
   2609 
   2610 	if (!error) {
   2611 		memcpy(data, bp->b_data, msize);
   2612 	}
   2613 
   2614 	brelse(bp, 0);
   2615 	return(error);
   2616 }
   2617 
   2618 
   2619 static int
   2620 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2621     RF_ComponentLabel_t *clabel)
   2622 {
   2623 	return raidwrite_component_area(dev, b_vp, clabel,
   2624 	    sizeof(RF_ComponentLabel_t),
   2625 	    rf_component_info_offset(),
   2626 	    rf_component_info_size(secsize), 0);
   2627 }
   2628 
   2629 /* ARGSUSED */
   2630 static int
   2631 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2632     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2633 {
   2634 	struct buf *bp;
   2635 	const struct bdevsw *bdev;
   2636 	int error;
   2637 
   2638 	/* get a block of the appropriate size... */
   2639 	bp = geteblk((int)dsize);
   2640 	bp->b_dev = dev;
   2641 
   2642 	/* get our ducks in a row for the write */
   2643 	bp->b_blkno = offset / DEV_BSIZE;
   2644 	bp->b_bcount = dsize;
   2645 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2646  	bp->b_resid = dsize;
   2647 
   2648 	memset(bp->b_data, 0, dsize);
   2649 	memcpy(bp->b_data, data, msize);
   2650 
   2651 	bdev = bdevsw_lookup(bp->b_dev);
   2652 	if (bdev == NULL)
   2653 		return (ENXIO);
   2654 	(*bdev->d_strategy)(bp);
   2655 	if (asyncp)
   2656 		return 0;
   2657 	error = biowait(bp);
   2658 	brelse(bp, 0);
   2659 	if (error) {
   2660 #if 1
   2661 		printf("Failed to write RAID component info!\n");
   2662 #endif
   2663 	}
   2664 
   2665 	return(error);
   2666 }
   2667 
   2668 void
   2669 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2670 {
   2671 	int c;
   2672 
   2673 	for (c = 0; c < raidPtr->numCol; c++) {
   2674 		/* Skip dead disks. */
   2675 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2676 			continue;
   2677 		/* XXXjld: what if an error occurs here? */
   2678 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2679 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2680 		    RF_PARITYMAP_NBYTE,
   2681 		    rf_parity_map_offset(raidPtr),
   2682 		    rf_parity_map_size(raidPtr), 0);
   2683 	}
   2684 }
   2685 
   2686 void
   2687 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2688 {
   2689 	struct rf_paritymap_ondisk tmp;
   2690 	int c,first;
   2691 
   2692 	first=1;
   2693 	for (c = 0; c < raidPtr->numCol; c++) {
   2694 		/* Skip dead disks. */
   2695 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2696 			continue;
   2697 		raidread_component_area(raidPtr->Disks[c].dev,
   2698 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2699 		    RF_PARITYMAP_NBYTE,
   2700 		    rf_parity_map_offset(raidPtr),
   2701 		    rf_parity_map_size(raidPtr));
   2702 		if (first) {
   2703 			memcpy(map, &tmp, sizeof(*map));
   2704 			first = 0;
   2705 		} else {
   2706 			rf_paritymap_merge(map, &tmp);
   2707 		}
   2708 	}
   2709 }
   2710 
   2711 void
   2712 rf_markalldirty(RF_Raid_t *raidPtr)
   2713 {
   2714 	RF_ComponentLabel_t *clabel;
   2715 	int sparecol;
   2716 	int c;
   2717 	int j;
   2718 	int scol = -1;
   2719 
   2720 	raidPtr->mod_counter++;
   2721 	for (c = 0; c < raidPtr->numCol; c++) {
   2722 		/* we don't want to touch (at all) a disk that has
   2723 		   failed */
   2724 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2725 			clabel = raidget_component_label(raidPtr, c);
   2726 			if (clabel->status == rf_ds_spared) {
   2727 				/* XXX do something special...
   2728 				   but whatever you do, don't
   2729 				   try to access it!! */
   2730 			} else {
   2731 				raidmarkdirty(raidPtr, c);
   2732 			}
   2733 		}
   2734 	}
   2735 
   2736 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2737 		sparecol = raidPtr->numCol + c;
   2738 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2739 			/*
   2740 
   2741 			   we claim this disk is "optimal" if it's
   2742 			   rf_ds_used_spare, as that means it should be
   2743 			   directly substitutable for the disk it replaced.
   2744 			   We note that too...
   2745 
   2746 			 */
   2747 
   2748 			for(j=0;j<raidPtr->numCol;j++) {
   2749 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2750 					scol = j;
   2751 					break;
   2752 				}
   2753 			}
   2754 
   2755 			clabel = raidget_component_label(raidPtr, sparecol);
   2756 			/* make sure status is noted */
   2757 
   2758 			raid_init_component_label(raidPtr, clabel);
   2759 
   2760 			clabel->row = 0;
   2761 			clabel->column = scol;
   2762 			/* Note: we *don't* change status from rf_ds_used_spare
   2763 			   to rf_ds_optimal */
   2764 			/* clabel.status = rf_ds_optimal; */
   2765 
   2766 			raidmarkdirty(raidPtr, sparecol);
   2767 		}
   2768 	}
   2769 }
   2770 
   2771 
   2772 void
   2773 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2774 {
   2775 	RF_ComponentLabel_t *clabel;
   2776 	int sparecol;
   2777 	int c;
   2778 	int j;
   2779 	int scol;
   2780 
   2781 	scol = -1;
   2782 
   2783 	/* XXX should do extra checks to make sure things really are clean,
   2784 	   rather than blindly setting the clean bit... */
   2785 
   2786 	raidPtr->mod_counter++;
   2787 
   2788 	for (c = 0; c < raidPtr->numCol; c++) {
   2789 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2790 			clabel = raidget_component_label(raidPtr, c);
   2791 			/* make sure status is noted */
   2792 			clabel->status = rf_ds_optimal;
   2793 
   2794 			/* note what unit we are configured as */
   2795 			clabel->last_unit = raidPtr->raidid;
   2796 
   2797 			raidflush_component_label(raidPtr, c);
   2798 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2799 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2800 					raidmarkclean(raidPtr, c);
   2801 				}
   2802 			}
   2803 		}
   2804 		/* else we don't touch it.. */
   2805 	}
   2806 
   2807 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2808 		sparecol = raidPtr->numCol + c;
   2809 		/* Need to ensure that the reconstruct actually completed! */
   2810 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2811 			/*
   2812 
   2813 			   we claim this disk is "optimal" if it's
   2814 			   rf_ds_used_spare, as that means it should be
   2815 			   directly substitutable for the disk it replaced.
   2816 			   We note that too...
   2817 
   2818 			 */
   2819 
   2820 			for(j=0;j<raidPtr->numCol;j++) {
   2821 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2822 					scol = j;
   2823 					break;
   2824 				}
   2825 			}
   2826 
   2827 			/* XXX shouldn't *really* need this... */
   2828 			clabel = raidget_component_label(raidPtr, sparecol);
   2829 			/* make sure status is noted */
   2830 
   2831 			raid_init_component_label(raidPtr, clabel);
   2832 
   2833 			clabel->column = scol;
   2834 			clabel->status = rf_ds_optimal;
   2835 			clabel->last_unit = raidPtr->raidid;
   2836 
   2837 			raidflush_component_label(raidPtr, sparecol);
   2838 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2839 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2840 					raidmarkclean(raidPtr, sparecol);
   2841 				}
   2842 			}
   2843 		}
   2844 	}
   2845 }
   2846 
   2847 void
   2848 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2849 {
   2850 
   2851 	if (vp != NULL) {
   2852 		if (auto_configured == 1) {
   2853 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2854 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2855 			vput(vp);
   2856 
   2857 		} else {
   2858 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2859 		}
   2860 	}
   2861 }
   2862 
   2863 
   2864 void
   2865 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2866 {
   2867 	int r,c;
   2868 	struct vnode *vp;
   2869 	int acd;
   2870 
   2871 
   2872 	/* We take this opportunity to close the vnodes like we should.. */
   2873 
   2874 	for (c = 0; c < raidPtr->numCol; c++) {
   2875 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2876 		acd = raidPtr->Disks[c].auto_configured;
   2877 		rf_close_component(raidPtr, vp, acd);
   2878 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2879 		raidPtr->Disks[c].auto_configured = 0;
   2880 	}
   2881 
   2882 	for (r = 0; r < raidPtr->numSpare; r++) {
   2883 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2884 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2885 		rf_close_component(raidPtr, vp, acd);
   2886 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2887 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2888 	}
   2889 }
   2890 
   2891 
   2892 void
   2893 rf_ReconThread(struct rf_recon_req *req)
   2894 {
   2895 	int     s;
   2896 	RF_Raid_t *raidPtr;
   2897 
   2898 	s = splbio();
   2899 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2900 	raidPtr->recon_in_progress = 1;
   2901 
   2902 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2903 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2904 
   2905 	RF_Free(req, sizeof(*req));
   2906 
   2907 	raidPtr->recon_in_progress = 0;
   2908 	splx(s);
   2909 
   2910 	/* That's all... */
   2911 	kthread_exit(0);	/* does not return */
   2912 }
   2913 
   2914 void
   2915 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2916 {
   2917 	int retcode;
   2918 	int s;
   2919 
   2920 	raidPtr->parity_rewrite_stripes_done = 0;
   2921 	raidPtr->parity_rewrite_in_progress = 1;
   2922 	s = splbio();
   2923 	retcode = rf_RewriteParity(raidPtr);
   2924 	splx(s);
   2925 	if (retcode) {
   2926 		printf("raid%d: Error re-writing parity (%d)!\n",
   2927 		    raidPtr->raidid, retcode);
   2928 	} else {
   2929 		/* set the clean bit!  If we shutdown correctly,
   2930 		   the clean bit on each component label will get
   2931 		   set */
   2932 		raidPtr->parity_good = RF_RAID_CLEAN;
   2933 	}
   2934 	raidPtr->parity_rewrite_in_progress = 0;
   2935 
   2936 	/* Anyone waiting for us to stop?  If so, inform them... */
   2937 	if (raidPtr->waitShutdown) {
   2938 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2939 	}
   2940 
   2941 	/* That's all... */
   2942 	kthread_exit(0);	/* does not return */
   2943 }
   2944 
   2945 
   2946 void
   2947 rf_CopybackThread(RF_Raid_t *raidPtr)
   2948 {
   2949 	int s;
   2950 
   2951 	raidPtr->copyback_in_progress = 1;
   2952 	s = splbio();
   2953 	rf_CopybackReconstructedData(raidPtr);
   2954 	splx(s);
   2955 	raidPtr->copyback_in_progress = 0;
   2956 
   2957 	/* That's all... */
   2958 	kthread_exit(0);	/* does not return */
   2959 }
   2960 
   2961 
   2962 void
   2963 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2964 {
   2965 	int s;
   2966 	RF_Raid_t *raidPtr;
   2967 
   2968 	s = splbio();
   2969 	raidPtr = req->raidPtr;
   2970 	raidPtr->recon_in_progress = 1;
   2971 	rf_ReconstructInPlace(raidPtr, req->col);
   2972 	RF_Free(req, sizeof(*req));
   2973 	raidPtr->recon_in_progress = 0;
   2974 	splx(s);
   2975 
   2976 	/* That's all... */
   2977 	kthread_exit(0);	/* does not return */
   2978 }
   2979 
   2980 static RF_AutoConfig_t *
   2981 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2982     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2983     unsigned secsize)
   2984 {
   2985 	int good_one = 0;
   2986 	RF_ComponentLabel_t *clabel;
   2987 	RF_AutoConfig_t *ac;
   2988 
   2989 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2990 	if (clabel == NULL) {
   2991 oomem:
   2992 		    while(ac_list) {
   2993 			    ac = ac_list;
   2994 			    if (ac->clabel)
   2995 				    free(ac->clabel, M_RAIDFRAME);
   2996 			    ac_list = ac_list->next;
   2997 			    free(ac, M_RAIDFRAME);
   2998 		    }
   2999 		    printf("RAID auto config: out of memory!\n");
   3000 		    return NULL; /* XXX probably should panic? */
   3001 	}
   3002 
   3003 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3004 		/* Got the label.  Does it look reasonable? */
   3005 		if (rf_reasonable_label(clabel, numsecs) &&
   3006 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3007 #ifdef DEBUG
   3008 			printf("Component on: %s: %llu\n",
   3009 				cname, (unsigned long long)size);
   3010 			rf_print_component_label(clabel);
   3011 #endif
   3012 			/* if it's reasonable, add it, else ignore it. */
   3013 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3014 				M_NOWAIT);
   3015 			if (ac == NULL) {
   3016 				free(clabel, M_RAIDFRAME);
   3017 				goto oomem;
   3018 			}
   3019 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3020 			ac->dev = dev;
   3021 			ac->vp = vp;
   3022 			ac->clabel = clabel;
   3023 			ac->next = ac_list;
   3024 			ac_list = ac;
   3025 			good_one = 1;
   3026 		}
   3027 	}
   3028 	if (!good_one) {
   3029 		/* cleanup */
   3030 		free(clabel, M_RAIDFRAME);
   3031 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3032 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3033 		vput(vp);
   3034 	}
   3035 	return ac_list;
   3036 }
   3037 
   3038 RF_AutoConfig_t *
   3039 rf_find_raid_components(void)
   3040 {
   3041 	struct vnode *vp;
   3042 	struct disklabel label;
   3043 	device_t dv;
   3044 	deviter_t di;
   3045 	dev_t dev;
   3046 	int bmajor, bminor, wedge, rf_part_found;
   3047 	int error;
   3048 	int i;
   3049 	RF_AutoConfig_t *ac_list;
   3050 	uint64_t numsecs;
   3051 	unsigned secsize;
   3052 
   3053 	/* initialize the AutoConfig list */
   3054 	ac_list = NULL;
   3055 
   3056 	/* we begin by trolling through *all* the devices on the system */
   3057 
   3058 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3059 	     dv = deviter_next(&di)) {
   3060 
   3061 		/* we are only interested in disks... */
   3062 		if (device_class(dv) != DV_DISK)
   3063 			continue;
   3064 
   3065 		/* we don't care about floppies... */
   3066 		if (device_is_a(dv, "fd")) {
   3067 			continue;
   3068 		}
   3069 
   3070 		/* we don't care about CD's... */
   3071 		if (device_is_a(dv, "cd")) {
   3072 			continue;
   3073 		}
   3074 
   3075 		/* we don't care about md's... */
   3076 		if (device_is_a(dv, "md")) {
   3077 			continue;
   3078 		}
   3079 
   3080 		/* hdfd is the Atari/Hades floppy driver */
   3081 		if (device_is_a(dv, "hdfd")) {
   3082 			continue;
   3083 		}
   3084 
   3085 		/* fdisa is the Atari/Milan floppy driver */
   3086 		if (device_is_a(dv, "fdisa")) {
   3087 			continue;
   3088 		}
   3089 
   3090 		/* need to find the device_name_to_block_device_major stuff */
   3091 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3092 
   3093 		rf_part_found = 0; /*No raid partition as yet*/
   3094 
   3095 		/* get a vnode for the raw partition of this disk */
   3096 
   3097 		wedge = device_is_a(dv, "dk");
   3098 		bminor = minor(device_unit(dv));
   3099 		dev = wedge ? makedev(bmajor, bminor) :
   3100 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3101 		if (bdevvp(dev, &vp))
   3102 			panic("RAID can't alloc vnode");
   3103 
   3104 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3105 
   3106 		if (error) {
   3107 			/* "Who cares."  Continue looking
   3108 			   for something that exists*/
   3109 			vput(vp);
   3110 			continue;
   3111 		}
   3112 
   3113 		error = getdisksize(vp, &numsecs, &secsize);
   3114 		if (error) {
   3115 			vput(vp);
   3116 			continue;
   3117 		}
   3118 		if (wedge) {
   3119 			struct dkwedge_info dkw;
   3120 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3121 			    NOCRED);
   3122 			if (error) {
   3123 				printf("RAIDframe: can't get wedge info for "
   3124 				    "dev %s (%d)\n", device_xname(dv), error);
   3125 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3126 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3127 				vput(vp);
   3128 				continue;
   3129 			}
   3130 
   3131 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3132 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3133 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3134 				vput(vp);
   3135 				continue;
   3136 			}
   3137 
   3138 			ac_list = rf_get_component(ac_list, dev, vp,
   3139 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3140 			rf_part_found = 1; /*There is a raid component on this disk*/
   3141 			continue;
   3142 		}
   3143 
   3144 		/* Ok, the disk exists.  Go get the disklabel. */
   3145 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3146 		if (error) {
   3147 			/*
   3148 			 * XXX can't happen - open() would
   3149 			 * have errored out (or faked up one)
   3150 			 */
   3151 			if (error != ENOTTY)
   3152 				printf("RAIDframe: can't get label for dev "
   3153 				    "%s (%d)\n", device_xname(dv), error);
   3154 		}
   3155 
   3156 		/* don't need this any more.  We'll allocate it again
   3157 		   a little later if we really do... */
   3158 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3159 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3160 		vput(vp);
   3161 
   3162 		if (error)
   3163 			continue;
   3164 
   3165 		rf_part_found = 0; /*No raid partitions yet*/
   3166 		for (i = 0; i < label.d_npartitions; i++) {
   3167 			char cname[sizeof(ac_list->devname)];
   3168 
   3169 			/* We only support partitions marked as RAID */
   3170 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3171 				continue;
   3172 
   3173 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3174 			if (bdevvp(dev, &vp))
   3175 				panic("RAID can't alloc vnode");
   3176 
   3177 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3178 			if (error) {
   3179 				/* Whatever... */
   3180 				vput(vp);
   3181 				continue;
   3182 			}
   3183 			snprintf(cname, sizeof(cname), "%s%c",
   3184 			    device_xname(dv), 'a' + i);
   3185 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3186 				label.d_partitions[i].p_size, numsecs, secsize);
   3187 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3188 		}
   3189 
   3190 		/*
   3191 		 *If there is no raid component on this disk, either in a
   3192 		 *disklabel or inside a wedge, check the raw partition as well,
   3193 		 *as it is possible to configure raid components on raw disk
   3194 		 *devices.
   3195 		 */
   3196 
   3197 		if (!rf_part_found) {
   3198 			char cname[sizeof(ac_list->devname)];
   3199 
   3200 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3201 			if (bdevvp(dev, &vp))
   3202 				panic("RAID can't alloc vnode");
   3203 
   3204 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3205 			if (error) {
   3206 				/* Whatever... */
   3207 				vput(vp);
   3208 				continue;
   3209 			}
   3210 			snprintf(cname, sizeof(cname), "%s%c",
   3211 			    device_xname(dv), 'a' + RAW_PART);
   3212 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3213 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3214 		}
   3215 	}
   3216 	deviter_release(&di);
   3217 	return ac_list;
   3218 }
   3219 
   3220 
   3221 int
   3222 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3223 {
   3224 
   3225 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3226 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3227 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3228 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3229 	    clabel->row >=0 &&
   3230 	    clabel->column >= 0 &&
   3231 	    clabel->num_rows > 0 &&
   3232 	    clabel->num_columns > 0 &&
   3233 	    clabel->row < clabel->num_rows &&
   3234 	    clabel->column < clabel->num_columns &&
   3235 	    clabel->blockSize > 0 &&
   3236 	    /*
   3237 	     * numBlocksHi may contain garbage, but it is ok since
   3238 	     * the type is unsigned.  If it is really garbage,
   3239 	     * rf_fix_old_label_size() will fix it.
   3240 	     */
   3241 	    rf_component_label_numblocks(clabel) > 0) {
   3242 		/*
   3243 		 * label looks reasonable enough...
   3244 		 * let's make sure it has no old garbage.
   3245 		 */
   3246 		if (numsecs)
   3247 			rf_fix_old_label_size(clabel, numsecs);
   3248 		return(1);
   3249 	}
   3250 	return(0);
   3251 }
   3252 
   3253 
   3254 /*
   3255  * For reasons yet unknown, some old component labels have garbage in
   3256  * the newer numBlocksHi region, and this causes lossage.  Since those
   3257  * disks will also have numsecs set to less than 32 bits of sectors,
   3258  * we can determine when this corruption has occurred, and fix it.
   3259  *
   3260  * The exact same problem, with the same unknown reason, happens to
   3261  * the partitionSizeHi member as well.
   3262  */
   3263 static void
   3264 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3265 {
   3266 
   3267 	if (numsecs < ((uint64_t)1 << 32)) {
   3268 		if (clabel->numBlocksHi) {
   3269 			printf("WARNING: total sectors < 32 bits, yet "
   3270 			       "numBlocksHi set\n"
   3271 			       "WARNING: resetting numBlocksHi to zero.\n");
   3272 			clabel->numBlocksHi = 0;
   3273 		}
   3274 
   3275 		if (clabel->partitionSizeHi) {
   3276 			printf("WARNING: total sectors < 32 bits, yet "
   3277 			       "partitionSizeHi set\n"
   3278 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3279 			clabel->partitionSizeHi = 0;
   3280 		}
   3281 	}
   3282 }
   3283 
   3284 
   3285 #ifdef DEBUG
   3286 void
   3287 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3288 {
   3289 	uint64_t numBlocks;
   3290 
   3291 	numBlocks = rf_component_label_numblocks(clabel);
   3292 
   3293 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3294 	       clabel->row, clabel->column,
   3295 	       clabel->num_rows, clabel->num_columns);
   3296 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3297 	       clabel->version, clabel->serial_number,
   3298 	       clabel->mod_counter);
   3299 	printf("   Clean: %s Status: %d\n",
   3300 	       clabel->clean ? "Yes" : "No", clabel->status);
   3301 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3302 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3303 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3304 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3305 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3306 	printf("   Contains root partition: %s\n",
   3307 	       clabel->root_partition ? "Yes" : "No");
   3308 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3309 #if 0
   3310 	   printf("   Config order: %d\n", clabel->config_order);
   3311 #endif
   3312 
   3313 }
   3314 #endif
   3315 
   3316 RF_ConfigSet_t *
   3317 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3318 {
   3319 	RF_AutoConfig_t *ac;
   3320 	RF_ConfigSet_t *config_sets;
   3321 	RF_ConfigSet_t *cset;
   3322 	RF_AutoConfig_t *ac_next;
   3323 
   3324 
   3325 	config_sets = NULL;
   3326 
   3327 	/* Go through the AutoConfig list, and figure out which components
   3328 	   belong to what sets.  */
   3329 	ac = ac_list;
   3330 	while(ac!=NULL) {
   3331 		/* we're going to putz with ac->next, so save it here
   3332 		   for use at the end of the loop */
   3333 		ac_next = ac->next;
   3334 
   3335 		if (config_sets == NULL) {
   3336 			/* will need at least this one... */
   3337 			config_sets = (RF_ConfigSet_t *)
   3338 				malloc(sizeof(RF_ConfigSet_t),
   3339 				       M_RAIDFRAME, M_NOWAIT);
   3340 			if (config_sets == NULL) {
   3341 				panic("rf_create_auto_sets: No memory!");
   3342 			}
   3343 			/* this one is easy :) */
   3344 			config_sets->ac = ac;
   3345 			config_sets->next = NULL;
   3346 			config_sets->rootable = 0;
   3347 			ac->next = NULL;
   3348 		} else {
   3349 			/* which set does this component fit into? */
   3350 			cset = config_sets;
   3351 			while(cset!=NULL) {
   3352 				if (rf_does_it_fit(cset, ac)) {
   3353 					/* looks like it matches... */
   3354 					ac->next = cset->ac;
   3355 					cset->ac = ac;
   3356 					break;
   3357 				}
   3358 				cset = cset->next;
   3359 			}
   3360 			if (cset==NULL) {
   3361 				/* didn't find a match above... new set..*/
   3362 				cset = (RF_ConfigSet_t *)
   3363 					malloc(sizeof(RF_ConfigSet_t),
   3364 					       M_RAIDFRAME, M_NOWAIT);
   3365 				if (cset == NULL) {
   3366 					panic("rf_create_auto_sets: No memory!");
   3367 				}
   3368 				cset->ac = ac;
   3369 				ac->next = NULL;
   3370 				cset->next = config_sets;
   3371 				cset->rootable = 0;
   3372 				config_sets = cset;
   3373 			}
   3374 		}
   3375 		ac = ac_next;
   3376 	}
   3377 
   3378 
   3379 	return(config_sets);
   3380 }
   3381 
   3382 static int
   3383 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3384 {
   3385 	RF_ComponentLabel_t *clabel1, *clabel2;
   3386 
   3387 	/* If this one matches the *first* one in the set, that's good
   3388 	   enough, since the other members of the set would have been
   3389 	   through here too... */
   3390 	/* note that we are not checking partitionSize here..
   3391 
   3392 	   Note that we are also not checking the mod_counters here.
   3393 	   If everything else matches except the mod_counter, that's
   3394 	   good enough for this test.  We will deal with the mod_counters
   3395 	   a little later in the autoconfiguration process.
   3396 
   3397 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3398 
   3399 	   The reason we don't check for this is that failed disks
   3400 	   will have lower modification counts.  If those disks are
   3401 	   not added to the set they used to belong to, then they will
   3402 	   form their own set, which may result in 2 different sets,
   3403 	   for example, competing to be configured at raid0, and
   3404 	   perhaps competing to be the root filesystem set.  If the
   3405 	   wrong ones get configured, or both attempt to become /,
   3406 	   weird behaviour and or serious lossage will occur.  Thus we
   3407 	   need to bring them into the fold here, and kick them out at
   3408 	   a later point.
   3409 
   3410 	*/
   3411 
   3412 	clabel1 = cset->ac->clabel;
   3413 	clabel2 = ac->clabel;
   3414 	if ((clabel1->version == clabel2->version) &&
   3415 	    (clabel1->serial_number == clabel2->serial_number) &&
   3416 	    (clabel1->num_rows == clabel2->num_rows) &&
   3417 	    (clabel1->num_columns == clabel2->num_columns) &&
   3418 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3419 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3420 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3421 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3422 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3423 	    (clabel1->blockSize == clabel2->blockSize) &&
   3424 	    rf_component_label_numblocks(clabel1) ==
   3425 	    rf_component_label_numblocks(clabel2) &&
   3426 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3427 	    (clabel1->root_partition == clabel2->root_partition) &&
   3428 	    (clabel1->last_unit == clabel2->last_unit) &&
   3429 	    (clabel1->config_order == clabel2->config_order)) {
   3430 		/* if it get's here, it almost *has* to be a match */
   3431 	} else {
   3432 		/* it's not consistent with somebody in the set..
   3433 		   punt */
   3434 		return(0);
   3435 	}
   3436 	/* all was fine.. it must fit... */
   3437 	return(1);
   3438 }
   3439 
   3440 int
   3441 rf_have_enough_components(RF_ConfigSet_t *cset)
   3442 {
   3443 	RF_AutoConfig_t *ac;
   3444 	RF_AutoConfig_t *auto_config;
   3445 	RF_ComponentLabel_t *clabel;
   3446 	int c;
   3447 	int num_cols;
   3448 	int num_missing;
   3449 	int mod_counter;
   3450 	int mod_counter_found;
   3451 	int even_pair_failed;
   3452 	char parity_type;
   3453 
   3454 
   3455 	/* check to see that we have enough 'live' components
   3456 	   of this set.  If so, we can configure it if necessary */
   3457 
   3458 	num_cols = cset->ac->clabel->num_columns;
   3459 	parity_type = cset->ac->clabel->parityConfig;
   3460 
   3461 	/* XXX Check for duplicate components!?!?!? */
   3462 
   3463 	/* Determine what the mod_counter is supposed to be for this set. */
   3464 
   3465 	mod_counter_found = 0;
   3466 	mod_counter = 0;
   3467 	ac = cset->ac;
   3468 	while(ac!=NULL) {
   3469 		if (mod_counter_found==0) {
   3470 			mod_counter = ac->clabel->mod_counter;
   3471 			mod_counter_found = 1;
   3472 		} else {
   3473 			if (ac->clabel->mod_counter > mod_counter) {
   3474 				mod_counter = ac->clabel->mod_counter;
   3475 			}
   3476 		}
   3477 		ac = ac->next;
   3478 	}
   3479 
   3480 	num_missing = 0;
   3481 	auto_config = cset->ac;
   3482 
   3483 	even_pair_failed = 0;
   3484 	for(c=0; c<num_cols; c++) {
   3485 		ac = auto_config;
   3486 		while(ac!=NULL) {
   3487 			if ((ac->clabel->column == c) &&
   3488 			    (ac->clabel->mod_counter == mod_counter)) {
   3489 				/* it's this one... */
   3490 #ifdef DEBUG
   3491 				printf("Found: %s at %d\n",
   3492 				       ac->devname,c);
   3493 #endif
   3494 				break;
   3495 			}
   3496 			ac=ac->next;
   3497 		}
   3498 		if (ac==NULL) {
   3499 				/* Didn't find one here! */
   3500 				/* special case for RAID 1, especially
   3501 				   where there are more than 2
   3502 				   components (where RAIDframe treats
   3503 				   things a little differently :( ) */
   3504 			if (parity_type == '1') {
   3505 				if (c%2 == 0) { /* even component */
   3506 					even_pair_failed = 1;
   3507 				} else { /* odd component.  If
   3508 					    we're failed, and
   3509 					    so is the even
   3510 					    component, it's
   3511 					    "Good Night, Charlie" */
   3512 					if (even_pair_failed == 1) {
   3513 						return(0);
   3514 					}
   3515 				}
   3516 			} else {
   3517 				/* normal accounting */
   3518 				num_missing++;
   3519 			}
   3520 		}
   3521 		if ((parity_type == '1') && (c%2 == 1)) {
   3522 				/* Just did an even component, and we didn't
   3523 				   bail.. reset the even_pair_failed flag,
   3524 				   and go on to the next component.... */
   3525 			even_pair_failed = 0;
   3526 		}
   3527 	}
   3528 
   3529 	clabel = cset->ac->clabel;
   3530 
   3531 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3532 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3533 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3534 		/* XXX this needs to be made *much* more general */
   3535 		/* Too many failures */
   3536 		return(0);
   3537 	}
   3538 	/* otherwise, all is well, and we've got enough to take a kick
   3539 	   at autoconfiguring this set */
   3540 	return(1);
   3541 }
   3542 
   3543 void
   3544 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3545 			RF_Raid_t *raidPtr)
   3546 {
   3547 	RF_ComponentLabel_t *clabel;
   3548 	int i;
   3549 
   3550 	clabel = ac->clabel;
   3551 
   3552 	/* 1. Fill in the common stuff */
   3553 	config->numRow = clabel->num_rows = 1;
   3554 	config->numCol = clabel->num_columns;
   3555 	config->numSpare = 0; /* XXX should this be set here? */
   3556 	config->sectPerSU = clabel->sectPerSU;
   3557 	config->SUsPerPU = clabel->SUsPerPU;
   3558 	config->SUsPerRU = clabel->SUsPerRU;
   3559 	config->parityConfig = clabel->parityConfig;
   3560 	/* XXX... */
   3561 	strcpy(config->diskQueueType,"fifo");
   3562 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3563 	config->layoutSpecificSize = 0; /* XXX ?? */
   3564 
   3565 	while(ac!=NULL) {
   3566 		/* row/col values will be in range due to the checks
   3567 		   in reasonable_label() */
   3568 		strcpy(config->devnames[0][ac->clabel->column],
   3569 		       ac->devname);
   3570 		ac = ac->next;
   3571 	}
   3572 
   3573 	for(i=0;i<RF_MAXDBGV;i++) {
   3574 		config->debugVars[i][0] = 0;
   3575 	}
   3576 }
   3577 
   3578 int
   3579 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3580 {
   3581 	RF_ComponentLabel_t *clabel;
   3582 	int column;
   3583 	int sparecol;
   3584 
   3585 	raidPtr->autoconfigure = new_value;
   3586 
   3587 	for(column=0; column<raidPtr->numCol; column++) {
   3588 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3589 			clabel = raidget_component_label(raidPtr, column);
   3590 			clabel->autoconfigure = new_value;
   3591 			raidflush_component_label(raidPtr, column);
   3592 		}
   3593 	}
   3594 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3595 		sparecol = raidPtr->numCol + column;
   3596 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3597 			clabel = raidget_component_label(raidPtr, sparecol);
   3598 			clabel->autoconfigure = new_value;
   3599 			raidflush_component_label(raidPtr, sparecol);
   3600 		}
   3601 	}
   3602 	return(new_value);
   3603 }
   3604 
   3605 int
   3606 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3607 {
   3608 	RF_ComponentLabel_t *clabel;
   3609 	int column;
   3610 	int sparecol;
   3611 
   3612 	raidPtr->root_partition = new_value;
   3613 	for(column=0; column<raidPtr->numCol; column++) {
   3614 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3615 			clabel = raidget_component_label(raidPtr, column);
   3616 			clabel->root_partition = new_value;
   3617 			raidflush_component_label(raidPtr, column);
   3618 		}
   3619 	}
   3620 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3621 		sparecol = raidPtr->numCol + column;
   3622 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3623 			clabel = raidget_component_label(raidPtr, sparecol);
   3624 			clabel->root_partition = new_value;
   3625 			raidflush_component_label(raidPtr, sparecol);
   3626 		}
   3627 	}
   3628 	return(new_value);
   3629 }
   3630 
   3631 void
   3632 rf_release_all_vps(RF_ConfigSet_t *cset)
   3633 {
   3634 	RF_AutoConfig_t *ac;
   3635 
   3636 	ac = cset->ac;
   3637 	while(ac!=NULL) {
   3638 		/* Close the vp, and give it back */
   3639 		if (ac->vp) {
   3640 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3641 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3642 			vput(ac->vp);
   3643 			ac->vp = NULL;
   3644 		}
   3645 		ac = ac->next;
   3646 	}
   3647 }
   3648 
   3649 
   3650 void
   3651 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3652 {
   3653 	RF_AutoConfig_t *ac;
   3654 	RF_AutoConfig_t *next_ac;
   3655 
   3656 	ac = cset->ac;
   3657 	while(ac!=NULL) {
   3658 		next_ac = ac->next;
   3659 		/* nuke the label */
   3660 		free(ac->clabel, M_RAIDFRAME);
   3661 		/* cleanup the config structure */
   3662 		free(ac, M_RAIDFRAME);
   3663 		/* "next.." */
   3664 		ac = next_ac;
   3665 	}
   3666 	/* and, finally, nuke the config set */
   3667 	free(cset, M_RAIDFRAME);
   3668 }
   3669 
   3670 
   3671 void
   3672 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3673 {
   3674 	/* current version number */
   3675 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3676 	clabel->serial_number = raidPtr->serial_number;
   3677 	clabel->mod_counter = raidPtr->mod_counter;
   3678 
   3679 	clabel->num_rows = 1;
   3680 	clabel->num_columns = raidPtr->numCol;
   3681 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3682 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3683 
   3684 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3685 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3686 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3687 
   3688 	clabel->blockSize = raidPtr->bytesPerSector;
   3689 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3690 
   3691 	/* XXX not portable */
   3692 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3693 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3694 	clabel->autoconfigure = raidPtr->autoconfigure;
   3695 	clabel->root_partition = raidPtr->root_partition;
   3696 	clabel->last_unit = raidPtr->raidid;
   3697 	clabel->config_order = raidPtr->config_order;
   3698 
   3699 #ifndef RF_NO_PARITY_MAP
   3700 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3701 #endif
   3702 }
   3703 
   3704 struct raid_softc *
   3705 rf_auto_config_set(RF_ConfigSet_t *cset)
   3706 {
   3707 	RF_Raid_t *raidPtr;
   3708 	RF_Config_t *config;
   3709 	int raidID;
   3710 	struct raid_softc *sc;
   3711 
   3712 #ifdef DEBUG
   3713 	printf("RAID autoconfigure\n");
   3714 #endif
   3715 
   3716 	/* 1. Create a config structure */
   3717 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3718 	if (config == NULL) {
   3719 		printf("Out of mem!?!?\n");
   3720 				/* XXX do something more intelligent here. */
   3721 		return NULL;
   3722 	}
   3723 
   3724 	/*
   3725 	   2. Figure out what RAID ID this one is supposed to live at
   3726 	   See if we can get the same RAID dev that it was configured
   3727 	   on last time..
   3728 	*/
   3729 
   3730 	raidID = cset->ac->clabel->last_unit;
   3731 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3732 		continue;
   3733 #ifdef DEBUG
   3734 	printf("Configuring raid%d:\n",raidID);
   3735 #endif
   3736 
   3737 	raidPtr = &sc->sc_r;
   3738 
   3739 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3740 	raidPtr->softc = sc;
   3741 	raidPtr->raidid = raidID;
   3742 	raidPtr->openings = RAIDOUTSTANDING;
   3743 
   3744 	/* 3. Build the configuration structure */
   3745 	rf_create_configuration(cset->ac, config, raidPtr);
   3746 
   3747 	/* 4. Do the configuration */
   3748 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3749 		raidinit(sc);
   3750 
   3751 		rf_markalldirty(raidPtr);
   3752 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3753 		if (cset->ac->clabel->root_partition==1) {
   3754 			/* everything configured just fine.  Make a note
   3755 			   that this set is eligible to be root. */
   3756 			cset->rootable = 1;
   3757 			/* XXX do this here? */
   3758 			raidPtr->root_partition = 1;
   3759 		}
   3760 	} else {
   3761 		raidput(sc);
   3762 		sc = NULL;
   3763 	}
   3764 
   3765 	/* 5. Cleanup */
   3766 	free(config, M_RAIDFRAME);
   3767 	return sc;
   3768 }
   3769 
   3770 void
   3771 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3772 {
   3773 	struct buf *bp;
   3774 	struct raid_softc *rs;
   3775 
   3776 	bp = (struct buf *)desc->bp;
   3777 	rs = desc->raidPtr->softc;
   3778 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3779 	    (bp->b_flags & B_READ));
   3780 }
   3781 
   3782 void
   3783 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3784 	     size_t xmin, size_t xmax)
   3785 {
   3786 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3787 	pool_sethiwat(p, xmax);
   3788 	pool_prime(p, xmin);
   3789 	pool_setlowat(p, xmin);
   3790 }
   3791 
   3792 /*
   3793  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3794  * if there is IO pending and if that IO could possibly be done for a
   3795  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3796  * otherwise.
   3797  *
   3798  */
   3799 
   3800 int
   3801 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3802 {
   3803 	struct raid_softc *rs = raidPtr->softc;
   3804 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3805 		/* there is work to do */
   3806 		return 0;
   3807 	}
   3808 	/* default is nothing to do */
   3809 	return 1;
   3810 }
   3811 
   3812 int
   3813 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3814 {
   3815 	uint64_t numsecs;
   3816 	unsigned secsize;
   3817 	int error;
   3818 
   3819 	error = getdisksize(vp, &numsecs, &secsize);
   3820 	if (error == 0) {
   3821 		diskPtr->blockSize = secsize;
   3822 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3823 		diskPtr->partitionSize = numsecs;
   3824 		return 0;
   3825 	}
   3826 	return error;
   3827 }
   3828 
   3829 static int
   3830 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3831 {
   3832 	return 1;
   3833 }
   3834 
   3835 static void
   3836 raid_attach(device_t parent, device_t self, void *aux)
   3837 {
   3838 
   3839 }
   3840 
   3841 
   3842 static int
   3843 raid_detach(device_t self, int flags)
   3844 {
   3845 	int error;
   3846 	struct raid_softc *rs = raidget(device_unit(self));
   3847 
   3848 	if (rs == NULL)
   3849 		return ENXIO;
   3850 
   3851 	if ((error = raidlock(rs)) != 0)
   3852 		return (error);
   3853 
   3854 	error = raid_detach_unlocked(rs);
   3855 
   3856 	raidunlock(rs);
   3857 
   3858 	/* XXXkd: raidput(rs) ??? */
   3859 
   3860 	return error;
   3861 }
   3862 
   3863 static void
   3864 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3865 {
   3866 	struct disk_geom *dg = &rs->sc_dkdev.dk_geom;
   3867 
   3868 	memset(dg, 0, sizeof(*dg));
   3869 
   3870 	dg->dg_secperunit = raidPtr->totalSectors;
   3871 	dg->dg_secsize = raidPtr->bytesPerSector;
   3872 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   3873 	dg->dg_ntracks = 4 * raidPtr->numCol;
   3874 
   3875 	disk_set_info(rs->sc_dev, &rs->sc_dkdev, NULL);
   3876 }
   3877 
   3878 /*
   3879  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3880  * We end up returning whatever error was returned by the first cache flush
   3881  * that fails.
   3882  */
   3883 
   3884 int
   3885 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3886 {
   3887 	int c, sparecol;
   3888 	int e,error;
   3889 	int force = 1;
   3890 
   3891 	error = 0;
   3892 	for (c = 0; c < raidPtr->numCol; c++) {
   3893 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3894 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3895 					  &force, FWRITE, NOCRED);
   3896 			if (e) {
   3897 				if (e != ENODEV)
   3898 					printf("raid%d: cache flush to component %s failed.\n",
   3899 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3900 				if (error == 0) {
   3901 					error = e;
   3902 				}
   3903 			}
   3904 		}
   3905 	}
   3906 
   3907 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3908 		sparecol = raidPtr->numCol + c;
   3909 		/* Need to ensure that the reconstruct actually completed! */
   3910 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3911 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3912 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3913 			if (e) {
   3914 				if (e != ENODEV)
   3915 					printf("raid%d: cache flush to component %s failed.\n",
   3916 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3917 				if (error == 0) {
   3918 					error = e;
   3919 				}
   3920 			}
   3921 		}
   3922 	}
   3923 	return error;
   3924 }
   3925 
   3926 static void
   3927 raidminphys(struct buf *bp)
   3928 {
   3929 	dev_t dev;
   3930 	int unit;
   3931 	struct raid_softc *rs;
   3932 	RF_Raid_t *raidPtr;
   3933 	long xmax;
   3934 
   3935 	dev = bp->b_dev;
   3936 	unit = raidunit(dev);
   3937 	rs = raidget(unit);
   3938 	raidPtr = &(rs->sc_r);
   3939 
   3940 	xmax = raidPtr->Layout.numDataCol * MAXPHYS;
   3941 
   3942 	if (bp->b_bcount > xmax) {
   3943 		bp->b_bcount = xmax;
   3944 	}
   3945 }
   3946