Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.300
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.300 2013/04/27 21:18:42 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.300 2013/04/27 21:18:42 christos Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #include "raid.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #ifdef DEBUG
    156 int     rf_kdebug_level = 0;
    157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    158 #else				/* DEBUG */
    159 #define db1_printf(a) { }
    160 #endif				/* DEBUG */
    161 
    162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    163 static rf_declare_mutex2(rf_sparet_wait_mutex);
    164 static rf_declare_cond2(rf_sparet_wait_cv);
    165 static rf_declare_cond2(rf_sparet_resp_cv);
    166 
    167 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    168 						 * spare table */
    169 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    170 						 * installation process */
    171 #endif
    172 
    173 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    174 
    175 /* prototypes */
    176 static void KernelWakeupFunc(struct buf *);
    177 static void InitBP(struct buf *, struct vnode *, unsigned,
    178     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    179     void *, int, struct proc *);
    180 struct raid_softc;
    181 static void raidinit(struct raid_softc *);
    182 
    183 void raidattach(int);
    184 static int raid_match(device_t, cfdata_t, void *);
    185 static void raid_attach(device_t, device_t, void *);
    186 static int raid_detach(device_t, int);
    187 
    188 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    189     daddr_t, daddr_t);
    190 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    191     daddr_t, daddr_t, int);
    192 
    193 static int raidwrite_component_label(unsigned,
    194     dev_t, struct vnode *, RF_ComponentLabel_t *);
    195 static int raidread_component_label(unsigned,
    196     dev_t, struct vnode *, RF_ComponentLabel_t *);
    197 
    198 
    199 dev_type_open(raidopen);
    200 dev_type_close(raidclose);
    201 dev_type_read(raidread);
    202 dev_type_write(raidwrite);
    203 dev_type_ioctl(raidioctl);
    204 dev_type_strategy(raidstrategy);
    205 dev_type_dump(raiddump);
    206 dev_type_size(raidsize);
    207 
    208 const struct bdevsw raid_bdevsw = {
    209 	raidopen, raidclose, raidstrategy, raidioctl,
    210 	raiddump, raidsize, D_DISK
    211 };
    212 
    213 const struct cdevsw raid_cdevsw = {
    214 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    215 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    216 };
    217 
    218 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    219 
    220 struct raid_softc {
    221 	device_t sc_dev;
    222 	int	sc_unit;
    223 	int     sc_flags;	/* flags */
    224 	int     sc_cflags;	/* configuration flags */
    225 	uint64_t sc_size;	/* size of the raid device */
    226 	char    sc_xname[20];	/* XXX external name */
    227 	struct disk sc_dkdev;	/* generic disk device info */
    228 	struct bufq_state *buf_queue;	/* used for the device queue */
    229 	RF_Raid_t sc_r;
    230 	LIST_ENTRY(raid_softc) sc_link;
    231 };
    232 /* sc_flags */
    233 #define RAIDF_INITED	0x01	/* unit has been initialized */
    234 #define RAIDF_WLABEL	0x02	/* label area is writable */
    235 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    236 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    237 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    238 #define RAIDF_LOCKED	0x80	/* unit is locked */
    239 
    240 #define	raidunit(x)	DISKUNIT(x)
    241 
    242 extern struct cfdriver raid_cd;
    243 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    244     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    245     DVF_DETACH_SHUTDOWN);
    246 
    247 /*
    248  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    249  * Be aware that large numbers can allow the driver to consume a lot of
    250  * kernel memory, especially on writes, and in degraded mode reads.
    251  *
    252  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    253  * a single 64K write will typically require 64K for the old data,
    254  * 64K for the old parity, and 64K for the new parity, for a total
    255  * of 192K (if the parity buffer is not re-used immediately).
    256  * Even it if is used immediately, that's still 128K, which when multiplied
    257  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    258  *
    259  * Now in degraded mode, for example, a 64K read on the above setup may
    260  * require data reconstruction, which will require *all* of the 4 remaining
    261  * disks to participate -- 4 * 32K/disk == 128K again.
    262  */
    263 
    264 #ifndef RAIDOUTSTANDING
    265 #define RAIDOUTSTANDING   6
    266 #endif
    267 
    268 #define RAIDLABELDEV(dev)	\
    269 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    270 
    271 /* declared here, and made public, for the benefit of KVM stuff.. */
    272 
    273 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    274 				     struct disklabel *);
    275 static void raidgetdisklabel(dev_t);
    276 static void raidmakedisklabel(struct raid_softc *);
    277 
    278 static int raidlock(struct raid_softc *);
    279 static void raidunlock(struct raid_softc *);
    280 
    281 static int raid_detach_unlocked(struct raid_softc *);
    282 
    283 static void rf_markalldirty(RF_Raid_t *);
    284 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    285 
    286 void rf_ReconThread(struct rf_recon_req *);
    287 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    288 void rf_CopybackThread(RF_Raid_t *raidPtr);
    289 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    290 int rf_autoconfig(device_t);
    291 void rf_buildroothack(RF_ConfigSet_t *);
    292 
    293 RF_AutoConfig_t *rf_find_raid_components(void);
    294 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    295 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    296 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    297 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    298 int rf_set_autoconfig(RF_Raid_t *, int);
    299 int rf_set_rootpartition(RF_Raid_t *, int);
    300 void rf_release_all_vps(RF_ConfigSet_t *);
    301 void rf_cleanup_config_set(RF_ConfigSet_t *);
    302 int rf_have_enough_components(RF_ConfigSet_t *);
    303 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
    304 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    305 
    306 /*
    307  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    308  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    309  * in the kernel config file.
    310  */
    311 #ifdef RAID_AUTOCONFIG
    312 int raidautoconfig = 1;
    313 #else
    314 int raidautoconfig = 0;
    315 #endif
    316 static bool raidautoconfigdone = false;
    317 
    318 struct RF_Pools_s rf_pools;
    319 
    320 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
    321 static kmutex_t raid_lock;
    322 
    323 static struct raid_softc *
    324 raidcreate(int unit) {
    325 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
    326 	if (sc == NULL) {
    327 #ifdef DIAGNOSTIC
    328 		printf("%s: out of memory\n", __func__);
    329 #endif
    330 		return NULL;
    331 	}
    332 	sc->sc_unit = unit;
    333 	sc->sc_r.softc = sc;
    334 	bufq_alloc(&sc->buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    335 	return sc;
    336 }
    337 
    338 static void
    339 raiddestroy(struct raid_softc *sc) {
    340 	bufq_free(sc->buf_queue);
    341 	kmem_free(sc, sizeof(*sc));
    342 }
    343 
    344 static struct raid_softc *
    345 raidget(int unit) {
    346 	struct raid_softc *sc;
    347 	if (unit < 0) {
    348 #ifdef DIAGNOSTIC
    349 		panic("%s: unit %d!", __func__, unit);
    350 #endif
    351 		return NULL;
    352 	}
    353 	mutex_enter(&raid_lock);
    354 	LIST_FOREACH(sc, &raids, sc_link) {
    355 		if (sc->sc_unit == unit) {
    356 			mutex_exit(&raid_lock);
    357 			return sc;
    358 		}
    359 	}
    360 	mutex_exit(&raid_lock);
    361 	if ((sc = raidcreate(unit)) == NULL)
    362 		return NULL;
    363 	mutex_enter(&raid_lock);
    364 	LIST_INSERT_HEAD(&raids, sc, sc_link);
    365 	mutex_exit(&raid_lock);
    366 	return sc;
    367 }
    368 
    369 static void
    370 raidput(struct raid_softc *sc) {
    371 	mutex_enter(&raid_lock);
    372 	LIST_REMOVE(sc, sc_link);
    373 	mutex_exit(&raid_lock);
    374 	raiddestroy(sc);
    375 }
    376 
    377 void
    378 raidattach(int num)
    379 {
    380 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
    381 	/* This is where all the initialization stuff gets done. */
    382 
    383 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    384 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    385 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    386 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    387 
    388 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    389 #endif
    390 
    391 	if (rf_BootRaidframe() == 0)
    392 		aprint_verbose("Kernelized RAIDframe activated\n");
    393 	else
    394 		panic("Serious error booting RAID!!");
    395 
    396 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    397 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    398 	}
    399 
    400 	raidautoconfigdone = false;
    401 
    402 	/*
    403 	 * Register a finalizer which will be used to auto-config RAID
    404 	 * sets once all real hardware devices have been found.
    405 	 */
    406 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    407 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    408 }
    409 
    410 int
    411 rf_autoconfig(device_t self)
    412 {
    413 	RF_AutoConfig_t *ac_list;
    414 	RF_ConfigSet_t *config_sets;
    415 
    416 	if (!raidautoconfig || raidautoconfigdone == true)
    417 		return (0);
    418 
    419 	/* XXX This code can only be run once. */
    420 	raidautoconfigdone = true;
    421 
    422 	/* 1. locate all RAID components on the system */
    423 	aprint_debug("Searching for RAID components...\n");
    424 	ac_list = rf_find_raid_components();
    425 
    426 	/* 2. Sort them into their respective sets. */
    427 	config_sets = rf_create_auto_sets(ac_list);
    428 
    429 	/*
    430 	 * 3. Evaluate each set and configure the valid ones.
    431 	 * This gets done in rf_buildroothack().
    432 	 */
    433 	rf_buildroothack(config_sets);
    434 
    435 	return 1;
    436 }
    437 
    438 void
    439 rf_buildroothack(RF_ConfigSet_t *config_sets)
    440 {
    441 	RF_ConfigSet_t *cset;
    442 	RF_ConfigSet_t *next_cset;
    443 	int col;
    444 	int num_root;
    445 	char *devname;
    446 	struct raid_softc *sc, *rsc;
    447 
    448 	sc = rsc = NULL;
    449 	num_root = 0;
    450 	cset = config_sets;
    451 	while (cset != NULL) {
    452 		next_cset = cset->next;
    453 		if (rf_have_enough_components(cset) &&
    454 		    cset->ac->clabel->autoconfigure == 1) {
    455 			sc = rf_auto_config_set(cset);
    456 			if (sc != NULL) {
    457 				aprint_debug("raid%d: configured ok\n",
    458 				    sc->sc_unit);
    459 				if (cset->rootable) {
    460 					rsc = sc;
    461 					num_root++;
    462 				}
    463 			} else {
    464 				/* The autoconfig didn't work :( */
    465 				aprint_debug("Autoconfig failed\n");
    466 				rf_release_all_vps(cset);
    467 			}
    468 		} else {
    469 			/* we're not autoconfiguring this set...
    470 			   release the associated resources */
    471 			rf_release_all_vps(cset);
    472 		}
    473 		/* cleanup */
    474 		rf_cleanup_config_set(cset);
    475 		cset = next_cset;
    476 	}
    477 
    478 	/* if the user has specified what the root device should be
    479 	   then we don't touch booted_device or boothowto... */
    480 
    481 	if (rootspec != NULL)
    482 		return;
    483 
    484 	/* we found something bootable... */
    485 
    486 	if (num_root == 1) {
    487 		if (rsc->sc_dkdev.dk_nwedges != 0) {
    488 			/* XXX: How do we find the real root partition? */
    489 			char cname[sizeof(cset->ac->devname)];
    490 			snprintf(cname, sizeof(cname), "%s%c",
    491 			    device_xname(rsc->sc_dev), 'a');
    492 			booted_device = dkwedge_find_by_wname(cname);
    493 		} else
    494 			booted_device = rsc->sc_dev;
    495 	} else if (num_root > 1) {
    496 
    497 		/*
    498 		 * Maybe the MD code can help. If it cannot, then
    499 		 * setroot() will discover that we have no
    500 		 * booted_device and will ask the user if nothing was
    501 		 * hardwired in the kernel config file
    502 		 */
    503 
    504 		if (booted_device == NULL)
    505 			cpu_rootconf();
    506 		if (booted_device == NULL)
    507 			return;
    508 
    509 		num_root = 0;
    510 		mutex_enter(&raid_lock);
    511 		LIST_FOREACH(sc, &raids, sc_link) {
    512 			RF_Raid_t *r = &sc->sc_r;
    513 			if (r->valid == 0)
    514 				continue;
    515 
    516 			if (r->root_partition == 0)
    517 				continue;
    518 
    519 			for (col = 0; col < r->numCol; col++) {
    520 				devname = r->Disks[col].devname;
    521 				devname += sizeof("/dev/") - 1;
    522 				if (strncmp(devname, device_xname(booted_device),
    523 					    strlen(device_xname(booted_device))) != 0)
    524 					continue;
    525 				aprint_debug("raid%d includes boot device %s\n",
    526 				       sc->sc_unit, devname);
    527 				num_root++;
    528 				rsc = sc;
    529 			}
    530 		}
    531 		mutex_exit(&raid_lock);
    532 
    533 		if (num_root == 1) {
    534 			booted_device = rsc->sc_dev;
    535 		} else {
    536 			/* we can't guess.. require the user to answer... */
    537 			boothowto |= RB_ASKNAME;
    538 		}
    539 	}
    540 }
    541 
    542 
    543 int
    544 raidsize(dev_t dev)
    545 {
    546 	struct raid_softc *rs;
    547 	struct disklabel *lp;
    548 	int     part, unit, omask, size;
    549 
    550 	unit = raidunit(dev);
    551 	if ((rs = raidget(unit)) == NULL)
    552 		return -1;
    553 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    554 		return (-1);
    555 
    556 	part = DISKPART(dev);
    557 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    558 	lp = rs->sc_dkdev.dk_label;
    559 
    560 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    561 		return (-1);
    562 
    563 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    564 		size = -1;
    565 	else
    566 		size = lp->d_partitions[part].p_size *
    567 		    (lp->d_secsize / DEV_BSIZE);
    568 
    569 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    570 		return (-1);
    571 
    572 	return (size);
    573 
    574 }
    575 
    576 int
    577 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    578 {
    579 	int     unit = raidunit(dev);
    580 	struct raid_softc *rs;
    581 	const struct bdevsw *bdev;
    582 	struct disklabel *lp;
    583 	RF_Raid_t *raidPtr;
    584 	daddr_t offset;
    585 	int     part, c, sparecol, j, scol, dumpto;
    586 	int     error = 0;
    587 
    588 	if ((rs = raidget(unit)) == NULL)
    589 		return ENXIO;
    590 
    591 	raidPtr = &rs->sc_r;
    592 
    593 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    594 		return ENXIO;
    595 
    596 	/* we only support dumping to RAID 1 sets */
    597 	if (raidPtr->Layout.numDataCol != 1 ||
    598 	    raidPtr->Layout.numParityCol != 1)
    599 		return EINVAL;
    600 
    601 
    602 	if ((error = raidlock(rs)) != 0)
    603 		return error;
    604 
    605 	if (size % DEV_BSIZE != 0) {
    606 		error = EINVAL;
    607 		goto out;
    608 	}
    609 
    610 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    611 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    612 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    613 		    size / DEV_BSIZE, rs->sc_size);
    614 		error = EINVAL;
    615 		goto out;
    616 	}
    617 
    618 	part = DISKPART(dev);
    619 	lp = rs->sc_dkdev.dk_label;
    620 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    621 
    622 	/* figure out what device is alive.. */
    623 
    624 	/*
    625 	   Look for a component to dump to.  The preference for the
    626 	   component to dump to is as follows:
    627 	   1) the master
    628 	   2) a used_spare of the master
    629 	   3) the slave
    630 	   4) a used_spare of the slave
    631 	*/
    632 
    633 	dumpto = -1;
    634 	for (c = 0; c < raidPtr->numCol; c++) {
    635 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    636 			/* this might be the one */
    637 			dumpto = c;
    638 			break;
    639 		}
    640 	}
    641 
    642 	/*
    643 	   At this point we have possibly selected a live master or a
    644 	   live slave.  We now check to see if there is a spared
    645 	   master (or a spared slave), if we didn't find a live master
    646 	   or a live slave.
    647 	*/
    648 
    649 	for (c = 0; c < raidPtr->numSpare; c++) {
    650 		sparecol = raidPtr->numCol + c;
    651 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    652 			/* How about this one? */
    653 			scol = -1;
    654 			for(j=0;j<raidPtr->numCol;j++) {
    655 				if (raidPtr->Disks[j].spareCol == sparecol) {
    656 					scol = j;
    657 					break;
    658 				}
    659 			}
    660 			if (scol == 0) {
    661 				/*
    662 				   We must have found a spared master!
    663 				   We'll take that over anything else
    664 				   found so far.  (We couldn't have
    665 				   found a real master before, since
    666 				   this is a used spare, and it's
    667 				   saying that it's replacing the
    668 				   master.)  On reboot (with
    669 				   autoconfiguration turned on)
    670 				   sparecol will become the 1st
    671 				   component (component0) of this set.
    672 				*/
    673 				dumpto = sparecol;
    674 				break;
    675 			} else if (scol != -1) {
    676 				/*
    677 				   Must be a spared slave.  We'll dump
    678 				   to that if we havn't found anything
    679 				   else so far.
    680 				*/
    681 				if (dumpto == -1)
    682 					dumpto = sparecol;
    683 			}
    684 		}
    685 	}
    686 
    687 	if (dumpto == -1) {
    688 		/* we couldn't find any live components to dump to!?!?
    689 		 */
    690 		error = EINVAL;
    691 		goto out;
    692 	}
    693 
    694 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    695 
    696 	/*
    697 	   Note that blkno is relative to this particular partition.
    698 	   By adding the offset of this partition in the RAID
    699 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    700 	   value that is relative to the partition used for the
    701 	   underlying component.
    702 	*/
    703 
    704 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    705 				blkno + offset, va, size);
    706 
    707 out:
    708 	raidunlock(rs);
    709 
    710 	return error;
    711 }
    712 /* ARGSUSED */
    713 int
    714 raidopen(dev_t dev, int flags, int fmt,
    715     struct lwp *l)
    716 {
    717 	int     unit = raidunit(dev);
    718 	struct raid_softc *rs;
    719 	struct disklabel *lp;
    720 	int     part, pmask;
    721 	int     error = 0;
    722 
    723 	if ((rs = raidget(unit)) == NULL)
    724 		return ENXIO;
    725 	if ((error = raidlock(rs)) != 0)
    726 		return (error);
    727 
    728 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    729 		error = EBUSY;
    730 		goto bad;
    731 	}
    732 
    733 	lp = rs->sc_dkdev.dk_label;
    734 
    735 	part = DISKPART(dev);
    736 
    737 	/*
    738 	 * If there are wedges, and this is not RAW_PART, then we
    739 	 * need to fail.
    740 	 */
    741 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    742 		error = EBUSY;
    743 		goto bad;
    744 	}
    745 	pmask = (1 << part);
    746 
    747 	if ((rs->sc_flags & RAIDF_INITED) &&
    748 	    (rs->sc_dkdev.dk_openmask == 0))
    749 		raidgetdisklabel(dev);
    750 
    751 	/* make sure that this partition exists */
    752 
    753 	if (part != RAW_PART) {
    754 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    755 		    ((part >= lp->d_npartitions) ||
    756 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    757 			error = ENXIO;
    758 			goto bad;
    759 		}
    760 	}
    761 	/* Prevent this unit from being unconfigured while open. */
    762 	switch (fmt) {
    763 	case S_IFCHR:
    764 		rs->sc_dkdev.dk_copenmask |= pmask;
    765 		break;
    766 
    767 	case S_IFBLK:
    768 		rs->sc_dkdev.dk_bopenmask |= pmask;
    769 		break;
    770 	}
    771 
    772 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    773 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    774 		/* First one... mark things as dirty... Note that we *MUST*
    775 		 have done a configure before this.  I DO NOT WANT TO BE
    776 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    777 		 THAT THEY BELONG TOGETHER!!!!! */
    778 		/* XXX should check to see if we're only open for reading
    779 		   here... If so, we needn't do this, but then need some
    780 		   other way of keeping track of what's happened.. */
    781 
    782 		rf_markalldirty(&rs->sc_r);
    783 	}
    784 
    785 
    786 	rs->sc_dkdev.dk_openmask =
    787 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    788 
    789 bad:
    790 	raidunlock(rs);
    791 
    792 	return (error);
    793 
    794 
    795 }
    796 /* ARGSUSED */
    797 int
    798 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    799 {
    800 	int     unit = raidunit(dev);
    801 	struct raid_softc *rs;
    802 	int     error = 0;
    803 	int     part;
    804 
    805 	if ((rs = raidget(unit)) == NULL)
    806 		return ENXIO;
    807 
    808 	if ((error = raidlock(rs)) != 0)
    809 		return (error);
    810 
    811 	part = DISKPART(dev);
    812 
    813 	/* ...that much closer to allowing unconfiguration... */
    814 	switch (fmt) {
    815 	case S_IFCHR:
    816 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    817 		break;
    818 
    819 	case S_IFBLK:
    820 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    821 		break;
    822 	}
    823 	rs->sc_dkdev.dk_openmask =
    824 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    825 
    826 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    827 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    828 		/* Last one... device is not unconfigured yet.
    829 		   Device shutdown has taken care of setting the
    830 		   clean bits if RAIDF_INITED is not set
    831 		   mark things as clean... */
    832 
    833 		rf_update_component_labels(&rs->sc_r,
    834 						 RF_FINAL_COMPONENT_UPDATE);
    835 
    836 		/* If the kernel is shutting down, it will detach
    837 		 * this RAID set soon enough.
    838 		 */
    839 	}
    840 
    841 	raidunlock(rs);
    842 	return (0);
    843 
    844 }
    845 
    846 void
    847 raidstrategy(struct buf *bp)
    848 {
    849 	unsigned int unit = raidunit(bp->b_dev);
    850 	RF_Raid_t *raidPtr;
    851 	int     wlabel;
    852 	struct raid_softc *rs;
    853 
    854 	if ((rs = raidget(unit)) == NULL) {
    855 		bp->b_error = ENXIO;
    856 		goto done;
    857 	}
    858 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
    859 		bp->b_error = ENXIO;
    860 		goto done;
    861 	}
    862 	raidPtr = &rs->sc_r;
    863 	if (!raidPtr->valid) {
    864 		bp->b_error = ENODEV;
    865 		goto done;
    866 	}
    867 	if (bp->b_bcount == 0) {
    868 		db1_printf(("b_bcount is zero..\n"));
    869 		goto done;
    870 	}
    871 
    872 	/*
    873 	 * Do bounds checking and adjust transfer.  If there's an
    874 	 * error, the bounds check will flag that for us.
    875 	 */
    876 
    877 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    878 	if (DISKPART(bp->b_dev) == RAW_PART) {
    879 		uint64_t size; /* device size in DEV_BSIZE unit */
    880 
    881 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    882 			size = raidPtr->totalSectors <<
    883 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    884 		} else {
    885 			size = raidPtr->totalSectors >>
    886 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    887 		}
    888 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    889 			goto done;
    890 		}
    891 	} else {
    892 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    893 			db1_printf(("Bounds check failed!!:%d %d\n",
    894 				(int) bp->b_blkno, (int) wlabel));
    895 			goto done;
    896 		}
    897 	}
    898 
    899 	rf_lock_mutex2(raidPtr->iodone_lock);
    900 
    901 	bp->b_resid = 0;
    902 
    903 	/* stuff it onto our queue */
    904 	bufq_put(rs->buf_queue, bp);
    905 
    906 	/* scheduled the IO to happen at the next convenient time */
    907 	rf_signal_cond2(raidPtr->iodone_cv);
    908 	rf_unlock_mutex2(raidPtr->iodone_lock);
    909 
    910 	return;
    911 
    912 done:
    913 	bp->b_resid = bp->b_bcount;
    914 	biodone(bp);
    915 }
    916 /* ARGSUSED */
    917 int
    918 raidread(dev_t dev, struct uio *uio, int flags)
    919 {
    920 	int     unit = raidunit(dev);
    921 	struct raid_softc *rs;
    922 
    923 	if ((rs = raidget(unit)) == NULL)
    924 		return ENXIO;
    925 
    926 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    927 		return (ENXIO);
    928 
    929 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    930 
    931 }
    932 /* ARGSUSED */
    933 int
    934 raidwrite(dev_t dev, struct uio *uio, int flags)
    935 {
    936 	int     unit = raidunit(dev);
    937 	struct raid_softc *rs;
    938 
    939 	if ((rs = raidget(unit)) == NULL)
    940 		return ENXIO;
    941 
    942 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    943 		return (ENXIO);
    944 
    945 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    946 
    947 }
    948 
    949 static int
    950 raid_detach_unlocked(struct raid_softc *rs)
    951 {
    952 	int error;
    953 	RF_Raid_t *raidPtr;
    954 
    955 	raidPtr = &rs->sc_r;
    956 
    957 	/*
    958 	 * If somebody has a partition mounted, we shouldn't
    959 	 * shutdown.
    960 	 */
    961 	if (rs->sc_dkdev.dk_openmask != 0)
    962 		return EBUSY;
    963 
    964 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    965 		;	/* not initialized: nothing to do */
    966 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    967 		return error;
    968 	else
    969 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    970 
    971 	/* Detach the disk. */
    972 	dkwedge_delall(&rs->sc_dkdev);
    973 	disk_detach(&rs->sc_dkdev);
    974 	disk_destroy(&rs->sc_dkdev);
    975 
    976 	aprint_normal_dev(rs->sc_dev, "detached\n");
    977 
    978 	return 0;
    979 }
    980 
    981 int
    982 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    983 {
    984 	int     unit = raidunit(dev);
    985 	int     error = 0;
    986 	int     part, pmask, s;
    987 	cfdata_t cf;
    988 	struct raid_softc *rs;
    989 	RF_Config_t *k_cfg, *u_cfg;
    990 	RF_Raid_t *raidPtr;
    991 	RF_RaidDisk_t *diskPtr;
    992 	RF_AccTotals_t *totals;
    993 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    994 	u_char *specific_buf;
    995 	int retcode = 0;
    996 	int column;
    997 /*	int raidid; */
    998 	struct rf_recon_req *rrcopy, *rr;
    999 	RF_ComponentLabel_t *clabel;
   1000 	RF_ComponentLabel_t *ci_label;
   1001 	RF_ComponentLabel_t **clabel_ptr;
   1002 	RF_SingleComponent_t *sparePtr,*componentPtr;
   1003 	RF_SingleComponent_t component;
   1004 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
   1005 	int i, j, d;
   1006 #ifdef __HAVE_OLD_DISKLABEL
   1007 	struct disklabel newlabel;
   1008 #endif
   1009 	struct dkwedge_info *dkw;
   1010 
   1011 	if ((rs = raidget(unit)) == NULL)
   1012 		return ENXIO;
   1013 	raidPtr = &rs->sc_r;
   1014 
   1015 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1016 		(int) DISKPART(dev), (int) unit, cmd));
   1017 
   1018 	/* Must be open for writes for these commands... */
   1019 	switch (cmd) {
   1020 #ifdef DIOCGSECTORSIZE
   1021 	case DIOCGSECTORSIZE:
   1022 		*(u_int *)data = raidPtr->bytesPerSector;
   1023 		return 0;
   1024 	case DIOCGMEDIASIZE:
   1025 		*(off_t *)data =
   1026 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1027 		return 0;
   1028 #endif
   1029 	case DIOCSDINFO:
   1030 	case DIOCWDINFO:
   1031 #ifdef __HAVE_OLD_DISKLABEL
   1032 	case ODIOCWDINFO:
   1033 	case ODIOCSDINFO:
   1034 #endif
   1035 	case DIOCWLABEL:
   1036 	case DIOCAWEDGE:
   1037 	case DIOCDWEDGE:
   1038 	case DIOCSSTRATEGY:
   1039 		if ((flag & FWRITE) == 0)
   1040 			return (EBADF);
   1041 	}
   1042 
   1043 	/* Must be initialized for these... */
   1044 	switch (cmd) {
   1045 	case DIOCGDINFO:
   1046 	case DIOCSDINFO:
   1047 	case DIOCWDINFO:
   1048 #ifdef __HAVE_OLD_DISKLABEL
   1049 	case ODIOCGDINFO:
   1050 	case ODIOCWDINFO:
   1051 	case ODIOCSDINFO:
   1052 	case ODIOCGDEFLABEL:
   1053 #endif
   1054 	case DIOCGPART:
   1055 	case DIOCWLABEL:
   1056 	case DIOCGDEFLABEL:
   1057 	case DIOCAWEDGE:
   1058 	case DIOCDWEDGE:
   1059 	case DIOCLWEDGES:
   1060 	case DIOCCACHESYNC:
   1061 	case RAIDFRAME_SHUTDOWN:
   1062 	case RAIDFRAME_REWRITEPARITY:
   1063 	case RAIDFRAME_GET_INFO:
   1064 	case RAIDFRAME_RESET_ACCTOTALS:
   1065 	case RAIDFRAME_GET_ACCTOTALS:
   1066 	case RAIDFRAME_KEEP_ACCTOTALS:
   1067 	case RAIDFRAME_GET_SIZE:
   1068 	case RAIDFRAME_FAIL_DISK:
   1069 	case RAIDFRAME_COPYBACK:
   1070 	case RAIDFRAME_CHECK_RECON_STATUS:
   1071 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1072 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1073 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1074 	case RAIDFRAME_ADD_HOT_SPARE:
   1075 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1076 	case RAIDFRAME_INIT_LABELS:
   1077 	case RAIDFRAME_REBUILD_IN_PLACE:
   1078 	case RAIDFRAME_CHECK_PARITY:
   1079 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1080 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1081 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1082 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1083 	case RAIDFRAME_SET_AUTOCONFIG:
   1084 	case RAIDFRAME_SET_ROOT:
   1085 	case RAIDFRAME_DELETE_COMPONENT:
   1086 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1087 	case RAIDFRAME_PARITYMAP_STATUS:
   1088 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1089 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1090 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1091 	case DIOCGSTRATEGY:
   1092 	case DIOCSSTRATEGY:
   1093 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1094 			return (ENXIO);
   1095 	}
   1096 
   1097 	switch (cmd) {
   1098 #ifdef COMPAT_50
   1099 	case RAIDFRAME_GET_INFO50:
   1100 		return rf_get_info50(raidPtr, data);
   1101 
   1102 	case RAIDFRAME_CONFIGURE50:
   1103 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1104 			return retcode;
   1105 		goto config;
   1106 #endif
   1107 		/* configure the system */
   1108 	case RAIDFRAME_CONFIGURE:
   1109 
   1110 		if (raidPtr->valid) {
   1111 			/* There is a valid RAID set running on this unit! */
   1112 			printf("raid%d: Device already configured!\n",unit);
   1113 			return(EINVAL);
   1114 		}
   1115 
   1116 		/* copy-in the configuration information */
   1117 		/* data points to a pointer to the configuration structure */
   1118 
   1119 		u_cfg = *((RF_Config_t **) data);
   1120 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1121 		if (k_cfg == NULL) {
   1122 			return (ENOMEM);
   1123 		}
   1124 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1125 		if (retcode) {
   1126 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1127 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1128 				retcode));
   1129 			return (retcode);
   1130 		}
   1131 		goto config;
   1132 	config:
   1133 		/* allocate a buffer for the layout-specific data, and copy it
   1134 		 * in */
   1135 		if (k_cfg->layoutSpecificSize) {
   1136 			if (k_cfg->layoutSpecificSize > 10000) {
   1137 				/* sanity check */
   1138 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1139 				return (EINVAL);
   1140 			}
   1141 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1142 			    (u_char *));
   1143 			if (specific_buf == NULL) {
   1144 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1145 				return (ENOMEM);
   1146 			}
   1147 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1148 			    k_cfg->layoutSpecificSize);
   1149 			if (retcode) {
   1150 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1151 				RF_Free(specific_buf,
   1152 					k_cfg->layoutSpecificSize);
   1153 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1154 					retcode));
   1155 				return (retcode);
   1156 			}
   1157 		} else
   1158 			specific_buf = NULL;
   1159 		k_cfg->layoutSpecific = specific_buf;
   1160 
   1161 		/* should do some kind of sanity check on the configuration.
   1162 		 * Store the sum of all the bytes in the last byte? */
   1163 
   1164 		/* configure the system */
   1165 
   1166 		/*
   1167 		 * Clear the entire RAID descriptor, just to make sure
   1168 		 *  there is no stale data left in the case of a
   1169 		 *  reconfiguration
   1170 		 */
   1171 		memset(raidPtr, 0, sizeof(*raidPtr));
   1172 		raidPtr->raidid = unit;
   1173 
   1174 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1175 
   1176 		if (retcode == 0) {
   1177 
   1178 			/* allow this many simultaneous IO's to
   1179 			   this RAID device */
   1180 			raidPtr->openings = RAIDOUTSTANDING;
   1181 
   1182 			raidinit(rs);
   1183 			rf_markalldirty(raidPtr);
   1184 		}
   1185 		/* free the buffers.  No return code here. */
   1186 		if (k_cfg->layoutSpecificSize) {
   1187 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1188 		}
   1189 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1190 
   1191 		return (retcode);
   1192 
   1193 		/* shutdown the system */
   1194 	case RAIDFRAME_SHUTDOWN:
   1195 
   1196 		part = DISKPART(dev);
   1197 		pmask = (1 << part);
   1198 
   1199 		if ((error = raidlock(rs)) != 0)
   1200 			return (error);
   1201 
   1202 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1203 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1204 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1205 			retcode = EBUSY;
   1206 		else {
   1207 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1208 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1209 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1210 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1211 			retcode = 0;
   1212 		}
   1213 
   1214 		raidunlock(rs);
   1215 
   1216 		if (retcode != 0)
   1217 			return retcode;
   1218 
   1219 		/* free the pseudo device attach bits */
   1220 
   1221 		cf = device_cfdata(rs->sc_dev);
   1222 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1223 			free(cf, M_RAIDFRAME);
   1224 
   1225 		return (retcode);
   1226 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1227 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1228 		/* need to read the component label for the disk indicated
   1229 		   by row,column in clabel */
   1230 
   1231 		/*
   1232 		 * Perhaps there should be an option to skip the in-core
   1233 		 * copy and hit the disk, as with disklabel(8).
   1234 		 */
   1235 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1236 
   1237 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1238 
   1239 		if (retcode) {
   1240 			RF_Free(clabel, sizeof(*clabel));
   1241 			return retcode;
   1242 		}
   1243 
   1244 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1245 
   1246 		column = clabel->column;
   1247 
   1248 		if ((column < 0) || (column >= raidPtr->numCol +
   1249 		    raidPtr->numSpare)) {
   1250 			RF_Free(clabel, sizeof(*clabel));
   1251 			return EINVAL;
   1252 		}
   1253 
   1254 		RF_Free(clabel, sizeof(*clabel));
   1255 
   1256 		clabel = raidget_component_label(raidPtr, column);
   1257 
   1258 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1259 
   1260 #if 0
   1261 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1262 		clabel = (RF_ComponentLabel_t *) data;
   1263 
   1264 		/* XXX check the label for valid stuff... */
   1265 		/* Note that some things *should not* get modified --
   1266 		   the user should be re-initing the labels instead of
   1267 		   trying to patch things.
   1268 		   */
   1269 
   1270 		raidid = raidPtr->raidid;
   1271 #ifdef DEBUG
   1272 		printf("raid%d: Got component label:\n", raidid);
   1273 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1274 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1275 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1276 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1277 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1278 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1279 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1280 #endif
   1281 		clabel->row = 0;
   1282 		column = clabel->column;
   1283 
   1284 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1285 			return(EINVAL);
   1286 		}
   1287 
   1288 		/* XXX this isn't allowed to do anything for now :-) */
   1289 
   1290 		/* XXX and before it is, we need to fill in the rest
   1291 		   of the fields!?!?!?! */
   1292 		memcpy(raidget_component_label(raidPtr, column),
   1293 		    clabel, sizeof(*clabel));
   1294 		raidflush_component_label(raidPtr, column);
   1295 		return (0);
   1296 #endif
   1297 
   1298 	case RAIDFRAME_INIT_LABELS:
   1299 		clabel = (RF_ComponentLabel_t *) data;
   1300 		/*
   1301 		   we only want the serial number from
   1302 		   the above.  We get all the rest of the information
   1303 		   from the config that was used to create this RAID
   1304 		   set.
   1305 		   */
   1306 
   1307 		raidPtr->serial_number = clabel->serial_number;
   1308 
   1309 		for(column=0;column<raidPtr->numCol;column++) {
   1310 			diskPtr = &raidPtr->Disks[column];
   1311 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1312 				ci_label = raidget_component_label(raidPtr,
   1313 				    column);
   1314 				/* Zeroing this is important. */
   1315 				memset(ci_label, 0, sizeof(*ci_label));
   1316 				raid_init_component_label(raidPtr, ci_label);
   1317 				ci_label->serial_number =
   1318 				    raidPtr->serial_number;
   1319 				ci_label->row = 0; /* we dont' pretend to support more */
   1320 				rf_component_label_set_partitionsize(ci_label,
   1321 				    diskPtr->partitionSize);
   1322 				ci_label->column = column;
   1323 				raidflush_component_label(raidPtr, column);
   1324 			}
   1325 			/* XXXjld what about the spares? */
   1326 		}
   1327 
   1328 		return (retcode);
   1329 	case RAIDFRAME_SET_AUTOCONFIG:
   1330 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1331 		printf("raid%d: New autoconfig value is: %d\n",
   1332 		       raidPtr->raidid, d);
   1333 		*(int *) data = d;
   1334 		return (retcode);
   1335 
   1336 	case RAIDFRAME_SET_ROOT:
   1337 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1338 		printf("raid%d: New rootpartition value is: %d\n",
   1339 		       raidPtr->raidid, d);
   1340 		*(int *) data = d;
   1341 		return (retcode);
   1342 
   1343 		/* initialize all parity */
   1344 	case RAIDFRAME_REWRITEPARITY:
   1345 
   1346 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1347 			/* Parity for RAID 0 is trivially correct */
   1348 			raidPtr->parity_good = RF_RAID_CLEAN;
   1349 			return(0);
   1350 		}
   1351 
   1352 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1353 			/* Re-write is already in progress! */
   1354 			return(EINVAL);
   1355 		}
   1356 
   1357 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1358 					   rf_RewriteParityThread,
   1359 					   raidPtr,"raid_parity");
   1360 		return (retcode);
   1361 
   1362 
   1363 	case RAIDFRAME_ADD_HOT_SPARE:
   1364 		sparePtr = (RF_SingleComponent_t *) data;
   1365 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1366 		retcode = rf_add_hot_spare(raidPtr, &component);
   1367 		return(retcode);
   1368 
   1369 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1370 		return(retcode);
   1371 
   1372 	case RAIDFRAME_DELETE_COMPONENT:
   1373 		componentPtr = (RF_SingleComponent_t *)data;
   1374 		memcpy( &component, componentPtr,
   1375 			sizeof(RF_SingleComponent_t));
   1376 		retcode = rf_delete_component(raidPtr, &component);
   1377 		return(retcode);
   1378 
   1379 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1380 		componentPtr = (RF_SingleComponent_t *)data;
   1381 		memcpy( &component, componentPtr,
   1382 			sizeof(RF_SingleComponent_t));
   1383 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1384 		return(retcode);
   1385 
   1386 	case RAIDFRAME_REBUILD_IN_PLACE:
   1387 
   1388 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1389 			/* Can't do this on a RAID 0!! */
   1390 			return(EINVAL);
   1391 		}
   1392 
   1393 		if (raidPtr->recon_in_progress == 1) {
   1394 			/* a reconstruct is already in progress! */
   1395 			return(EINVAL);
   1396 		}
   1397 
   1398 		componentPtr = (RF_SingleComponent_t *) data;
   1399 		memcpy( &component, componentPtr,
   1400 			sizeof(RF_SingleComponent_t));
   1401 		component.row = 0; /* we don't support any more */
   1402 		column = component.column;
   1403 
   1404 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1405 			return(EINVAL);
   1406 		}
   1407 
   1408 		rf_lock_mutex2(raidPtr->mutex);
   1409 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1410 		    (raidPtr->numFailures > 0)) {
   1411 			/* XXX 0 above shouldn't be constant!!! */
   1412 			/* some component other than this has failed.
   1413 			   Let's not make things worse than they already
   1414 			   are... */
   1415 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1416 			       raidPtr->raidid);
   1417 			printf("raid%d:     Col: %d   Too many failures.\n",
   1418 			       raidPtr->raidid, column);
   1419 			rf_unlock_mutex2(raidPtr->mutex);
   1420 			return (EINVAL);
   1421 		}
   1422 		if (raidPtr->Disks[column].status ==
   1423 		    rf_ds_reconstructing) {
   1424 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1425 			       raidPtr->raidid);
   1426 			printf("raid%d:    Col: %d   Reconstruction already occurring!\n", raidPtr->raidid, column);
   1427 
   1428 			rf_unlock_mutex2(raidPtr->mutex);
   1429 			return (EINVAL);
   1430 		}
   1431 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1432 			rf_unlock_mutex2(raidPtr->mutex);
   1433 			return (EINVAL);
   1434 		}
   1435 		rf_unlock_mutex2(raidPtr->mutex);
   1436 
   1437 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1438 		if (rrcopy == NULL)
   1439 			return(ENOMEM);
   1440 
   1441 		rrcopy->raidPtr = (void *) raidPtr;
   1442 		rrcopy->col = column;
   1443 
   1444 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1445 					   rf_ReconstructInPlaceThread,
   1446 					   rrcopy,"raid_reconip");
   1447 		return(retcode);
   1448 
   1449 	case RAIDFRAME_GET_INFO:
   1450 		if (!raidPtr->valid)
   1451 			return (ENODEV);
   1452 		ucfgp = (RF_DeviceConfig_t **) data;
   1453 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1454 			  (RF_DeviceConfig_t *));
   1455 		if (d_cfg == NULL)
   1456 			return (ENOMEM);
   1457 		d_cfg->rows = 1; /* there is only 1 row now */
   1458 		d_cfg->cols = raidPtr->numCol;
   1459 		d_cfg->ndevs = raidPtr->numCol;
   1460 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1461 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1462 			return (ENOMEM);
   1463 		}
   1464 		d_cfg->nspares = raidPtr->numSpare;
   1465 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1466 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1467 			return (ENOMEM);
   1468 		}
   1469 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1470 		d = 0;
   1471 		for (j = 0; j < d_cfg->cols; j++) {
   1472 			d_cfg->devs[d] = raidPtr->Disks[j];
   1473 			d++;
   1474 		}
   1475 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1476 			d_cfg->spares[i] = raidPtr->Disks[j];
   1477 		}
   1478 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1479 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1480 
   1481 		return (retcode);
   1482 
   1483 	case RAIDFRAME_CHECK_PARITY:
   1484 		*(int *) data = raidPtr->parity_good;
   1485 		return (0);
   1486 
   1487 	case RAIDFRAME_PARITYMAP_STATUS:
   1488 		if (rf_paritymap_ineligible(raidPtr))
   1489 			return EINVAL;
   1490 		rf_paritymap_status(raidPtr->parity_map,
   1491 		    (struct rf_pmstat *)data);
   1492 		return 0;
   1493 
   1494 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1495 		if (rf_paritymap_ineligible(raidPtr))
   1496 			return EINVAL;
   1497 		if (raidPtr->parity_map == NULL)
   1498 			return ENOENT; /* ??? */
   1499 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1500 			(struct rf_pmparams *)data, 1))
   1501 			return EINVAL;
   1502 		return 0;
   1503 
   1504 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1505 		if (rf_paritymap_ineligible(raidPtr))
   1506 			return EINVAL;
   1507 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1508 		return 0;
   1509 
   1510 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1511 		if (rf_paritymap_ineligible(raidPtr))
   1512 			return EINVAL;
   1513 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1514 		/* XXX should errors be passed up? */
   1515 		return 0;
   1516 
   1517 	case RAIDFRAME_RESET_ACCTOTALS:
   1518 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1519 		return (0);
   1520 
   1521 	case RAIDFRAME_GET_ACCTOTALS:
   1522 		totals = (RF_AccTotals_t *) data;
   1523 		*totals = raidPtr->acc_totals;
   1524 		return (0);
   1525 
   1526 	case RAIDFRAME_KEEP_ACCTOTALS:
   1527 		raidPtr->keep_acc_totals = *(int *)data;
   1528 		return (0);
   1529 
   1530 	case RAIDFRAME_GET_SIZE:
   1531 		*(int *) data = raidPtr->totalSectors;
   1532 		return (0);
   1533 
   1534 		/* fail a disk & optionally start reconstruction */
   1535 	case RAIDFRAME_FAIL_DISK:
   1536 
   1537 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1538 			/* Can't do this on a RAID 0!! */
   1539 			return(EINVAL);
   1540 		}
   1541 
   1542 		rr = (struct rf_recon_req *) data;
   1543 		rr->row = 0;
   1544 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1545 			return (EINVAL);
   1546 
   1547 
   1548 		rf_lock_mutex2(raidPtr->mutex);
   1549 		if (raidPtr->status == rf_rs_reconstructing) {
   1550 			/* you can't fail a disk while we're reconstructing! */
   1551 			/* XXX wrong for RAID6 */
   1552 			rf_unlock_mutex2(raidPtr->mutex);
   1553 			return (EINVAL);
   1554 		}
   1555 		if ((raidPtr->Disks[rr->col].status ==
   1556 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1557 			/* some other component has failed.  Let's not make
   1558 			   things worse. XXX wrong for RAID6 */
   1559 			rf_unlock_mutex2(raidPtr->mutex);
   1560 			return (EINVAL);
   1561 		}
   1562 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1563 			/* Can't fail a spared disk! */
   1564 			rf_unlock_mutex2(raidPtr->mutex);
   1565 			return (EINVAL);
   1566 		}
   1567 		rf_unlock_mutex2(raidPtr->mutex);
   1568 
   1569 		/* make a copy of the recon request so that we don't rely on
   1570 		 * the user's buffer */
   1571 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1572 		if (rrcopy == NULL)
   1573 			return(ENOMEM);
   1574 		memcpy(rrcopy, rr, sizeof(*rr));
   1575 		rrcopy->raidPtr = (void *) raidPtr;
   1576 
   1577 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1578 					   rf_ReconThread,
   1579 					   rrcopy,"raid_recon");
   1580 		return (0);
   1581 
   1582 		/* invoke a copyback operation after recon on whatever disk
   1583 		 * needs it, if any */
   1584 	case RAIDFRAME_COPYBACK:
   1585 
   1586 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1587 			/* This makes no sense on a RAID 0!! */
   1588 			return(EINVAL);
   1589 		}
   1590 
   1591 		if (raidPtr->copyback_in_progress == 1) {
   1592 			/* Copyback is already in progress! */
   1593 			return(EINVAL);
   1594 		}
   1595 
   1596 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1597 					   rf_CopybackThread,
   1598 					   raidPtr,"raid_copyback");
   1599 		return (retcode);
   1600 
   1601 		/* return the percentage completion of reconstruction */
   1602 	case RAIDFRAME_CHECK_RECON_STATUS:
   1603 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1604 			/* This makes no sense on a RAID 0, so tell the
   1605 			   user it's done. */
   1606 			*(int *) data = 100;
   1607 			return(0);
   1608 		}
   1609 		if (raidPtr->status != rf_rs_reconstructing)
   1610 			*(int *) data = 100;
   1611 		else {
   1612 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1613 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1614 			} else {
   1615 				*(int *) data = 0;
   1616 			}
   1617 		}
   1618 		return (0);
   1619 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1620 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1621 		if (raidPtr->status != rf_rs_reconstructing) {
   1622 			progressInfo.remaining = 0;
   1623 			progressInfo.completed = 100;
   1624 			progressInfo.total = 100;
   1625 		} else {
   1626 			progressInfo.total =
   1627 				raidPtr->reconControl->numRUsTotal;
   1628 			progressInfo.completed =
   1629 				raidPtr->reconControl->numRUsComplete;
   1630 			progressInfo.remaining = progressInfo.total -
   1631 				progressInfo.completed;
   1632 		}
   1633 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1634 				  sizeof(RF_ProgressInfo_t));
   1635 		return (retcode);
   1636 
   1637 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1638 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1639 			/* This makes no sense on a RAID 0, so tell the
   1640 			   user it's done. */
   1641 			*(int *) data = 100;
   1642 			return(0);
   1643 		}
   1644 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1645 			*(int *) data = 100 *
   1646 				raidPtr->parity_rewrite_stripes_done /
   1647 				raidPtr->Layout.numStripe;
   1648 		} else {
   1649 			*(int *) data = 100;
   1650 		}
   1651 		return (0);
   1652 
   1653 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1654 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1655 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1656 			progressInfo.total = raidPtr->Layout.numStripe;
   1657 			progressInfo.completed =
   1658 				raidPtr->parity_rewrite_stripes_done;
   1659 			progressInfo.remaining = progressInfo.total -
   1660 				progressInfo.completed;
   1661 		} else {
   1662 			progressInfo.remaining = 0;
   1663 			progressInfo.completed = 100;
   1664 			progressInfo.total = 100;
   1665 		}
   1666 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1667 				  sizeof(RF_ProgressInfo_t));
   1668 		return (retcode);
   1669 
   1670 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1671 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1672 			/* This makes no sense on a RAID 0 */
   1673 			*(int *) data = 100;
   1674 			return(0);
   1675 		}
   1676 		if (raidPtr->copyback_in_progress == 1) {
   1677 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1678 				raidPtr->Layout.numStripe;
   1679 		} else {
   1680 			*(int *) data = 100;
   1681 		}
   1682 		return (0);
   1683 
   1684 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1685 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1686 		if (raidPtr->copyback_in_progress == 1) {
   1687 			progressInfo.total = raidPtr->Layout.numStripe;
   1688 			progressInfo.completed =
   1689 				raidPtr->copyback_stripes_done;
   1690 			progressInfo.remaining = progressInfo.total -
   1691 				progressInfo.completed;
   1692 		} else {
   1693 			progressInfo.remaining = 0;
   1694 			progressInfo.completed = 100;
   1695 			progressInfo.total = 100;
   1696 		}
   1697 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1698 				  sizeof(RF_ProgressInfo_t));
   1699 		return (retcode);
   1700 
   1701 		/* the sparetable daemon calls this to wait for the kernel to
   1702 		 * need a spare table. this ioctl does not return until a
   1703 		 * spare table is needed. XXX -- calling mpsleep here in the
   1704 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1705 		 * -- I should either compute the spare table in the kernel,
   1706 		 * or have a different -- XXX XXX -- interface (a different
   1707 		 * character device) for delivering the table     -- XXX */
   1708 #if 0
   1709 	case RAIDFRAME_SPARET_WAIT:
   1710 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1711 		while (!rf_sparet_wait_queue)
   1712 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1713 		waitreq = rf_sparet_wait_queue;
   1714 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1715 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1716 
   1717 		/* structure assignment */
   1718 		*((RF_SparetWait_t *) data) = *waitreq;
   1719 
   1720 		RF_Free(waitreq, sizeof(*waitreq));
   1721 		return (0);
   1722 
   1723 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1724 		 * code in it that will cause the dameon to exit */
   1725 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1726 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1727 		waitreq->fcol = -1;
   1728 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1729 		waitreq->next = rf_sparet_wait_queue;
   1730 		rf_sparet_wait_queue = waitreq;
   1731 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1732 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1733 		return (0);
   1734 
   1735 		/* used by the spare table daemon to deliver a spare table
   1736 		 * into the kernel */
   1737 	case RAIDFRAME_SEND_SPARET:
   1738 
   1739 		/* install the spare table */
   1740 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1741 
   1742 		/* respond to the requestor.  the return status of the spare
   1743 		 * table installation is passed in the "fcol" field */
   1744 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1745 		waitreq->fcol = retcode;
   1746 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1747 		waitreq->next = rf_sparet_resp_queue;
   1748 		rf_sparet_resp_queue = waitreq;
   1749 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1750 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1751 
   1752 		return (retcode);
   1753 #endif
   1754 
   1755 	default:
   1756 		break; /* fall through to the os-specific code below */
   1757 
   1758 	}
   1759 
   1760 	if (!raidPtr->valid)
   1761 		return (EINVAL);
   1762 
   1763 	/*
   1764 	 * Add support for "regular" device ioctls here.
   1765 	 */
   1766 
   1767 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1768 	if (error != EPASSTHROUGH)
   1769 		return (error);
   1770 
   1771 	switch (cmd) {
   1772 	case DIOCGDINFO:
   1773 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1774 		break;
   1775 #ifdef __HAVE_OLD_DISKLABEL
   1776 	case ODIOCGDINFO:
   1777 		newlabel = *(rs->sc_dkdev.dk_label);
   1778 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1779 			return ENOTTY;
   1780 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1781 		break;
   1782 #endif
   1783 
   1784 	case DIOCGPART:
   1785 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1786 		((struct partinfo *) data)->part =
   1787 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1788 		break;
   1789 
   1790 	case DIOCWDINFO:
   1791 	case DIOCSDINFO:
   1792 #ifdef __HAVE_OLD_DISKLABEL
   1793 	case ODIOCWDINFO:
   1794 	case ODIOCSDINFO:
   1795 #endif
   1796 	{
   1797 		struct disklabel *lp;
   1798 #ifdef __HAVE_OLD_DISKLABEL
   1799 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1800 			memset(&newlabel, 0, sizeof newlabel);
   1801 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1802 			lp = &newlabel;
   1803 		} else
   1804 #endif
   1805 		lp = (struct disklabel *)data;
   1806 
   1807 		if ((error = raidlock(rs)) != 0)
   1808 			return (error);
   1809 
   1810 		rs->sc_flags |= RAIDF_LABELLING;
   1811 
   1812 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1813 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1814 		if (error == 0) {
   1815 			if (cmd == DIOCWDINFO
   1816 #ifdef __HAVE_OLD_DISKLABEL
   1817 			    || cmd == ODIOCWDINFO
   1818 #endif
   1819 			   )
   1820 				error = writedisklabel(RAIDLABELDEV(dev),
   1821 				    raidstrategy, rs->sc_dkdev.dk_label,
   1822 				    rs->sc_dkdev.dk_cpulabel);
   1823 		}
   1824 		rs->sc_flags &= ~RAIDF_LABELLING;
   1825 
   1826 		raidunlock(rs);
   1827 
   1828 		if (error)
   1829 			return (error);
   1830 		break;
   1831 	}
   1832 
   1833 	case DIOCWLABEL:
   1834 		if (*(int *) data != 0)
   1835 			rs->sc_flags |= RAIDF_WLABEL;
   1836 		else
   1837 			rs->sc_flags &= ~RAIDF_WLABEL;
   1838 		break;
   1839 
   1840 	case DIOCGDEFLABEL:
   1841 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1842 		break;
   1843 
   1844 #ifdef __HAVE_OLD_DISKLABEL
   1845 	case ODIOCGDEFLABEL:
   1846 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1847 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1848 			return ENOTTY;
   1849 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1850 		break;
   1851 #endif
   1852 
   1853 	case DIOCAWEDGE:
   1854 	case DIOCDWEDGE:
   1855 	    	dkw = (void *)data;
   1856 
   1857 		/* If the ioctl happens here, the parent is us. */
   1858 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1859 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1860 
   1861 	case DIOCLWEDGES:
   1862 		return dkwedge_list(&rs->sc_dkdev,
   1863 		    (struct dkwedge_list *)data, l);
   1864 	case DIOCCACHESYNC:
   1865 		return rf_sync_component_caches(raidPtr);
   1866 
   1867 	case DIOCGSTRATEGY:
   1868 	    {
   1869 		struct disk_strategy *dks = (void *)data;
   1870 
   1871 		s = splbio();
   1872 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1873 		    sizeof(dks->dks_name));
   1874 		splx(s);
   1875 		dks->dks_paramlen = 0;
   1876 
   1877 		return 0;
   1878 	    }
   1879 
   1880 	case DIOCSSTRATEGY:
   1881 	    {
   1882 		struct disk_strategy *dks = (void *)data;
   1883 		struct bufq_state *new;
   1884 		struct bufq_state *old;
   1885 
   1886 		if (dks->dks_param != NULL) {
   1887 			return EINVAL;
   1888 		}
   1889 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1890 		error = bufq_alloc(&new, dks->dks_name,
   1891 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1892 		if (error) {
   1893 			return error;
   1894 		}
   1895 		s = splbio();
   1896 		old = rs->buf_queue;
   1897 		bufq_move(new, old);
   1898 		rs->buf_queue = new;
   1899 		splx(s);
   1900 		bufq_free(old);
   1901 
   1902 		return 0;
   1903 	    }
   1904 
   1905 	default:
   1906 		retcode = ENOTTY;
   1907 	}
   1908 	return (retcode);
   1909 
   1910 }
   1911 
   1912 
   1913 /* raidinit -- complete the rest of the initialization for the
   1914    RAIDframe device.  */
   1915 
   1916 
   1917 static void
   1918 raidinit(struct raid_softc *rs)
   1919 {
   1920 	cfdata_t cf;
   1921 	int     unit;
   1922 	RF_Raid_t *raidPtr = &rs->sc_r;
   1923 
   1924 	unit = raidPtr->raidid;
   1925 
   1926 
   1927 	/* XXX should check return code first... */
   1928 	rs->sc_flags |= RAIDF_INITED;
   1929 
   1930 	/* XXX doesn't check bounds. */
   1931 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1932 
   1933 	/* attach the pseudo device */
   1934 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1935 	cf->cf_name = raid_cd.cd_name;
   1936 	cf->cf_atname = raid_cd.cd_name;
   1937 	cf->cf_unit = unit;
   1938 	cf->cf_fstate = FSTATE_STAR;
   1939 
   1940 	rs->sc_dev = config_attach_pseudo(cf);
   1941 
   1942 	if (rs->sc_dev == NULL) {
   1943 		printf("raid%d: config_attach_pseudo failed\n",
   1944 		    raidPtr->raidid);
   1945 		rs->sc_flags &= ~RAIDF_INITED;
   1946 		free(cf, M_RAIDFRAME);
   1947 		return;
   1948 	}
   1949 
   1950 	/* disk_attach actually creates space for the CPU disklabel, among
   1951 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1952 	 * with disklabels. */
   1953 
   1954 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1955 	disk_attach(&rs->sc_dkdev);
   1956 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1957 
   1958 	/* XXX There may be a weird interaction here between this, and
   1959 	 * protectedSectors, as used in RAIDframe.  */
   1960 
   1961 	rs->sc_size = raidPtr->totalSectors;
   1962 
   1963 	dkwedge_discover(&rs->sc_dkdev);
   1964 
   1965 	rf_set_properties(rs, raidPtr);
   1966 
   1967 }
   1968 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1969 /* wake up the daemon & tell it to get us a spare table
   1970  * XXX
   1971  * the entries in the queues should be tagged with the raidPtr
   1972  * so that in the extremely rare case that two recons happen at once,
   1973  * we know for which device were requesting a spare table
   1974  * XXX
   1975  *
   1976  * XXX This code is not currently used. GO
   1977  */
   1978 int
   1979 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1980 {
   1981 	int     retcode;
   1982 
   1983 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1984 	req->next = rf_sparet_wait_queue;
   1985 	rf_sparet_wait_queue = req;
   1986 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1987 
   1988 	/* mpsleep unlocks the mutex */
   1989 	while (!rf_sparet_resp_queue) {
   1990 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1991 	}
   1992 	req = rf_sparet_resp_queue;
   1993 	rf_sparet_resp_queue = req->next;
   1994 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1995 
   1996 	retcode = req->fcol;
   1997 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1998 					 * alloc'd */
   1999 	return (retcode);
   2000 }
   2001 #endif
   2002 
   2003 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2004  * bp & passes it down.
   2005  * any calls originating in the kernel must use non-blocking I/O
   2006  * do some extra sanity checking to return "appropriate" error values for
   2007  * certain conditions (to make some standard utilities work)
   2008  *
   2009  * Formerly known as: rf_DoAccessKernel
   2010  */
   2011 void
   2012 raidstart(RF_Raid_t *raidPtr)
   2013 {
   2014 	RF_SectorCount_t num_blocks, pb, sum;
   2015 	RF_RaidAddr_t raid_addr;
   2016 	struct partition *pp;
   2017 	daddr_t blocknum;
   2018 	struct raid_softc *rs;
   2019 	int     do_async;
   2020 	struct buf *bp;
   2021 	int rc;
   2022 
   2023 	rs = raidPtr->softc;
   2024 	/* quick check to see if anything has died recently */
   2025 	rf_lock_mutex2(raidPtr->mutex);
   2026 	if (raidPtr->numNewFailures > 0) {
   2027 		rf_unlock_mutex2(raidPtr->mutex);
   2028 		rf_update_component_labels(raidPtr,
   2029 					   RF_NORMAL_COMPONENT_UPDATE);
   2030 		rf_lock_mutex2(raidPtr->mutex);
   2031 		raidPtr->numNewFailures--;
   2032 	}
   2033 
   2034 	/* Check to see if we're at the limit... */
   2035 	while (raidPtr->openings > 0) {
   2036 		rf_unlock_mutex2(raidPtr->mutex);
   2037 
   2038 		/* get the next item, if any, from the queue */
   2039 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2040 			/* nothing more to do */
   2041 			return;
   2042 		}
   2043 
   2044 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2045 		 * partition.. Need to make it absolute to the underlying
   2046 		 * device.. */
   2047 
   2048 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2049 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2050 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2051 			blocknum += pp->p_offset;
   2052 		}
   2053 
   2054 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2055 			    (int) blocknum));
   2056 
   2057 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2058 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2059 
   2060 		/* *THIS* is where we adjust what block we're going to...
   2061 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2062 		raid_addr = blocknum;
   2063 
   2064 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2065 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2066 		sum = raid_addr + num_blocks + pb;
   2067 		if (1 || rf_debugKernelAccess) {
   2068 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2069 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2070 				    (int) pb, (int) bp->b_resid));
   2071 		}
   2072 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2073 		    || (sum < num_blocks) || (sum < pb)) {
   2074 			bp->b_error = ENOSPC;
   2075 			bp->b_resid = bp->b_bcount;
   2076 			biodone(bp);
   2077 			rf_lock_mutex2(raidPtr->mutex);
   2078 			continue;
   2079 		}
   2080 		/*
   2081 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2082 		 */
   2083 
   2084 		if (bp->b_bcount & raidPtr->sectorMask) {
   2085 			bp->b_error = EINVAL;
   2086 			bp->b_resid = bp->b_bcount;
   2087 			biodone(bp);
   2088 			rf_lock_mutex2(raidPtr->mutex);
   2089 			continue;
   2090 
   2091 		}
   2092 		db1_printf(("Calling DoAccess..\n"));
   2093 
   2094 
   2095 		rf_lock_mutex2(raidPtr->mutex);
   2096 		raidPtr->openings--;
   2097 		rf_unlock_mutex2(raidPtr->mutex);
   2098 
   2099 		/*
   2100 		 * Everything is async.
   2101 		 */
   2102 		do_async = 1;
   2103 
   2104 		disk_busy(&rs->sc_dkdev);
   2105 
   2106 		/* XXX we're still at splbio() here... do we *really*
   2107 		   need to be? */
   2108 
   2109 		/* don't ever condition on bp->b_flags & B_WRITE.
   2110 		 * always condition on B_READ instead */
   2111 
   2112 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2113 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2114 				 do_async, raid_addr, num_blocks,
   2115 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2116 
   2117 		if (rc) {
   2118 			bp->b_error = rc;
   2119 			bp->b_resid = bp->b_bcount;
   2120 			biodone(bp);
   2121 			/* continue loop */
   2122 		}
   2123 
   2124 		rf_lock_mutex2(raidPtr->mutex);
   2125 	}
   2126 	rf_unlock_mutex2(raidPtr->mutex);
   2127 }
   2128 
   2129 
   2130 
   2131 
   2132 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2133 
   2134 int
   2135 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2136 {
   2137 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2138 	struct buf *bp;
   2139 
   2140 	req->queue = queue;
   2141 	bp = req->bp;
   2142 
   2143 	switch (req->type) {
   2144 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2145 		/* XXX need to do something extra here.. */
   2146 		/* I'm leaving this in, as I've never actually seen it used,
   2147 		 * and I'd like folks to report it... GO */
   2148 		printf(("WAKEUP CALLED\n"));
   2149 		queue->numOutstanding++;
   2150 
   2151 		bp->b_flags = 0;
   2152 		bp->b_private = req;
   2153 
   2154 		KernelWakeupFunc(bp);
   2155 		break;
   2156 
   2157 	case RF_IO_TYPE_READ:
   2158 	case RF_IO_TYPE_WRITE:
   2159 #if RF_ACC_TRACE > 0
   2160 		if (req->tracerec) {
   2161 			RF_ETIMER_START(req->tracerec->timer);
   2162 		}
   2163 #endif
   2164 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2165 		    op, queue->rf_cinfo->ci_dev,
   2166 		    req->sectorOffset, req->numSector,
   2167 		    req->buf, KernelWakeupFunc, (void *) req,
   2168 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2169 
   2170 		if (rf_debugKernelAccess) {
   2171 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2172 				(long) bp->b_blkno));
   2173 		}
   2174 		queue->numOutstanding++;
   2175 		queue->last_deq_sector = req->sectorOffset;
   2176 		/* acc wouldn't have been let in if there were any pending
   2177 		 * reqs at any other priority */
   2178 		queue->curPriority = req->priority;
   2179 
   2180 		db1_printf(("Going for %c to unit %d col %d\n",
   2181 			    req->type, queue->raidPtr->raidid,
   2182 			    queue->col));
   2183 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2184 			(int) req->sectorOffset, (int) req->numSector,
   2185 			(int) (req->numSector <<
   2186 			    queue->raidPtr->logBytesPerSector),
   2187 			(int) queue->raidPtr->logBytesPerSector));
   2188 
   2189 		/*
   2190 		 * XXX: drop lock here since this can block at
   2191 		 * least with backing SCSI devices.  Retake it
   2192 		 * to minimize fuss with calling interfaces.
   2193 		 */
   2194 
   2195 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2196 		bdev_strategy(bp);
   2197 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2198 		break;
   2199 
   2200 	default:
   2201 		panic("bad req->type in rf_DispatchKernelIO");
   2202 	}
   2203 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2204 
   2205 	return (0);
   2206 }
   2207 /* this is the callback function associated with a I/O invoked from
   2208    kernel code.
   2209  */
   2210 static void
   2211 KernelWakeupFunc(struct buf *bp)
   2212 {
   2213 	RF_DiskQueueData_t *req = NULL;
   2214 	RF_DiskQueue_t *queue;
   2215 
   2216 	db1_printf(("recovering the request queue:\n"));
   2217 
   2218 	req = bp->b_private;
   2219 
   2220 	queue = (RF_DiskQueue_t *) req->queue;
   2221 
   2222 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2223 
   2224 #if RF_ACC_TRACE > 0
   2225 	if (req->tracerec) {
   2226 		RF_ETIMER_STOP(req->tracerec->timer);
   2227 		RF_ETIMER_EVAL(req->tracerec->timer);
   2228 		rf_lock_mutex2(rf_tracing_mutex);
   2229 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2230 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2231 		req->tracerec->num_phys_ios++;
   2232 		rf_unlock_mutex2(rf_tracing_mutex);
   2233 	}
   2234 #endif
   2235 
   2236 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2237 	 * ballistic, and mark the component as hosed... */
   2238 
   2239 	if (bp->b_error != 0) {
   2240 		/* Mark the disk as dead */
   2241 		/* but only mark it once... */
   2242 		/* and only if it wouldn't leave this RAID set
   2243 		   completely broken */
   2244 		if (((queue->raidPtr->Disks[queue->col].status ==
   2245 		      rf_ds_optimal) ||
   2246 		     (queue->raidPtr->Disks[queue->col].status ==
   2247 		      rf_ds_used_spare)) &&
   2248 		     (queue->raidPtr->numFailures <
   2249 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2250 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2251 			       queue->raidPtr->raidid,
   2252 			       queue->raidPtr->Disks[queue->col].devname);
   2253 			queue->raidPtr->Disks[queue->col].status =
   2254 			    rf_ds_failed;
   2255 			queue->raidPtr->status = rf_rs_degraded;
   2256 			queue->raidPtr->numFailures++;
   2257 			queue->raidPtr->numNewFailures++;
   2258 		} else {	/* Disk is already dead... */
   2259 			/* printf("Disk already marked as dead!\n"); */
   2260 		}
   2261 
   2262 	}
   2263 
   2264 	/* Fill in the error value */
   2265 	req->error = bp->b_error;
   2266 
   2267 	/* Drop this one on the "finished" queue... */
   2268 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2269 
   2270 	/* Let the raidio thread know there is work to be done. */
   2271 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2272 
   2273 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2274 }
   2275 
   2276 
   2277 /*
   2278  * initialize a buf structure for doing an I/O in the kernel.
   2279  */
   2280 static void
   2281 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2282        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2283        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2284        struct proc *b_proc)
   2285 {
   2286 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2287 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2288 	bp->b_oflags = 0;
   2289 	bp->b_cflags = 0;
   2290 	bp->b_bcount = numSect << logBytesPerSector;
   2291 	bp->b_bufsize = bp->b_bcount;
   2292 	bp->b_error = 0;
   2293 	bp->b_dev = dev;
   2294 	bp->b_data = bf;
   2295 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2296 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2297 	if (bp->b_bcount == 0) {
   2298 		panic("bp->b_bcount is zero in InitBP!!");
   2299 	}
   2300 	bp->b_proc = b_proc;
   2301 	bp->b_iodone = cbFunc;
   2302 	bp->b_private = cbArg;
   2303 }
   2304 
   2305 static void
   2306 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2307 		    struct disklabel *lp)
   2308 {
   2309 	memset(lp, 0, sizeof(*lp));
   2310 
   2311 	/* fabricate a label... */
   2312 	lp->d_secperunit = raidPtr->totalSectors;
   2313 	lp->d_secsize = raidPtr->bytesPerSector;
   2314 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2315 	lp->d_ntracks = 4 * raidPtr->numCol;
   2316 	lp->d_ncylinders = raidPtr->totalSectors /
   2317 		(lp->d_nsectors * lp->d_ntracks);
   2318 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2319 
   2320 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2321 	lp->d_type = DTYPE_RAID;
   2322 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2323 	lp->d_rpm = 3600;
   2324 	lp->d_interleave = 1;
   2325 	lp->d_flags = 0;
   2326 
   2327 	lp->d_partitions[RAW_PART].p_offset = 0;
   2328 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2329 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2330 	lp->d_npartitions = RAW_PART + 1;
   2331 
   2332 	lp->d_magic = DISKMAGIC;
   2333 	lp->d_magic2 = DISKMAGIC;
   2334 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2335 
   2336 }
   2337 /*
   2338  * Read the disklabel from the raid device.  If one is not present, fake one
   2339  * up.
   2340  */
   2341 static void
   2342 raidgetdisklabel(dev_t dev)
   2343 {
   2344 	int     unit = raidunit(dev);
   2345 	struct raid_softc *rs;
   2346 	const char   *errstring;
   2347 	struct disklabel *lp;
   2348 	struct cpu_disklabel *clp;
   2349 	RF_Raid_t *raidPtr;
   2350 
   2351 	if ((rs = raidget(unit)) == NULL)
   2352 		return;
   2353 
   2354 	lp = rs->sc_dkdev.dk_label;
   2355 	clp = rs->sc_dkdev.dk_cpulabel;
   2356 
   2357 	db1_printf(("Getting the disklabel...\n"));
   2358 
   2359 	memset(clp, 0, sizeof(*clp));
   2360 
   2361 	raidPtr = &rs->sc_r;
   2362 
   2363 	raidgetdefaultlabel(raidPtr, rs, lp);
   2364 
   2365 	/*
   2366 	 * Call the generic disklabel extraction routine.
   2367 	 */
   2368 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2369 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2370 	if (errstring)
   2371 		raidmakedisklabel(rs);
   2372 	else {
   2373 		int     i;
   2374 		struct partition *pp;
   2375 
   2376 		/*
   2377 		 * Sanity check whether the found disklabel is valid.
   2378 		 *
   2379 		 * This is necessary since total size of the raid device
   2380 		 * may vary when an interleave is changed even though exactly
   2381 		 * same components are used, and old disklabel may used
   2382 		 * if that is found.
   2383 		 */
   2384 		if (lp->d_secperunit != rs->sc_size)
   2385 			printf("raid%d: WARNING: %s: "
   2386 			    "total sector size in disklabel (%" PRIu32 ") != "
   2387 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2388 			    lp->d_secperunit, rs->sc_size);
   2389 		for (i = 0; i < lp->d_npartitions; i++) {
   2390 			pp = &lp->d_partitions[i];
   2391 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2392 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2393 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2394 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2395 		}
   2396 	}
   2397 
   2398 }
   2399 /*
   2400  * Take care of things one might want to take care of in the event
   2401  * that a disklabel isn't present.
   2402  */
   2403 static void
   2404 raidmakedisklabel(struct raid_softc *rs)
   2405 {
   2406 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2407 	db1_printf(("Making a label..\n"));
   2408 
   2409 	/*
   2410 	 * For historical reasons, if there's no disklabel present
   2411 	 * the raw partition must be marked FS_BSDFFS.
   2412 	 */
   2413 
   2414 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2415 
   2416 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2417 
   2418 	lp->d_checksum = dkcksum(lp);
   2419 }
   2420 /*
   2421  * Wait interruptibly for an exclusive lock.
   2422  *
   2423  * XXX
   2424  * Several drivers do this; it should be abstracted and made MP-safe.
   2425  * (Hmm... where have we seen this warning before :->  GO )
   2426  */
   2427 static int
   2428 raidlock(struct raid_softc *rs)
   2429 {
   2430 	int     error;
   2431 
   2432 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2433 		rs->sc_flags |= RAIDF_WANTED;
   2434 		if ((error =
   2435 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2436 			return (error);
   2437 	}
   2438 	rs->sc_flags |= RAIDF_LOCKED;
   2439 	return (0);
   2440 }
   2441 /*
   2442  * Unlock and wake up any waiters.
   2443  */
   2444 static void
   2445 raidunlock(struct raid_softc *rs)
   2446 {
   2447 
   2448 	rs->sc_flags &= ~RAIDF_LOCKED;
   2449 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2450 		rs->sc_flags &= ~RAIDF_WANTED;
   2451 		wakeup(rs);
   2452 	}
   2453 }
   2454 
   2455 
   2456 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2457 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2458 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2459 
   2460 static daddr_t
   2461 rf_component_info_offset(void)
   2462 {
   2463 
   2464 	return RF_COMPONENT_INFO_OFFSET;
   2465 }
   2466 
   2467 static daddr_t
   2468 rf_component_info_size(unsigned secsize)
   2469 {
   2470 	daddr_t info_size;
   2471 
   2472 	KASSERT(secsize);
   2473 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2474 		info_size = secsize;
   2475 	else
   2476 		info_size = RF_COMPONENT_INFO_SIZE;
   2477 
   2478 	return info_size;
   2479 }
   2480 
   2481 static daddr_t
   2482 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2483 {
   2484 	daddr_t map_offset;
   2485 
   2486 	KASSERT(raidPtr->bytesPerSector);
   2487 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2488 		map_offset = raidPtr->bytesPerSector;
   2489 	else
   2490 		map_offset = RF_COMPONENT_INFO_SIZE;
   2491 	map_offset += rf_component_info_offset();
   2492 
   2493 	return map_offset;
   2494 }
   2495 
   2496 static daddr_t
   2497 rf_parity_map_size(RF_Raid_t *raidPtr)
   2498 {
   2499 	daddr_t map_size;
   2500 
   2501 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2502 		map_size = raidPtr->bytesPerSector;
   2503 	else
   2504 		map_size = RF_PARITY_MAP_SIZE;
   2505 
   2506 	return map_size;
   2507 }
   2508 
   2509 int
   2510 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2511 {
   2512 	RF_ComponentLabel_t *clabel;
   2513 
   2514 	clabel = raidget_component_label(raidPtr, col);
   2515 	clabel->clean = RF_RAID_CLEAN;
   2516 	raidflush_component_label(raidPtr, col);
   2517 	return(0);
   2518 }
   2519 
   2520 
   2521 int
   2522 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2523 {
   2524 	RF_ComponentLabel_t *clabel;
   2525 
   2526 	clabel = raidget_component_label(raidPtr, col);
   2527 	clabel->clean = RF_RAID_DIRTY;
   2528 	raidflush_component_label(raidPtr, col);
   2529 	return(0);
   2530 }
   2531 
   2532 int
   2533 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2534 {
   2535 	KASSERT(raidPtr->bytesPerSector);
   2536 	return raidread_component_label(raidPtr->bytesPerSector,
   2537 	    raidPtr->Disks[col].dev,
   2538 	    raidPtr->raid_cinfo[col].ci_vp,
   2539 	    &raidPtr->raid_cinfo[col].ci_label);
   2540 }
   2541 
   2542 RF_ComponentLabel_t *
   2543 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2544 {
   2545 	return &raidPtr->raid_cinfo[col].ci_label;
   2546 }
   2547 
   2548 int
   2549 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2550 {
   2551 	RF_ComponentLabel_t *label;
   2552 
   2553 	label = &raidPtr->raid_cinfo[col].ci_label;
   2554 	label->mod_counter = raidPtr->mod_counter;
   2555 #ifndef RF_NO_PARITY_MAP
   2556 	label->parity_map_modcount = label->mod_counter;
   2557 #endif
   2558 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2559 	    raidPtr->Disks[col].dev,
   2560 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2561 }
   2562 
   2563 
   2564 static int
   2565 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2566     RF_ComponentLabel_t *clabel)
   2567 {
   2568 	return raidread_component_area(dev, b_vp, clabel,
   2569 	    sizeof(RF_ComponentLabel_t),
   2570 	    rf_component_info_offset(),
   2571 	    rf_component_info_size(secsize));
   2572 }
   2573 
   2574 /* ARGSUSED */
   2575 static int
   2576 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2577     size_t msize, daddr_t offset, daddr_t dsize)
   2578 {
   2579 	struct buf *bp;
   2580 	const struct bdevsw *bdev;
   2581 	int error;
   2582 
   2583 	/* XXX should probably ensure that we don't try to do this if
   2584 	   someone has changed rf_protected_sectors. */
   2585 
   2586 	if (b_vp == NULL) {
   2587 		/* For whatever reason, this component is not valid.
   2588 		   Don't try to read a component label from it. */
   2589 		return(EINVAL);
   2590 	}
   2591 
   2592 	/* get a block of the appropriate size... */
   2593 	bp = geteblk((int)dsize);
   2594 	bp->b_dev = dev;
   2595 
   2596 	/* get our ducks in a row for the read */
   2597 	bp->b_blkno = offset / DEV_BSIZE;
   2598 	bp->b_bcount = dsize;
   2599 	bp->b_flags |= B_READ;
   2600  	bp->b_resid = dsize;
   2601 
   2602 	bdev = bdevsw_lookup(bp->b_dev);
   2603 	if (bdev == NULL)
   2604 		return (ENXIO);
   2605 	(*bdev->d_strategy)(bp);
   2606 
   2607 	error = biowait(bp);
   2608 
   2609 	if (!error) {
   2610 		memcpy(data, bp->b_data, msize);
   2611 	}
   2612 
   2613 	brelse(bp, 0);
   2614 	return(error);
   2615 }
   2616 
   2617 
   2618 static int
   2619 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2620     RF_ComponentLabel_t *clabel)
   2621 {
   2622 	return raidwrite_component_area(dev, b_vp, clabel,
   2623 	    sizeof(RF_ComponentLabel_t),
   2624 	    rf_component_info_offset(),
   2625 	    rf_component_info_size(secsize), 0);
   2626 }
   2627 
   2628 /* ARGSUSED */
   2629 static int
   2630 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2631     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2632 {
   2633 	struct buf *bp;
   2634 	const struct bdevsw *bdev;
   2635 	int error;
   2636 
   2637 	/* get a block of the appropriate size... */
   2638 	bp = geteblk((int)dsize);
   2639 	bp->b_dev = dev;
   2640 
   2641 	/* get our ducks in a row for the write */
   2642 	bp->b_blkno = offset / DEV_BSIZE;
   2643 	bp->b_bcount = dsize;
   2644 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2645  	bp->b_resid = dsize;
   2646 
   2647 	memset(bp->b_data, 0, dsize);
   2648 	memcpy(bp->b_data, data, msize);
   2649 
   2650 	bdev = bdevsw_lookup(bp->b_dev);
   2651 	if (bdev == NULL)
   2652 		return (ENXIO);
   2653 	(*bdev->d_strategy)(bp);
   2654 	if (asyncp)
   2655 		return 0;
   2656 	error = biowait(bp);
   2657 	brelse(bp, 0);
   2658 	if (error) {
   2659 #if 1
   2660 		printf("Failed to write RAID component info!\n");
   2661 #endif
   2662 	}
   2663 
   2664 	return(error);
   2665 }
   2666 
   2667 void
   2668 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2669 {
   2670 	int c;
   2671 
   2672 	for (c = 0; c < raidPtr->numCol; c++) {
   2673 		/* Skip dead disks. */
   2674 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2675 			continue;
   2676 		/* XXXjld: what if an error occurs here? */
   2677 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2678 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2679 		    RF_PARITYMAP_NBYTE,
   2680 		    rf_parity_map_offset(raidPtr),
   2681 		    rf_parity_map_size(raidPtr), 0);
   2682 	}
   2683 }
   2684 
   2685 void
   2686 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2687 {
   2688 	struct rf_paritymap_ondisk tmp;
   2689 	int c,first;
   2690 
   2691 	first=1;
   2692 	for (c = 0; c < raidPtr->numCol; c++) {
   2693 		/* Skip dead disks. */
   2694 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2695 			continue;
   2696 		raidread_component_area(raidPtr->Disks[c].dev,
   2697 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2698 		    RF_PARITYMAP_NBYTE,
   2699 		    rf_parity_map_offset(raidPtr),
   2700 		    rf_parity_map_size(raidPtr));
   2701 		if (first) {
   2702 			memcpy(map, &tmp, sizeof(*map));
   2703 			first = 0;
   2704 		} else {
   2705 			rf_paritymap_merge(map, &tmp);
   2706 		}
   2707 	}
   2708 }
   2709 
   2710 void
   2711 rf_markalldirty(RF_Raid_t *raidPtr)
   2712 {
   2713 	RF_ComponentLabel_t *clabel;
   2714 	int sparecol;
   2715 	int c;
   2716 	int j;
   2717 	int scol = -1;
   2718 
   2719 	raidPtr->mod_counter++;
   2720 	for (c = 0; c < raidPtr->numCol; c++) {
   2721 		/* we don't want to touch (at all) a disk that has
   2722 		   failed */
   2723 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2724 			clabel = raidget_component_label(raidPtr, c);
   2725 			if (clabel->status == rf_ds_spared) {
   2726 				/* XXX do something special...
   2727 				   but whatever you do, don't
   2728 				   try to access it!! */
   2729 			} else {
   2730 				raidmarkdirty(raidPtr, c);
   2731 			}
   2732 		}
   2733 	}
   2734 
   2735 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2736 		sparecol = raidPtr->numCol + c;
   2737 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2738 			/*
   2739 
   2740 			   we claim this disk is "optimal" if it's
   2741 			   rf_ds_used_spare, as that means it should be
   2742 			   directly substitutable for the disk it replaced.
   2743 			   We note that too...
   2744 
   2745 			 */
   2746 
   2747 			for(j=0;j<raidPtr->numCol;j++) {
   2748 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2749 					scol = j;
   2750 					break;
   2751 				}
   2752 			}
   2753 
   2754 			clabel = raidget_component_label(raidPtr, sparecol);
   2755 			/* make sure status is noted */
   2756 
   2757 			raid_init_component_label(raidPtr, clabel);
   2758 
   2759 			clabel->row = 0;
   2760 			clabel->column = scol;
   2761 			/* Note: we *don't* change status from rf_ds_used_spare
   2762 			   to rf_ds_optimal */
   2763 			/* clabel.status = rf_ds_optimal; */
   2764 
   2765 			raidmarkdirty(raidPtr, sparecol);
   2766 		}
   2767 	}
   2768 }
   2769 
   2770 
   2771 void
   2772 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2773 {
   2774 	RF_ComponentLabel_t *clabel;
   2775 	int sparecol;
   2776 	int c;
   2777 	int j;
   2778 	int scol;
   2779 
   2780 	scol = -1;
   2781 
   2782 	/* XXX should do extra checks to make sure things really are clean,
   2783 	   rather than blindly setting the clean bit... */
   2784 
   2785 	raidPtr->mod_counter++;
   2786 
   2787 	for (c = 0; c < raidPtr->numCol; c++) {
   2788 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2789 			clabel = raidget_component_label(raidPtr, c);
   2790 			/* make sure status is noted */
   2791 			clabel->status = rf_ds_optimal;
   2792 
   2793 			/* note what unit we are configured as */
   2794 			clabel->last_unit = raidPtr->raidid;
   2795 
   2796 			raidflush_component_label(raidPtr, c);
   2797 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2798 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2799 					raidmarkclean(raidPtr, c);
   2800 				}
   2801 			}
   2802 		}
   2803 		/* else we don't touch it.. */
   2804 	}
   2805 
   2806 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2807 		sparecol = raidPtr->numCol + c;
   2808 		/* Need to ensure that the reconstruct actually completed! */
   2809 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2810 			/*
   2811 
   2812 			   we claim this disk is "optimal" if it's
   2813 			   rf_ds_used_spare, as that means it should be
   2814 			   directly substitutable for the disk it replaced.
   2815 			   We note that too...
   2816 
   2817 			 */
   2818 
   2819 			for(j=0;j<raidPtr->numCol;j++) {
   2820 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2821 					scol = j;
   2822 					break;
   2823 				}
   2824 			}
   2825 
   2826 			/* XXX shouldn't *really* need this... */
   2827 			clabel = raidget_component_label(raidPtr, sparecol);
   2828 			/* make sure status is noted */
   2829 
   2830 			raid_init_component_label(raidPtr, clabel);
   2831 
   2832 			clabel->column = scol;
   2833 			clabel->status = rf_ds_optimal;
   2834 			clabel->last_unit = raidPtr->raidid;
   2835 
   2836 			raidflush_component_label(raidPtr, sparecol);
   2837 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2838 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2839 					raidmarkclean(raidPtr, sparecol);
   2840 				}
   2841 			}
   2842 		}
   2843 	}
   2844 }
   2845 
   2846 void
   2847 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2848 {
   2849 
   2850 	if (vp != NULL) {
   2851 		if (auto_configured == 1) {
   2852 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2853 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2854 			vput(vp);
   2855 
   2856 		} else {
   2857 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2858 		}
   2859 	}
   2860 }
   2861 
   2862 
   2863 void
   2864 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2865 {
   2866 	int r,c;
   2867 	struct vnode *vp;
   2868 	int acd;
   2869 
   2870 
   2871 	/* We take this opportunity to close the vnodes like we should.. */
   2872 
   2873 	for (c = 0; c < raidPtr->numCol; c++) {
   2874 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2875 		acd = raidPtr->Disks[c].auto_configured;
   2876 		rf_close_component(raidPtr, vp, acd);
   2877 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2878 		raidPtr->Disks[c].auto_configured = 0;
   2879 	}
   2880 
   2881 	for (r = 0; r < raidPtr->numSpare; r++) {
   2882 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2883 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2884 		rf_close_component(raidPtr, vp, acd);
   2885 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2886 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2887 	}
   2888 }
   2889 
   2890 
   2891 void
   2892 rf_ReconThread(struct rf_recon_req *req)
   2893 {
   2894 	int     s;
   2895 	RF_Raid_t *raidPtr;
   2896 
   2897 	s = splbio();
   2898 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2899 	raidPtr->recon_in_progress = 1;
   2900 
   2901 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2902 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2903 
   2904 	RF_Free(req, sizeof(*req));
   2905 
   2906 	raidPtr->recon_in_progress = 0;
   2907 	splx(s);
   2908 
   2909 	/* That's all... */
   2910 	kthread_exit(0);	/* does not return */
   2911 }
   2912 
   2913 void
   2914 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2915 {
   2916 	int retcode;
   2917 	int s;
   2918 
   2919 	raidPtr->parity_rewrite_stripes_done = 0;
   2920 	raidPtr->parity_rewrite_in_progress = 1;
   2921 	s = splbio();
   2922 	retcode = rf_RewriteParity(raidPtr);
   2923 	splx(s);
   2924 	if (retcode) {
   2925 		printf("raid%d: Error re-writing parity (%d)!\n",
   2926 		    raidPtr->raidid, retcode);
   2927 	} else {
   2928 		/* set the clean bit!  If we shutdown correctly,
   2929 		   the clean bit on each component label will get
   2930 		   set */
   2931 		raidPtr->parity_good = RF_RAID_CLEAN;
   2932 	}
   2933 	raidPtr->parity_rewrite_in_progress = 0;
   2934 
   2935 	/* Anyone waiting for us to stop?  If so, inform them... */
   2936 	if (raidPtr->waitShutdown) {
   2937 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2938 	}
   2939 
   2940 	/* That's all... */
   2941 	kthread_exit(0);	/* does not return */
   2942 }
   2943 
   2944 
   2945 void
   2946 rf_CopybackThread(RF_Raid_t *raidPtr)
   2947 {
   2948 	int s;
   2949 
   2950 	raidPtr->copyback_in_progress = 1;
   2951 	s = splbio();
   2952 	rf_CopybackReconstructedData(raidPtr);
   2953 	splx(s);
   2954 	raidPtr->copyback_in_progress = 0;
   2955 
   2956 	/* That's all... */
   2957 	kthread_exit(0);	/* does not return */
   2958 }
   2959 
   2960 
   2961 void
   2962 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2963 {
   2964 	int s;
   2965 	RF_Raid_t *raidPtr;
   2966 
   2967 	s = splbio();
   2968 	raidPtr = req->raidPtr;
   2969 	raidPtr->recon_in_progress = 1;
   2970 	rf_ReconstructInPlace(raidPtr, req->col);
   2971 	RF_Free(req, sizeof(*req));
   2972 	raidPtr->recon_in_progress = 0;
   2973 	splx(s);
   2974 
   2975 	/* That's all... */
   2976 	kthread_exit(0);	/* does not return */
   2977 }
   2978 
   2979 static RF_AutoConfig_t *
   2980 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2981     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2982     unsigned secsize)
   2983 {
   2984 	int good_one = 0;
   2985 	RF_ComponentLabel_t *clabel;
   2986 	RF_AutoConfig_t *ac;
   2987 
   2988 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2989 	if (clabel == NULL) {
   2990 oomem:
   2991 		    while(ac_list) {
   2992 			    ac = ac_list;
   2993 			    if (ac->clabel)
   2994 				    free(ac->clabel, M_RAIDFRAME);
   2995 			    ac_list = ac_list->next;
   2996 			    free(ac, M_RAIDFRAME);
   2997 		    }
   2998 		    printf("RAID auto config: out of memory!\n");
   2999 		    return NULL; /* XXX probably should panic? */
   3000 	}
   3001 
   3002 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3003 		/* Got the label.  Does it look reasonable? */
   3004 		if (rf_reasonable_label(clabel, numsecs) &&
   3005 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3006 #ifdef DEBUG
   3007 			printf("Component on: %s: %llu\n",
   3008 				cname, (unsigned long long)size);
   3009 			rf_print_component_label(clabel);
   3010 #endif
   3011 			/* if it's reasonable, add it, else ignore it. */
   3012 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3013 				M_NOWAIT);
   3014 			if (ac == NULL) {
   3015 				free(clabel, M_RAIDFRAME);
   3016 				goto oomem;
   3017 			}
   3018 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3019 			ac->dev = dev;
   3020 			ac->vp = vp;
   3021 			ac->clabel = clabel;
   3022 			ac->next = ac_list;
   3023 			ac_list = ac;
   3024 			good_one = 1;
   3025 		}
   3026 	}
   3027 	if (!good_one) {
   3028 		/* cleanup */
   3029 		free(clabel, M_RAIDFRAME);
   3030 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3031 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3032 		vput(vp);
   3033 	}
   3034 	return ac_list;
   3035 }
   3036 
   3037 RF_AutoConfig_t *
   3038 rf_find_raid_components(void)
   3039 {
   3040 	struct vnode *vp;
   3041 	struct disklabel label;
   3042 	device_t dv;
   3043 	deviter_t di;
   3044 	dev_t dev;
   3045 	int bmajor, bminor, wedge, rf_part_found;
   3046 	int error;
   3047 	int i;
   3048 	RF_AutoConfig_t *ac_list;
   3049 	uint64_t numsecs;
   3050 	unsigned secsize;
   3051 
   3052 	/* initialize the AutoConfig list */
   3053 	ac_list = NULL;
   3054 
   3055 	/* we begin by trolling through *all* the devices on the system */
   3056 
   3057 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3058 	     dv = deviter_next(&di)) {
   3059 
   3060 		/* we are only interested in disks... */
   3061 		if (device_class(dv) != DV_DISK)
   3062 			continue;
   3063 
   3064 		/* we don't care about floppies... */
   3065 		if (device_is_a(dv, "fd")) {
   3066 			continue;
   3067 		}
   3068 
   3069 		/* we don't care about CD's... */
   3070 		if (device_is_a(dv, "cd")) {
   3071 			continue;
   3072 		}
   3073 
   3074 		/* we don't care about md's... */
   3075 		if (device_is_a(dv, "md")) {
   3076 			continue;
   3077 		}
   3078 
   3079 		/* hdfd is the Atari/Hades floppy driver */
   3080 		if (device_is_a(dv, "hdfd")) {
   3081 			continue;
   3082 		}
   3083 
   3084 		/* fdisa is the Atari/Milan floppy driver */
   3085 		if (device_is_a(dv, "fdisa")) {
   3086 			continue;
   3087 		}
   3088 
   3089 		/* need to find the device_name_to_block_device_major stuff */
   3090 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3091 
   3092 		rf_part_found = 0; /*No raid partition as yet*/
   3093 
   3094 		/* get a vnode for the raw partition of this disk */
   3095 
   3096 		wedge = device_is_a(dv, "dk");
   3097 		bminor = minor(device_unit(dv));
   3098 		dev = wedge ? makedev(bmajor, bminor) :
   3099 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3100 		if (bdevvp(dev, &vp))
   3101 			panic("RAID can't alloc vnode");
   3102 
   3103 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3104 
   3105 		if (error) {
   3106 			/* "Who cares."  Continue looking
   3107 			   for something that exists*/
   3108 			vput(vp);
   3109 			continue;
   3110 		}
   3111 
   3112 		error = getdisksize(vp, &numsecs, &secsize);
   3113 		if (error) {
   3114 			vput(vp);
   3115 			continue;
   3116 		}
   3117 		if (wedge) {
   3118 			struct dkwedge_info dkw;
   3119 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3120 			    NOCRED);
   3121 			if (error) {
   3122 				printf("RAIDframe: can't get wedge info for "
   3123 				    "dev %s (%d)\n", device_xname(dv), error);
   3124 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3125 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3126 				vput(vp);
   3127 				continue;
   3128 			}
   3129 
   3130 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3131 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3132 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3133 				vput(vp);
   3134 				continue;
   3135 			}
   3136 
   3137 			ac_list = rf_get_component(ac_list, dev, vp,
   3138 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3139 			rf_part_found = 1; /*There is a raid component on this disk*/
   3140 			continue;
   3141 		}
   3142 
   3143 		/* Ok, the disk exists.  Go get the disklabel. */
   3144 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3145 		if (error) {
   3146 			/*
   3147 			 * XXX can't happen - open() would
   3148 			 * have errored out (or faked up one)
   3149 			 */
   3150 			if (error != ENOTTY)
   3151 				printf("RAIDframe: can't get label for dev "
   3152 				    "%s (%d)\n", device_xname(dv), error);
   3153 		}
   3154 
   3155 		/* don't need this any more.  We'll allocate it again
   3156 		   a little later if we really do... */
   3157 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3158 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3159 		vput(vp);
   3160 
   3161 		if (error)
   3162 			continue;
   3163 
   3164 		rf_part_found = 0; /*No raid partitions yet*/
   3165 		for (i = 0; i < label.d_npartitions; i++) {
   3166 			char cname[sizeof(ac_list->devname)];
   3167 
   3168 			/* We only support partitions marked as RAID */
   3169 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3170 				continue;
   3171 
   3172 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3173 			if (bdevvp(dev, &vp))
   3174 				panic("RAID can't alloc vnode");
   3175 
   3176 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3177 			if (error) {
   3178 				/* Whatever... */
   3179 				vput(vp);
   3180 				continue;
   3181 			}
   3182 			snprintf(cname, sizeof(cname), "%s%c",
   3183 			    device_xname(dv), 'a' + i);
   3184 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3185 				label.d_partitions[i].p_size, numsecs, secsize);
   3186 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3187 		}
   3188 
   3189 		/*
   3190 		 *If there is no raid component on this disk, either in a
   3191 		 *disklabel or inside a wedge, check the raw partition as well,
   3192 		 *as it is possible to configure raid components on raw disk
   3193 		 *devices.
   3194 		 */
   3195 
   3196 		if (!rf_part_found) {
   3197 			char cname[sizeof(ac_list->devname)];
   3198 
   3199 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3200 			if (bdevvp(dev, &vp))
   3201 				panic("RAID can't alloc vnode");
   3202 
   3203 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3204 			if (error) {
   3205 				/* Whatever... */
   3206 				vput(vp);
   3207 				continue;
   3208 			}
   3209 			snprintf(cname, sizeof(cname), "%s%c",
   3210 			    device_xname(dv), 'a' + RAW_PART);
   3211 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3212 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3213 		}
   3214 	}
   3215 	deviter_release(&di);
   3216 	return ac_list;
   3217 }
   3218 
   3219 
   3220 int
   3221 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3222 {
   3223 
   3224 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3225 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3226 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3227 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3228 	    clabel->row >=0 &&
   3229 	    clabel->column >= 0 &&
   3230 	    clabel->num_rows > 0 &&
   3231 	    clabel->num_columns > 0 &&
   3232 	    clabel->row < clabel->num_rows &&
   3233 	    clabel->column < clabel->num_columns &&
   3234 	    clabel->blockSize > 0 &&
   3235 	    /*
   3236 	     * numBlocksHi may contain garbage, but it is ok since
   3237 	     * the type is unsigned.  If it is really garbage,
   3238 	     * rf_fix_old_label_size() will fix it.
   3239 	     */
   3240 	    rf_component_label_numblocks(clabel) > 0) {
   3241 		/*
   3242 		 * label looks reasonable enough...
   3243 		 * let's make sure it has no old garbage.
   3244 		 */
   3245 		if (numsecs)
   3246 			rf_fix_old_label_size(clabel, numsecs);
   3247 		return(1);
   3248 	}
   3249 	return(0);
   3250 }
   3251 
   3252 
   3253 /*
   3254  * For reasons yet unknown, some old component labels have garbage in
   3255  * the newer numBlocksHi region, and this causes lossage.  Since those
   3256  * disks will also have numsecs set to less than 32 bits of sectors,
   3257  * we can determine when this corruption has occurred, and fix it.
   3258  *
   3259  * The exact same problem, with the same unknown reason, happens to
   3260  * the partitionSizeHi member as well.
   3261  */
   3262 static void
   3263 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3264 {
   3265 
   3266 	if (numsecs < ((uint64_t)1 << 32)) {
   3267 		if (clabel->numBlocksHi) {
   3268 			printf("WARNING: total sectors < 32 bits, yet "
   3269 			       "numBlocksHi set\n"
   3270 			       "WARNING: resetting numBlocksHi to zero.\n");
   3271 			clabel->numBlocksHi = 0;
   3272 		}
   3273 
   3274 		if (clabel->partitionSizeHi) {
   3275 			printf("WARNING: total sectors < 32 bits, yet "
   3276 			       "partitionSizeHi set\n"
   3277 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3278 			clabel->partitionSizeHi = 0;
   3279 		}
   3280 	}
   3281 }
   3282 
   3283 
   3284 #ifdef DEBUG
   3285 void
   3286 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3287 {
   3288 	uint64_t numBlocks;
   3289 
   3290 	numBlocks = rf_component_label_numblocks(clabel);
   3291 
   3292 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3293 	       clabel->row, clabel->column,
   3294 	       clabel->num_rows, clabel->num_columns);
   3295 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3296 	       clabel->version, clabel->serial_number,
   3297 	       clabel->mod_counter);
   3298 	printf("   Clean: %s Status: %d\n",
   3299 	       clabel->clean ? "Yes" : "No", clabel->status);
   3300 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3301 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3302 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3303 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3304 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3305 	printf("   Contains root partition: %s\n",
   3306 	       clabel->root_partition ? "Yes" : "No");
   3307 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3308 #if 0
   3309 	   printf("   Config order: %d\n", clabel->config_order);
   3310 #endif
   3311 
   3312 }
   3313 #endif
   3314 
   3315 RF_ConfigSet_t *
   3316 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3317 {
   3318 	RF_AutoConfig_t *ac;
   3319 	RF_ConfigSet_t *config_sets;
   3320 	RF_ConfigSet_t *cset;
   3321 	RF_AutoConfig_t *ac_next;
   3322 
   3323 
   3324 	config_sets = NULL;
   3325 
   3326 	/* Go through the AutoConfig list, and figure out which components
   3327 	   belong to what sets.  */
   3328 	ac = ac_list;
   3329 	while(ac!=NULL) {
   3330 		/* we're going to putz with ac->next, so save it here
   3331 		   for use at the end of the loop */
   3332 		ac_next = ac->next;
   3333 
   3334 		if (config_sets == NULL) {
   3335 			/* will need at least this one... */
   3336 			config_sets = (RF_ConfigSet_t *)
   3337 				malloc(sizeof(RF_ConfigSet_t),
   3338 				       M_RAIDFRAME, M_NOWAIT);
   3339 			if (config_sets == NULL) {
   3340 				panic("rf_create_auto_sets: No memory!");
   3341 			}
   3342 			/* this one is easy :) */
   3343 			config_sets->ac = ac;
   3344 			config_sets->next = NULL;
   3345 			config_sets->rootable = 0;
   3346 			ac->next = NULL;
   3347 		} else {
   3348 			/* which set does this component fit into? */
   3349 			cset = config_sets;
   3350 			while(cset!=NULL) {
   3351 				if (rf_does_it_fit(cset, ac)) {
   3352 					/* looks like it matches... */
   3353 					ac->next = cset->ac;
   3354 					cset->ac = ac;
   3355 					break;
   3356 				}
   3357 				cset = cset->next;
   3358 			}
   3359 			if (cset==NULL) {
   3360 				/* didn't find a match above... new set..*/
   3361 				cset = (RF_ConfigSet_t *)
   3362 					malloc(sizeof(RF_ConfigSet_t),
   3363 					       M_RAIDFRAME, M_NOWAIT);
   3364 				if (cset == NULL) {
   3365 					panic("rf_create_auto_sets: No memory!");
   3366 				}
   3367 				cset->ac = ac;
   3368 				ac->next = NULL;
   3369 				cset->next = config_sets;
   3370 				cset->rootable = 0;
   3371 				config_sets = cset;
   3372 			}
   3373 		}
   3374 		ac = ac_next;
   3375 	}
   3376 
   3377 
   3378 	return(config_sets);
   3379 }
   3380 
   3381 static int
   3382 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3383 {
   3384 	RF_ComponentLabel_t *clabel1, *clabel2;
   3385 
   3386 	/* If this one matches the *first* one in the set, that's good
   3387 	   enough, since the other members of the set would have been
   3388 	   through here too... */
   3389 	/* note that we are not checking partitionSize here..
   3390 
   3391 	   Note that we are also not checking the mod_counters here.
   3392 	   If everything else matches except the mod_counter, that's
   3393 	   good enough for this test.  We will deal with the mod_counters
   3394 	   a little later in the autoconfiguration process.
   3395 
   3396 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3397 
   3398 	   The reason we don't check for this is that failed disks
   3399 	   will have lower modification counts.  If those disks are
   3400 	   not added to the set they used to belong to, then they will
   3401 	   form their own set, which may result in 2 different sets,
   3402 	   for example, competing to be configured at raid0, and
   3403 	   perhaps competing to be the root filesystem set.  If the
   3404 	   wrong ones get configured, or both attempt to become /,
   3405 	   weird behaviour and or serious lossage will occur.  Thus we
   3406 	   need to bring them into the fold here, and kick them out at
   3407 	   a later point.
   3408 
   3409 	*/
   3410 
   3411 	clabel1 = cset->ac->clabel;
   3412 	clabel2 = ac->clabel;
   3413 	if ((clabel1->version == clabel2->version) &&
   3414 	    (clabel1->serial_number == clabel2->serial_number) &&
   3415 	    (clabel1->num_rows == clabel2->num_rows) &&
   3416 	    (clabel1->num_columns == clabel2->num_columns) &&
   3417 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3418 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3419 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3420 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3421 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3422 	    (clabel1->blockSize == clabel2->blockSize) &&
   3423 	    rf_component_label_numblocks(clabel1) ==
   3424 	    rf_component_label_numblocks(clabel2) &&
   3425 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3426 	    (clabel1->root_partition == clabel2->root_partition) &&
   3427 	    (clabel1->last_unit == clabel2->last_unit) &&
   3428 	    (clabel1->config_order == clabel2->config_order)) {
   3429 		/* if it get's here, it almost *has* to be a match */
   3430 	} else {
   3431 		/* it's not consistent with somebody in the set..
   3432 		   punt */
   3433 		return(0);
   3434 	}
   3435 	/* all was fine.. it must fit... */
   3436 	return(1);
   3437 }
   3438 
   3439 int
   3440 rf_have_enough_components(RF_ConfigSet_t *cset)
   3441 {
   3442 	RF_AutoConfig_t *ac;
   3443 	RF_AutoConfig_t *auto_config;
   3444 	RF_ComponentLabel_t *clabel;
   3445 	int c;
   3446 	int num_cols;
   3447 	int num_missing;
   3448 	int mod_counter;
   3449 	int mod_counter_found;
   3450 	int even_pair_failed;
   3451 	char parity_type;
   3452 
   3453 
   3454 	/* check to see that we have enough 'live' components
   3455 	   of this set.  If so, we can configure it if necessary */
   3456 
   3457 	num_cols = cset->ac->clabel->num_columns;
   3458 	parity_type = cset->ac->clabel->parityConfig;
   3459 
   3460 	/* XXX Check for duplicate components!?!?!? */
   3461 
   3462 	/* Determine what the mod_counter is supposed to be for this set. */
   3463 
   3464 	mod_counter_found = 0;
   3465 	mod_counter = 0;
   3466 	ac = cset->ac;
   3467 	while(ac!=NULL) {
   3468 		if (mod_counter_found==0) {
   3469 			mod_counter = ac->clabel->mod_counter;
   3470 			mod_counter_found = 1;
   3471 		} else {
   3472 			if (ac->clabel->mod_counter > mod_counter) {
   3473 				mod_counter = ac->clabel->mod_counter;
   3474 			}
   3475 		}
   3476 		ac = ac->next;
   3477 	}
   3478 
   3479 	num_missing = 0;
   3480 	auto_config = cset->ac;
   3481 
   3482 	even_pair_failed = 0;
   3483 	for(c=0; c<num_cols; c++) {
   3484 		ac = auto_config;
   3485 		while(ac!=NULL) {
   3486 			if ((ac->clabel->column == c) &&
   3487 			    (ac->clabel->mod_counter == mod_counter)) {
   3488 				/* it's this one... */
   3489 #ifdef DEBUG
   3490 				printf("Found: %s at %d\n",
   3491 				       ac->devname,c);
   3492 #endif
   3493 				break;
   3494 			}
   3495 			ac=ac->next;
   3496 		}
   3497 		if (ac==NULL) {
   3498 				/* Didn't find one here! */
   3499 				/* special case for RAID 1, especially
   3500 				   where there are more than 2
   3501 				   components (where RAIDframe treats
   3502 				   things a little differently :( ) */
   3503 			if (parity_type == '1') {
   3504 				if (c%2 == 0) { /* even component */
   3505 					even_pair_failed = 1;
   3506 				} else { /* odd component.  If
   3507 					    we're failed, and
   3508 					    so is the even
   3509 					    component, it's
   3510 					    "Good Night, Charlie" */
   3511 					if (even_pair_failed == 1) {
   3512 						return(0);
   3513 					}
   3514 				}
   3515 			} else {
   3516 				/* normal accounting */
   3517 				num_missing++;
   3518 			}
   3519 		}
   3520 		if ((parity_type == '1') && (c%2 == 1)) {
   3521 				/* Just did an even component, and we didn't
   3522 				   bail.. reset the even_pair_failed flag,
   3523 				   and go on to the next component.... */
   3524 			even_pair_failed = 0;
   3525 		}
   3526 	}
   3527 
   3528 	clabel = cset->ac->clabel;
   3529 
   3530 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3531 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3532 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3533 		/* XXX this needs to be made *much* more general */
   3534 		/* Too many failures */
   3535 		return(0);
   3536 	}
   3537 	/* otherwise, all is well, and we've got enough to take a kick
   3538 	   at autoconfiguring this set */
   3539 	return(1);
   3540 }
   3541 
   3542 void
   3543 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3544 			RF_Raid_t *raidPtr)
   3545 {
   3546 	RF_ComponentLabel_t *clabel;
   3547 	int i;
   3548 
   3549 	clabel = ac->clabel;
   3550 
   3551 	/* 1. Fill in the common stuff */
   3552 	config->numRow = clabel->num_rows = 1;
   3553 	config->numCol = clabel->num_columns;
   3554 	config->numSpare = 0; /* XXX should this be set here? */
   3555 	config->sectPerSU = clabel->sectPerSU;
   3556 	config->SUsPerPU = clabel->SUsPerPU;
   3557 	config->SUsPerRU = clabel->SUsPerRU;
   3558 	config->parityConfig = clabel->parityConfig;
   3559 	/* XXX... */
   3560 	strcpy(config->diskQueueType,"fifo");
   3561 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3562 	config->layoutSpecificSize = 0; /* XXX ?? */
   3563 
   3564 	while(ac!=NULL) {
   3565 		/* row/col values will be in range due to the checks
   3566 		   in reasonable_label() */
   3567 		strcpy(config->devnames[0][ac->clabel->column],
   3568 		       ac->devname);
   3569 		ac = ac->next;
   3570 	}
   3571 
   3572 	for(i=0;i<RF_MAXDBGV;i++) {
   3573 		config->debugVars[i][0] = 0;
   3574 	}
   3575 }
   3576 
   3577 int
   3578 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3579 {
   3580 	RF_ComponentLabel_t *clabel;
   3581 	int column;
   3582 	int sparecol;
   3583 
   3584 	raidPtr->autoconfigure = new_value;
   3585 
   3586 	for(column=0; column<raidPtr->numCol; column++) {
   3587 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3588 			clabel = raidget_component_label(raidPtr, column);
   3589 			clabel->autoconfigure = new_value;
   3590 			raidflush_component_label(raidPtr, column);
   3591 		}
   3592 	}
   3593 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3594 		sparecol = raidPtr->numCol + column;
   3595 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3596 			clabel = raidget_component_label(raidPtr, sparecol);
   3597 			clabel->autoconfigure = new_value;
   3598 			raidflush_component_label(raidPtr, sparecol);
   3599 		}
   3600 	}
   3601 	return(new_value);
   3602 }
   3603 
   3604 int
   3605 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3606 {
   3607 	RF_ComponentLabel_t *clabel;
   3608 	int column;
   3609 	int sparecol;
   3610 
   3611 	raidPtr->root_partition = new_value;
   3612 	for(column=0; column<raidPtr->numCol; column++) {
   3613 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3614 			clabel = raidget_component_label(raidPtr, column);
   3615 			clabel->root_partition = new_value;
   3616 			raidflush_component_label(raidPtr, column);
   3617 		}
   3618 	}
   3619 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3620 		sparecol = raidPtr->numCol + column;
   3621 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3622 			clabel = raidget_component_label(raidPtr, sparecol);
   3623 			clabel->root_partition = new_value;
   3624 			raidflush_component_label(raidPtr, sparecol);
   3625 		}
   3626 	}
   3627 	return(new_value);
   3628 }
   3629 
   3630 void
   3631 rf_release_all_vps(RF_ConfigSet_t *cset)
   3632 {
   3633 	RF_AutoConfig_t *ac;
   3634 
   3635 	ac = cset->ac;
   3636 	while(ac!=NULL) {
   3637 		/* Close the vp, and give it back */
   3638 		if (ac->vp) {
   3639 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3640 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3641 			vput(ac->vp);
   3642 			ac->vp = NULL;
   3643 		}
   3644 		ac = ac->next;
   3645 	}
   3646 }
   3647 
   3648 
   3649 void
   3650 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3651 {
   3652 	RF_AutoConfig_t *ac;
   3653 	RF_AutoConfig_t *next_ac;
   3654 
   3655 	ac = cset->ac;
   3656 	while(ac!=NULL) {
   3657 		next_ac = ac->next;
   3658 		/* nuke the label */
   3659 		free(ac->clabel, M_RAIDFRAME);
   3660 		/* cleanup the config structure */
   3661 		free(ac, M_RAIDFRAME);
   3662 		/* "next.." */
   3663 		ac = next_ac;
   3664 	}
   3665 	/* and, finally, nuke the config set */
   3666 	free(cset, M_RAIDFRAME);
   3667 }
   3668 
   3669 
   3670 void
   3671 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3672 {
   3673 	/* current version number */
   3674 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3675 	clabel->serial_number = raidPtr->serial_number;
   3676 	clabel->mod_counter = raidPtr->mod_counter;
   3677 
   3678 	clabel->num_rows = 1;
   3679 	clabel->num_columns = raidPtr->numCol;
   3680 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3681 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3682 
   3683 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3684 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3685 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3686 
   3687 	clabel->blockSize = raidPtr->bytesPerSector;
   3688 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3689 
   3690 	/* XXX not portable */
   3691 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3692 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3693 	clabel->autoconfigure = raidPtr->autoconfigure;
   3694 	clabel->root_partition = raidPtr->root_partition;
   3695 	clabel->last_unit = raidPtr->raidid;
   3696 	clabel->config_order = raidPtr->config_order;
   3697 
   3698 #ifndef RF_NO_PARITY_MAP
   3699 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3700 #endif
   3701 }
   3702 
   3703 struct raid_softc *
   3704 rf_auto_config_set(RF_ConfigSet_t *cset)
   3705 {
   3706 	RF_Raid_t *raidPtr;
   3707 	RF_Config_t *config;
   3708 	int raidID;
   3709 	struct raid_softc *sc;
   3710 
   3711 #ifdef DEBUG
   3712 	printf("RAID autoconfigure\n");
   3713 #endif
   3714 
   3715 	/* 1. Create a config structure */
   3716 	config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
   3717 	if (config == NULL) {
   3718 		printf("Out of mem!?!?\n");
   3719 				/* XXX do something more intelligent here. */
   3720 		return NULL;
   3721 	}
   3722 
   3723 	/*
   3724 	   2. Figure out what RAID ID this one is supposed to live at
   3725 	   See if we can get the same RAID dev that it was configured
   3726 	   on last time..
   3727 	*/
   3728 
   3729 	raidID = cset->ac->clabel->last_unit;
   3730 	for (sc = raidget(raidID); sc->sc_r.valid != 0; sc = raidget(++raidID))
   3731 		continue;
   3732 #ifdef DEBUG
   3733 	printf("Configuring raid%d:\n",raidID);
   3734 #endif
   3735 
   3736 	raidPtr = &sc->sc_r;
   3737 
   3738 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3739 	raidPtr->raidid = raidID;
   3740 	raidPtr->openings = RAIDOUTSTANDING;
   3741 
   3742 	/* 3. Build the configuration structure */
   3743 	rf_create_configuration(cset->ac, config, raidPtr);
   3744 
   3745 	/* 4. Do the configuration */
   3746 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
   3747 		raidinit(sc);
   3748 
   3749 		rf_markalldirty(raidPtr);
   3750 		raidPtr->autoconfigure = 1; /* XXX do this here? */
   3751 		if (cset->ac->clabel->root_partition==1) {
   3752 			/* everything configured just fine.  Make a note
   3753 			   that this set is eligible to be root. */
   3754 			cset->rootable = 1;
   3755 			/* XXX do this here? */
   3756 			raidPtr->root_partition = 1;
   3757 		}
   3758 	} else {
   3759 		raidput(sc);
   3760 		sc = NULL;
   3761 	}
   3762 
   3763 	/* 5. Cleanup */
   3764 	free(config, M_RAIDFRAME);
   3765 	return sc;
   3766 }
   3767 
   3768 void
   3769 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3770 {
   3771 	struct buf *bp;
   3772 	struct raid_softc *rs;
   3773 
   3774 	bp = (struct buf *)desc->bp;
   3775 	rs = desc->raidPtr->softc;
   3776 	disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid),
   3777 	    (bp->b_flags & B_READ));
   3778 }
   3779 
   3780 void
   3781 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3782 	     size_t xmin, size_t xmax)
   3783 {
   3784 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3785 	pool_sethiwat(p, xmax);
   3786 	pool_prime(p, xmin);
   3787 	pool_setlowat(p, xmin);
   3788 }
   3789 
   3790 /*
   3791  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buf_queue to see
   3792  * if there is IO pending and if that IO could possibly be done for a
   3793  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3794  * otherwise.
   3795  *
   3796  */
   3797 
   3798 int
   3799 rf_buf_queue_check(RF_Raid_t *raidPtr)
   3800 {
   3801 	struct raid_softc *rs = raidPtr->softc;
   3802 	if ((bufq_peek(rs->buf_queue) != NULL) && raidPtr->openings > 0) {
   3803 		/* there is work to do */
   3804 		return 0;
   3805 	}
   3806 	/* default is nothing to do */
   3807 	return 1;
   3808 }
   3809 
   3810 int
   3811 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3812 {
   3813 	uint64_t numsecs;
   3814 	unsigned secsize;
   3815 	int error;
   3816 
   3817 	error = getdisksize(vp, &numsecs, &secsize);
   3818 	if (error == 0) {
   3819 		diskPtr->blockSize = secsize;
   3820 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3821 		diskPtr->partitionSize = numsecs;
   3822 		return 0;
   3823 	}
   3824 	return error;
   3825 }
   3826 
   3827 static int
   3828 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3829 {
   3830 	return 1;
   3831 }
   3832 
   3833 static void
   3834 raid_attach(device_t parent, device_t self, void *aux)
   3835 {
   3836 
   3837 }
   3838 
   3839 
   3840 static int
   3841 raid_detach(device_t self, int flags)
   3842 {
   3843 	int error;
   3844 	struct raid_softc *rs = device_private(self);
   3845 
   3846 	if ((error = raidlock(rs)) != 0)
   3847 		return (error);
   3848 
   3849 	error = raid_detach_unlocked(rs);
   3850 
   3851 	raidunlock(rs);
   3852 
   3853 	return error;
   3854 }
   3855 
   3856 static void
   3857 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3858 {
   3859 	prop_dictionary_t disk_info, odisk_info, geom;
   3860 	disk_info = prop_dictionary_create();
   3861 	geom = prop_dictionary_create();
   3862 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3863 				   raidPtr->totalSectors);
   3864 	prop_dictionary_set_uint32(geom, "sector-size",
   3865 				   raidPtr->bytesPerSector);
   3866 
   3867 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3868 				   raidPtr->Layout.dataSectorsPerStripe);
   3869 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3870 				   4 * raidPtr->numCol);
   3871 
   3872 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3873 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3874 	   (4 * raidPtr->numCol)));
   3875 
   3876 	prop_dictionary_set(disk_info, "geometry", geom);
   3877 	prop_object_release(geom);
   3878 	prop_dictionary_set(device_properties(rs->sc_dev),
   3879 			    "disk-info", disk_info);
   3880 	odisk_info = rs->sc_dkdev.dk_info;
   3881 	rs->sc_dkdev.dk_info = disk_info;
   3882 	if (odisk_info)
   3883 		prop_object_release(odisk_info);
   3884 }
   3885 
   3886 /*
   3887  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3888  * We end up returning whatever error was returned by the first cache flush
   3889  * that fails.
   3890  */
   3891 
   3892 int
   3893 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3894 {
   3895 	int c, sparecol;
   3896 	int e,error;
   3897 	int force = 1;
   3898 
   3899 	error = 0;
   3900 	for (c = 0; c < raidPtr->numCol; c++) {
   3901 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3902 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3903 					  &force, FWRITE, NOCRED);
   3904 			if (e) {
   3905 				if (e != ENODEV)
   3906 					printf("raid%d: cache flush to component %s failed.\n",
   3907 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3908 				if (error == 0) {
   3909 					error = e;
   3910 				}
   3911 			}
   3912 		}
   3913 	}
   3914 
   3915 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3916 		sparecol = raidPtr->numCol + c;
   3917 		/* Need to ensure that the reconstruct actually completed! */
   3918 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3919 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3920 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3921 			if (e) {
   3922 				if (e != ENODEV)
   3923 					printf("raid%d: cache flush to component %s failed.\n",
   3924 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3925 				if (error == 0) {
   3926 					error = e;
   3927 				}
   3928 			}
   3929 		}
   3930 	}
   3931 	return error;
   3932 }
   3933