Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.295.6.4
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.295.6.4 2014/12/22 04:11:38 msaitoh Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.295.6.4 2014/12/22 04:11:38 msaitoh Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #include "raid.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #ifdef DEBUG
    156 int     rf_kdebug_level = 0;
    157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    158 #else				/* DEBUG */
    159 #define db1_printf(a) { }
    160 #endif				/* DEBUG */
    161 
    162 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 static void raidinit(RF_Raid_t *);
    183 
    184 void raidattach(int);
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 
    200 dev_type_open(raidopen);
    201 dev_type_close(raidclose);
    202 dev_type_read(raidread);
    203 dev_type_write(raidwrite);
    204 dev_type_ioctl(raidioctl);
    205 dev_type_strategy(raidstrategy);
    206 dev_type_dump(raiddump);
    207 dev_type_size(raidsize);
    208 
    209 const struct bdevsw raid_bdevsw = {
    210 	raidopen, raidclose, raidstrategy, raidioctl,
    211 	raiddump, raidsize, D_DISK
    212 };
    213 
    214 const struct cdevsw raid_cdevsw = {
    215 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    216 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    217 };
    218 
    219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    220 
    221 /* XXX Not sure if the following should be replacing the raidPtrs above,
    222    or if it should be used in conjunction with that...
    223 */
    224 
    225 struct raid_softc {
    226 	device_t sc_dev;
    227 	int     sc_flags;	/* flags */
    228 	int     sc_cflags;	/* configuration flags */
    229 	uint64_t sc_size;	/* size of the raid device */
    230 	char    sc_xname[20];	/* XXX external name */
    231 	struct disk sc_dkdev;	/* generic disk device info */
    232 	struct bufq_state *buf_queue;	/* used for the device queue */
    233 };
    234 /* sc_flags */
    235 #define RAIDF_INITED	0x01	/* unit has been initialized */
    236 #define RAIDF_WLABEL	0x02	/* label area is writable */
    237 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    238 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    239 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    240 #define RAIDF_LOCKED	0x80	/* unit is locked */
    241 
    242 #define	raidunit(x)	DISKUNIT(x)
    243 int numraid = 0;
    244 
    245 extern struct cfdriver raid_cd;
    246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    247     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    248     DVF_DETACH_SHUTDOWN);
    249 
    250 /*
    251  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    252  * Be aware that large numbers can allow the driver to consume a lot of
    253  * kernel memory, especially on writes, and in degraded mode reads.
    254  *
    255  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    256  * a single 64K write will typically require 64K for the old data,
    257  * 64K for the old parity, and 64K for the new parity, for a total
    258  * of 192K (if the parity buffer is not re-used immediately).
    259  * Even it if is used immediately, that's still 128K, which when multiplied
    260  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    261  *
    262  * Now in degraded mode, for example, a 64K read on the above setup may
    263  * require data reconstruction, which will require *all* of the 4 remaining
    264  * disks to participate -- 4 * 32K/disk == 128K again.
    265  */
    266 
    267 #ifndef RAIDOUTSTANDING
    268 #define RAIDOUTSTANDING   6
    269 #endif
    270 
    271 #define RAIDLABELDEV(dev)	\
    272 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    273 
    274 /* declared here, and made public, for the benefit of KVM stuff.. */
    275 struct raid_softc *raid_softc;
    276 
    277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    278 				     struct disklabel *);
    279 static void raidgetdisklabel(dev_t);
    280 static void raidmakedisklabel(struct raid_softc *);
    281 
    282 static int raidlock(struct raid_softc *);
    283 static void raidunlock(struct raid_softc *);
    284 
    285 static int raid_detach_unlocked(struct raid_softc *);
    286 
    287 static void rf_markalldirty(RF_Raid_t *);
    288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    289 
    290 void rf_ReconThread(struct rf_recon_req *);
    291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    292 void rf_CopybackThread(RF_Raid_t *raidPtr);
    293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    294 int rf_autoconfig(device_t);
    295 void rf_buildroothack(RF_ConfigSet_t *);
    296 
    297 RF_AutoConfig_t *rf_find_raid_components(void);
    298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    302 int rf_set_autoconfig(RF_Raid_t *, int);
    303 int rf_set_rootpartition(RF_Raid_t *, int);
    304 void rf_release_all_vps(RF_ConfigSet_t *);
    305 void rf_cleanup_config_set(RF_ConfigSet_t *);
    306 int rf_have_enough_components(RF_ConfigSet_t *);
    307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    309 
    310 /*
    311  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    312  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    313  * in the kernel config file.
    314  */
    315 #ifdef RAID_AUTOCONFIG
    316 int raidautoconfig = 1;
    317 #else
    318 int raidautoconfig = 0;
    319 #endif
    320 static bool raidautoconfigdone = false;
    321 
    322 struct RF_Pools_s rf_pools;
    323 
    324 void
    325 raidattach(int num)
    326 {
    327 	int raidID;
    328 	int i, rc;
    329 
    330 	aprint_debug("raidattach: Asked for %d units\n", num);
    331 
    332 	if (num <= 0) {
    333 #ifdef DIAGNOSTIC
    334 		panic("raidattach: count <= 0");
    335 #endif
    336 		return;
    337 	}
    338 	/* This is where all the initialization stuff gets done. */
    339 
    340 	numraid = num;
    341 
    342 	/* Make some space for requested number of units... */
    343 
    344 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    345 	if (raidPtrs == NULL) {
    346 		panic("raidPtrs is NULL!!");
    347 	}
    348 
    349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    350 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    351 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    352 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    353 
    354 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    355 #endif
    356 
    357 	for (i = 0; i < num; i++)
    358 		raidPtrs[i] = NULL;
    359 	rc = rf_BootRaidframe();
    360 	if (rc == 0)
    361 		aprint_verbose("Kernelized RAIDframe activated\n");
    362 	else
    363 		panic("Serious error booting RAID!!");
    364 
    365 	/* put together some datastructures like the CCD device does.. This
    366 	 * lets us lock the device and what-not when it gets opened. */
    367 
    368 	raid_softc = (struct raid_softc *)
    369 		malloc(num * sizeof(struct raid_softc),
    370 		       M_RAIDFRAME, M_NOWAIT);
    371 	if (raid_softc == NULL) {
    372 		aprint_error("WARNING: no memory for RAIDframe driver\n");
    373 		return;
    374 	}
    375 
    376 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    377 
    378 	for (raidID = 0; raidID < num; raidID++) {
    379 		bufq_alloc(&raid_softc[raidID].buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    380 
    381 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    382 			  (RF_Raid_t *));
    383 		if (raidPtrs[raidID] == NULL) {
    384 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
    385 			numraid = raidID;
    386 			return;
    387 		}
    388 	}
    389 
    390 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    391 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    392 	}
    393 
    394 	raidautoconfigdone = false;
    395 
    396 	/*
    397 	 * Register a finalizer which will be used to auto-config RAID
    398 	 * sets once all real hardware devices have been found.
    399 	 */
    400 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    401 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    402 }
    403 
    404 int
    405 rf_autoconfig(device_t self)
    406 {
    407 	RF_AutoConfig_t *ac_list;
    408 	RF_ConfigSet_t *config_sets;
    409 
    410 	if (!raidautoconfig || raidautoconfigdone == true)
    411 		return (0);
    412 
    413 	/* XXX This code can only be run once. */
    414 	raidautoconfigdone = true;
    415 
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set andconfigure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 void
    433 rf_buildroothack(RF_ConfigSet_t *config_sets)
    434 {
    435 	RF_ConfigSet_t *cset;
    436 	RF_ConfigSet_t *next_cset;
    437 	int retcode;
    438 	int raidID;
    439 	int rootID;
    440 	int col;
    441 	int num_root;
    442 	char *devname;
    443 
    444 	rootID = 0;
    445 	num_root = 0;
    446 	cset = config_sets;
    447 	while (cset != NULL) {
    448 		next_cset = cset->next;
    449 		if (rf_have_enough_components(cset) &&
    450 		    cset->ac->clabel->autoconfigure==1) {
    451 			retcode = rf_auto_config_set(cset,&raidID);
    452 			if (!retcode) {
    453 				aprint_debug("raid%d: configured ok\n", raidID);
    454 				if (cset->rootable) {
    455 					rootID = raidID;
    456 					num_root++;
    457 				}
    458 			} else {
    459 				/* The autoconfig didn't work :( */
    460 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    461 				rf_release_all_vps(cset);
    462 			}
    463 		} else {
    464 			/* we're not autoconfiguring this set...
    465 			   release the associated resources */
    466 			rf_release_all_vps(cset);
    467 		}
    468 		/* cleanup */
    469 		rf_cleanup_config_set(cset);
    470 		cset = next_cset;
    471 	}
    472 
    473 	/* if the user has specified what the root device should be
    474 	   then we don't touch booted_device or boothowto... */
    475 
    476 	if (rootspec != NULL)
    477 		return;
    478 
    479 	/* we found something bootable... */
    480 
    481 	if (num_root == 1) {
    482 		booted_device = raid_softc[rootID].sc_dev;
    483 	} else if (num_root > 1) {
    484 
    485 		/*
    486 		 * Maybe the MD code can help. If it cannot, then
    487 		 * setroot() will discover that we have no
    488 		 * booted_device and will ask the user if nothing was
    489 		 * hardwired in the kernel config file
    490 		 */
    491 
    492 		if (booted_device == NULL)
    493 			cpu_rootconf();
    494 		if (booted_device == NULL)
    495 			return;
    496 
    497 		num_root = 0;
    498 		for (raidID = 0; raidID < numraid; raidID++) {
    499 			if (raidPtrs[raidID]->valid == 0)
    500 				continue;
    501 
    502 			if (raidPtrs[raidID]->root_partition == 0)
    503 				continue;
    504 
    505 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
    506 				devname = raidPtrs[raidID]->Disks[col].devname;
    507 				devname += sizeof("/dev/") - 1;
    508 				if (strncmp(devname, device_xname(booted_device),
    509 					    strlen(device_xname(booted_device))) != 0)
    510 					continue;
    511 				aprint_debug("raid%d includes boot device %s\n",
    512 				       raidID, devname);
    513 				num_root++;
    514 				rootID = raidID;
    515 			}
    516 		}
    517 
    518 		if (num_root == 1) {
    519 			booted_device = raid_softc[rootID].sc_dev;
    520 		} else {
    521 			/* we can't guess.. require the user to answer... */
    522 			boothowto |= RB_ASKNAME;
    523 		}
    524 	}
    525 }
    526 
    527 
    528 int
    529 raidsize(dev_t dev)
    530 {
    531 	struct raid_softc *rs;
    532 	struct disklabel *lp;
    533 	int     part, unit, omask, size;
    534 
    535 	unit = raidunit(dev);
    536 	if (unit >= numraid)
    537 		return (-1);
    538 	rs = &raid_softc[unit];
    539 
    540 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    541 		return (-1);
    542 
    543 	part = DISKPART(dev);
    544 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    545 	lp = rs->sc_dkdev.dk_label;
    546 
    547 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    548 		return (-1);
    549 
    550 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    551 		size = -1;
    552 	else
    553 		size = lp->d_partitions[part].p_size *
    554 		    (lp->d_secsize / DEV_BSIZE);
    555 
    556 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    557 		return (-1);
    558 
    559 	return (size);
    560 
    561 }
    562 
    563 int
    564 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    565 {
    566 	int     unit = raidunit(dev);
    567 	struct raid_softc *rs;
    568 	const struct bdevsw *bdev;
    569 	struct disklabel *lp;
    570 	RF_Raid_t *raidPtr;
    571 	daddr_t offset;
    572 	int     part, c, sparecol, j, scol, dumpto;
    573 	int     error = 0;
    574 
    575 	if (unit >= numraid)
    576 		return (ENXIO);
    577 
    578 	rs = &raid_softc[unit];
    579 	raidPtr = raidPtrs[unit];
    580 
    581 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    582 		return ENXIO;
    583 
    584 	/* we only support dumping to RAID 1 sets */
    585 	if (raidPtr->Layout.numDataCol != 1 ||
    586 	    raidPtr->Layout.numParityCol != 1)
    587 		return EINVAL;
    588 
    589 
    590 	if ((error = raidlock(rs)) != 0)
    591 		return error;
    592 
    593 	if (size % DEV_BSIZE != 0) {
    594 		error = EINVAL;
    595 		goto out;
    596 	}
    597 
    598 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    599 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    600 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    601 		    size / DEV_BSIZE, rs->sc_size);
    602 		error = EINVAL;
    603 		goto out;
    604 	}
    605 
    606 	part = DISKPART(dev);
    607 	lp = rs->sc_dkdev.dk_label;
    608 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    609 
    610 	/* figure out what device is alive.. */
    611 
    612 	/*
    613 	   Look for a component to dump to.  The preference for the
    614 	   component to dump to is as follows:
    615 	   1) the master
    616 	   2) a used_spare of the master
    617 	   3) the slave
    618 	   4) a used_spare of the slave
    619 	*/
    620 
    621 	dumpto = -1;
    622 	for (c = 0; c < raidPtr->numCol; c++) {
    623 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    624 			/* this might be the one */
    625 			dumpto = c;
    626 			break;
    627 		}
    628 	}
    629 
    630 	/*
    631 	   At this point we have possibly selected a live master or a
    632 	   live slave.  We now check to see if there is a spared
    633 	   master (or a spared slave), if we didn't find a live master
    634 	   or a live slave.
    635 	*/
    636 
    637 	for (c = 0; c < raidPtr->numSpare; c++) {
    638 		sparecol = raidPtr->numCol + c;
    639 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    640 			/* How about this one? */
    641 			scol = -1;
    642 			for(j=0;j<raidPtr->numCol;j++) {
    643 				if (raidPtr->Disks[j].spareCol == sparecol) {
    644 					scol = j;
    645 					break;
    646 				}
    647 			}
    648 			if (scol == 0) {
    649 				/*
    650 				   We must have found a spared master!
    651 				   We'll take that over anything else
    652 				   found so far.  (We couldn't have
    653 				   found a real master before, since
    654 				   this is a used spare, and it's
    655 				   saying that it's replacing the
    656 				   master.)  On reboot (with
    657 				   autoconfiguration turned on)
    658 				   sparecol will become the 1st
    659 				   component (component0) of this set.
    660 				*/
    661 				dumpto = sparecol;
    662 				break;
    663 			} else if (scol != -1) {
    664 				/*
    665 				   Must be a spared slave.  We'll dump
    666 				   to that if we havn't found anything
    667 				   else so far.
    668 				*/
    669 				if (dumpto == -1)
    670 					dumpto = sparecol;
    671 			}
    672 		}
    673 	}
    674 
    675 	if (dumpto == -1) {
    676 		/* we couldn't find any live components to dump to!?!?
    677 		 */
    678 		error = EINVAL;
    679 		goto out;
    680 	}
    681 
    682 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    683 
    684 	/*
    685 	   Note that blkno is relative to this particular partition.
    686 	   By adding the offset of this partition in the RAID
    687 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    688 	   value that is relative to the partition used for the
    689 	   underlying component.
    690 	*/
    691 
    692 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    693 				blkno + offset, va, size);
    694 
    695 out:
    696 	raidunlock(rs);
    697 
    698 	return error;
    699 }
    700 /* ARGSUSED */
    701 int
    702 raidopen(dev_t dev, int flags, int fmt,
    703     struct lwp *l)
    704 {
    705 	int     unit = raidunit(dev);
    706 	struct raid_softc *rs;
    707 	struct disklabel *lp;
    708 	int     part, pmask;
    709 	int     error = 0;
    710 
    711 	if (unit >= numraid)
    712 		return (ENXIO);
    713 	rs = &raid_softc[unit];
    714 
    715 	if ((error = raidlock(rs)) != 0)
    716 		return (error);
    717 
    718 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    719 		error = EBUSY;
    720 		goto bad;
    721 	}
    722 
    723 	lp = rs->sc_dkdev.dk_label;
    724 
    725 	part = DISKPART(dev);
    726 
    727 	/*
    728 	 * If there are wedges, and this is not RAW_PART, then we
    729 	 * need to fail.
    730 	 */
    731 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    732 		error = EBUSY;
    733 		goto bad;
    734 	}
    735 	pmask = (1 << part);
    736 
    737 	if ((rs->sc_flags & RAIDF_INITED) &&
    738 	    (rs->sc_dkdev.dk_nwedges == 0) &&
    739 	    (rs->sc_dkdev.dk_openmask == 0))
    740 		raidgetdisklabel(dev);
    741 
    742 	/* make sure that this partition exists */
    743 
    744 	if (part != RAW_PART) {
    745 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    746 		    ((part >= lp->d_npartitions) ||
    747 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    748 			error = ENXIO;
    749 			goto bad;
    750 		}
    751 	}
    752 	/* Prevent this unit from being unconfigured while open. */
    753 	switch (fmt) {
    754 	case S_IFCHR:
    755 		rs->sc_dkdev.dk_copenmask |= pmask;
    756 		break;
    757 
    758 	case S_IFBLK:
    759 		rs->sc_dkdev.dk_bopenmask |= pmask;
    760 		break;
    761 	}
    762 
    763 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    764 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    765 		/* First one... mark things as dirty... Note that we *MUST*
    766 		 have done a configure before this.  I DO NOT WANT TO BE
    767 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    768 		 THAT THEY BELONG TOGETHER!!!!! */
    769 		/* XXX should check to see if we're only open for reading
    770 		   here... If so, we needn't do this, but then need some
    771 		   other way of keeping track of what's happened.. */
    772 
    773 		rf_markalldirty(raidPtrs[unit]);
    774 	}
    775 
    776 
    777 	rs->sc_dkdev.dk_openmask =
    778 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    779 
    780 bad:
    781 	raidunlock(rs);
    782 
    783 	return (error);
    784 
    785 
    786 }
    787 /* ARGSUSED */
    788 int
    789 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    790 {
    791 	int     unit = raidunit(dev);
    792 	struct raid_softc *rs;
    793 	int     error = 0;
    794 	int     part;
    795 
    796 	if (unit >= numraid)
    797 		return (ENXIO);
    798 	rs = &raid_softc[unit];
    799 
    800 	if ((error = raidlock(rs)) != 0)
    801 		return (error);
    802 
    803 	part = DISKPART(dev);
    804 
    805 	/* ...that much closer to allowing unconfiguration... */
    806 	switch (fmt) {
    807 	case S_IFCHR:
    808 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    809 		break;
    810 
    811 	case S_IFBLK:
    812 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    813 		break;
    814 	}
    815 	rs->sc_dkdev.dk_openmask =
    816 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    817 
    818 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    819 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    820 		/* Last one... device is not unconfigured yet.
    821 		   Device shutdown has taken care of setting the
    822 		   clean bits if RAIDF_INITED is not set
    823 		   mark things as clean... */
    824 
    825 		rf_update_component_labels(raidPtrs[unit],
    826 						 RF_FINAL_COMPONENT_UPDATE);
    827 
    828 		/* If the kernel is shutting down, it will detach
    829 		 * this RAID set soon enough.
    830 		 */
    831 	}
    832 
    833 	raidunlock(rs);
    834 	return (0);
    835 
    836 }
    837 
    838 void
    839 raidstrategy(struct buf *bp)
    840 {
    841 	unsigned int raidID = raidunit(bp->b_dev);
    842 	RF_Raid_t *raidPtr;
    843 	struct raid_softc *rs = &raid_softc[raidID];
    844 	int     wlabel;
    845 
    846 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    847 		bp->b_error = ENXIO;
    848 		goto done;
    849 	}
    850 	if (raidID >= numraid || !raidPtrs[raidID]) {
    851 		bp->b_error = ENODEV;
    852 		goto done;
    853 	}
    854 	raidPtr = raidPtrs[raidID];
    855 	if (!raidPtr->valid) {
    856 		bp->b_error = ENODEV;
    857 		goto done;
    858 	}
    859 	if (bp->b_bcount == 0) {
    860 		db1_printf(("b_bcount is zero..\n"));
    861 		goto done;
    862 	}
    863 
    864 	/*
    865 	 * Do bounds checking and adjust transfer.  If there's an
    866 	 * error, the bounds check will flag that for us.
    867 	 */
    868 
    869 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    870 	if (DISKPART(bp->b_dev) == RAW_PART) {
    871 		uint64_t size; /* device size in DEV_BSIZE unit */
    872 
    873 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    874 			size = raidPtr->totalSectors <<
    875 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    876 		} else {
    877 			size = raidPtr->totalSectors >>
    878 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    879 		}
    880 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    881 			goto done;
    882 		}
    883 	} else {
    884 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    885 			db1_printf(("Bounds check failed!!:%d %d\n",
    886 				(int) bp->b_blkno, (int) wlabel));
    887 			goto done;
    888 		}
    889 	}
    890 
    891 	rf_lock_mutex2(raidPtr->iodone_lock);
    892 
    893 	bp->b_resid = 0;
    894 
    895 	/* stuff it onto our queue */
    896 	bufq_put(rs->buf_queue, bp);
    897 
    898 	/* scheduled the IO to happen at the next convenient time */
    899 	rf_signal_cond2(raidPtr->iodone_cv);
    900 	rf_unlock_mutex2(raidPtr->iodone_lock);
    901 
    902 	return;
    903 
    904 done:
    905 	bp->b_resid = bp->b_bcount;
    906 	biodone(bp);
    907 }
    908 /* ARGSUSED */
    909 int
    910 raidread(dev_t dev, struct uio *uio, int flags)
    911 {
    912 	int     unit = raidunit(dev);
    913 	struct raid_softc *rs;
    914 
    915 	if (unit >= numraid)
    916 		return (ENXIO);
    917 	rs = &raid_softc[unit];
    918 
    919 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    920 		return (ENXIO);
    921 
    922 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    923 
    924 }
    925 /* ARGSUSED */
    926 int
    927 raidwrite(dev_t dev, struct uio *uio, int flags)
    928 {
    929 	int     unit = raidunit(dev);
    930 	struct raid_softc *rs;
    931 
    932 	if (unit >= numraid)
    933 		return (ENXIO);
    934 	rs = &raid_softc[unit];
    935 
    936 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    937 		return (ENXIO);
    938 
    939 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    940 
    941 }
    942 
    943 static int
    944 raid_detach_unlocked(struct raid_softc *rs)
    945 {
    946 	int error;
    947 	RF_Raid_t *raidPtr;
    948 
    949 	raidPtr = raidPtrs[device_unit(rs->sc_dev)];
    950 
    951 	/*
    952 	 * If somebody has a partition mounted, we shouldn't
    953 	 * shutdown.
    954 	 */
    955 	if (rs->sc_dkdev.dk_openmask != 0)
    956 		return EBUSY;
    957 
    958 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    959 		;	/* not initialized: nothing to do */
    960 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    961 		return error;
    962 	else
    963 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    964 
    965 	/* Detach the disk. */
    966 	dkwedge_delall(&rs->sc_dkdev);
    967 	disk_detach(&rs->sc_dkdev);
    968 	disk_destroy(&rs->sc_dkdev);
    969 
    970 	aprint_normal_dev(rs->sc_dev, "detached\n");
    971 
    972 	return 0;
    973 }
    974 
    975 int
    976 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    977 {
    978 	int     unit = raidunit(dev);
    979 	int     error = 0;
    980 	int     part, pmask, s;
    981 	cfdata_t cf;
    982 	struct raid_softc *rs;
    983 	RF_Config_t *k_cfg, *u_cfg;
    984 	RF_Raid_t *raidPtr;
    985 	RF_RaidDisk_t *diskPtr;
    986 	RF_AccTotals_t *totals;
    987 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    988 	u_char *specific_buf;
    989 	int retcode = 0;
    990 	int column;
    991 /*	int raidid; */
    992 	struct rf_recon_req *rrcopy, *rr;
    993 	RF_ComponentLabel_t *clabel;
    994 	RF_ComponentLabel_t *ci_label;
    995 	RF_ComponentLabel_t **clabel_ptr;
    996 	RF_SingleComponent_t *sparePtr,*componentPtr;
    997 	RF_SingleComponent_t component;
    998 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
    999 	int i, j, d;
   1000 #ifdef __HAVE_OLD_DISKLABEL
   1001 	struct disklabel newlabel;
   1002 #endif
   1003 	struct dkwedge_info *dkw;
   1004 
   1005 	if (unit >= numraid)
   1006 		return (ENXIO);
   1007 	rs = &raid_softc[unit];
   1008 	raidPtr = raidPtrs[unit];
   1009 
   1010 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1011 		(int) DISKPART(dev), (int) unit, cmd));
   1012 
   1013 	/* Must be open for writes for these commands... */
   1014 	switch (cmd) {
   1015 #ifdef DIOCGSECTORSIZE
   1016 	case DIOCGSECTORSIZE:
   1017 		*(u_int *)data = raidPtr->bytesPerSector;
   1018 		return 0;
   1019 	case DIOCGMEDIASIZE:
   1020 		*(off_t *)data =
   1021 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1022 		return 0;
   1023 #endif
   1024 	case DIOCSDINFO:
   1025 	case DIOCWDINFO:
   1026 #ifdef __HAVE_OLD_DISKLABEL
   1027 	case ODIOCWDINFO:
   1028 	case ODIOCSDINFO:
   1029 #endif
   1030 	case DIOCWLABEL:
   1031 	case DIOCAWEDGE:
   1032 	case DIOCDWEDGE:
   1033 	case DIOCSSTRATEGY:
   1034 		if ((flag & FWRITE) == 0)
   1035 			return (EBADF);
   1036 	}
   1037 
   1038 	/* Must be initialized for these... */
   1039 	switch (cmd) {
   1040 	case DIOCGDINFO:
   1041 	case DIOCSDINFO:
   1042 	case DIOCWDINFO:
   1043 #ifdef __HAVE_OLD_DISKLABEL
   1044 	case ODIOCGDINFO:
   1045 	case ODIOCWDINFO:
   1046 	case ODIOCSDINFO:
   1047 	case ODIOCGDEFLABEL:
   1048 #endif
   1049 	case DIOCGPART:
   1050 	case DIOCWLABEL:
   1051 	case DIOCGDEFLABEL:
   1052 	case DIOCAWEDGE:
   1053 	case DIOCDWEDGE:
   1054 	case DIOCLWEDGES:
   1055 	case DIOCCACHESYNC:
   1056 	case RAIDFRAME_SHUTDOWN:
   1057 	case RAIDFRAME_REWRITEPARITY:
   1058 	case RAIDFRAME_GET_INFO:
   1059 	case RAIDFRAME_RESET_ACCTOTALS:
   1060 	case RAIDFRAME_GET_ACCTOTALS:
   1061 	case RAIDFRAME_KEEP_ACCTOTALS:
   1062 	case RAIDFRAME_GET_SIZE:
   1063 	case RAIDFRAME_FAIL_DISK:
   1064 	case RAIDFRAME_COPYBACK:
   1065 	case RAIDFRAME_CHECK_RECON_STATUS:
   1066 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1067 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1068 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1069 	case RAIDFRAME_ADD_HOT_SPARE:
   1070 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1071 	case RAIDFRAME_INIT_LABELS:
   1072 	case RAIDFRAME_REBUILD_IN_PLACE:
   1073 	case RAIDFRAME_CHECK_PARITY:
   1074 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1075 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1076 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1077 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1078 	case RAIDFRAME_SET_AUTOCONFIG:
   1079 	case RAIDFRAME_SET_ROOT:
   1080 	case RAIDFRAME_DELETE_COMPONENT:
   1081 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1082 	case RAIDFRAME_PARITYMAP_STATUS:
   1083 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1084 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1085 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1086 	case DIOCGSTRATEGY:
   1087 	case DIOCSSTRATEGY:
   1088 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1089 			return (ENXIO);
   1090 	}
   1091 
   1092 	switch (cmd) {
   1093 #ifdef COMPAT_50
   1094 	case RAIDFRAME_GET_INFO50:
   1095 		return rf_get_info50(raidPtr, data);
   1096 
   1097 	case RAIDFRAME_CONFIGURE50:
   1098 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1099 			return retcode;
   1100 		goto config;
   1101 #endif
   1102 		/* configure the system */
   1103 	case RAIDFRAME_CONFIGURE:
   1104 
   1105 		if (raidPtr->valid) {
   1106 			/* There is a valid RAID set running on this unit! */
   1107 			printf("raid%d: Device already configured!\n",unit);
   1108 			return(EINVAL);
   1109 		}
   1110 
   1111 		/* copy-in the configuration information */
   1112 		/* data points to a pointer to the configuration structure */
   1113 
   1114 		u_cfg = *((RF_Config_t **) data);
   1115 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1116 		if (k_cfg == NULL) {
   1117 			return (ENOMEM);
   1118 		}
   1119 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1120 		if (retcode) {
   1121 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1122 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1123 				retcode));
   1124 			return (retcode);
   1125 		}
   1126 		goto config;
   1127 	config:
   1128 		/* allocate a buffer for the layout-specific data, and copy it
   1129 		 * in */
   1130 		if (k_cfg->layoutSpecificSize) {
   1131 			if (k_cfg->layoutSpecificSize > 10000) {
   1132 				/* sanity check */
   1133 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1134 				return (EINVAL);
   1135 			}
   1136 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1137 			    (u_char *));
   1138 			if (specific_buf == NULL) {
   1139 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1140 				return (ENOMEM);
   1141 			}
   1142 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1143 			    k_cfg->layoutSpecificSize);
   1144 			if (retcode) {
   1145 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1146 				RF_Free(specific_buf,
   1147 					k_cfg->layoutSpecificSize);
   1148 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1149 					retcode));
   1150 				return (retcode);
   1151 			}
   1152 		} else
   1153 			specific_buf = NULL;
   1154 		k_cfg->layoutSpecific = specific_buf;
   1155 
   1156 		/* should do some kind of sanity check on the configuration.
   1157 		 * Store the sum of all the bytes in the last byte? */
   1158 
   1159 		/* configure the system */
   1160 
   1161 		/*
   1162 		 * Clear the entire RAID descriptor, just to make sure
   1163 		 *  there is no stale data left in the case of a
   1164 		 *  reconfiguration
   1165 		 */
   1166 		memset(raidPtr, 0, sizeof(*raidPtr));
   1167 		raidPtr->raidid = unit;
   1168 
   1169 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1170 
   1171 		if (retcode == 0) {
   1172 
   1173 			/* allow this many simultaneous IO's to
   1174 			   this RAID device */
   1175 			raidPtr->openings = RAIDOUTSTANDING;
   1176 
   1177 			raidinit(raidPtr);
   1178 			rf_markalldirty(raidPtr);
   1179 		}
   1180 		/* free the buffers.  No return code here. */
   1181 		if (k_cfg->layoutSpecificSize) {
   1182 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1183 		}
   1184 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1185 
   1186 		return (retcode);
   1187 
   1188 		/* shutdown the system */
   1189 	case RAIDFRAME_SHUTDOWN:
   1190 
   1191 		part = DISKPART(dev);
   1192 		pmask = (1 << part);
   1193 
   1194 		if ((error = raidlock(rs)) != 0)
   1195 			return (error);
   1196 
   1197 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1198 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1199 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1200 			retcode = EBUSY;
   1201 		else {
   1202 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1203 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1204 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1205 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1206 			retcode = 0;
   1207 		}
   1208 
   1209 		raidunlock(rs);
   1210 
   1211 		if (retcode != 0)
   1212 			return retcode;
   1213 
   1214 		/* free the pseudo device attach bits */
   1215 
   1216 		cf = device_cfdata(rs->sc_dev);
   1217 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1218 			free(cf, M_RAIDFRAME);
   1219 
   1220 		return (retcode);
   1221 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1222 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1223 		/* need to read the component label for the disk indicated
   1224 		   by row,column in clabel */
   1225 
   1226 		/*
   1227 		 * Perhaps there should be an option to skip the in-core
   1228 		 * copy and hit the disk, as with disklabel(8).
   1229 		 */
   1230 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1231 
   1232 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1233 
   1234 		if (retcode) {
   1235 			RF_Free(clabel, sizeof(*clabel));
   1236 			return retcode;
   1237 		}
   1238 
   1239 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1240 
   1241 		column = clabel->column;
   1242 
   1243 		if ((column < 0) || (column >= raidPtr->numCol +
   1244 		    raidPtr->numSpare)) {
   1245 			RF_Free(clabel, sizeof(*clabel));
   1246 			return EINVAL;
   1247 		}
   1248 
   1249 		RF_Free(clabel, sizeof(*clabel));
   1250 
   1251 		clabel = raidget_component_label(raidPtr, column);
   1252 
   1253 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1254 
   1255 #if 0
   1256 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1257 		clabel = (RF_ComponentLabel_t *) data;
   1258 
   1259 		/* XXX check the label for valid stuff... */
   1260 		/* Note that some things *should not* get modified --
   1261 		   the user should be re-initing the labels instead of
   1262 		   trying to patch things.
   1263 		   */
   1264 
   1265 		raidid = raidPtr->raidid;
   1266 #ifdef DEBUG
   1267 		printf("raid%d: Got component label:\n", raidid);
   1268 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1269 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1270 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1271 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1272 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1273 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1274 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1275 #endif
   1276 		clabel->row = 0;
   1277 		column = clabel->column;
   1278 
   1279 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1280 			return(EINVAL);
   1281 		}
   1282 
   1283 		/* XXX this isn't allowed to do anything for now :-) */
   1284 
   1285 		/* XXX and before it is, we need to fill in the rest
   1286 		   of the fields!?!?!?! */
   1287 		memcpy(raidget_component_label(raidPtr, column),
   1288 		    clabel, sizeof(*clabel));
   1289 		raidflush_component_label(raidPtr, column);
   1290 		return (0);
   1291 #endif
   1292 
   1293 	case RAIDFRAME_INIT_LABELS:
   1294 		clabel = (RF_ComponentLabel_t *) data;
   1295 		/*
   1296 		   we only want the serial number from
   1297 		   the above.  We get all the rest of the information
   1298 		   from the config that was used to create this RAID
   1299 		   set.
   1300 		   */
   1301 
   1302 		raidPtr->serial_number = clabel->serial_number;
   1303 
   1304 		for(column=0;column<raidPtr->numCol;column++) {
   1305 			diskPtr = &raidPtr->Disks[column];
   1306 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1307 				ci_label = raidget_component_label(raidPtr,
   1308 				    column);
   1309 				/* Zeroing this is important. */
   1310 				memset(ci_label, 0, sizeof(*ci_label));
   1311 				raid_init_component_label(raidPtr, ci_label);
   1312 				ci_label->serial_number =
   1313 				    raidPtr->serial_number;
   1314 				ci_label->row = 0; /* we dont' pretend to support more */
   1315 				rf_component_label_set_partitionsize(ci_label,
   1316 				    diskPtr->partitionSize);
   1317 				ci_label->column = column;
   1318 				raidflush_component_label(raidPtr, column);
   1319 			}
   1320 			/* XXXjld what about the spares? */
   1321 		}
   1322 
   1323 		return (retcode);
   1324 	case RAIDFRAME_SET_AUTOCONFIG:
   1325 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1326 		printf("raid%d: New autoconfig value is: %d\n",
   1327 		       raidPtr->raidid, d);
   1328 		*(int *) data = d;
   1329 		return (retcode);
   1330 
   1331 	case RAIDFRAME_SET_ROOT:
   1332 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1333 		printf("raid%d: New rootpartition value is: %d\n",
   1334 		       raidPtr->raidid, d);
   1335 		*(int *) data = d;
   1336 		return (retcode);
   1337 
   1338 		/* initialize all parity */
   1339 	case RAIDFRAME_REWRITEPARITY:
   1340 
   1341 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1342 			/* Parity for RAID 0 is trivially correct */
   1343 			raidPtr->parity_good = RF_RAID_CLEAN;
   1344 			return(0);
   1345 		}
   1346 
   1347 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1348 			/* Re-write is already in progress! */
   1349 			return(EINVAL);
   1350 		}
   1351 
   1352 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1353 					   rf_RewriteParityThread,
   1354 					   raidPtr,"raid_parity");
   1355 		return (retcode);
   1356 
   1357 
   1358 	case RAIDFRAME_ADD_HOT_SPARE:
   1359 		sparePtr = (RF_SingleComponent_t *) data;
   1360 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1361 		retcode = rf_add_hot_spare(raidPtr, &component);
   1362 		return(retcode);
   1363 
   1364 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1365 		return(retcode);
   1366 
   1367 	case RAIDFRAME_DELETE_COMPONENT:
   1368 		componentPtr = (RF_SingleComponent_t *)data;
   1369 		memcpy( &component, componentPtr,
   1370 			sizeof(RF_SingleComponent_t));
   1371 		retcode = rf_delete_component(raidPtr, &component);
   1372 		return(retcode);
   1373 
   1374 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1375 		componentPtr = (RF_SingleComponent_t *)data;
   1376 		memcpy( &component, componentPtr,
   1377 			sizeof(RF_SingleComponent_t));
   1378 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1379 		return(retcode);
   1380 
   1381 	case RAIDFRAME_REBUILD_IN_PLACE:
   1382 
   1383 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1384 			/* Can't do this on a RAID 0!! */
   1385 			return(EINVAL);
   1386 		}
   1387 
   1388 		if (raidPtr->recon_in_progress == 1) {
   1389 			/* a reconstruct is already in progress! */
   1390 			return(EINVAL);
   1391 		}
   1392 
   1393 		componentPtr = (RF_SingleComponent_t *) data;
   1394 		memcpy( &component, componentPtr,
   1395 			sizeof(RF_SingleComponent_t));
   1396 		component.row = 0; /* we don't support any more */
   1397 		column = component.column;
   1398 
   1399 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1400 			return(EINVAL);
   1401 		}
   1402 
   1403 		rf_lock_mutex2(raidPtr->mutex);
   1404 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1405 		    (raidPtr->numFailures > 0)) {
   1406 			/* XXX 0 above shouldn't be constant!!! */
   1407 			/* some component other than this has failed.
   1408 			   Let's not make things worse than they already
   1409 			   are... */
   1410 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1411 			       raidPtr->raidid);
   1412 			printf("raid%d:     Col: %d   Too many failures.\n",
   1413 			       raidPtr->raidid, column);
   1414 			rf_unlock_mutex2(raidPtr->mutex);
   1415 			return (EINVAL);
   1416 		}
   1417 		if (raidPtr->Disks[column].status ==
   1418 		    rf_ds_reconstructing) {
   1419 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1420 			       raidPtr->raidid);
   1421 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1422 
   1423 			rf_unlock_mutex2(raidPtr->mutex);
   1424 			return (EINVAL);
   1425 		}
   1426 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1427 			rf_unlock_mutex2(raidPtr->mutex);
   1428 			return (EINVAL);
   1429 		}
   1430 		rf_unlock_mutex2(raidPtr->mutex);
   1431 
   1432 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1433 		if (rrcopy == NULL)
   1434 			return(ENOMEM);
   1435 
   1436 		rrcopy->raidPtr = (void *) raidPtr;
   1437 		rrcopy->col = column;
   1438 
   1439 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1440 					   rf_ReconstructInPlaceThread,
   1441 					   rrcopy,"raid_reconip");
   1442 		return(retcode);
   1443 
   1444 	case RAIDFRAME_GET_INFO:
   1445 		if (!raidPtr->valid)
   1446 			return (ENODEV);
   1447 		ucfgp = (RF_DeviceConfig_t **) data;
   1448 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1449 			  (RF_DeviceConfig_t *));
   1450 		if (d_cfg == NULL)
   1451 			return (ENOMEM);
   1452 		d_cfg->rows = 1; /* there is only 1 row now */
   1453 		d_cfg->cols = raidPtr->numCol;
   1454 		d_cfg->ndevs = raidPtr->numCol;
   1455 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1456 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1457 			return (ENOMEM);
   1458 		}
   1459 		d_cfg->nspares = raidPtr->numSpare;
   1460 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1461 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1462 			return (ENOMEM);
   1463 		}
   1464 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1465 		d = 0;
   1466 		for (j = 0; j < d_cfg->cols; j++) {
   1467 			d_cfg->devs[d] = raidPtr->Disks[j];
   1468 			d++;
   1469 		}
   1470 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1471 			d_cfg->spares[i] = raidPtr->Disks[j];
   1472 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1473 				/* XXX: raidctl(8) expects to see this as a used spare */
   1474 				d_cfg->spares[i].status = rf_ds_used_spare;
   1475 			}
   1476 		}
   1477 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1478 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1479 
   1480 		return (retcode);
   1481 
   1482 	case RAIDFRAME_CHECK_PARITY:
   1483 		*(int *) data = raidPtr->parity_good;
   1484 		return (0);
   1485 
   1486 	case RAIDFRAME_PARITYMAP_STATUS:
   1487 		if (rf_paritymap_ineligible(raidPtr))
   1488 			return EINVAL;
   1489 		rf_paritymap_status(raidPtr->parity_map,
   1490 		    (struct rf_pmstat *)data);
   1491 		return 0;
   1492 
   1493 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1494 		if (rf_paritymap_ineligible(raidPtr))
   1495 			return EINVAL;
   1496 		if (raidPtr->parity_map == NULL)
   1497 			return ENOENT; /* ??? */
   1498 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1499 			(struct rf_pmparams *)data, 1))
   1500 			return EINVAL;
   1501 		return 0;
   1502 
   1503 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1504 		if (rf_paritymap_ineligible(raidPtr))
   1505 			return EINVAL;
   1506 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1507 		return 0;
   1508 
   1509 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1510 		if (rf_paritymap_ineligible(raidPtr))
   1511 			return EINVAL;
   1512 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1513 		/* XXX should errors be passed up? */
   1514 		return 0;
   1515 
   1516 	case RAIDFRAME_RESET_ACCTOTALS:
   1517 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1518 		return (0);
   1519 
   1520 	case RAIDFRAME_GET_ACCTOTALS:
   1521 		totals = (RF_AccTotals_t *) data;
   1522 		*totals = raidPtr->acc_totals;
   1523 		return (0);
   1524 
   1525 	case RAIDFRAME_KEEP_ACCTOTALS:
   1526 		raidPtr->keep_acc_totals = *(int *)data;
   1527 		return (0);
   1528 
   1529 	case RAIDFRAME_GET_SIZE:
   1530 		*(int *) data = raidPtr->totalSectors;
   1531 		return (0);
   1532 
   1533 		/* fail a disk & optionally start reconstruction */
   1534 	case RAIDFRAME_FAIL_DISK:
   1535 
   1536 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1537 			/* Can't do this on a RAID 0!! */
   1538 			return(EINVAL);
   1539 		}
   1540 
   1541 		rr = (struct rf_recon_req *) data;
   1542 		rr->row = 0;
   1543 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1544 			return (EINVAL);
   1545 
   1546 
   1547 		rf_lock_mutex2(raidPtr->mutex);
   1548 		if (raidPtr->status == rf_rs_reconstructing) {
   1549 			/* you can't fail a disk while we're reconstructing! */
   1550 			/* XXX wrong for RAID6 */
   1551 			rf_unlock_mutex2(raidPtr->mutex);
   1552 			return (EINVAL);
   1553 		}
   1554 		if ((raidPtr->Disks[rr->col].status ==
   1555 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1556 			/* some other component has failed.  Let's not make
   1557 			   things worse. XXX wrong for RAID6 */
   1558 			rf_unlock_mutex2(raidPtr->mutex);
   1559 			return (EINVAL);
   1560 		}
   1561 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1562 			/* Can't fail a spared disk! */
   1563 			rf_unlock_mutex2(raidPtr->mutex);
   1564 			return (EINVAL);
   1565 		}
   1566 		rf_unlock_mutex2(raidPtr->mutex);
   1567 
   1568 		/* make a copy of the recon request so that we don't rely on
   1569 		 * the user's buffer */
   1570 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1571 		if (rrcopy == NULL)
   1572 			return(ENOMEM);
   1573 		memcpy(rrcopy, rr, sizeof(*rr));
   1574 		rrcopy->raidPtr = (void *) raidPtr;
   1575 
   1576 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1577 					   rf_ReconThread,
   1578 					   rrcopy,"raid_recon");
   1579 		return (0);
   1580 
   1581 		/* invoke a copyback operation after recon on whatever disk
   1582 		 * needs it, if any */
   1583 	case RAIDFRAME_COPYBACK:
   1584 
   1585 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1586 			/* This makes no sense on a RAID 0!! */
   1587 			return(EINVAL);
   1588 		}
   1589 
   1590 		if (raidPtr->copyback_in_progress == 1) {
   1591 			/* Copyback is already in progress! */
   1592 			return(EINVAL);
   1593 		}
   1594 
   1595 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1596 					   rf_CopybackThread,
   1597 					   raidPtr,"raid_copyback");
   1598 		return (retcode);
   1599 
   1600 		/* return the percentage completion of reconstruction */
   1601 	case RAIDFRAME_CHECK_RECON_STATUS:
   1602 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1603 			/* This makes no sense on a RAID 0, so tell the
   1604 			   user it's done. */
   1605 			*(int *) data = 100;
   1606 			return(0);
   1607 		}
   1608 		if (raidPtr->status != rf_rs_reconstructing)
   1609 			*(int *) data = 100;
   1610 		else {
   1611 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1612 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1613 			} else {
   1614 				*(int *) data = 0;
   1615 			}
   1616 		}
   1617 		return (0);
   1618 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1619 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1620 		if (raidPtr->status != rf_rs_reconstructing) {
   1621 			progressInfo.remaining = 0;
   1622 			progressInfo.completed = 100;
   1623 			progressInfo.total = 100;
   1624 		} else {
   1625 			progressInfo.total =
   1626 				raidPtr->reconControl->numRUsTotal;
   1627 			progressInfo.completed =
   1628 				raidPtr->reconControl->numRUsComplete;
   1629 			progressInfo.remaining = progressInfo.total -
   1630 				progressInfo.completed;
   1631 		}
   1632 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1633 				  sizeof(RF_ProgressInfo_t));
   1634 		return (retcode);
   1635 
   1636 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1637 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1638 			/* This makes no sense on a RAID 0, so tell the
   1639 			   user it's done. */
   1640 			*(int *) data = 100;
   1641 			return(0);
   1642 		}
   1643 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1644 			*(int *) data = 100 *
   1645 				raidPtr->parity_rewrite_stripes_done /
   1646 				raidPtr->Layout.numStripe;
   1647 		} else {
   1648 			*(int *) data = 100;
   1649 		}
   1650 		return (0);
   1651 
   1652 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1653 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1654 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1655 			progressInfo.total = raidPtr->Layout.numStripe;
   1656 			progressInfo.completed =
   1657 				raidPtr->parity_rewrite_stripes_done;
   1658 			progressInfo.remaining = progressInfo.total -
   1659 				progressInfo.completed;
   1660 		} else {
   1661 			progressInfo.remaining = 0;
   1662 			progressInfo.completed = 100;
   1663 			progressInfo.total = 100;
   1664 		}
   1665 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1666 				  sizeof(RF_ProgressInfo_t));
   1667 		return (retcode);
   1668 
   1669 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1670 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1671 			/* This makes no sense on a RAID 0 */
   1672 			*(int *) data = 100;
   1673 			return(0);
   1674 		}
   1675 		if (raidPtr->copyback_in_progress == 1) {
   1676 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1677 				raidPtr->Layout.numStripe;
   1678 		} else {
   1679 			*(int *) data = 100;
   1680 		}
   1681 		return (0);
   1682 
   1683 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1684 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1685 		if (raidPtr->copyback_in_progress == 1) {
   1686 			progressInfo.total = raidPtr->Layout.numStripe;
   1687 			progressInfo.completed =
   1688 				raidPtr->copyback_stripes_done;
   1689 			progressInfo.remaining = progressInfo.total -
   1690 				progressInfo.completed;
   1691 		} else {
   1692 			progressInfo.remaining = 0;
   1693 			progressInfo.completed = 100;
   1694 			progressInfo.total = 100;
   1695 		}
   1696 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1697 				  sizeof(RF_ProgressInfo_t));
   1698 		return (retcode);
   1699 
   1700 		/* the sparetable daemon calls this to wait for the kernel to
   1701 		 * need a spare table. this ioctl does not return until a
   1702 		 * spare table is needed. XXX -- calling mpsleep here in the
   1703 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1704 		 * -- I should either compute the spare table in the kernel,
   1705 		 * or have a different -- XXX XXX -- interface (a different
   1706 		 * character device) for delivering the table     -- XXX */
   1707 #if 0
   1708 	case RAIDFRAME_SPARET_WAIT:
   1709 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1710 		while (!rf_sparet_wait_queue)
   1711 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1712 		waitreq = rf_sparet_wait_queue;
   1713 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1714 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1715 
   1716 		/* structure assignment */
   1717 		*((RF_SparetWait_t *) data) = *waitreq;
   1718 
   1719 		RF_Free(waitreq, sizeof(*waitreq));
   1720 		return (0);
   1721 
   1722 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1723 		 * code in it that will cause the dameon to exit */
   1724 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1725 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1726 		waitreq->fcol = -1;
   1727 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1728 		waitreq->next = rf_sparet_wait_queue;
   1729 		rf_sparet_wait_queue = waitreq;
   1730 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1731 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1732 		return (0);
   1733 
   1734 		/* used by the spare table daemon to deliver a spare table
   1735 		 * into the kernel */
   1736 	case RAIDFRAME_SEND_SPARET:
   1737 
   1738 		/* install the spare table */
   1739 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1740 
   1741 		/* respond to the requestor.  the return status of the spare
   1742 		 * table installation is passed in the "fcol" field */
   1743 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1744 		waitreq->fcol = retcode;
   1745 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1746 		waitreq->next = rf_sparet_resp_queue;
   1747 		rf_sparet_resp_queue = waitreq;
   1748 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1749 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1750 
   1751 		return (retcode);
   1752 #endif
   1753 
   1754 	default:
   1755 		break; /* fall through to the os-specific code below */
   1756 
   1757 	}
   1758 
   1759 	if (!raidPtr->valid)
   1760 		return (EINVAL);
   1761 
   1762 	/*
   1763 	 * Add support for "regular" device ioctls here.
   1764 	 */
   1765 
   1766 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1767 	if (error != EPASSTHROUGH)
   1768 		return (error);
   1769 
   1770 	switch (cmd) {
   1771 	case DIOCGDINFO:
   1772 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1773 		break;
   1774 #ifdef __HAVE_OLD_DISKLABEL
   1775 	case ODIOCGDINFO:
   1776 		newlabel = *(rs->sc_dkdev.dk_label);
   1777 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1778 			return ENOTTY;
   1779 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1780 		break;
   1781 #endif
   1782 
   1783 	case DIOCGPART:
   1784 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1785 		((struct partinfo *) data)->part =
   1786 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1787 		break;
   1788 
   1789 	case DIOCWDINFO:
   1790 	case DIOCSDINFO:
   1791 #ifdef __HAVE_OLD_DISKLABEL
   1792 	case ODIOCWDINFO:
   1793 	case ODIOCSDINFO:
   1794 #endif
   1795 	{
   1796 		struct disklabel *lp;
   1797 #ifdef __HAVE_OLD_DISKLABEL
   1798 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1799 			memset(&newlabel, 0, sizeof newlabel);
   1800 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1801 			lp = &newlabel;
   1802 		} else
   1803 #endif
   1804 		lp = (struct disklabel *)data;
   1805 
   1806 		if ((error = raidlock(rs)) != 0)
   1807 			return (error);
   1808 
   1809 		rs->sc_flags |= RAIDF_LABELLING;
   1810 
   1811 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1812 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1813 		if (error == 0) {
   1814 			if (cmd == DIOCWDINFO
   1815 #ifdef __HAVE_OLD_DISKLABEL
   1816 			    || cmd == ODIOCWDINFO
   1817 #endif
   1818 			   )
   1819 				error = writedisklabel(RAIDLABELDEV(dev),
   1820 				    raidstrategy, rs->sc_dkdev.dk_label,
   1821 				    rs->sc_dkdev.dk_cpulabel);
   1822 		}
   1823 		rs->sc_flags &= ~RAIDF_LABELLING;
   1824 
   1825 		raidunlock(rs);
   1826 
   1827 		if (error)
   1828 			return (error);
   1829 		break;
   1830 	}
   1831 
   1832 	case DIOCWLABEL:
   1833 		if (*(int *) data != 0)
   1834 			rs->sc_flags |= RAIDF_WLABEL;
   1835 		else
   1836 			rs->sc_flags &= ~RAIDF_WLABEL;
   1837 		break;
   1838 
   1839 	case DIOCGDEFLABEL:
   1840 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1841 		break;
   1842 
   1843 #ifdef __HAVE_OLD_DISKLABEL
   1844 	case ODIOCGDEFLABEL:
   1845 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1846 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1847 			return ENOTTY;
   1848 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1849 		break;
   1850 #endif
   1851 
   1852 	case DIOCAWEDGE:
   1853 	case DIOCDWEDGE:
   1854 	    	dkw = (void *)data;
   1855 
   1856 		/* If the ioctl happens here, the parent is us. */
   1857 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1858 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1859 
   1860 	case DIOCLWEDGES:
   1861 		return dkwedge_list(&rs->sc_dkdev,
   1862 		    (struct dkwedge_list *)data, l);
   1863 	case DIOCCACHESYNC:
   1864 		return rf_sync_component_caches(raidPtr);
   1865 
   1866 	case DIOCGSTRATEGY:
   1867 	    {
   1868 		struct disk_strategy *dks = (void *)data;
   1869 
   1870 		s = splbio();
   1871 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1872 		    sizeof(dks->dks_name));
   1873 		splx(s);
   1874 		dks->dks_paramlen = 0;
   1875 
   1876 		return 0;
   1877 	    }
   1878 
   1879 	case DIOCSSTRATEGY:
   1880 	    {
   1881 		struct disk_strategy *dks = (void *)data;
   1882 		struct bufq_state *new;
   1883 		struct bufq_state *old;
   1884 
   1885 		if (dks->dks_param != NULL) {
   1886 			return EINVAL;
   1887 		}
   1888 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1889 		error = bufq_alloc(&new, dks->dks_name,
   1890 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1891 		if (error) {
   1892 			return error;
   1893 		}
   1894 		s = splbio();
   1895 		old = rs->buf_queue;
   1896 		bufq_move(new, old);
   1897 		rs->buf_queue = new;
   1898 		splx(s);
   1899 		bufq_free(old);
   1900 
   1901 		return 0;
   1902 	    }
   1903 
   1904 	default:
   1905 		retcode = ENOTTY;
   1906 	}
   1907 	return (retcode);
   1908 
   1909 }
   1910 
   1911 
   1912 /* raidinit -- complete the rest of the initialization for the
   1913    RAIDframe device.  */
   1914 
   1915 
   1916 static void
   1917 raidinit(RF_Raid_t *raidPtr)
   1918 {
   1919 	cfdata_t cf;
   1920 	struct raid_softc *rs;
   1921 	int     unit;
   1922 
   1923 	unit = raidPtr->raidid;
   1924 
   1925 	rs = &raid_softc[unit];
   1926 
   1927 	/* XXX should check return code first... */
   1928 	rs->sc_flags |= RAIDF_INITED;
   1929 
   1930 	/* XXX doesn't check bounds. */
   1931 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1932 
   1933 	/* attach the pseudo device */
   1934 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1935 	cf->cf_name = raid_cd.cd_name;
   1936 	cf->cf_atname = raid_cd.cd_name;
   1937 	cf->cf_unit = unit;
   1938 	cf->cf_fstate = FSTATE_STAR;
   1939 
   1940 	rs->sc_dev = config_attach_pseudo(cf);
   1941 
   1942 	if (rs->sc_dev == NULL) {
   1943 		printf("raid%d: config_attach_pseudo failed\n",
   1944 		    raidPtr->raidid);
   1945 		rs->sc_flags &= ~RAIDF_INITED;
   1946 		free(cf, M_RAIDFRAME);
   1947 		return;
   1948 	}
   1949 
   1950 	/* disk_attach actually creates space for the CPU disklabel, among
   1951 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1952 	 * with disklabels. */
   1953 
   1954 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1955 	disk_attach(&rs->sc_dkdev);
   1956 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1957 
   1958 	/* XXX There may be a weird interaction here between this, and
   1959 	 * protectedSectors, as used in RAIDframe.  */
   1960 
   1961 	rs->sc_size = raidPtr->totalSectors;
   1962 
   1963 	dkwedge_discover(&rs->sc_dkdev);
   1964 
   1965 	rf_set_properties(rs, raidPtr);
   1966 
   1967 }
   1968 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1969 /* wake up the daemon & tell it to get us a spare table
   1970  * XXX
   1971  * the entries in the queues should be tagged with the raidPtr
   1972  * so that in the extremely rare case that two recons happen at once,
   1973  * we know for which device were requesting a spare table
   1974  * XXX
   1975  *
   1976  * XXX This code is not currently used. GO
   1977  */
   1978 int
   1979 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1980 {
   1981 	int     retcode;
   1982 
   1983 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1984 	req->next = rf_sparet_wait_queue;
   1985 	rf_sparet_wait_queue = req;
   1986 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1987 
   1988 	/* mpsleep unlocks the mutex */
   1989 	while (!rf_sparet_resp_queue) {
   1990 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1991 	}
   1992 	req = rf_sparet_resp_queue;
   1993 	rf_sparet_resp_queue = req->next;
   1994 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1995 
   1996 	retcode = req->fcol;
   1997 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1998 					 * alloc'd */
   1999 	return (retcode);
   2000 }
   2001 #endif
   2002 
   2003 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2004  * bp & passes it down.
   2005  * any calls originating in the kernel must use non-blocking I/O
   2006  * do some extra sanity checking to return "appropriate" error values for
   2007  * certain conditions (to make some standard utilities work)
   2008  *
   2009  * Formerly known as: rf_DoAccessKernel
   2010  */
   2011 void
   2012 raidstart(RF_Raid_t *raidPtr)
   2013 {
   2014 	RF_SectorCount_t num_blocks, pb, sum;
   2015 	RF_RaidAddr_t raid_addr;
   2016 	struct partition *pp;
   2017 	daddr_t blocknum;
   2018 	int     unit;
   2019 	struct raid_softc *rs;
   2020 	int     do_async;
   2021 	struct buf *bp;
   2022 	int rc;
   2023 
   2024 	unit = raidPtr->raidid;
   2025 	rs = &raid_softc[unit];
   2026 
   2027 	/* quick check to see if anything has died recently */
   2028 	rf_lock_mutex2(raidPtr->mutex);
   2029 	if (raidPtr->numNewFailures > 0) {
   2030 		rf_unlock_mutex2(raidPtr->mutex);
   2031 		rf_update_component_labels(raidPtr,
   2032 					   RF_NORMAL_COMPONENT_UPDATE);
   2033 		rf_lock_mutex2(raidPtr->mutex);
   2034 		raidPtr->numNewFailures--;
   2035 	}
   2036 
   2037 	/* Check to see if we're at the limit... */
   2038 	while (raidPtr->openings > 0) {
   2039 		rf_unlock_mutex2(raidPtr->mutex);
   2040 
   2041 		/* get the next item, if any, from the queue */
   2042 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2043 			/* nothing more to do */
   2044 			return;
   2045 		}
   2046 
   2047 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2048 		 * partition.. Need to make it absolute to the underlying
   2049 		 * device.. */
   2050 
   2051 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2052 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2053 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2054 			blocknum += pp->p_offset;
   2055 		}
   2056 
   2057 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2058 			    (int) blocknum));
   2059 
   2060 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2061 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2062 
   2063 		/* *THIS* is where we adjust what block we're going to...
   2064 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2065 		raid_addr = blocknum;
   2066 
   2067 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2068 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2069 		sum = raid_addr + num_blocks + pb;
   2070 		if (1 || rf_debugKernelAccess) {
   2071 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2072 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2073 				    (int) pb, (int) bp->b_resid));
   2074 		}
   2075 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2076 		    || (sum < num_blocks) || (sum < pb)) {
   2077 			bp->b_error = ENOSPC;
   2078 			bp->b_resid = bp->b_bcount;
   2079 			biodone(bp);
   2080 			rf_lock_mutex2(raidPtr->mutex);
   2081 			continue;
   2082 		}
   2083 		/*
   2084 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2085 		 */
   2086 
   2087 		if (bp->b_bcount & raidPtr->sectorMask) {
   2088 			bp->b_error = EINVAL;
   2089 			bp->b_resid = bp->b_bcount;
   2090 			biodone(bp);
   2091 			rf_lock_mutex2(raidPtr->mutex);
   2092 			continue;
   2093 
   2094 		}
   2095 		db1_printf(("Calling DoAccess..\n"));
   2096 
   2097 
   2098 		rf_lock_mutex2(raidPtr->mutex);
   2099 		raidPtr->openings--;
   2100 		rf_unlock_mutex2(raidPtr->mutex);
   2101 
   2102 		/*
   2103 		 * Everything is async.
   2104 		 */
   2105 		do_async = 1;
   2106 
   2107 		disk_busy(&rs->sc_dkdev);
   2108 
   2109 		/* XXX we're still at splbio() here... do we *really*
   2110 		   need to be? */
   2111 
   2112 		/* don't ever condition on bp->b_flags & B_WRITE.
   2113 		 * always condition on B_READ instead */
   2114 
   2115 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2116 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2117 				 do_async, raid_addr, num_blocks,
   2118 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2119 
   2120 		if (rc) {
   2121 			bp->b_error = rc;
   2122 			bp->b_resid = bp->b_bcount;
   2123 			biodone(bp);
   2124 			/* continue loop */
   2125 		}
   2126 
   2127 		rf_lock_mutex2(raidPtr->mutex);
   2128 	}
   2129 	rf_unlock_mutex2(raidPtr->mutex);
   2130 }
   2131 
   2132 
   2133 
   2134 
   2135 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2136 
   2137 int
   2138 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2139 {
   2140 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2141 	struct buf *bp;
   2142 
   2143 	req->queue = queue;
   2144 	bp = req->bp;
   2145 
   2146 	switch (req->type) {
   2147 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2148 		/* XXX need to do something extra here.. */
   2149 		/* I'm leaving this in, as I've never actually seen it used,
   2150 		 * and I'd like folks to report it... GO */
   2151 		printf(("WAKEUP CALLED\n"));
   2152 		queue->numOutstanding++;
   2153 
   2154 		bp->b_flags = 0;
   2155 		bp->b_private = req;
   2156 
   2157 		KernelWakeupFunc(bp);
   2158 		break;
   2159 
   2160 	case RF_IO_TYPE_READ:
   2161 	case RF_IO_TYPE_WRITE:
   2162 #if RF_ACC_TRACE > 0
   2163 		if (req->tracerec) {
   2164 			RF_ETIMER_START(req->tracerec->timer);
   2165 		}
   2166 #endif
   2167 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2168 		    op, queue->rf_cinfo->ci_dev,
   2169 		    req->sectorOffset, req->numSector,
   2170 		    req->buf, KernelWakeupFunc, (void *) req,
   2171 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2172 
   2173 		if (rf_debugKernelAccess) {
   2174 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2175 				(long) bp->b_blkno));
   2176 		}
   2177 		queue->numOutstanding++;
   2178 		queue->last_deq_sector = req->sectorOffset;
   2179 		/* acc wouldn't have been let in if there were any pending
   2180 		 * reqs at any other priority */
   2181 		queue->curPriority = req->priority;
   2182 
   2183 		db1_printf(("Going for %c to unit %d col %d\n",
   2184 			    req->type, queue->raidPtr->raidid,
   2185 			    queue->col));
   2186 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2187 			(int) req->sectorOffset, (int) req->numSector,
   2188 			(int) (req->numSector <<
   2189 			    queue->raidPtr->logBytesPerSector),
   2190 			(int) queue->raidPtr->logBytesPerSector));
   2191 
   2192 		/*
   2193 		 * XXX: drop lock here since this can block at
   2194 		 * least with backing SCSI devices.  Retake it
   2195 		 * to minimize fuss with calling interfaces.
   2196 		 */
   2197 
   2198 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2199 		bdev_strategy(bp);
   2200 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2201 		break;
   2202 
   2203 	default:
   2204 		panic("bad req->type in rf_DispatchKernelIO");
   2205 	}
   2206 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2207 
   2208 	return (0);
   2209 }
   2210 /* this is the callback function associated with a I/O invoked from
   2211    kernel code.
   2212  */
   2213 static void
   2214 KernelWakeupFunc(struct buf *bp)
   2215 {
   2216 	RF_DiskQueueData_t *req = NULL;
   2217 	RF_DiskQueue_t *queue;
   2218 
   2219 	db1_printf(("recovering the request queue:\n"));
   2220 
   2221 	req = bp->b_private;
   2222 
   2223 	queue = (RF_DiskQueue_t *) req->queue;
   2224 
   2225 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2226 
   2227 #if RF_ACC_TRACE > 0
   2228 	if (req->tracerec) {
   2229 		RF_ETIMER_STOP(req->tracerec->timer);
   2230 		RF_ETIMER_EVAL(req->tracerec->timer);
   2231 		rf_lock_mutex2(rf_tracing_mutex);
   2232 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2233 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2234 		req->tracerec->num_phys_ios++;
   2235 		rf_unlock_mutex2(rf_tracing_mutex);
   2236 	}
   2237 #endif
   2238 
   2239 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2240 	 * ballistic, and mark the component as hosed... */
   2241 
   2242 	if (bp->b_error != 0) {
   2243 		/* Mark the disk as dead */
   2244 		/* but only mark it once... */
   2245 		/* and only if it wouldn't leave this RAID set
   2246 		   completely broken */
   2247 		if (((queue->raidPtr->Disks[queue->col].status ==
   2248 		      rf_ds_optimal) ||
   2249 		     (queue->raidPtr->Disks[queue->col].status ==
   2250 		      rf_ds_used_spare)) &&
   2251 		     (queue->raidPtr->numFailures <
   2252 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2253 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2254 			       queue->raidPtr->raidid,
   2255 			       queue->raidPtr->Disks[queue->col].devname);
   2256 			queue->raidPtr->Disks[queue->col].status =
   2257 			    rf_ds_failed;
   2258 			queue->raidPtr->status = rf_rs_degraded;
   2259 			queue->raidPtr->numFailures++;
   2260 			queue->raidPtr->numNewFailures++;
   2261 		} else {	/* Disk is already dead... */
   2262 			/* printf("Disk already marked as dead!\n"); */
   2263 		}
   2264 
   2265 	}
   2266 
   2267 	/* Fill in the error value */
   2268 	req->error = bp->b_error;
   2269 
   2270 	/* Drop this one on the "finished" queue... */
   2271 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2272 
   2273 	/* Let the raidio thread know there is work to be done. */
   2274 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2275 
   2276 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2277 }
   2278 
   2279 
   2280 /*
   2281  * initialize a buf structure for doing an I/O in the kernel.
   2282  */
   2283 static void
   2284 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2285        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2286        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2287        struct proc *b_proc)
   2288 {
   2289 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2290 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2291 	bp->b_oflags = 0;
   2292 	bp->b_cflags = 0;
   2293 	bp->b_bcount = numSect << logBytesPerSector;
   2294 	bp->b_bufsize = bp->b_bcount;
   2295 	bp->b_error = 0;
   2296 	bp->b_dev = dev;
   2297 	bp->b_data = bf;
   2298 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2299 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2300 	if (bp->b_bcount == 0) {
   2301 		panic("bp->b_bcount is zero in InitBP!!");
   2302 	}
   2303 	bp->b_proc = b_proc;
   2304 	bp->b_iodone = cbFunc;
   2305 	bp->b_private = cbArg;
   2306 }
   2307 
   2308 static void
   2309 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2310 		    struct disklabel *lp)
   2311 {
   2312 	memset(lp, 0, sizeof(*lp));
   2313 
   2314 	/* fabricate a label... */
   2315 	lp->d_secperunit = raidPtr->totalSectors;
   2316 	lp->d_secsize = raidPtr->bytesPerSector;
   2317 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2318 	lp->d_ntracks = 4 * raidPtr->numCol;
   2319 	lp->d_ncylinders = raidPtr->totalSectors /
   2320 		(lp->d_nsectors * lp->d_ntracks);
   2321 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2322 
   2323 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2324 	lp->d_type = DTYPE_RAID;
   2325 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2326 	lp->d_rpm = 3600;
   2327 	lp->d_interleave = 1;
   2328 	lp->d_flags = 0;
   2329 
   2330 	lp->d_partitions[RAW_PART].p_offset = 0;
   2331 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2332 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2333 	lp->d_npartitions = RAW_PART + 1;
   2334 
   2335 	lp->d_magic = DISKMAGIC;
   2336 	lp->d_magic2 = DISKMAGIC;
   2337 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2338 
   2339 }
   2340 /*
   2341  * Read the disklabel from the raid device.  If one is not present, fake one
   2342  * up.
   2343  */
   2344 static void
   2345 raidgetdisklabel(dev_t dev)
   2346 {
   2347 	int     unit = raidunit(dev);
   2348 	struct raid_softc *rs = &raid_softc[unit];
   2349 	const char   *errstring;
   2350 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2351 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2352 	RF_Raid_t *raidPtr;
   2353 
   2354 	db1_printf(("Getting the disklabel...\n"));
   2355 
   2356 	memset(clp, 0, sizeof(*clp));
   2357 
   2358 	raidPtr = raidPtrs[unit];
   2359 
   2360 	raidgetdefaultlabel(raidPtr, rs, lp);
   2361 
   2362 	/*
   2363 	 * Call the generic disklabel extraction routine.
   2364 	 */
   2365 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2366 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2367 	if (errstring)
   2368 		raidmakedisklabel(rs);
   2369 	else {
   2370 		int     i;
   2371 		struct partition *pp;
   2372 
   2373 		/*
   2374 		 * Sanity check whether the found disklabel is valid.
   2375 		 *
   2376 		 * This is necessary since total size of the raid device
   2377 		 * may vary when an interleave is changed even though exactly
   2378 		 * same components are used, and old disklabel may used
   2379 		 * if that is found.
   2380 		 */
   2381 		if (lp->d_secperunit != rs->sc_size)
   2382 			printf("raid%d: WARNING: %s: "
   2383 			    "total sector size in disklabel (%" PRIu32 ") != "
   2384 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2385 			    lp->d_secperunit, rs->sc_size);
   2386 		for (i = 0; i < lp->d_npartitions; i++) {
   2387 			pp = &lp->d_partitions[i];
   2388 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2389 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2390 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2391 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2392 		}
   2393 	}
   2394 
   2395 }
   2396 /*
   2397  * Take care of things one might want to take care of in the event
   2398  * that a disklabel isn't present.
   2399  */
   2400 static void
   2401 raidmakedisklabel(struct raid_softc *rs)
   2402 {
   2403 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2404 	db1_printf(("Making a label..\n"));
   2405 
   2406 	/*
   2407 	 * For historical reasons, if there's no disklabel present
   2408 	 * the raw partition must be marked FS_BSDFFS.
   2409 	 */
   2410 
   2411 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2412 
   2413 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2414 
   2415 	lp->d_checksum = dkcksum(lp);
   2416 }
   2417 /*
   2418  * Wait interruptibly for an exclusive lock.
   2419  *
   2420  * XXX
   2421  * Several drivers do this; it should be abstracted and made MP-safe.
   2422  * (Hmm... where have we seen this warning before :->  GO )
   2423  */
   2424 static int
   2425 raidlock(struct raid_softc *rs)
   2426 {
   2427 	int     error;
   2428 
   2429 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2430 		rs->sc_flags |= RAIDF_WANTED;
   2431 		if ((error =
   2432 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2433 			return (error);
   2434 	}
   2435 	rs->sc_flags |= RAIDF_LOCKED;
   2436 	return (0);
   2437 }
   2438 /*
   2439  * Unlock and wake up any waiters.
   2440  */
   2441 static void
   2442 raidunlock(struct raid_softc *rs)
   2443 {
   2444 
   2445 	rs->sc_flags &= ~RAIDF_LOCKED;
   2446 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2447 		rs->sc_flags &= ~RAIDF_WANTED;
   2448 		wakeup(rs);
   2449 	}
   2450 }
   2451 
   2452 
   2453 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2454 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2455 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2456 
   2457 static daddr_t
   2458 rf_component_info_offset(void)
   2459 {
   2460 
   2461 	return RF_COMPONENT_INFO_OFFSET;
   2462 }
   2463 
   2464 static daddr_t
   2465 rf_component_info_size(unsigned secsize)
   2466 {
   2467 	daddr_t info_size;
   2468 
   2469 	KASSERT(secsize);
   2470 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2471 		info_size = secsize;
   2472 	else
   2473 		info_size = RF_COMPONENT_INFO_SIZE;
   2474 
   2475 	return info_size;
   2476 }
   2477 
   2478 static daddr_t
   2479 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2480 {
   2481 	daddr_t map_offset;
   2482 
   2483 	KASSERT(raidPtr->bytesPerSector);
   2484 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2485 		map_offset = raidPtr->bytesPerSector;
   2486 	else
   2487 		map_offset = RF_COMPONENT_INFO_SIZE;
   2488 	map_offset += rf_component_info_offset();
   2489 
   2490 	return map_offset;
   2491 }
   2492 
   2493 static daddr_t
   2494 rf_parity_map_size(RF_Raid_t *raidPtr)
   2495 {
   2496 	daddr_t map_size;
   2497 
   2498 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2499 		map_size = raidPtr->bytesPerSector;
   2500 	else
   2501 		map_size = RF_PARITY_MAP_SIZE;
   2502 
   2503 	return map_size;
   2504 }
   2505 
   2506 int
   2507 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2508 {
   2509 	RF_ComponentLabel_t *clabel;
   2510 
   2511 	clabel = raidget_component_label(raidPtr, col);
   2512 	clabel->clean = RF_RAID_CLEAN;
   2513 	raidflush_component_label(raidPtr, col);
   2514 	return(0);
   2515 }
   2516 
   2517 
   2518 int
   2519 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2520 {
   2521 	RF_ComponentLabel_t *clabel;
   2522 
   2523 	clabel = raidget_component_label(raidPtr, col);
   2524 	clabel->clean = RF_RAID_DIRTY;
   2525 	raidflush_component_label(raidPtr, col);
   2526 	return(0);
   2527 }
   2528 
   2529 int
   2530 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2531 {
   2532 	KASSERT(raidPtr->bytesPerSector);
   2533 	return raidread_component_label(raidPtr->bytesPerSector,
   2534 	    raidPtr->Disks[col].dev,
   2535 	    raidPtr->raid_cinfo[col].ci_vp,
   2536 	    &raidPtr->raid_cinfo[col].ci_label);
   2537 }
   2538 
   2539 RF_ComponentLabel_t *
   2540 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2541 {
   2542 	return &raidPtr->raid_cinfo[col].ci_label;
   2543 }
   2544 
   2545 int
   2546 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2547 {
   2548 	RF_ComponentLabel_t *label;
   2549 
   2550 	label = &raidPtr->raid_cinfo[col].ci_label;
   2551 	label->mod_counter = raidPtr->mod_counter;
   2552 #ifndef RF_NO_PARITY_MAP
   2553 	label->parity_map_modcount = label->mod_counter;
   2554 #endif
   2555 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2556 	    raidPtr->Disks[col].dev,
   2557 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2558 }
   2559 
   2560 
   2561 static int
   2562 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2563     RF_ComponentLabel_t *clabel)
   2564 {
   2565 	return raidread_component_area(dev, b_vp, clabel,
   2566 	    sizeof(RF_ComponentLabel_t),
   2567 	    rf_component_info_offset(),
   2568 	    rf_component_info_size(secsize));
   2569 }
   2570 
   2571 /* ARGSUSED */
   2572 static int
   2573 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2574     size_t msize, daddr_t offset, daddr_t dsize)
   2575 {
   2576 	struct buf *bp;
   2577 	const struct bdevsw *bdev;
   2578 	int error;
   2579 
   2580 	/* XXX should probably ensure that we don't try to do this if
   2581 	   someone has changed rf_protected_sectors. */
   2582 
   2583 	if (b_vp == NULL) {
   2584 		/* For whatever reason, this component is not valid.
   2585 		   Don't try to read a component label from it. */
   2586 		return(EINVAL);
   2587 	}
   2588 
   2589 	/* get a block of the appropriate size... */
   2590 	bp = geteblk((int)dsize);
   2591 	bp->b_dev = dev;
   2592 
   2593 	/* get our ducks in a row for the read */
   2594 	bp->b_blkno = offset / DEV_BSIZE;
   2595 	bp->b_bcount = dsize;
   2596 	bp->b_flags |= B_READ;
   2597  	bp->b_resid = dsize;
   2598 
   2599 	bdev = bdevsw_lookup(bp->b_dev);
   2600 	if (bdev == NULL)
   2601 		return (ENXIO);
   2602 	(*bdev->d_strategy)(bp);
   2603 
   2604 	error = biowait(bp);
   2605 
   2606 	if (!error) {
   2607 		memcpy(data, bp->b_data, msize);
   2608 	}
   2609 
   2610 	brelse(bp, 0);
   2611 	return(error);
   2612 }
   2613 
   2614 
   2615 static int
   2616 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2617     RF_ComponentLabel_t *clabel)
   2618 {
   2619 	return raidwrite_component_area(dev, b_vp, clabel,
   2620 	    sizeof(RF_ComponentLabel_t),
   2621 	    rf_component_info_offset(),
   2622 	    rf_component_info_size(secsize), 0);
   2623 }
   2624 
   2625 /* ARGSUSED */
   2626 static int
   2627 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2628     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2629 {
   2630 	struct buf *bp;
   2631 	const struct bdevsw *bdev;
   2632 	int error;
   2633 
   2634 	/* get a block of the appropriate size... */
   2635 	bp = geteblk((int)dsize);
   2636 	bp->b_dev = dev;
   2637 
   2638 	/* get our ducks in a row for the write */
   2639 	bp->b_blkno = offset / DEV_BSIZE;
   2640 	bp->b_bcount = dsize;
   2641 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2642  	bp->b_resid = dsize;
   2643 
   2644 	memset(bp->b_data, 0, dsize);
   2645 	memcpy(bp->b_data, data, msize);
   2646 
   2647 	bdev = bdevsw_lookup(bp->b_dev);
   2648 	if (bdev == NULL)
   2649 		return (ENXIO);
   2650 	(*bdev->d_strategy)(bp);
   2651 	if (asyncp)
   2652 		return 0;
   2653 	error = biowait(bp);
   2654 	brelse(bp, 0);
   2655 	if (error) {
   2656 #if 1
   2657 		printf("Failed to write RAID component info!\n");
   2658 #endif
   2659 	}
   2660 
   2661 	return(error);
   2662 }
   2663 
   2664 void
   2665 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2666 {
   2667 	int c;
   2668 
   2669 	for (c = 0; c < raidPtr->numCol; c++) {
   2670 		/* Skip dead disks. */
   2671 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2672 			continue;
   2673 		/* XXXjld: what if an error occurs here? */
   2674 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2675 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2676 		    RF_PARITYMAP_NBYTE,
   2677 		    rf_parity_map_offset(raidPtr),
   2678 		    rf_parity_map_size(raidPtr), 0);
   2679 	}
   2680 }
   2681 
   2682 void
   2683 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2684 {
   2685 	struct rf_paritymap_ondisk tmp;
   2686 	int c,first;
   2687 
   2688 	first=1;
   2689 	for (c = 0; c < raidPtr->numCol; c++) {
   2690 		/* Skip dead disks. */
   2691 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2692 			continue;
   2693 		raidread_component_area(raidPtr->Disks[c].dev,
   2694 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2695 		    RF_PARITYMAP_NBYTE,
   2696 		    rf_parity_map_offset(raidPtr),
   2697 		    rf_parity_map_size(raidPtr));
   2698 		if (first) {
   2699 			memcpy(map, &tmp, sizeof(*map));
   2700 			first = 0;
   2701 		} else {
   2702 			rf_paritymap_merge(map, &tmp);
   2703 		}
   2704 	}
   2705 }
   2706 
   2707 void
   2708 rf_markalldirty(RF_Raid_t *raidPtr)
   2709 {
   2710 	RF_ComponentLabel_t *clabel;
   2711 	int sparecol;
   2712 	int c;
   2713 	int j;
   2714 	int scol = -1;
   2715 
   2716 	raidPtr->mod_counter++;
   2717 	for (c = 0; c < raidPtr->numCol; c++) {
   2718 		/* we don't want to touch (at all) a disk that has
   2719 		   failed */
   2720 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2721 			clabel = raidget_component_label(raidPtr, c);
   2722 			if (clabel->status == rf_ds_spared) {
   2723 				/* XXX do something special...
   2724 				   but whatever you do, don't
   2725 				   try to access it!! */
   2726 			} else {
   2727 				raidmarkdirty(raidPtr, c);
   2728 			}
   2729 		}
   2730 	}
   2731 
   2732 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2733 		sparecol = raidPtr->numCol + c;
   2734 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2735 			/*
   2736 
   2737 			   we claim this disk is "optimal" if it's
   2738 			   rf_ds_used_spare, as that means it should be
   2739 			   directly substitutable for the disk it replaced.
   2740 			   We note that too...
   2741 
   2742 			 */
   2743 
   2744 			for(j=0;j<raidPtr->numCol;j++) {
   2745 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2746 					scol = j;
   2747 					break;
   2748 				}
   2749 			}
   2750 
   2751 			clabel = raidget_component_label(raidPtr, sparecol);
   2752 			/* make sure status is noted */
   2753 
   2754 			raid_init_component_label(raidPtr, clabel);
   2755 
   2756 			clabel->row = 0;
   2757 			clabel->column = scol;
   2758 			/* Note: we *don't* change status from rf_ds_used_spare
   2759 			   to rf_ds_optimal */
   2760 			/* clabel.status = rf_ds_optimal; */
   2761 
   2762 			raidmarkdirty(raidPtr, sparecol);
   2763 		}
   2764 	}
   2765 }
   2766 
   2767 
   2768 void
   2769 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2770 {
   2771 	RF_ComponentLabel_t *clabel;
   2772 	int sparecol;
   2773 	int c;
   2774 	int j;
   2775 	int scol;
   2776 
   2777 	scol = -1;
   2778 
   2779 	/* XXX should do extra checks to make sure things really are clean,
   2780 	   rather than blindly setting the clean bit... */
   2781 
   2782 	raidPtr->mod_counter++;
   2783 
   2784 	for (c = 0; c < raidPtr->numCol; c++) {
   2785 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2786 			clabel = raidget_component_label(raidPtr, c);
   2787 			/* make sure status is noted */
   2788 			clabel->status = rf_ds_optimal;
   2789 
   2790 			/* note what unit we are configured as */
   2791 			clabel->last_unit = raidPtr->raidid;
   2792 
   2793 			raidflush_component_label(raidPtr, c);
   2794 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2795 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2796 					raidmarkclean(raidPtr, c);
   2797 				}
   2798 			}
   2799 		}
   2800 		/* else we don't touch it.. */
   2801 	}
   2802 
   2803 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2804 		sparecol = raidPtr->numCol + c;
   2805 		/* Need to ensure that the reconstruct actually completed! */
   2806 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2807 			/*
   2808 
   2809 			   we claim this disk is "optimal" if it's
   2810 			   rf_ds_used_spare, as that means it should be
   2811 			   directly substitutable for the disk it replaced.
   2812 			   We note that too...
   2813 
   2814 			 */
   2815 
   2816 			for(j=0;j<raidPtr->numCol;j++) {
   2817 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2818 					scol = j;
   2819 					break;
   2820 				}
   2821 			}
   2822 
   2823 			/* XXX shouldn't *really* need this... */
   2824 			clabel = raidget_component_label(raidPtr, sparecol);
   2825 			/* make sure status is noted */
   2826 
   2827 			raid_init_component_label(raidPtr, clabel);
   2828 
   2829 			clabel->column = scol;
   2830 			clabel->status = rf_ds_optimal;
   2831 			clabel->last_unit = raidPtr->raidid;
   2832 
   2833 			raidflush_component_label(raidPtr, sparecol);
   2834 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2835 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2836 					raidmarkclean(raidPtr, sparecol);
   2837 				}
   2838 			}
   2839 		}
   2840 	}
   2841 }
   2842 
   2843 void
   2844 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2845 {
   2846 
   2847 	if (vp != NULL) {
   2848 		if (auto_configured == 1) {
   2849 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2850 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2851 			vput(vp);
   2852 
   2853 		} else {
   2854 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2855 		}
   2856 	}
   2857 }
   2858 
   2859 
   2860 void
   2861 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2862 {
   2863 	int r,c;
   2864 	struct vnode *vp;
   2865 	int acd;
   2866 
   2867 
   2868 	/* We take this opportunity to close the vnodes like we should.. */
   2869 
   2870 	for (c = 0; c < raidPtr->numCol; c++) {
   2871 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2872 		acd = raidPtr->Disks[c].auto_configured;
   2873 		rf_close_component(raidPtr, vp, acd);
   2874 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2875 		raidPtr->Disks[c].auto_configured = 0;
   2876 	}
   2877 
   2878 	for (r = 0; r < raidPtr->numSpare; r++) {
   2879 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2880 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2881 		rf_close_component(raidPtr, vp, acd);
   2882 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2883 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2884 	}
   2885 }
   2886 
   2887 
   2888 void
   2889 rf_ReconThread(struct rf_recon_req *req)
   2890 {
   2891 	int     s;
   2892 	RF_Raid_t *raidPtr;
   2893 
   2894 	s = splbio();
   2895 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2896 	raidPtr->recon_in_progress = 1;
   2897 
   2898 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2899 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2900 
   2901 	RF_Free(req, sizeof(*req));
   2902 
   2903 	raidPtr->recon_in_progress = 0;
   2904 	splx(s);
   2905 
   2906 	/* That's all... */
   2907 	kthread_exit(0);	/* does not return */
   2908 }
   2909 
   2910 void
   2911 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2912 {
   2913 	int retcode;
   2914 	int s;
   2915 
   2916 	raidPtr->parity_rewrite_stripes_done = 0;
   2917 	raidPtr->parity_rewrite_in_progress = 1;
   2918 	s = splbio();
   2919 	retcode = rf_RewriteParity(raidPtr);
   2920 	splx(s);
   2921 	if (retcode) {
   2922 		printf("raid%d: Error re-writing parity (%d)!\n",
   2923 		    raidPtr->raidid, retcode);
   2924 	} else {
   2925 		/* set the clean bit!  If we shutdown correctly,
   2926 		   the clean bit on each component label will get
   2927 		   set */
   2928 		raidPtr->parity_good = RF_RAID_CLEAN;
   2929 	}
   2930 	raidPtr->parity_rewrite_in_progress = 0;
   2931 
   2932 	/* Anyone waiting for us to stop?  If so, inform them... */
   2933 	if (raidPtr->waitShutdown) {
   2934 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2935 	}
   2936 
   2937 	/* That's all... */
   2938 	kthread_exit(0);	/* does not return */
   2939 }
   2940 
   2941 
   2942 void
   2943 rf_CopybackThread(RF_Raid_t *raidPtr)
   2944 {
   2945 	int s;
   2946 
   2947 	raidPtr->copyback_in_progress = 1;
   2948 	s = splbio();
   2949 	rf_CopybackReconstructedData(raidPtr);
   2950 	splx(s);
   2951 	raidPtr->copyback_in_progress = 0;
   2952 
   2953 	/* That's all... */
   2954 	kthread_exit(0);	/* does not return */
   2955 }
   2956 
   2957 
   2958 void
   2959 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2960 {
   2961 	int s;
   2962 	RF_Raid_t *raidPtr;
   2963 
   2964 	s = splbio();
   2965 	raidPtr = req->raidPtr;
   2966 	raidPtr->recon_in_progress = 1;
   2967 	rf_ReconstructInPlace(raidPtr, req->col);
   2968 	RF_Free(req, sizeof(*req));
   2969 	raidPtr->recon_in_progress = 0;
   2970 	splx(s);
   2971 
   2972 	/* That's all... */
   2973 	kthread_exit(0);	/* does not return */
   2974 }
   2975 
   2976 static RF_AutoConfig_t *
   2977 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2978     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2979     unsigned secsize)
   2980 {
   2981 	int good_one = 0;
   2982 	RF_ComponentLabel_t *clabel;
   2983 	RF_AutoConfig_t *ac;
   2984 
   2985 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2986 	if (clabel == NULL) {
   2987 oomem:
   2988 		    while(ac_list) {
   2989 			    ac = ac_list;
   2990 			    if (ac->clabel)
   2991 				    free(ac->clabel, M_RAIDFRAME);
   2992 			    ac_list = ac_list->next;
   2993 			    free(ac, M_RAIDFRAME);
   2994 		    }
   2995 		    printf("RAID auto config: out of memory!\n");
   2996 		    return NULL; /* XXX probably should panic? */
   2997 	}
   2998 
   2999 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   3000 		/* Got the label.  Does it look reasonable? */
   3001 		if (rf_reasonable_label(clabel, numsecs) &&
   3002 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3003 #ifdef DEBUG
   3004 			printf("Component on: %s: %llu\n",
   3005 				cname, (unsigned long long)size);
   3006 			rf_print_component_label(clabel);
   3007 #endif
   3008 			/* if it's reasonable, add it, else ignore it. */
   3009 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3010 				M_NOWAIT);
   3011 			if (ac == NULL) {
   3012 				free(clabel, M_RAIDFRAME);
   3013 				goto oomem;
   3014 			}
   3015 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3016 			ac->dev = dev;
   3017 			ac->vp = vp;
   3018 			ac->clabel = clabel;
   3019 			ac->next = ac_list;
   3020 			ac_list = ac;
   3021 			good_one = 1;
   3022 		}
   3023 	}
   3024 	if (!good_one) {
   3025 		/* cleanup */
   3026 		free(clabel, M_RAIDFRAME);
   3027 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3028 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3029 		vput(vp);
   3030 	}
   3031 	return ac_list;
   3032 }
   3033 
   3034 RF_AutoConfig_t *
   3035 rf_find_raid_components(void)
   3036 {
   3037 	struct vnode *vp;
   3038 	struct disklabel label;
   3039 	device_t dv;
   3040 	deviter_t di;
   3041 	dev_t dev;
   3042 	int bmajor, bminor, wedge, rf_part_found;
   3043 	int error;
   3044 	int i;
   3045 	RF_AutoConfig_t *ac_list;
   3046 	uint64_t numsecs;
   3047 	unsigned secsize;
   3048 
   3049 	/* initialize the AutoConfig list */
   3050 	ac_list = NULL;
   3051 
   3052 	/* we begin by trolling through *all* the devices on the system */
   3053 
   3054 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3055 	     dv = deviter_next(&di)) {
   3056 
   3057 		/* we are only interested in disks... */
   3058 		if (device_class(dv) != DV_DISK)
   3059 			continue;
   3060 
   3061 		/* we don't care about floppies... */
   3062 		if (device_is_a(dv, "fd")) {
   3063 			continue;
   3064 		}
   3065 
   3066 		/* we don't care about CD's... */
   3067 		if (device_is_a(dv, "cd")) {
   3068 			continue;
   3069 		}
   3070 
   3071 		/* we don't care about md's... */
   3072 		if (device_is_a(dv, "md")) {
   3073 			continue;
   3074 		}
   3075 
   3076 		/* hdfd is the Atari/Hades floppy driver */
   3077 		if (device_is_a(dv, "hdfd")) {
   3078 			continue;
   3079 		}
   3080 
   3081 		/* fdisa is the Atari/Milan floppy driver */
   3082 		if (device_is_a(dv, "fdisa")) {
   3083 			continue;
   3084 		}
   3085 
   3086 		/* need to find the device_name_to_block_device_major stuff */
   3087 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3088 
   3089 		rf_part_found = 0; /*No raid partition as yet*/
   3090 
   3091 		/* get a vnode for the raw partition of this disk */
   3092 
   3093 		wedge = device_is_a(dv, "dk");
   3094 		bminor = minor(device_unit(dv));
   3095 		dev = wedge ? makedev(bmajor, bminor) :
   3096 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3097 		if (bdevvp(dev, &vp))
   3098 			panic("RAID can't alloc vnode");
   3099 
   3100 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3101 
   3102 		if (error) {
   3103 			/* "Who cares."  Continue looking
   3104 			   for something that exists*/
   3105 			vput(vp);
   3106 			continue;
   3107 		}
   3108 
   3109 		error = getdisksize(vp, &numsecs, &secsize);
   3110 		if (error) {
   3111 			vput(vp);
   3112 			continue;
   3113 		}
   3114 		if (wedge) {
   3115 			struct dkwedge_info dkw;
   3116 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3117 			    NOCRED);
   3118 			if (error) {
   3119 				printf("RAIDframe: can't get wedge info for "
   3120 				    "dev %s (%d)\n", device_xname(dv), error);
   3121 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3122 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3123 				vput(vp);
   3124 				continue;
   3125 			}
   3126 
   3127 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3128 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3129 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3130 				vput(vp);
   3131 				continue;
   3132 			}
   3133 
   3134 			ac_list = rf_get_component(ac_list, dev, vp,
   3135 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3136 			rf_part_found = 1; /*There is a raid component on this disk*/
   3137 			continue;
   3138 		}
   3139 
   3140 		/* Ok, the disk exists.  Go get the disklabel. */
   3141 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3142 		if (error) {
   3143 			/*
   3144 			 * XXX can't happen - open() would
   3145 			 * have errored out (or faked up one)
   3146 			 */
   3147 			if (error != ENOTTY)
   3148 				printf("RAIDframe: can't get label for dev "
   3149 				    "%s (%d)\n", device_xname(dv), error);
   3150 		}
   3151 
   3152 		/* don't need this any more.  We'll allocate it again
   3153 		   a little later if we really do... */
   3154 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3155 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3156 		vput(vp);
   3157 
   3158 		if (error)
   3159 			continue;
   3160 
   3161 		rf_part_found = 0; /*No raid partitions yet*/
   3162 		for (i = 0; i < label.d_npartitions; i++) {
   3163 			char cname[sizeof(ac_list->devname)];
   3164 
   3165 			/* We only support partitions marked as RAID */
   3166 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3167 				continue;
   3168 
   3169 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3170 			if (bdevvp(dev, &vp))
   3171 				panic("RAID can't alloc vnode");
   3172 
   3173 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3174 			if (error) {
   3175 				/* Whatever... */
   3176 				vput(vp);
   3177 				continue;
   3178 			}
   3179 			snprintf(cname, sizeof(cname), "%s%c",
   3180 			    device_xname(dv), 'a' + i);
   3181 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3182 				label.d_partitions[i].p_size, numsecs, secsize);
   3183 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3184 		}
   3185 
   3186 		/*
   3187 		 *If there is no raid component on this disk, either in a
   3188 		 *disklabel or inside a wedge, check the raw partition as well,
   3189 		 *as it is possible to configure raid components on raw disk
   3190 		 *devices.
   3191 		 */
   3192 
   3193 		if (!rf_part_found) {
   3194 			char cname[sizeof(ac_list->devname)];
   3195 
   3196 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3197 			if (bdevvp(dev, &vp))
   3198 				panic("RAID can't alloc vnode");
   3199 
   3200 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3201 			if (error) {
   3202 				/* Whatever... */
   3203 				vput(vp);
   3204 				continue;
   3205 			}
   3206 			snprintf(cname, sizeof(cname), "%s%c",
   3207 			    device_xname(dv), 'a' + RAW_PART);
   3208 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3209 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3210 		}
   3211 	}
   3212 	deviter_release(&di);
   3213 	return ac_list;
   3214 }
   3215 
   3216 
   3217 int
   3218 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3219 {
   3220 
   3221 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3222 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3223 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3224 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3225 	    clabel->row >=0 &&
   3226 	    clabel->column >= 0 &&
   3227 	    clabel->num_rows > 0 &&
   3228 	    clabel->num_columns > 0 &&
   3229 	    clabel->row < clabel->num_rows &&
   3230 	    clabel->column < clabel->num_columns &&
   3231 	    clabel->blockSize > 0 &&
   3232 	    /*
   3233 	     * numBlocksHi may contain garbage, but it is ok since
   3234 	     * the type is unsigned.  If it is really garbage,
   3235 	     * rf_fix_old_label_size() will fix it.
   3236 	     */
   3237 	    rf_component_label_numblocks(clabel) > 0) {
   3238 		/*
   3239 		 * label looks reasonable enough...
   3240 		 * let's make sure it has no old garbage.
   3241 		 */
   3242 		if (numsecs)
   3243 			rf_fix_old_label_size(clabel, numsecs);
   3244 		return(1);
   3245 	}
   3246 	return(0);
   3247 }
   3248 
   3249 
   3250 /*
   3251  * For reasons yet unknown, some old component labels have garbage in
   3252  * the newer numBlocksHi region, and this causes lossage.  Since those
   3253  * disks will also have numsecs set to less than 32 bits of sectors,
   3254  * we can determine when this corruption has occured, and fix it.
   3255  *
   3256  * The exact same problem, with the same unknown reason, happens to
   3257  * the partitionSizeHi member as well.
   3258  */
   3259 static void
   3260 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3261 {
   3262 
   3263 	if (numsecs < ((uint64_t)1 << 32)) {
   3264 		if (clabel->numBlocksHi) {
   3265 			printf("WARNING: total sectors < 32 bits, yet "
   3266 			       "numBlocksHi set\n"
   3267 			       "WARNING: resetting numBlocksHi to zero.\n");
   3268 			clabel->numBlocksHi = 0;
   3269 		}
   3270 
   3271 		if (clabel->partitionSizeHi) {
   3272 			printf("WARNING: total sectors < 32 bits, yet "
   3273 			       "partitionSizeHi set\n"
   3274 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3275 			clabel->partitionSizeHi = 0;
   3276 		}
   3277 	}
   3278 }
   3279 
   3280 
   3281 #ifdef DEBUG
   3282 void
   3283 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3284 {
   3285 	uint64_t numBlocks;
   3286 
   3287 	numBlocks = rf_component_label_numblocks(clabel);
   3288 
   3289 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3290 	       clabel->row, clabel->column,
   3291 	       clabel->num_rows, clabel->num_columns);
   3292 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3293 	       clabel->version, clabel->serial_number,
   3294 	       clabel->mod_counter);
   3295 	printf("   Clean: %s Status: %d\n",
   3296 	       clabel->clean ? "Yes" : "No", clabel->status);
   3297 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3298 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3299 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3300 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3301 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3302 	printf("   Contains root partition: %s\n",
   3303 	       clabel->root_partition ? "Yes" : "No");
   3304 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3305 #if 0
   3306 	   printf("   Config order: %d\n", clabel->config_order);
   3307 #endif
   3308 
   3309 }
   3310 #endif
   3311 
   3312 RF_ConfigSet_t *
   3313 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3314 {
   3315 	RF_AutoConfig_t *ac;
   3316 	RF_ConfigSet_t *config_sets;
   3317 	RF_ConfigSet_t *cset;
   3318 	RF_AutoConfig_t *ac_next;
   3319 
   3320 
   3321 	config_sets = NULL;
   3322 
   3323 	/* Go through the AutoConfig list, and figure out which components
   3324 	   belong to what sets.  */
   3325 	ac = ac_list;
   3326 	while(ac!=NULL) {
   3327 		/* we're going to putz with ac->next, so save it here
   3328 		   for use at the end of the loop */
   3329 		ac_next = ac->next;
   3330 
   3331 		if (config_sets == NULL) {
   3332 			/* will need at least this one... */
   3333 			config_sets = (RF_ConfigSet_t *)
   3334 				malloc(sizeof(RF_ConfigSet_t),
   3335 				       M_RAIDFRAME, M_NOWAIT);
   3336 			if (config_sets == NULL) {
   3337 				panic("rf_create_auto_sets: No memory!");
   3338 			}
   3339 			/* this one is easy :) */
   3340 			config_sets->ac = ac;
   3341 			config_sets->next = NULL;
   3342 			config_sets->rootable = 0;
   3343 			ac->next = NULL;
   3344 		} else {
   3345 			/* which set does this component fit into? */
   3346 			cset = config_sets;
   3347 			while(cset!=NULL) {
   3348 				if (rf_does_it_fit(cset, ac)) {
   3349 					/* looks like it matches... */
   3350 					ac->next = cset->ac;
   3351 					cset->ac = ac;
   3352 					break;
   3353 				}
   3354 				cset = cset->next;
   3355 			}
   3356 			if (cset==NULL) {
   3357 				/* didn't find a match above... new set..*/
   3358 				cset = (RF_ConfigSet_t *)
   3359 					malloc(sizeof(RF_ConfigSet_t),
   3360 					       M_RAIDFRAME, M_NOWAIT);
   3361 				if (cset == NULL) {
   3362 					panic("rf_create_auto_sets: No memory!");
   3363 				}
   3364 				cset->ac = ac;
   3365 				ac->next = NULL;
   3366 				cset->next = config_sets;
   3367 				cset->rootable = 0;
   3368 				config_sets = cset;
   3369 			}
   3370 		}
   3371 		ac = ac_next;
   3372 	}
   3373 
   3374 
   3375 	return(config_sets);
   3376 }
   3377 
   3378 static int
   3379 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3380 {
   3381 	RF_ComponentLabel_t *clabel1, *clabel2;
   3382 
   3383 	/* If this one matches the *first* one in the set, that's good
   3384 	   enough, since the other members of the set would have been
   3385 	   through here too... */
   3386 	/* note that we are not checking partitionSize here..
   3387 
   3388 	   Note that we are also not checking the mod_counters here.
   3389 	   If everything else matches execpt the mod_counter, that's
   3390 	   good enough for this test.  We will deal with the mod_counters
   3391 	   a little later in the autoconfiguration process.
   3392 
   3393 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3394 
   3395 	   The reason we don't check for this is that failed disks
   3396 	   will have lower modification counts.  If those disks are
   3397 	   not added to the set they used to belong to, then they will
   3398 	   form their own set, which may result in 2 different sets,
   3399 	   for example, competing to be configured at raid0, and
   3400 	   perhaps competing to be the root filesystem set.  If the
   3401 	   wrong ones get configured, or both attempt to become /,
   3402 	   weird behaviour and or serious lossage will occur.  Thus we
   3403 	   need to bring them into the fold here, and kick them out at
   3404 	   a later point.
   3405 
   3406 	*/
   3407 
   3408 	clabel1 = cset->ac->clabel;
   3409 	clabel2 = ac->clabel;
   3410 	if ((clabel1->version == clabel2->version) &&
   3411 	    (clabel1->serial_number == clabel2->serial_number) &&
   3412 	    (clabel1->num_rows == clabel2->num_rows) &&
   3413 	    (clabel1->num_columns == clabel2->num_columns) &&
   3414 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3415 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3416 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3417 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3418 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3419 	    (clabel1->blockSize == clabel2->blockSize) &&
   3420 	    rf_component_label_numblocks(clabel1) ==
   3421 	    rf_component_label_numblocks(clabel2) &&
   3422 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3423 	    (clabel1->root_partition == clabel2->root_partition) &&
   3424 	    (clabel1->last_unit == clabel2->last_unit) &&
   3425 	    (clabel1->config_order == clabel2->config_order)) {
   3426 		/* if it get's here, it almost *has* to be a match */
   3427 	} else {
   3428 		/* it's not consistent with somebody in the set..
   3429 		   punt */
   3430 		return(0);
   3431 	}
   3432 	/* all was fine.. it must fit... */
   3433 	return(1);
   3434 }
   3435 
   3436 int
   3437 rf_have_enough_components(RF_ConfigSet_t *cset)
   3438 {
   3439 	RF_AutoConfig_t *ac;
   3440 	RF_AutoConfig_t *auto_config;
   3441 	RF_ComponentLabel_t *clabel;
   3442 	int c;
   3443 	int num_cols;
   3444 	int num_missing;
   3445 	int mod_counter;
   3446 	int mod_counter_found;
   3447 	int even_pair_failed;
   3448 	char parity_type;
   3449 
   3450 
   3451 	/* check to see that we have enough 'live' components
   3452 	   of this set.  If so, we can configure it if necessary */
   3453 
   3454 	num_cols = cset->ac->clabel->num_columns;
   3455 	parity_type = cset->ac->clabel->parityConfig;
   3456 
   3457 	/* XXX Check for duplicate components!?!?!? */
   3458 
   3459 	/* Determine what the mod_counter is supposed to be for this set. */
   3460 
   3461 	mod_counter_found = 0;
   3462 	mod_counter = 0;
   3463 	ac = cset->ac;
   3464 	while(ac!=NULL) {
   3465 		if (mod_counter_found==0) {
   3466 			mod_counter = ac->clabel->mod_counter;
   3467 			mod_counter_found = 1;
   3468 		} else {
   3469 			if (ac->clabel->mod_counter > mod_counter) {
   3470 				mod_counter = ac->clabel->mod_counter;
   3471 			}
   3472 		}
   3473 		ac = ac->next;
   3474 	}
   3475 
   3476 	num_missing = 0;
   3477 	auto_config = cset->ac;
   3478 
   3479 	even_pair_failed = 0;
   3480 	for(c=0; c<num_cols; c++) {
   3481 		ac = auto_config;
   3482 		while(ac!=NULL) {
   3483 			if ((ac->clabel->column == c) &&
   3484 			    (ac->clabel->mod_counter == mod_counter)) {
   3485 				/* it's this one... */
   3486 #ifdef DEBUG
   3487 				printf("Found: %s at %d\n",
   3488 				       ac->devname,c);
   3489 #endif
   3490 				break;
   3491 			}
   3492 			ac=ac->next;
   3493 		}
   3494 		if (ac==NULL) {
   3495 				/* Didn't find one here! */
   3496 				/* special case for RAID 1, especially
   3497 				   where there are more than 2
   3498 				   components (where RAIDframe treats
   3499 				   things a little differently :( ) */
   3500 			if (parity_type == '1') {
   3501 				if (c%2 == 0) { /* even component */
   3502 					even_pair_failed = 1;
   3503 				} else { /* odd component.  If
   3504 					    we're failed, and
   3505 					    so is the even
   3506 					    component, it's
   3507 					    "Good Night, Charlie" */
   3508 					if (even_pair_failed == 1) {
   3509 						return(0);
   3510 					}
   3511 				}
   3512 			} else {
   3513 				/* normal accounting */
   3514 				num_missing++;
   3515 			}
   3516 		}
   3517 		if ((parity_type == '1') && (c%2 == 1)) {
   3518 				/* Just did an even component, and we didn't
   3519 				   bail.. reset the even_pair_failed flag,
   3520 				   and go on to the next component.... */
   3521 			even_pair_failed = 0;
   3522 		}
   3523 	}
   3524 
   3525 	clabel = cset->ac->clabel;
   3526 
   3527 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3528 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3529 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3530 		/* XXX this needs to be made *much* more general */
   3531 		/* Too many failures */
   3532 		return(0);
   3533 	}
   3534 	/* otherwise, all is well, and we've got enough to take a kick
   3535 	   at autoconfiguring this set */
   3536 	return(1);
   3537 }
   3538 
   3539 void
   3540 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3541 			RF_Raid_t *raidPtr)
   3542 {
   3543 	RF_ComponentLabel_t *clabel;
   3544 	int i;
   3545 
   3546 	clabel = ac->clabel;
   3547 
   3548 	/* 1. Fill in the common stuff */
   3549 	config->numRow = clabel->num_rows = 1;
   3550 	config->numCol = clabel->num_columns;
   3551 	config->numSpare = 0; /* XXX should this be set here? */
   3552 	config->sectPerSU = clabel->sectPerSU;
   3553 	config->SUsPerPU = clabel->SUsPerPU;
   3554 	config->SUsPerRU = clabel->SUsPerRU;
   3555 	config->parityConfig = clabel->parityConfig;
   3556 	/* XXX... */
   3557 	strcpy(config->diskQueueType,"fifo");
   3558 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3559 	config->layoutSpecificSize = 0; /* XXX ?? */
   3560 
   3561 	while(ac!=NULL) {
   3562 		/* row/col values will be in range due to the checks
   3563 		   in reasonable_label() */
   3564 		strcpy(config->devnames[0][ac->clabel->column],
   3565 		       ac->devname);
   3566 		ac = ac->next;
   3567 	}
   3568 
   3569 	for(i=0;i<RF_MAXDBGV;i++) {
   3570 		config->debugVars[i][0] = 0;
   3571 	}
   3572 }
   3573 
   3574 int
   3575 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3576 {
   3577 	RF_ComponentLabel_t *clabel;
   3578 	int column;
   3579 	int sparecol;
   3580 
   3581 	raidPtr->autoconfigure = new_value;
   3582 
   3583 	for(column=0; column<raidPtr->numCol; column++) {
   3584 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3585 			clabel = raidget_component_label(raidPtr, column);
   3586 			clabel->autoconfigure = new_value;
   3587 			raidflush_component_label(raidPtr, column);
   3588 		}
   3589 	}
   3590 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3591 		sparecol = raidPtr->numCol + column;
   3592 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3593 			clabel = raidget_component_label(raidPtr, sparecol);
   3594 			clabel->autoconfigure = new_value;
   3595 			raidflush_component_label(raidPtr, sparecol);
   3596 		}
   3597 	}
   3598 	return(new_value);
   3599 }
   3600 
   3601 int
   3602 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3603 {
   3604 	RF_ComponentLabel_t *clabel;
   3605 	int column;
   3606 	int sparecol;
   3607 
   3608 	raidPtr->root_partition = new_value;
   3609 	for(column=0; column<raidPtr->numCol; column++) {
   3610 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3611 			clabel = raidget_component_label(raidPtr, column);
   3612 			clabel->root_partition = new_value;
   3613 			raidflush_component_label(raidPtr, column);
   3614 		}
   3615 	}
   3616 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3617 		sparecol = raidPtr->numCol + column;
   3618 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3619 			clabel = raidget_component_label(raidPtr, sparecol);
   3620 			clabel->root_partition = new_value;
   3621 			raidflush_component_label(raidPtr, sparecol);
   3622 		}
   3623 	}
   3624 	return(new_value);
   3625 }
   3626 
   3627 void
   3628 rf_release_all_vps(RF_ConfigSet_t *cset)
   3629 {
   3630 	RF_AutoConfig_t *ac;
   3631 
   3632 	ac = cset->ac;
   3633 	while(ac!=NULL) {
   3634 		/* Close the vp, and give it back */
   3635 		if (ac->vp) {
   3636 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3637 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3638 			vput(ac->vp);
   3639 			ac->vp = NULL;
   3640 		}
   3641 		ac = ac->next;
   3642 	}
   3643 }
   3644 
   3645 
   3646 void
   3647 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3648 {
   3649 	RF_AutoConfig_t *ac;
   3650 	RF_AutoConfig_t *next_ac;
   3651 
   3652 	ac = cset->ac;
   3653 	while(ac!=NULL) {
   3654 		next_ac = ac->next;
   3655 		/* nuke the label */
   3656 		free(ac->clabel, M_RAIDFRAME);
   3657 		/* cleanup the config structure */
   3658 		free(ac, M_RAIDFRAME);
   3659 		/* "next.." */
   3660 		ac = next_ac;
   3661 	}
   3662 	/* and, finally, nuke the config set */
   3663 	free(cset, M_RAIDFRAME);
   3664 }
   3665 
   3666 
   3667 void
   3668 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3669 {
   3670 	/* current version number */
   3671 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3672 	clabel->serial_number = raidPtr->serial_number;
   3673 	clabel->mod_counter = raidPtr->mod_counter;
   3674 
   3675 	clabel->num_rows = 1;
   3676 	clabel->num_columns = raidPtr->numCol;
   3677 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3678 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3679 
   3680 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3681 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3682 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3683 
   3684 	clabel->blockSize = raidPtr->bytesPerSector;
   3685 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3686 
   3687 	/* XXX not portable */
   3688 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3689 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3690 	clabel->autoconfigure = raidPtr->autoconfigure;
   3691 	clabel->root_partition = raidPtr->root_partition;
   3692 	clabel->last_unit = raidPtr->raidid;
   3693 	clabel->config_order = raidPtr->config_order;
   3694 
   3695 #ifndef RF_NO_PARITY_MAP
   3696 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3697 #endif
   3698 }
   3699 
   3700 int
   3701 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3702 {
   3703 	RF_Raid_t *raidPtr;
   3704 	RF_Config_t *config;
   3705 	int raidID;
   3706 	int retcode;
   3707 
   3708 #ifdef DEBUG
   3709 	printf("RAID autoconfigure\n");
   3710 #endif
   3711 
   3712 	retcode = 0;
   3713 	*unit = -1;
   3714 
   3715 	/* 1. Create a config structure */
   3716 
   3717 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3718 				       M_RAIDFRAME,
   3719 				       M_NOWAIT);
   3720 	if (config==NULL) {
   3721 		printf("Out of mem!?!?\n");
   3722 				/* XXX do something more intelligent here. */
   3723 		return(1);
   3724 	}
   3725 
   3726 	memset(config, 0, sizeof(RF_Config_t));
   3727 
   3728 	/*
   3729 	   2. Figure out what RAID ID this one is supposed to live at
   3730 	   See if we can get the same RAID dev that it was configured
   3731 	   on last time..
   3732 	*/
   3733 
   3734 	raidID = cset->ac->clabel->last_unit;
   3735 	if ((raidID < 0) || (raidID >= numraid)) {
   3736 		/* let's not wander off into lala land. */
   3737 		raidID = numraid - 1;
   3738 	}
   3739 	if (raidPtrs[raidID]->valid != 0) {
   3740 
   3741 		/*
   3742 		   Nope... Go looking for an alternative...
   3743 		   Start high so we don't immediately use raid0 if that's
   3744 		   not taken.
   3745 		*/
   3746 
   3747 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3748 			if (raidPtrs[raidID]->valid == 0) {
   3749 				/* can use this one! */
   3750 				break;
   3751 			}
   3752 		}
   3753 	}
   3754 
   3755 	if (raidID < 0) {
   3756 		/* punt... */
   3757 		printf("Unable to auto configure this set!\n");
   3758 		printf("(Out of RAID devs!)\n");
   3759 		free(config, M_RAIDFRAME);
   3760 		return(1);
   3761 	}
   3762 
   3763 #ifdef DEBUG
   3764 	printf("Configuring raid%d:\n",raidID);
   3765 #endif
   3766 
   3767 	raidPtr = raidPtrs[raidID];
   3768 
   3769 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3770 	raidPtr->raidid = raidID;
   3771 	raidPtr->openings = RAIDOUTSTANDING;
   3772 
   3773 	/* 3. Build the configuration structure */
   3774 	rf_create_configuration(cset->ac, config, raidPtr);
   3775 
   3776 	/* 4. Do the configuration */
   3777 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3778 
   3779 	if (retcode == 0) {
   3780 
   3781 		raidinit(raidPtrs[raidID]);
   3782 
   3783 		rf_markalldirty(raidPtrs[raidID]);
   3784 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3785 		if (cset->ac->clabel->root_partition==1) {
   3786 			/* everything configured just fine.  Make a note
   3787 			   that this set is eligible to be root. */
   3788 			cset->rootable = 1;
   3789 			/* XXX do this here? */
   3790 			raidPtrs[raidID]->root_partition = 1;
   3791 		}
   3792 	}
   3793 
   3794 	/* 5. Cleanup */
   3795 	free(config, M_RAIDFRAME);
   3796 
   3797 	*unit = raidID;
   3798 	return(retcode);
   3799 }
   3800 
   3801 void
   3802 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3803 {
   3804 	struct buf *bp;
   3805 
   3806 	bp = (struct buf *)desc->bp;
   3807 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3808 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3809 }
   3810 
   3811 void
   3812 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3813 	     size_t xmin, size_t xmax)
   3814 {
   3815 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3816 	pool_sethiwat(p, xmax);
   3817 	pool_prime(p, xmin);
   3818 	pool_setlowat(p, xmin);
   3819 }
   3820 
   3821 /*
   3822  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3823  * if there is IO pending and if that IO could possibly be done for a
   3824  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3825  * otherwise.
   3826  *
   3827  */
   3828 
   3829 int
   3830 rf_buf_queue_check(int raidid)
   3831 {
   3832 	if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
   3833 	    raidPtrs[raidid]->openings > 0) {
   3834 		/* there is work to do */
   3835 		return 0;
   3836 	}
   3837 	/* default is nothing to do */
   3838 	return 1;
   3839 }
   3840 
   3841 int
   3842 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3843 {
   3844 	uint64_t numsecs;
   3845 	unsigned secsize;
   3846 	int error;
   3847 
   3848 	error = getdisksize(vp, &numsecs, &secsize);
   3849 	if (error == 0) {
   3850 		diskPtr->blockSize = secsize;
   3851 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3852 		diskPtr->partitionSize = numsecs;
   3853 		return 0;
   3854 	}
   3855 	return error;
   3856 }
   3857 
   3858 static int
   3859 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3860 {
   3861 	return 1;
   3862 }
   3863 
   3864 static void
   3865 raid_attach(device_t parent, device_t self, void *aux)
   3866 {
   3867 
   3868 }
   3869 
   3870 
   3871 static int
   3872 raid_detach(device_t self, int flags)
   3873 {
   3874 	int error;
   3875 	struct raid_softc *rs = &raid_softc[device_unit(self)];
   3876 
   3877 	if ((error = raidlock(rs)) != 0)
   3878 		return (error);
   3879 
   3880 	error = raid_detach_unlocked(rs);
   3881 
   3882 	raidunlock(rs);
   3883 
   3884 	return error;
   3885 }
   3886 
   3887 static void
   3888 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3889 {
   3890 	prop_dictionary_t disk_info, odisk_info, geom;
   3891 	disk_info = prop_dictionary_create();
   3892 	geom = prop_dictionary_create();
   3893 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3894 				   raidPtr->totalSectors);
   3895 	prop_dictionary_set_uint32(geom, "sector-size",
   3896 				   raidPtr->bytesPerSector);
   3897 
   3898 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3899 				   raidPtr->Layout.dataSectorsPerStripe);
   3900 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3901 				   4 * raidPtr->numCol);
   3902 
   3903 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3904 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3905 	   (4 * raidPtr->numCol)));
   3906 
   3907 	prop_dictionary_set(disk_info, "geometry", geom);
   3908 	prop_object_release(geom);
   3909 	prop_dictionary_set(device_properties(rs->sc_dev),
   3910 			    "disk-info", disk_info);
   3911 	odisk_info = rs->sc_dkdev.dk_info;
   3912 	rs->sc_dkdev.dk_info = disk_info;
   3913 	if (odisk_info)
   3914 		prop_object_release(odisk_info);
   3915 }
   3916 
   3917 /*
   3918  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3919  * We end up returning whatever error was returned by the first cache flush
   3920  * that fails.
   3921  */
   3922 
   3923 int
   3924 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3925 {
   3926 	int c, sparecol;
   3927 	int e,error;
   3928 	int force = 1;
   3929 
   3930 	error = 0;
   3931 	for (c = 0; c < raidPtr->numCol; c++) {
   3932 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3933 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3934 					  &force, FWRITE, NOCRED);
   3935 			if (e) {
   3936 				if (e != ENODEV)
   3937 					printf("raid%d: cache flush to component %s failed.\n",
   3938 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3939 				if (error == 0) {
   3940 					error = e;
   3941 				}
   3942 			}
   3943 		}
   3944 	}
   3945 
   3946 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3947 		sparecol = raidPtr->numCol + c;
   3948 		/* Need to ensure that the reconstruct actually completed! */
   3949 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3950 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3951 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3952 			if (e) {
   3953 				if (e != ENODEV)
   3954 					printf("raid%d: cache flush to component %s failed.\n",
   3955 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3956 				if (error == 0) {
   3957 					error = e;
   3958 				}
   3959 			}
   3960 		}
   3961 	}
   3962 	return error;
   3963 }
   3964