Home | History | Annotate | Line # | Download | only in raidframe
rf_netbsdkintf.c revision 1.295.6.3
      1 /*	$NetBSD: rf_netbsdkintf.c,v 1.295.6.3 2014/12/02 22:05:14 snj Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Greg Oster; Jason R. Thorpe.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * Copyright (c) 1988 University of Utah.
     34  * Copyright (c) 1990, 1993
     35  *      The Regents of the University of California.  All rights reserved.
     36  *
     37  * This code is derived from software contributed to Berkeley by
     38  * the Systems Programming Group of the University of Utah Computer
     39  * Science Department.
     40  *
     41  * Redistribution and use in source and binary forms, with or without
     42  * modification, are permitted provided that the following conditions
     43  * are met:
     44  * 1. Redistributions of source code must retain the above copyright
     45  *    notice, this list of conditions and the following disclaimer.
     46  * 2. Redistributions in binary form must reproduce the above copyright
     47  *    notice, this list of conditions and the following disclaimer in the
     48  *    documentation and/or other materials provided with the distribution.
     49  * 3. Neither the name of the University nor the names of its contributors
     50  *    may be used to endorse or promote products derived from this software
     51  *    without specific prior written permission.
     52  *
     53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     63  * SUCH DAMAGE.
     64  *
     65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
     66  *
     67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
     68  */
     69 
     70 /*
     71  * Copyright (c) 1995 Carnegie-Mellon University.
     72  * All rights reserved.
     73  *
     74  * Authors: Mark Holland, Jim Zelenka
     75  *
     76  * Permission to use, copy, modify and distribute this software and
     77  * its documentation is hereby granted, provided that both the copyright
     78  * notice and this permission notice appear in all copies of the
     79  * software, derivative works or modified versions, and any portions
     80  * thereof, and that both notices appear in supporting documentation.
     81  *
     82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
     83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
     84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
     85  *
     86  * Carnegie Mellon requests users of this software to return to
     87  *
     88  *  Software Distribution Coordinator  or  Software.Distribution (at) CS.CMU.EDU
     89  *  School of Computer Science
     90  *  Carnegie Mellon University
     91  *  Pittsburgh PA 15213-3890
     92  *
     93  * any improvements or extensions that they make and grant Carnegie the
     94  * rights to redistribute these changes.
     95  */
     96 
     97 /***********************************************************
     98  *
     99  * rf_kintf.c -- the kernel interface routines for RAIDframe
    100  *
    101  ***********************************************************/
    102 
    103 #include <sys/cdefs.h>
    104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.295.6.3 2014/12/02 22:05:14 snj Exp $");
    105 
    106 #ifdef _KERNEL_OPT
    107 #include "opt_compat_netbsd.h"
    108 #include "opt_raid_autoconfig.h"
    109 #include "raid.h"
    110 #endif
    111 
    112 #include <sys/param.h>
    113 #include <sys/errno.h>
    114 #include <sys/pool.h>
    115 #include <sys/proc.h>
    116 #include <sys/queue.h>
    117 #include <sys/disk.h>
    118 #include <sys/device.h>
    119 #include <sys/stat.h>
    120 #include <sys/ioctl.h>
    121 #include <sys/fcntl.h>
    122 #include <sys/systm.h>
    123 #include <sys/vnode.h>
    124 #include <sys/disklabel.h>
    125 #include <sys/conf.h>
    126 #include <sys/buf.h>
    127 #include <sys/bufq.h>
    128 #include <sys/reboot.h>
    129 #include <sys/kauth.h>
    130 
    131 #include <prop/proplib.h>
    132 
    133 #include <dev/raidframe/raidframevar.h>
    134 #include <dev/raidframe/raidframeio.h>
    135 #include <dev/raidframe/rf_paritymap.h>
    136 
    137 #include "rf_raid.h"
    138 #include "rf_copyback.h"
    139 #include "rf_dag.h"
    140 #include "rf_dagflags.h"
    141 #include "rf_desc.h"
    142 #include "rf_diskqueue.h"
    143 #include "rf_etimer.h"
    144 #include "rf_general.h"
    145 #include "rf_kintf.h"
    146 #include "rf_options.h"
    147 #include "rf_driver.h"
    148 #include "rf_parityscan.h"
    149 #include "rf_threadstuff.h"
    150 
    151 #ifdef COMPAT_50
    152 #include "rf_compat50.h"
    153 #endif
    154 
    155 #ifdef DEBUG
    156 int     rf_kdebug_level = 0;
    157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
    158 #else				/* DEBUG */
    159 #define db1_printf(a) { }
    160 #endif				/* DEBUG */
    161 
    162 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
    163 
    164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    165 static rf_declare_mutex2(rf_sparet_wait_mutex);
    166 static rf_declare_cond2(rf_sparet_wait_cv);
    167 static rf_declare_cond2(rf_sparet_resp_cv);
    168 
    169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
    170 						 * spare table */
    171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
    172 						 * installation process */
    173 #endif
    174 
    175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
    176 
    177 /* prototypes */
    178 static void KernelWakeupFunc(struct buf *);
    179 static void InitBP(struct buf *, struct vnode *, unsigned,
    180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
    181     void *, int, struct proc *);
    182 static void raidinit(RF_Raid_t *);
    183 
    184 void raidattach(int);
    185 static int raid_match(device_t, cfdata_t, void *);
    186 static void raid_attach(device_t, device_t, void *);
    187 static int raid_detach(device_t, int);
    188 
    189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
    190     daddr_t, daddr_t);
    191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
    192     daddr_t, daddr_t, int);
    193 
    194 static int raidwrite_component_label(unsigned,
    195     dev_t, struct vnode *, RF_ComponentLabel_t *);
    196 static int raidread_component_label(unsigned,
    197     dev_t, struct vnode *, RF_ComponentLabel_t *);
    198 
    199 
    200 dev_type_open(raidopen);
    201 dev_type_close(raidclose);
    202 dev_type_read(raidread);
    203 dev_type_write(raidwrite);
    204 dev_type_ioctl(raidioctl);
    205 dev_type_strategy(raidstrategy);
    206 dev_type_dump(raiddump);
    207 dev_type_size(raidsize);
    208 
    209 const struct bdevsw raid_bdevsw = {
    210 	raidopen, raidclose, raidstrategy, raidioctl,
    211 	raiddump, raidsize, D_DISK
    212 };
    213 
    214 const struct cdevsw raid_cdevsw = {
    215 	raidopen, raidclose, raidread, raidwrite, raidioctl,
    216 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
    217 };
    218 
    219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
    220 
    221 /* XXX Not sure if the following should be replacing the raidPtrs above,
    222    or if it should be used in conjunction with that...
    223 */
    224 
    225 struct raid_softc {
    226 	device_t sc_dev;
    227 	int     sc_flags;	/* flags */
    228 	int     sc_cflags;	/* configuration flags */
    229 	uint64_t sc_size;	/* size of the raid device */
    230 	char    sc_xname[20];	/* XXX external name */
    231 	struct disk sc_dkdev;	/* generic disk device info */
    232 	struct bufq_state *buf_queue;	/* used for the device queue */
    233 };
    234 /* sc_flags */
    235 #define RAIDF_INITED	0x01	/* unit has been initialized */
    236 #define RAIDF_WLABEL	0x02	/* label area is writable */
    237 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
    238 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
    239 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
    240 #define RAIDF_LOCKED	0x80	/* unit is locked */
    241 
    242 #define	raidunit(x)	DISKUNIT(x)
    243 int numraid = 0;
    244 
    245 extern struct cfdriver raid_cd;
    246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
    247     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
    248     DVF_DETACH_SHUTDOWN);
    249 
    250 /*
    251  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
    252  * Be aware that large numbers can allow the driver to consume a lot of
    253  * kernel memory, especially on writes, and in degraded mode reads.
    254  *
    255  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
    256  * a single 64K write will typically require 64K for the old data,
    257  * 64K for the old parity, and 64K for the new parity, for a total
    258  * of 192K (if the parity buffer is not re-used immediately).
    259  * Even it if is used immediately, that's still 128K, which when multiplied
    260  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
    261  *
    262  * Now in degraded mode, for example, a 64K read on the above setup may
    263  * require data reconstruction, which will require *all* of the 4 remaining
    264  * disks to participate -- 4 * 32K/disk == 128K again.
    265  */
    266 
    267 #ifndef RAIDOUTSTANDING
    268 #define RAIDOUTSTANDING   6
    269 #endif
    270 
    271 #define RAIDLABELDEV(dev)	\
    272 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
    273 
    274 /* declared here, and made public, for the benefit of KVM stuff.. */
    275 struct raid_softc *raid_softc;
    276 
    277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
    278 				     struct disklabel *);
    279 static void raidgetdisklabel(dev_t);
    280 static void raidmakedisklabel(struct raid_softc *);
    281 
    282 static int raidlock(struct raid_softc *);
    283 static void raidunlock(struct raid_softc *);
    284 
    285 static int raid_detach_unlocked(struct raid_softc *);
    286 
    287 static void rf_markalldirty(RF_Raid_t *);
    288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
    289 
    290 void rf_ReconThread(struct rf_recon_req *);
    291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
    292 void rf_CopybackThread(RF_Raid_t *raidPtr);
    293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
    294 int rf_autoconfig(device_t);
    295 void rf_buildroothack(RF_ConfigSet_t *);
    296 
    297 RF_AutoConfig_t *rf_find_raid_components(void);
    298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
    299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
    300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
    301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
    302 int rf_set_autoconfig(RF_Raid_t *, int);
    303 int rf_set_rootpartition(RF_Raid_t *, int);
    304 void rf_release_all_vps(RF_ConfigSet_t *);
    305 void rf_cleanup_config_set(RF_ConfigSet_t *);
    306 int rf_have_enough_components(RF_ConfigSet_t *);
    307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
    308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
    309 
    310 /*
    311  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
    312  * Note that this is overridden by having RAID_AUTOCONFIG as an option
    313  * in the kernel config file.
    314  */
    315 #ifdef RAID_AUTOCONFIG
    316 int raidautoconfig = 1;
    317 #else
    318 int raidautoconfig = 0;
    319 #endif
    320 static bool raidautoconfigdone = false;
    321 
    322 struct RF_Pools_s rf_pools;
    323 
    324 void
    325 raidattach(int num)
    326 {
    327 	int raidID;
    328 	int i, rc;
    329 
    330 	aprint_debug("raidattach: Asked for %d units\n", num);
    331 
    332 	if (num <= 0) {
    333 #ifdef DIAGNOSTIC
    334 		panic("raidattach: count <= 0");
    335 #endif
    336 		return;
    337 	}
    338 	/* This is where all the initialization stuff gets done. */
    339 
    340 	numraid = num;
    341 
    342 	/* Make some space for requested number of units... */
    343 
    344 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
    345 	if (raidPtrs == NULL) {
    346 		panic("raidPtrs is NULL!!");
    347 	}
    348 
    349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
    350 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
    351 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
    352 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
    353 
    354 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
    355 #endif
    356 
    357 	for (i = 0; i < num; i++)
    358 		raidPtrs[i] = NULL;
    359 	rc = rf_BootRaidframe();
    360 	if (rc == 0)
    361 		aprint_verbose("Kernelized RAIDframe activated\n");
    362 	else
    363 		panic("Serious error booting RAID!!");
    364 
    365 	/* put together some datastructures like the CCD device does.. This
    366 	 * lets us lock the device and what-not when it gets opened. */
    367 
    368 	raid_softc = (struct raid_softc *)
    369 		malloc(num * sizeof(struct raid_softc),
    370 		       M_RAIDFRAME, M_NOWAIT);
    371 	if (raid_softc == NULL) {
    372 		aprint_error("WARNING: no memory for RAIDframe driver\n");
    373 		return;
    374 	}
    375 
    376 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
    377 
    378 	for (raidID = 0; raidID < num; raidID++) {
    379 		bufq_alloc(&raid_softc[raidID].buf_queue, BUFQ_DISK_DEFAULT_STRAT, BUFQ_SORT_RAWBLOCK);
    380 
    381 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
    382 			  (RF_Raid_t *));
    383 		if (raidPtrs[raidID] == NULL) {
    384 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
    385 			numraid = raidID;
    386 			return;
    387 		}
    388 	}
    389 
    390 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
    391 		aprint_error("raidattach: config_cfattach_attach failed?\n");
    392 	}
    393 
    394 	raidautoconfigdone = false;
    395 
    396 	/*
    397 	 * Register a finalizer which will be used to auto-config RAID
    398 	 * sets once all real hardware devices have been found.
    399 	 */
    400 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
    401 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
    402 }
    403 
    404 int
    405 rf_autoconfig(device_t self)
    406 {
    407 	RF_AutoConfig_t *ac_list;
    408 	RF_ConfigSet_t *config_sets;
    409 
    410 	if (!raidautoconfig || raidautoconfigdone == true)
    411 		return (0);
    412 
    413 	/* XXX This code can only be run once. */
    414 	raidautoconfigdone = true;
    415 
    416 	/* 1. locate all RAID components on the system */
    417 	aprint_debug("Searching for RAID components...\n");
    418 	ac_list = rf_find_raid_components();
    419 
    420 	/* 2. Sort them into their respective sets. */
    421 	config_sets = rf_create_auto_sets(ac_list);
    422 
    423 	/*
    424 	 * 3. Evaluate each set andconfigure the valid ones.
    425 	 * This gets done in rf_buildroothack().
    426 	 */
    427 	rf_buildroothack(config_sets);
    428 
    429 	return 1;
    430 }
    431 
    432 void
    433 rf_buildroothack(RF_ConfigSet_t *config_sets)
    434 {
    435 	RF_ConfigSet_t *cset;
    436 	RF_ConfigSet_t *next_cset;
    437 	int retcode;
    438 	int raidID;
    439 	int rootID;
    440 	int col;
    441 	int num_root;
    442 	char *devname;
    443 
    444 	rootID = 0;
    445 	num_root = 0;
    446 	cset = config_sets;
    447 	while (cset != NULL) {
    448 		next_cset = cset->next;
    449 		if (rf_have_enough_components(cset) &&
    450 		    cset->ac->clabel->autoconfigure==1) {
    451 			retcode = rf_auto_config_set(cset,&raidID);
    452 			if (!retcode) {
    453 				aprint_debug("raid%d: configured ok\n", raidID);
    454 				if (cset->rootable) {
    455 					rootID = raidID;
    456 					num_root++;
    457 				}
    458 			} else {
    459 				/* The autoconfig didn't work :( */
    460 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
    461 				rf_release_all_vps(cset);
    462 			}
    463 		} else {
    464 			/* we're not autoconfiguring this set...
    465 			   release the associated resources */
    466 			rf_release_all_vps(cset);
    467 		}
    468 		/* cleanup */
    469 		rf_cleanup_config_set(cset);
    470 		cset = next_cset;
    471 	}
    472 
    473 	/* if the user has specified what the root device should be
    474 	   then we don't touch booted_device or boothowto... */
    475 
    476 	if (rootspec != NULL)
    477 		return;
    478 
    479 	/* we found something bootable... */
    480 
    481 	if (num_root == 1) {
    482 		booted_device = raid_softc[rootID].sc_dev;
    483 	} else if (num_root > 1) {
    484 
    485 		/*
    486 		 * Maybe the MD code can help. If it cannot, then
    487 		 * setroot() will discover that we have no
    488 		 * booted_device and will ask the user if nothing was
    489 		 * hardwired in the kernel config file
    490 		 */
    491 
    492 		if (booted_device == NULL)
    493 			cpu_rootconf();
    494 		if (booted_device == NULL)
    495 			return;
    496 
    497 		num_root = 0;
    498 		for (raidID = 0; raidID < numraid; raidID++) {
    499 			if (raidPtrs[raidID]->valid == 0)
    500 				continue;
    501 
    502 			if (raidPtrs[raidID]->root_partition == 0)
    503 				continue;
    504 
    505 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
    506 				devname = raidPtrs[raidID]->Disks[col].devname;
    507 				devname += sizeof("/dev/") - 1;
    508 				if (strncmp(devname, device_xname(booted_device),
    509 					    strlen(device_xname(booted_device))) != 0)
    510 					continue;
    511 				aprint_debug("raid%d includes boot device %s\n",
    512 				       raidID, devname);
    513 				num_root++;
    514 				rootID = raidID;
    515 			}
    516 		}
    517 
    518 		if (num_root == 1) {
    519 			booted_device = raid_softc[rootID].sc_dev;
    520 		} else {
    521 			/* we can't guess.. require the user to answer... */
    522 			boothowto |= RB_ASKNAME;
    523 		}
    524 	}
    525 }
    526 
    527 
    528 int
    529 raidsize(dev_t dev)
    530 {
    531 	struct raid_softc *rs;
    532 	struct disklabel *lp;
    533 	int     part, unit, omask, size;
    534 
    535 	unit = raidunit(dev);
    536 	if (unit >= numraid)
    537 		return (-1);
    538 	rs = &raid_softc[unit];
    539 
    540 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    541 		return (-1);
    542 
    543 	part = DISKPART(dev);
    544 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
    545 	lp = rs->sc_dkdev.dk_label;
    546 
    547 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
    548 		return (-1);
    549 
    550 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
    551 		size = -1;
    552 	else
    553 		size = lp->d_partitions[part].p_size *
    554 		    (lp->d_secsize / DEV_BSIZE);
    555 
    556 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
    557 		return (-1);
    558 
    559 	return (size);
    560 
    561 }
    562 
    563 int
    564 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
    565 {
    566 	int     unit = raidunit(dev);
    567 	struct raid_softc *rs;
    568 	const struct bdevsw *bdev;
    569 	struct disklabel *lp;
    570 	RF_Raid_t *raidPtr;
    571 	daddr_t offset;
    572 	int     part, c, sparecol, j, scol, dumpto;
    573 	int     error = 0;
    574 
    575 	if (unit >= numraid)
    576 		return (ENXIO);
    577 
    578 	rs = &raid_softc[unit];
    579 	raidPtr = raidPtrs[unit];
    580 
    581 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    582 		return ENXIO;
    583 
    584 	/* we only support dumping to RAID 1 sets */
    585 	if (raidPtr->Layout.numDataCol != 1 ||
    586 	    raidPtr->Layout.numParityCol != 1)
    587 		return EINVAL;
    588 
    589 
    590 	if ((error = raidlock(rs)) != 0)
    591 		return error;
    592 
    593 	if (size % DEV_BSIZE != 0) {
    594 		error = EINVAL;
    595 		goto out;
    596 	}
    597 
    598 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
    599 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
    600 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
    601 		    size / DEV_BSIZE, rs->sc_size);
    602 		error = EINVAL;
    603 		goto out;
    604 	}
    605 
    606 	part = DISKPART(dev);
    607 	lp = rs->sc_dkdev.dk_label;
    608 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
    609 
    610 	/* figure out what device is alive.. */
    611 
    612 	/*
    613 	   Look for a component to dump to.  The preference for the
    614 	   component to dump to is as follows:
    615 	   1) the master
    616 	   2) a used_spare of the master
    617 	   3) the slave
    618 	   4) a used_spare of the slave
    619 	*/
    620 
    621 	dumpto = -1;
    622 	for (c = 0; c < raidPtr->numCol; c++) {
    623 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
    624 			/* this might be the one */
    625 			dumpto = c;
    626 			break;
    627 		}
    628 	}
    629 
    630 	/*
    631 	   At this point we have possibly selected a live master or a
    632 	   live slave.  We now check to see if there is a spared
    633 	   master (or a spared slave), if we didn't find a live master
    634 	   or a live slave.
    635 	*/
    636 
    637 	for (c = 0; c < raidPtr->numSpare; c++) {
    638 		sparecol = raidPtr->numCol + c;
    639 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
    640 			/* How about this one? */
    641 			scol = -1;
    642 			for(j=0;j<raidPtr->numCol;j++) {
    643 				if (raidPtr->Disks[j].spareCol == sparecol) {
    644 					scol = j;
    645 					break;
    646 				}
    647 			}
    648 			if (scol == 0) {
    649 				/*
    650 				   We must have found a spared master!
    651 				   We'll take that over anything else
    652 				   found so far.  (We couldn't have
    653 				   found a real master before, since
    654 				   this is a used spare, and it's
    655 				   saying that it's replacing the
    656 				   master.)  On reboot (with
    657 				   autoconfiguration turned on)
    658 				   sparecol will become the 1st
    659 				   component (component0) of this set.
    660 				*/
    661 				dumpto = sparecol;
    662 				break;
    663 			} else if (scol != -1) {
    664 				/*
    665 				   Must be a spared slave.  We'll dump
    666 				   to that if we havn't found anything
    667 				   else so far.
    668 				*/
    669 				if (dumpto == -1)
    670 					dumpto = sparecol;
    671 			}
    672 		}
    673 	}
    674 
    675 	if (dumpto == -1) {
    676 		/* we couldn't find any live components to dump to!?!?
    677 		 */
    678 		error = EINVAL;
    679 		goto out;
    680 	}
    681 
    682 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
    683 
    684 	/*
    685 	   Note that blkno is relative to this particular partition.
    686 	   By adding the offset of this partition in the RAID
    687 	   set, and also adding RF_PROTECTED_SECTORS, we get a
    688 	   value that is relative to the partition used for the
    689 	   underlying component.
    690 	*/
    691 
    692 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
    693 				blkno + offset, va, size);
    694 
    695 out:
    696 	raidunlock(rs);
    697 
    698 	return error;
    699 }
    700 /* ARGSUSED */
    701 int
    702 raidopen(dev_t dev, int flags, int fmt,
    703     struct lwp *l)
    704 {
    705 	int     unit = raidunit(dev);
    706 	struct raid_softc *rs;
    707 	struct disklabel *lp;
    708 	int     part, pmask;
    709 	int     error = 0;
    710 
    711 	if (unit >= numraid)
    712 		return (ENXIO);
    713 	rs = &raid_softc[unit];
    714 
    715 	if ((error = raidlock(rs)) != 0)
    716 		return (error);
    717 
    718 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
    719 		error = EBUSY;
    720 		goto bad;
    721 	}
    722 
    723 	lp = rs->sc_dkdev.dk_label;
    724 
    725 	part = DISKPART(dev);
    726 
    727 	/*
    728 	 * If there are wedges, and this is not RAW_PART, then we
    729 	 * need to fail.
    730 	 */
    731 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
    732 		error = EBUSY;
    733 		goto bad;
    734 	}
    735 	pmask = (1 << part);
    736 
    737 	if ((rs->sc_flags & RAIDF_INITED) &&
    738 	    (rs->sc_dkdev.dk_openmask == 0))
    739 		raidgetdisklabel(dev);
    740 
    741 	/* make sure that this partition exists */
    742 
    743 	if (part != RAW_PART) {
    744 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
    745 		    ((part >= lp->d_npartitions) ||
    746 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
    747 			error = ENXIO;
    748 			goto bad;
    749 		}
    750 	}
    751 	/* Prevent this unit from being unconfigured while open. */
    752 	switch (fmt) {
    753 	case S_IFCHR:
    754 		rs->sc_dkdev.dk_copenmask |= pmask;
    755 		break;
    756 
    757 	case S_IFBLK:
    758 		rs->sc_dkdev.dk_bopenmask |= pmask;
    759 		break;
    760 	}
    761 
    762 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    763 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    764 		/* First one... mark things as dirty... Note that we *MUST*
    765 		 have done a configure before this.  I DO NOT WANT TO BE
    766 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
    767 		 THAT THEY BELONG TOGETHER!!!!! */
    768 		/* XXX should check to see if we're only open for reading
    769 		   here... If so, we needn't do this, but then need some
    770 		   other way of keeping track of what's happened.. */
    771 
    772 		rf_markalldirty(raidPtrs[unit]);
    773 	}
    774 
    775 
    776 	rs->sc_dkdev.dk_openmask =
    777 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    778 
    779 bad:
    780 	raidunlock(rs);
    781 
    782 	return (error);
    783 
    784 
    785 }
    786 /* ARGSUSED */
    787 int
    788 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
    789 {
    790 	int     unit = raidunit(dev);
    791 	struct raid_softc *rs;
    792 	int     error = 0;
    793 	int     part;
    794 
    795 	if (unit >= numraid)
    796 		return (ENXIO);
    797 	rs = &raid_softc[unit];
    798 
    799 	if ((error = raidlock(rs)) != 0)
    800 		return (error);
    801 
    802 	part = DISKPART(dev);
    803 
    804 	/* ...that much closer to allowing unconfiguration... */
    805 	switch (fmt) {
    806 	case S_IFCHR:
    807 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
    808 		break;
    809 
    810 	case S_IFBLK:
    811 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
    812 		break;
    813 	}
    814 	rs->sc_dkdev.dk_openmask =
    815 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
    816 
    817 	if ((rs->sc_dkdev.dk_openmask == 0) &&
    818 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
    819 		/* Last one... device is not unconfigured yet.
    820 		   Device shutdown has taken care of setting the
    821 		   clean bits if RAIDF_INITED is not set
    822 		   mark things as clean... */
    823 
    824 		rf_update_component_labels(raidPtrs[unit],
    825 						 RF_FINAL_COMPONENT_UPDATE);
    826 
    827 		/* If the kernel is shutting down, it will detach
    828 		 * this RAID set soon enough.
    829 		 */
    830 	}
    831 
    832 	raidunlock(rs);
    833 	return (0);
    834 
    835 }
    836 
    837 void
    838 raidstrategy(struct buf *bp)
    839 {
    840 	unsigned int raidID = raidunit(bp->b_dev);
    841 	RF_Raid_t *raidPtr;
    842 	struct raid_softc *rs = &raid_softc[raidID];
    843 	int     wlabel;
    844 
    845 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
    846 		bp->b_error = ENXIO;
    847 		goto done;
    848 	}
    849 	if (raidID >= numraid || !raidPtrs[raidID]) {
    850 		bp->b_error = ENODEV;
    851 		goto done;
    852 	}
    853 	raidPtr = raidPtrs[raidID];
    854 	if (!raidPtr->valid) {
    855 		bp->b_error = ENODEV;
    856 		goto done;
    857 	}
    858 	if (bp->b_bcount == 0) {
    859 		db1_printf(("b_bcount is zero..\n"));
    860 		goto done;
    861 	}
    862 
    863 	/*
    864 	 * Do bounds checking and adjust transfer.  If there's an
    865 	 * error, the bounds check will flag that for us.
    866 	 */
    867 
    868 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
    869 	if (DISKPART(bp->b_dev) == RAW_PART) {
    870 		uint64_t size; /* device size in DEV_BSIZE unit */
    871 
    872 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
    873 			size = raidPtr->totalSectors <<
    874 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
    875 		} else {
    876 			size = raidPtr->totalSectors >>
    877 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
    878 		}
    879 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
    880 			goto done;
    881 		}
    882 	} else {
    883 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
    884 			db1_printf(("Bounds check failed!!:%d %d\n",
    885 				(int) bp->b_blkno, (int) wlabel));
    886 			goto done;
    887 		}
    888 	}
    889 
    890 	rf_lock_mutex2(raidPtr->iodone_lock);
    891 
    892 	bp->b_resid = 0;
    893 
    894 	/* stuff it onto our queue */
    895 	bufq_put(rs->buf_queue, bp);
    896 
    897 	/* scheduled the IO to happen at the next convenient time */
    898 	rf_signal_cond2(raidPtr->iodone_cv);
    899 	rf_unlock_mutex2(raidPtr->iodone_lock);
    900 
    901 	return;
    902 
    903 done:
    904 	bp->b_resid = bp->b_bcount;
    905 	biodone(bp);
    906 }
    907 /* ARGSUSED */
    908 int
    909 raidread(dev_t dev, struct uio *uio, int flags)
    910 {
    911 	int     unit = raidunit(dev);
    912 	struct raid_softc *rs;
    913 
    914 	if (unit >= numraid)
    915 		return (ENXIO);
    916 	rs = &raid_softc[unit];
    917 
    918 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    919 		return (ENXIO);
    920 
    921 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
    922 
    923 }
    924 /* ARGSUSED */
    925 int
    926 raidwrite(dev_t dev, struct uio *uio, int flags)
    927 {
    928 	int     unit = raidunit(dev);
    929 	struct raid_softc *rs;
    930 
    931 	if (unit >= numraid)
    932 		return (ENXIO);
    933 	rs = &raid_softc[unit];
    934 
    935 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    936 		return (ENXIO);
    937 
    938 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
    939 
    940 }
    941 
    942 static int
    943 raid_detach_unlocked(struct raid_softc *rs)
    944 {
    945 	int error;
    946 	RF_Raid_t *raidPtr;
    947 
    948 	raidPtr = raidPtrs[device_unit(rs->sc_dev)];
    949 
    950 	/*
    951 	 * If somebody has a partition mounted, we shouldn't
    952 	 * shutdown.
    953 	 */
    954 	if (rs->sc_dkdev.dk_openmask != 0)
    955 		return EBUSY;
    956 
    957 	if ((rs->sc_flags & RAIDF_INITED) == 0)
    958 		;	/* not initialized: nothing to do */
    959 	else if ((error = rf_Shutdown(raidPtr)) != 0)
    960 		return error;
    961 	else
    962 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
    963 
    964 	/* Detach the disk. */
    965 	dkwedge_delall(&rs->sc_dkdev);
    966 	disk_detach(&rs->sc_dkdev);
    967 	disk_destroy(&rs->sc_dkdev);
    968 
    969 	aprint_normal_dev(rs->sc_dev, "detached\n");
    970 
    971 	return 0;
    972 }
    973 
    974 int
    975 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
    976 {
    977 	int     unit = raidunit(dev);
    978 	int     error = 0;
    979 	int     part, pmask, s;
    980 	cfdata_t cf;
    981 	struct raid_softc *rs;
    982 	RF_Config_t *k_cfg, *u_cfg;
    983 	RF_Raid_t *raidPtr;
    984 	RF_RaidDisk_t *diskPtr;
    985 	RF_AccTotals_t *totals;
    986 	RF_DeviceConfig_t *d_cfg, **ucfgp;
    987 	u_char *specific_buf;
    988 	int retcode = 0;
    989 	int column;
    990 /*	int raidid; */
    991 	struct rf_recon_req *rrcopy, *rr;
    992 	RF_ComponentLabel_t *clabel;
    993 	RF_ComponentLabel_t *ci_label;
    994 	RF_ComponentLabel_t **clabel_ptr;
    995 	RF_SingleComponent_t *sparePtr,*componentPtr;
    996 	RF_SingleComponent_t component;
    997 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
    998 	int i, j, d;
    999 #ifdef __HAVE_OLD_DISKLABEL
   1000 	struct disklabel newlabel;
   1001 #endif
   1002 	struct dkwedge_info *dkw;
   1003 
   1004 	if (unit >= numraid)
   1005 		return (ENXIO);
   1006 	rs = &raid_softc[unit];
   1007 	raidPtr = raidPtrs[unit];
   1008 
   1009 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
   1010 		(int) DISKPART(dev), (int) unit, cmd));
   1011 
   1012 	/* Must be open for writes for these commands... */
   1013 	switch (cmd) {
   1014 #ifdef DIOCGSECTORSIZE
   1015 	case DIOCGSECTORSIZE:
   1016 		*(u_int *)data = raidPtr->bytesPerSector;
   1017 		return 0;
   1018 	case DIOCGMEDIASIZE:
   1019 		*(off_t *)data =
   1020 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
   1021 		return 0;
   1022 #endif
   1023 	case DIOCSDINFO:
   1024 	case DIOCWDINFO:
   1025 #ifdef __HAVE_OLD_DISKLABEL
   1026 	case ODIOCWDINFO:
   1027 	case ODIOCSDINFO:
   1028 #endif
   1029 	case DIOCWLABEL:
   1030 	case DIOCAWEDGE:
   1031 	case DIOCDWEDGE:
   1032 	case DIOCSSTRATEGY:
   1033 		if ((flag & FWRITE) == 0)
   1034 			return (EBADF);
   1035 	}
   1036 
   1037 	/* Must be initialized for these... */
   1038 	switch (cmd) {
   1039 	case DIOCGDINFO:
   1040 	case DIOCSDINFO:
   1041 	case DIOCWDINFO:
   1042 #ifdef __HAVE_OLD_DISKLABEL
   1043 	case ODIOCGDINFO:
   1044 	case ODIOCWDINFO:
   1045 	case ODIOCSDINFO:
   1046 	case ODIOCGDEFLABEL:
   1047 #endif
   1048 	case DIOCGPART:
   1049 	case DIOCWLABEL:
   1050 	case DIOCGDEFLABEL:
   1051 	case DIOCAWEDGE:
   1052 	case DIOCDWEDGE:
   1053 	case DIOCLWEDGES:
   1054 	case DIOCCACHESYNC:
   1055 	case RAIDFRAME_SHUTDOWN:
   1056 	case RAIDFRAME_REWRITEPARITY:
   1057 	case RAIDFRAME_GET_INFO:
   1058 	case RAIDFRAME_RESET_ACCTOTALS:
   1059 	case RAIDFRAME_GET_ACCTOTALS:
   1060 	case RAIDFRAME_KEEP_ACCTOTALS:
   1061 	case RAIDFRAME_GET_SIZE:
   1062 	case RAIDFRAME_FAIL_DISK:
   1063 	case RAIDFRAME_COPYBACK:
   1064 	case RAIDFRAME_CHECK_RECON_STATUS:
   1065 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1066 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1067 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1068 	case RAIDFRAME_ADD_HOT_SPARE:
   1069 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1070 	case RAIDFRAME_INIT_LABELS:
   1071 	case RAIDFRAME_REBUILD_IN_PLACE:
   1072 	case RAIDFRAME_CHECK_PARITY:
   1073 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1074 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1075 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1076 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1077 	case RAIDFRAME_SET_AUTOCONFIG:
   1078 	case RAIDFRAME_SET_ROOT:
   1079 	case RAIDFRAME_DELETE_COMPONENT:
   1080 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1081 	case RAIDFRAME_PARITYMAP_STATUS:
   1082 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1083 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1084 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1085 	case DIOCGSTRATEGY:
   1086 	case DIOCSSTRATEGY:
   1087 		if ((rs->sc_flags & RAIDF_INITED) == 0)
   1088 			return (ENXIO);
   1089 	}
   1090 
   1091 	switch (cmd) {
   1092 #ifdef COMPAT_50
   1093 	case RAIDFRAME_GET_INFO50:
   1094 		return rf_get_info50(raidPtr, data);
   1095 
   1096 	case RAIDFRAME_CONFIGURE50:
   1097 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
   1098 			return retcode;
   1099 		goto config;
   1100 #endif
   1101 		/* configure the system */
   1102 	case RAIDFRAME_CONFIGURE:
   1103 
   1104 		if (raidPtr->valid) {
   1105 			/* There is a valid RAID set running on this unit! */
   1106 			printf("raid%d: Device already configured!\n",unit);
   1107 			return(EINVAL);
   1108 		}
   1109 
   1110 		/* copy-in the configuration information */
   1111 		/* data points to a pointer to the configuration structure */
   1112 
   1113 		u_cfg = *((RF_Config_t **) data);
   1114 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
   1115 		if (k_cfg == NULL) {
   1116 			return (ENOMEM);
   1117 		}
   1118 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
   1119 		if (retcode) {
   1120 			RF_Free(k_cfg, sizeof(RF_Config_t));
   1121 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
   1122 				retcode));
   1123 			return (retcode);
   1124 		}
   1125 		goto config;
   1126 	config:
   1127 		/* allocate a buffer for the layout-specific data, and copy it
   1128 		 * in */
   1129 		if (k_cfg->layoutSpecificSize) {
   1130 			if (k_cfg->layoutSpecificSize > 10000) {
   1131 				/* sanity check */
   1132 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1133 				return (EINVAL);
   1134 			}
   1135 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
   1136 			    (u_char *));
   1137 			if (specific_buf == NULL) {
   1138 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1139 				return (ENOMEM);
   1140 			}
   1141 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
   1142 			    k_cfg->layoutSpecificSize);
   1143 			if (retcode) {
   1144 				RF_Free(k_cfg, sizeof(RF_Config_t));
   1145 				RF_Free(specific_buf,
   1146 					k_cfg->layoutSpecificSize);
   1147 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
   1148 					retcode));
   1149 				return (retcode);
   1150 			}
   1151 		} else
   1152 			specific_buf = NULL;
   1153 		k_cfg->layoutSpecific = specific_buf;
   1154 
   1155 		/* should do some kind of sanity check on the configuration.
   1156 		 * Store the sum of all the bytes in the last byte? */
   1157 
   1158 		/* configure the system */
   1159 
   1160 		/*
   1161 		 * Clear the entire RAID descriptor, just to make sure
   1162 		 *  there is no stale data left in the case of a
   1163 		 *  reconfiguration
   1164 		 */
   1165 		memset(raidPtr, 0, sizeof(*raidPtr));
   1166 		raidPtr->raidid = unit;
   1167 
   1168 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
   1169 
   1170 		if (retcode == 0) {
   1171 
   1172 			/* allow this many simultaneous IO's to
   1173 			   this RAID device */
   1174 			raidPtr->openings = RAIDOUTSTANDING;
   1175 
   1176 			raidinit(raidPtr);
   1177 			rf_markalldirty(raidPtr);
   1178 		}
   1179 		/* free the buffers.  No return code here. */
   1180 		if (k_cfg->layoutSpecificSize) {
   1181 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
   1182 		}
   1183 		RF_Free(k_cfg, sizeof(RF_Config_t));
   1184 
   1185 		return (retcode);
   1186 
   1187 		/* shutdown the system */
   1188 	case RAIDFRAME_SHUTDOWN:
   1189 
   1190 		part = DISKPART(dev);
   1191 		pmask = (1 << part);
   1192 
   1193 		if ((error = raidlock(rs)) != 0)
   1194 			return (error);
   1195 
   1196 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
   1197 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
   1198 			(rs->sc_dkdev.dk_copenmask & pmask)))
   1199 			retcode = EBUSY;
   1200 		else {
   1201 			rs->sc_flags |= RAIDF_SHUTDOWN;
   1202 			rs->sc_dkdev.dk_copenmask &= ~pmask;
   1203 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
   1204 			rs->sc_dkdev.dk_openmask &= ~pmask;
   1205 			retcode = 0;
   1206 		}
   1207 
   1208 		raidunlock(rs);
   1209 
   1210 		if (retcode != 0)
   1211 			return retcode;
   1212 
   1213 		/* free the pseudo device attach bits */
   1214 
   1215 		cf = device_cfdata(rs->sc_dev);
   1216 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
   1217 			free(cf, M_RAIDFRAME);
   1218 
   1219 		return (retcode);
   1220 	case RAIDFRAME_GET_COMPONENT_LABEL:
   1221 		clabel_ptr = (RF_ComponentLabel_t **) data;
   1222 		/* need to read the component label for the disk indicated
   1223 		   by row,column in clabel */
   1224 
   1225 		/*
   1226 		 * Perhaps there should be an option to skip the in-core
   1227 		 * copy and hit the disk, as with disklabel(8).
   1228 		 */
   1229 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
   1230 
   1231 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
   1232 
   1233 		if (retcode) {
   1234 			RF_Free(clabel, sizeof(*clabel));
   1235 			return retcode;
   1236 		}
   1237 
   1238 		clabel->row = 0; /* Don't allow looking at anything else.*/
   1239 
   1240 		column = clabel->column;
   1241 
   1242 		if ((column < 0) || (column >= raidPtr->numCol +
   1243 		    raidPtr->numSpare)) {
   1244 			RF_Free(clabel, sizeof(*clabel));
   1245 			return EINVAL;
   1246 		}
   1247 
   1248 		RF_Free(clabel, sizeof(*clabel));
   1249 
   1250 		clabel = raidget_component_label(raidPtr, column);
   1251 
   1252 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
   1253 
   1254 #if 0
   1255 	case RAIDFRAME_SET_COMPONENT_LABEL:
   1256 		clabel = (RF_ComponentLabel_t *) data;
   1257 
   1258 		/* XXX check the label for valid stuff... */
   1259 		/* Note that some things *should not* get modified --
   1260 		   the user should be re-initing the labels instead of
   1261 		   trying to patch things.
   1262 		   */
   1263 
   1264 		raidid = raidPtr->raidid;
   1265 #ifdef DEBUG
   1266 		printf("raid%d: Got component label:\n", raidid);
   1267 		printf("raid%d: Version: %d\n", raidid, clabel->version);
   1268 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
   1269 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
   1270 		printf("raid%d: Column: %d\n", raidid, clabel->column);
   1271 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
   1272 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
   1273 		printf("raid%d: Status: %d\n", raidid, clabel->status);
   1274 #endif
   1275 		clabel->row = 0;
   1276 		column = clabel->column;
   1277 
   1278 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1279 			return(EINVAL);
   1280 		}
   1281 
   1282 		/* XXX this isn't allowed to do anything for now :-) */
   1283 
   1284 		/* XXX and before it is, we need to fill in the rest
   1285 		   of the fields!?!?!?! */
   1286 		memcpy(raidget_component_label(raidPtr, column),
   1287 		    clabel, sizeof(*clabel));
   1288 		raidflush_component_label(raidPtr, column);
   1289 		return (0);
   1290 #endif
   1291 
   1292 	case RAIDFRAME_INIT_LABELS:
   1293 		clabel = (RF_ComponentLabel_t *) data;
   1294 		/*
   1295 		   we only want the serial number from
   1296 		   the above.  We get all the rest of the information
   1297 		   from the config that was used to create this RAID
   1298 		   set.
   1299 		   */
   1300 
   1301 		raidPtr->serial_number = clabel->serial_number;
   1302 
   1303 		for(column=0;column<raidPtr->numCol;column++) {
   1304 			diskPtr = &raidPtr->Disks[column];
   1305 			if (!RF_DEAD_DISK(diskPtr->status)) {
   1306 				ci_label = raidget_component_label(raidPtr,
   1307 				    column);
   1308 				/* Zeroing this is important. */
   1309 				memset(ci_label, 0, sizeof(*ci_label));
   1310 				raid_init_component_label(raidPtr, ci_label);
   1311 				ci_label->serial_number =
   1312 				    raidPtr->serial_number;
   1313 				ci_label->row = 0; /* we dont' pretend to support more */
   1314 				rf_component_label_set_partitionsize(ci_label,
   1315 				    diskPtr->partitionSize);
   1316 				ci_label->column = column;
   1317 				raidflush_component_label(raidPtr, column);
   1318 			}
   1319 			/* XXXjld what about the spares? */
   1320 		}
   1321 
   1322 		return (retcode);
   1323 	case RAIDFRAME_SET_AUTOCONFIG:
   1324 		d = rf_set_autoconfig(raidPtr, *(int *) data);
   1325 		printf("raid%d: New autoconfig value is: %d\n",
   1326 		       raidPtr->raidid, d);
   1327 		*(int *) data = d;
   1328 		return (retcode);
   1329 
   1330 	case RAIDFRAME_SET_ROOT:
   1331 		d = rf_set_rootpartition(raidPtr, *(int *) data);
   1332 		printf("raid%d: New rootpartition value is: %d\n",
   1333 		       raidPtr->raidid, d);
   1334 		*(int *) data = d;
   1335 		return (retcode);
   1336 
   1337 		/* initialize all parity */
   1338 	case RAIDFRAME_REWRITEPARITY:
   1339 
   1340 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1341 			/* Parity for RAID 0 is trivially correct */
   1342 			raidPtr->parity_good = RF_RAID_CLEAN;
   1343 			return(0);
   1344 		}
   1345 
   1346 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1347 			/* Re-write is already in progress! */
   1348 			return(EINVAL);
   1349 		}
   1350 
   1351 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
   1352 					   rf_RewriteParityThread,
   1353 					   raidPtr,"raid_parity");
   1354 		return (retcode);
   1355 
   1356 
   1357 	case RAIDFRAME_ADD_HOT_SPARE:
   1358 		sparePtr = (RF_SingleComponent_t *) data;
   1359 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
   1360 		retcode = rf_add_hot_spare(raidPtr, &component);
   1361 		return(retcode);
   1362 
   1363 	case RAIDFRAME_REMOVE_HOT_SPARE:
   1364 		return(retcode);
   1365 
   1366 	case RAIDFRAME_DELETE_COMPONENT:
   1367 		componentPtr = (RF_SingleComponent_t *)data;
   1368 		memcpy( &component, componentPtr,
   1369 			sizeof(RF_SingleComponent_t));
   1370 		retcode = rf_delete_component(raidPtr, &component);
   1371 		return(retcode);
   1372 
   1373 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
   1374 		componentPtr = (RF_SingleComponent_t *)data;
   1375 		memcpy( &component, componentPtr,
   1376 			sizeof(RF_SingleComponent_t));
   1377 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
   1378 		return(retcode);
   1379 
   1380 	case RAIDFRAME_REBUILD_IN_PLACE:
   1381 
   1382 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1383 			/* Can't do this on a RAID 0!! */
   1384 			return(EINVAL);
   1385 		}
   1386 
   1387 		if (raidPtr->recon_in_progress == 1) {
   1388 			/* a reconstruct is already in progress! */
   1389 			return(EINVAL);
   1390 		}
   1391 
   1392 		componentPtr = (RF_SingleComponent_t *) data;
   1393 		memcpy( &component, componentPtr,
   1394 			sizeof(RF_SingleComponent_t));
   1395 		component.row = 0; /* we don't support any more */
   1396 		column = component.column;
   1397 
   1398 		if ((column < 0) || (column >= raidPtr->numCol)) {
   1399 			return(EINVAL);
   1400 		}
   1401 
   1402 		rf_lock_mutex2(raidPtr->mutex);
   1403 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
   1404 		    (raidPtr->numFailures > 0)) {
   1405 			/* XXX 0 above shouldn't be constant!!! */
   1406 			/* some component other than this has failed.
   1407 			   Let's not make things worse than they already
   1408 			   are... */
   1409 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1410 			       raidPtr->raidid);
   1411 			printf("raid%d:     Col: %d   Too many failures.\n",
   1412 			       raidPtr->raidid, column);
   1413 			rf_unlock_mutex2(raidPtr->mutex);
   1414 			return (EINVAL);
   1415 		}
   1416 		if (raidPtr->Disks[column].status ==
   1417 		    rf_ds_reconstructing) {
   1418 			printf("raid%d: Unable to reconstruct to disk at:\n",
   1419 			       raidPtr->raidid);
   1420 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
   1421 
   1422 			rf_unlock_mutex2(raidPtr->mutex);
   1423 			return (EINVAL);
   1424 		}
   1425 		if (raidPtr->Disks[column].status == rf_ds_spared) {
   1426 			rf_unlock_mutex2(raidPtr->mutex);
   1427 			return (EINVAL);
   1428 		}
   1429 		rf_unlock_mutex2(raidPtr->mutex);
   1430 
   1431 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1432 		if (rrcopy == NULL)
   1433 			return(ENOMEM);
   1434 
   1435 		rrcopy->raidPtr = (void *) raidPtr;
   1436 		rrcopy->col = column;
   1437 
   1438 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1439 					   rf_ReconstructInPlaceThread,
   1440 					   rrcopy,"raid_reconip");
   1441 		return(retcode);
   1442 
   1443 	case RAIDFRAME_GET_INFO:
   1444 		if (!raidPtr->valid)
   1445 			return (ENODEV);
   1446 		ucfgp = (RF_DeviceConfig_t **) data;
   1447 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
   1448 			  (RF_DeviceConfig_t *));
   1449 		if (d_cfg == NULL)
   1450 			return (ENOMEM);
   1451 		d_cfg->rows = 1; /* there is only 1 row now */
   1452 		d_cfg->cols = raidPtr->numCol;
   1453 		d_cfg->ndevs = raidPtr->numCol;
   1454 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
   1455 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1456 			return (ENOMEM);
   1457 		}
   1458 		d_cfg->nspares = raidPtr->numSpare;
   1459 		if (d_cfg->nspares >= RF_MAX_DISKS) {
   1460 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1461 			return (ENOMEM);
   1462 		}
   1463 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
   1464 		d = 0;
   1465 		for (j = 0; j < d_cfg->cols; j++) {
   1466 			d_cfg->devs[d] = raidPtr->Disks[j];
   1467 			d++;
   1468 		}
   1469 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
   1470 			d_cfg->spares[i] = raidPtr->Disks[j];
   1471 			if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
   1472 				/* XXX: raidctl(8) expects to see this as a used spare */
   1473 				d_cfg->spares[i].status = rf_ds_used_spare;
   1474 			}
   1475 		}
   1476 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
   1477 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
   1478 
   1479 		return (retcode);
   1480 
   1481 	case RAIDFRAME_CHECK_PARITY:
   1482 		*(int *) data = raidPtr->parity_good;
   1483 		return (0);
   1484 
   1485 	case RAIDFRAME_PARITYMAP_STATUS:
   1486 		if (rf_paritymap_ineligible(raidPtr))
   1487 			return EINVAL;
   1488 		rf_paritymap_status(raidPtr->parity_map,
   1489 		    (struct rf_pmstat *)data);
   1490 		return 0;
   1491 
   1492 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
   1493 		if (rf_paritymap_ineligible(raidPtr))
   1494 			return EINVAL;
   1495 		if (raidPtr->parity_map == NULL)
   1496 			return ENOENT; /* ??? */
   1497 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
   1498 			(struct rf_pmparams *)data, 1))
   1499 			return EINVAL;
   1500 		return 0;
   1501 
   1502 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
   1503 		if (rf_paritymap_ineligible(raidPtr))
   1504 			return EINVAL;
   1505 		*(int *) data = rf_paritymap_get_disable(raidPtr);
   1506 		return 0;
   1507 
   1508 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
   1509 		if (rf_paritymap_ineligible(raidPtr))
   1510 			return EINVAL;
   1511 		rf_paritymap_set_disable(raidPtr, *(int *)data);
   1512 		/* XXX should errors be passed up? */
   1513 		return 0;
   1514 
   1515 	case RAIDFRAME_RESET_ACCTOTALS:
   1516 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
   1517 		return (0);
   1518 
   1519 	case RAIDFRAME_GET_ACCTOTALS:
   1520 		totals = (RF_AccTotals_t *) data;
   1521 		*totals = raidPtr->acc_totals;
   1522 		return (0);
   1523 
   1524 	case RAIDFRAME_KEEP_ACCTOTALS:
   1525 		raidPtr->keep_acc_totals = *(int *)data;
   1526 		return (0);
   1527 
   1528 	case RAIDFRAME_GET_SIZE:
   1529 		*(int *) data = raidPtr->totalSectors;
   1530 		return (0);
   1531 
   1532 		/* fail a disk & optionally start reconstruction */
   1533 	case RAIDFRAME_FAIL_DISK:
   1534 
   1535 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1536 			/* Can't do this on a RAID 0!! */
   1537 			return(EINVAL);
   1538 		}
   1539 
   1540 		rr = (struct rf_recon_req *) data;
   1541 		rr->row = 0;
   1542 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
   1543 			return (EINVAL);
   1544 
   1545 
   1546 		rf_lock_mutex2(raidPtr->mutex);
   1547 		if (raidPtr->status == rf_rs_reconstructing) {
   1548 			/* you can't fail a disk while we're reconstructing! */
   1549 			/* XXX wrong for RAID6 */
   1550 			rf_unlock_mutex2(raidPtr->mutex);
   1551 			return (EINVAL);
   1552 		}
   1553 		if ((raidPtr->Disks[rr->col].status ==
   1554 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
   1555 			/* some other component has failed.  Let's not make
   1556 			   things worse. XXX wrong for RAID6 */
   1557 			rf_unlock_mutex2(raidPtr->mutex);
   1558 			return (EINVAL);
   1559 		}
   1560 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
   1561 			/* Can't fail a spared disk! */
   1562 			rf_unlock_mutex2(raidPtr->mutex);
   1563 			return (EINVAL);
   1564 		}
   1565 		rf_unlock_mutex2(raidPtr->mutex);
   1566 
   1567 		/* make a copy of the recon request so that we don't rely on
   1568 		 * the user's buffer */
   1569 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
   1570 		if (rrcopy == NULL)
   1571 			return(ENOMEM);
   1572 		memcpy(rrcopy, rr, sizeof(*rr));
   1573 		rrcopy->raidPtr = (void *) raidPtr;
   1574 
   1575 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
   1576 					   rf_ReconThread,
   1577 					   rrcopy,"raid_recon");
   1578 		return (0);
   1579 
   1580 		/* invoke a copyback operation after recon on whatever disk
   1581 		 * needs it, if any */
   1582 	case RAIDFRAME_COPYBACK:
   1583 
   1584 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1585 			/* This makes no sense on a RAID 0!! */
   1586 			return(EINVAL);
   1587 		}
   1588 
   1589 		if (raidPtr->copyback_in_progress == 1) {
   1590 			/* Copyback is already in progress! */
   1591 			return(EINVAL);
   1592 		}
   1593 
   1594 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
   1595 					   rf_CopybackThread,
   1596 					   raidPtr,"raid_copyback");
   1597 		return (retcode);
   1598 
   1599 		/* return the percentage completion of reconstruction */
   1600 	case RAIDFRAME_CHECK_RECON_STATUS:
   1601 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1602 			/* This makes no sense on a RAID 0, so tell the
   1603 			   user it's done. */
   1604 			*(int *) data = 100;
   1605 			return(0);
   1606 		}
   1607 		if (raidPtr->status != rf_rs_reconstructing)
   1608 			*(int *) data = 100;
   1609 		else {
   1610 			if (raidPtr->reconControl->numRUsTotal > 0) {
   1611 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
   1612 			} else {
   1613 				*(int *) data = 0;
   1614 			}
   1615 		}
   1616 		return (0);
   1617 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
   1618 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1619 		if (raidPtr->status != rf_rs_reconstructing) {
   1620 			progressInfo.remaining = 0;
   1621 			progressInfo.completed = 100;
   1622 			progressInfo.total = 100;
   1623 		} else {
   1624 			progressInfo.total =
   1625 				raidPtr->reconControl->numRUsTotal;
   1626 			progressInfo.completed =
   1627 				raidPtr->reconControl->numRUsComplete;
   1628 			progressInfo.remaining = progressInfo.total -
   1629 				progressInfo.completed;
   1630 		}
   1631 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1632 				  sizeof(RF_ProgressInfo_t));
   1633 		return (retcode);
   1634 
   1635 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
   1636 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1637 			/* This makes no sense on a RAID 0, so tell the
   1638 			   user it's done. */
   1639 			*(int *) data = 100;
   1640 			return(0);
   1641 		}
   1642 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1643 			*(int *) data = 100 *
   1644 				raidPtr->parity_rewrite_stripes_done /
   1645 				raidPtr->Layout.numStripe;
   1646 		} else {
   1647 			*(int *) data = 100;
   1648 		}
   1649 		return (0);
   1650 
   1651 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
   1652 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1653 		if (raidPtr->parity_rewrite_in_progress == 1) {
   1654 			progressInfo.total = raidPtr->Layout.numStripe;
   1655 			progressInfo.completed =
   1656 				raidPtr->parity_rewrite_stripes_done;
   1657 			progressInfo.remaining = progressInfo.total -
   1658 				progressInfo.completed;
   1659 		} else {
   1660 			progressInfo.remaining = 0;
   1661 			progressInfo.completed = 100;
   1662 			progressInfo.total = 100;
   1663 		}
   1664 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1665 				  sizeof(RF_ProgressInfo_t));
   1666 		return (retcode);
   1667 
   1668 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
   1669 		if (raidPtr->Layout.map->faultsTolerated == 0) {
   1670 			/* This makes no sense on a RAID 0 */
   1671 			*(int *) data = 100;
   1672 			return(0);
   1673 		}
   1674 		if (raidPtr->copyback_in_progress == 1) {
   1675 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
   1676 				raidPtr->Layout.numStripe;
   1677 		} else {
   1678 			*(int *) data = 100;
   1679 		}
   1680 		return (0);
   1681 
   1682 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
   1683 		progressInfoPtr = (RF_ProgressInfo_t **) data;
   1684 		if (raidPtr->copyback_in_progress == 1) {
   1685 			progressInfo.total = raidPtr->Layout.numStripe;
   1686 			progressInfo.completed =
   1687 				raidPtr->copyback_stripes_done;
   1688 			progressInfo.remaining = progressInfo.total -
   1689 				progressInfo.completed;
   1690 		} else {
   1691 			progressInfo.remaining = 0;
   1692 			progressInfo.completed = 100;
   1693 			progressInfo.total = 100;
   1694 		}
   1695 		retcode = copyout(&progressInfo, *progressInfoPtr,
   1696 				  sizeof(RF_ProgressInfo_t));
   1697 		return (retcode);
   1698 
   1699 		/* the sparetable daemon calls this to wait for the kernel to
   1700 		 * need a spare table. this ioctl does not return until a
   1701 		 * spare table is needed. XXX -- calling mpsleep here in the
   1702 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
   1703 		 * -- I should either compute the spare table in the kernel,
   1704 		 * or have a different -- XXX XXX -- interface (a different
   1705 		 * character device) for delivering the table     -- XXX */
   1706 #if 0
   1707 	case RAIDFRAME_SPARET_WAIT:
   1708 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1709 		while (!rf_sparet_wait_queue)
   1710 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
   1711 		waitreq = rf_sparet_wait_queue;
   1712 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
   1713 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1714 
   1715 		/* structure assignment */
   1716 		*((RF_SparetWait_t *) data) = *waitreq;
   1717 
   1718 		RF_Free(waitreq, sizeof(*waitreq));
   1719 		return (0);
   1720 
   1721 		/* wakes up a process waiting on SPARET_WAIT and puts an error
   1722 		 * code in it that will cause the dameon to exit */
   1723 	case RAIDFRAME_ABORT_SPARET_WAIT:
   1724 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1725 		waitreq->fcol = -1;
   1726 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1727 		waitreq->next = rf_sparet_wait_queue;
   1728 		rf_sparet_wait_queue = waitreq;
   1729 		rf_broadcast_conf2(rf_sparet_wait_cv);
   1730 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1731 		return (0);
   1732 
   1733 		/* used by the spare table daemon to deliver a spare table
   1734 		 * into the kernel */
   1735 	case RAIDFRAME_SEND_SPARET:
   1736 
   1737 		/* install the spare table */
   1738 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
   1739 
   1740 		/* respond to the requestor.  the return status of the spare
   1741 		 * table installation is passed in the "fcol" field */
   1742 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
   1743 		waitreq->fcol = retcode;
   1744 		rf_lock_mutex2(rf_sparet_wait_mutex);
   1745 		waitreq->next = rf_sparet_resp_queue;
   1746 		rf_sparet_resp_queue = waitreq;
   1747 		rf_broadcast_cond2(rf_sparet_resp_cv);
   1748 		rf_unlock_mutex2(rf_sparet_wait_mutex);
   1749 
   1750 		return (retcode);
   1751 #endif
   1752 
   1753 	default:
   1754 		break; /* fall through to the os-specific code below */
   1755 
   1756 	}
   1757 
   1758 	if (!raidPtr->valid)
   1759 		return (EINVAL);
   1760 
   1761 	/*
   1762 	 * Add support for "regular" device ioctls here.
   1763 	 */
   1764 
   1765 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
   1766 	if (error != EPASSTHROUGH)
   1767 		return (error);
   1768 
   1769 	switch (cmd) {
   1770 	case DIOCGDINFO:
   1771 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
   1772 		break;
   1773 #ifdef __HAVE_OLD_DISKLABEL
   1774 	case ODIOCGDINFO:
   1775 		newlabel = *(rs->sc_dkdev.dk_label);
   1776 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1777 			return ENOTTY;
   1778 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1779 		break;
   1780 #endif
   1781 
   1782 	case DIOCGPART:
   1783 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
   1784 		((struct partinfo *) data)->part =
   1785 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
   1786 		break;
   1787 
   1788 	case DIOCWDINFO:
   1789 	case DIOCSDINFO:
   1790 #ifdef __HAVE_OLD_DISKLABEL
   1791 	case ODIOCWDINFO:
   1792 	case ODIOCSDINFO:
   1793 #endif
   1794 	{
   1795 		struct disklabel *lp;
   1796 #ifdef __HAVE_OLD_DISKLABEL
   1797 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
   1798 			memset(&newlabel, 0, sizeof newlabel);
   1799 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
   1800 			lp = &newlabel;
   1801 		} else
   1802 #endif
   1803 		lp = (struct disklabel *)data;
   1804 
   1805 		if ((error = raidlock(rs)) != 0)
   1806 			return (error);
   1807 
   1808 		rs->sc_flags |= RAIDF_LABELLING;
   1809 
   1810 		error = setdisklabel(rs->sc_dkdev.dk_label,
   1811 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
   1812 		if (error == 0) {
   1813 			if (cmd == DIOCWDINFO
   1814 #ifdef __HAVE_OLD_DISKLABEL
   1815 			    || cmd == ODIOCWDINFO
   1816 #endif
   1817 			   )
   1818 				error = writedisklabel(RAIDLABELDEV(dev),
   1819 				    raidstrategy, rs->sc_dkdev.dk_label,
   1820 				    rs->sc_dkdev.dk_cpulabel);
   1821 		}
   1822 		rs->sc_flags &= ~RAIDF_LABELLING;
   1823 
   1824 		raidunlock(rs);
   1825 
   1826 		if (error)
   1827 			return (error);
   1828 		break;
   1829 	}
   1830 
   1831 	case DIOCWLABEL:
   1832 		if (*(int *) data != 0)
   1833 			rs->sc_flags |= RAIDF_WLABEL;
   1834 		else
   1835 			rs->sc_flags &= ~RAIDF_WLABEL;
   1836 		break;
   1837 
   1838 	case DIOCGDEFLABEL:
   1839 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
   1840 		break;
   1841 
   1842 #ifdef __HAVE_OLD_DISKLABEL
   1843 	case ODIOCGDEFLABEL:
   1844 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
   1845 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
   1846 			return ENOTTY;
   1847 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
   1848 		break;
   1849 #endif
   1850 
   1851 	case DIOCAWEDGE:
   1852 	case DIOCDWEDGE:
   1853 	    	dkw = (void *)data;
   1854 
   1855 		/* If the ioctl happens here, the parent is us. */
   1856 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
   1857 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
   1858 
   1859 	case DIOCLWEDGES:
   1860 		return dkwedge_list(&rs->sc_dkdev,
   1861 		    (struct dkwedge_list *)data, l);
   1862 	case DIOCCACHESYNC:
   1863 		return rf_sync_component_caches(raidPtr);
   1864 
   1865 	case DIOCGSTRATEGY:
   1866 	    {
   1867 		struct disk_strategy *dks = (void *)data;
   1868 
   1869 		s = splbio();
   1870 		strlcpy(dks->dks_name, bufq_getstrategyname(rs->buf_queue),
   1871 		    sizeof(dks->dks_name));
   1872 		splx(s);
   1873 		dks->dks_paramlen = 0;
   1874 
   1875 		return 0;
   1876 	    }
   1877 
   1878 	case DIOCSSTRATEGY:
   1879 	    {
   1880 		struct disk_strategy *dks = (void *)data;
   1881 		struct bufq_state *new;
   1882 		struct bufq_state *old;
   1883 
   1884 		if (dks->dks_param != NULL) {
   1885 			return EINVAL;
   1886 		}
   1887 		dks->dks_name[sizeof(dks->dks_name) - 1] = 0; /* ensure term */
   1888 		error = bufq_alloc(&new, dks->dks_name,
   1889 		    BUFQ_EXACT|BUFQ_SORT_RAWBLOCK);
   1890 		if (error) {
   1891 			return error;
   1892 		}
   1893 		s = splbio();
   1894 		old = rs->buf_queue;
   1895 		bufq_move(new, old);
   1896 		rs->buf_queue = new;
   1897 		splx(s);
   1898 		bufq_free(old);
   1899 
   1900 		return 0;
   1901 	    }
   1902 
   1903 	default:
   1904 		retcode = ENOTTY;
   1905 	}
   1906 	return (retcode);
   1907 
   1908 }
   1909 
   1910 
   1911 /* raidinit -- complete the rest of the initialization for the
   1912    RAIDframe device.  */
   1913 
   1914 
   1915 static void
   1916 raidinit(RF_Raid_t *raidPtr)
   1917 {
   1918 	cfdata_t cf;
   1919 	struct raid_softc *rs;
   1920 	int     unit;
   1921 
   1922 	unit = raidPtr->raidid;
   1923 
   1924 	rs = &raid_softc[unit];
   1925 
   1926 	/* XXX should check return code first... */
   1927 	rs->sc_flags |= RAIDF_INITED;
   1928 
   1929 	/* XXX doesn't check bounds. */
   1930 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
   1931 
   1932 	/* attach the pseudo device */
   1933 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
   1934 	cf->cf_name = raid_cd.cd_name;
   1935 	cf->cf_atname = raid_cd.cd_name;
   1936 	cf->cf_unit = unit;
   1937 	cf->cf_fstate = FSTATE_STAR;
   1938 
   1939 	rs->sc_dev = config_attach_pseudo(cf);
   1940 
   1941 	if (rs->sc_dev == NULL) {
   1942 		printf("raid%d: config_attach_pseudo failed\n",
   1943 		    raidPtr->raidid);
   1944 		rs->sc_flags &= ~RAIDF_INITED;
   1945 		free(cf, M_RAIDFRAME);
   1946 		return;
   1947 	}
   1948 
   1949 	/* disk_attach actually creates space for the CPU disklabel, among
   1950 	 * other things, so it's critical to call this *BEFORE* we try putzing
   1951 	 * with disklabels. */
   1952 
   1953 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
   1954 	disk_attach(&rs->sc_dkdev);
   1955 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
   1956 
   1957 	/* XXX There may be a weird interaction here between this, and
   1958 	 * protectedSectors, as used in RAIDframe.  */
   1959 
   1960 	rs->sc_size = raidPtr->totalSectors;
   1961 
   1962 	dkwedge_discover(&rs->sc_dkdev);
   1963 
   1964 	rf_set_properties(rs, raidPtr);
   1965 
   1966 }
   1967 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
   1968 /* wake up the daemon & tell it to get us a spare table
   1969  * XXX
   1970  * the entries in the queues should be tagged with the raidPtr
   1971  * so that in the extremely rare case that two recons happen at once,
   1972  * we know for which device were requesting a spare table
   1973  * XXX
   1974  *
   1975  * XXX This code is not currently used. GO
   1976  */
   1977 int
   1978 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
   1979 {
   1980 	int     retcode;
   1981 
   1982 	rf_lock_mutex2(rf_sparet_wait_mutex);
   1983 	req->next = rf_sparet_wait_queue;
   1984 	rf_sparet_wait_queue = req;
   1985 	rf_broadcast_cond2(rf_sparet_wait_cv);
   1986 
   1987 	/* mpsleep unlocks the mutex */
   1988 	while (!rf_sparet_resp_queue) {
   1989 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
   1990 	}
   1991 	req = rf_sparet_resp_queue;
   1992 	rf_sparet_resp_queue = req->next;
   1993 	rf_unlock_mutex2(rf_sparet_wait_mutex);
   1994 
   1995 	retcode = req->fcol;
   1996 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
   1997 					 * alloc'd */
   1998 	return (retcode);
   1999 }
   2000 #endif
   2001 
   2002 /* a wrapper around rf_DoAccess that extracts appropriate info from the
   2003  * bp & passes it down.
   2004  * any calls originating in the kernel must use non-blocking I/O
   2005  * do some extra sanity checking to return "appropriate" error values for
   2006  * certain conditions (to make some standard utilities work)
   2007  *
   2008  * Formerly known as: rf_DoAccessKernel
   2009  */
   2010 void
   2011 raidstart(RF_Raid_t *raidPtr)
   2012 {
   2013 	RF_SectorCount_t num_blocks, pb, sum;
   2014 	RF_RaidAddr_t raid_addr;
   2015 	struct partition *pp;
   2016 	daddr_t blocknum;
   2017 	int     unit;
   2018 	struct raid_softc *rs;
   2019 	int     do_async;
   2020 	struct buf *bp;
   2021 	int rc;
   2022 
   2023 	unit = raidPtr->raidid;
   2024 	rs = &raid_softc[unit];
   2025 
   2026 	/* quick check to see if anything has died recently */
   2027 	rf_lock_mutex2(raidPtr->mutex);
   2028 	if (raidPtr->numNewFailures > 0) {
   2029 		rf_unlock_mutex2(raidPtr->mutex);
   2030 		rf_update_component_labels(raidPtr,
   2031 					   RF_NORMAL_COMPONENT_UPDATE);
   2032 		rf_lock_mutex2(raidPtr->mutex);
   2033 		raidPtr->numNewFailures--;
   2034 	}
   2035 
   2036 	/* Check to see if we're at the limit... */
   2037 	while (raidPtr->openings > 0) {
   2038 		rf_unlock_mutex2(raidPtr->mutex);
   2039 
   2040 		/* get the next item, if any, from the queue */
   2041 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
   2042 			/* nothing more to do */
   2043 			return;
   2044 		}
   2045 
   2046 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
   2047 		 * partition.. Need to make it absolute to the underlying
   2048 		 * device.. */
   2049 
   2050 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
   2051 		if (DISKPART(bp->b_dev) != RAW_PART) {
   2052 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
   2053 			blocknum += pp->p_offset;
   2054 		}
   2055 
   2056 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
   2057 			    (int) blocknum));
   2058 
   2059 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
   2060 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
   2061 
   2062 		/* *THIS* is where we adjust what block we're going to...
   2063 		 * but DO NOT TOUCH bp->b_blkno!!! */
   2064 		raid_addr = blocknum;
   2065 
   2066 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
   2067 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
   2068 		sum = raid_addr + num_blocks + pb;
   2069 		if (1 || rf_debugKernelAccess) {
   2070 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
   2071 				    (int) raid_addr, (int) sum, (int) num_blocks,
   2072 				    (int) pb, (int) bp->b_resid));
   2073 		}
   2074 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
   2075 		    || (sum < num_blocks) || (sum < pb)) {
   2076 			bp->b_error = ENOSPC;
   2077 			bp->b_resid = bp->b_bcount;
   2078 			biodone(bp);
   2079 			rf_lock_mutex2(raidPtr->mutex);
   2080 			continue;
   2081 		}
   2082 		/*
   2083 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
   2084 		 */
   2085 
   2086 		if (bp->b_bcount & raidPtr->sectorMask) {
   2087 			bp->b_error = EINVAL;
   2088 			bp->b_resid = bp->b_bcount;
   2089 			biodone(bp);
   2090 			rf_lock_mutex2(raidPtr->mutex);
   2091 			continue;
   2092 
   2093 		}
   2094 		db1_printf(("Calling DoAccess..\n"));
   2095 
   2096 
   2097 		rf_lock_mutex2(raidPtr->mutex);
   2098 		raidPtr->openings--;
   2099 		rf_unlock_mutex2(raidPtr->mutex);
   2100 
   2101 		/*
   2102 		 * Everything is async.
   2103 		 */
   2104 		do_async = 1;
   2105 
   2106 		disk_busy(&rs->sc_dkdev);
   2107 
   2108 		/* XXX we're still at splbio() here... do we *really*
   2109 		   need to be? */
   2110 
   2111 		/* don't ever condition on bp->b_flags & B_WRITE.
   2112 		 * always condition on B_READ instead */
   2113 
   2114 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
   2115 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
   2116 				 do_async, raid_addr, num_blocks,
   2117 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
   2118 
   2119 		if (rc) {
   2120 			bp->b_error = rc;
   2121 			bp->b_resid = bp->b_bcount;
   2122 			biodone(bp);
   2123 			/* continue loop */
   2124 		}
   2125 
   2126 		rf_lock_mutex2(raidPtr->mutex);
   2127 	}
   2128 	rf_unlock_mutex2(raidPtr->mutex);
   2129 }
   2130 
   2131 
   2132 
   2133 
   2134 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
   2135 
   2136 int
   2137 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
   2138 {
   2139 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
   2140 	struct buf *bp;
   2141 
   2142 	req->queue = queue;
   2143 	bp = req->bp;
   2144 
   2145 	switch (req->type) {
   2146 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
   2147 		/* XXX need to do something extra here.. */
   2148 		/* I'm leaving this in, as I've never actually seen it used,
   2149 		 * and I'd like folks to report it... GO */
   2150 		printf(("WAKEUP CALLED\n"));
   2151 		queue->numOutstanding++;
   2152 
   2153 		bp->b_flags = 0;
   2154 		bp->b_private = req;
   2155 
   2156 		KernelWakeupFunc(bp);
   2157 		break;
   2158 
   2159 	case RF_IO_TYPE_READ:
   2160 	case RF_IO_TYPE_WRITE:
   2161 #if RF_ACC_TRACE > 0
   2162 		if (req->tracerec) {
   2163 			RF_ETIMER_START(req->tracerec->timer);
   2164 		}
   2165 #endif
   2166 		InitBP(bp, queue->rf_cinfo->ci_vp,
   2167 		    op, queue->rf_cinfo->ci_dev,
   2168 		    req->sectorOffset, req->numSector,
   2169 		    req->buf, KernelWakeupFunc, (void *) req,
   2170 		    queue->raidPtr->logBytesPerSector, req->b_proc);
   2171 
   2172 		if (rf_debugKernelAccess) {
   2173 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
   2174 				(long) bp->b_blkno));
   2175 		}
   2176 		queue->numOutstanding++;
   2177 		queue->last_deq_sector = req->sectorOffset;
   2178 		/* acc wouldn't have been let in if there were any pending
   2179 		 * reqs at any other priority */
   2180 		queue->curPriority = req->priority;
   2181 
   2182 		db1_printf(("Going for %c to unit %d col %d\n",
   2183 			    req->type, queue->raidPtr->raidid,
   2184 			    queue->col));
   2185 		db1_printf(("sector %d count %d (%d bytes) %d\n",
   2186 			(int) req->sectorOffset, (int) req->numSector,
   2187 			(int) (req->numSector <<
   2188 			    queue->raidPtr->logBytesPerSector),
   2189 			(int) queue->raidPtr->logBytesPerSector));
   2190 
   2191 		/*
   2192 		 * XXX: drop lock here since this can block at
   2193 		 * least with backing SCSI devices.  Retake it
   2194 		 * to minimize fuss with calling interfaces.
   2195 		 */
   2196 
   2197 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
   2198 		bdev_strategy(bp);
   2199 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
   2200 		break;
   2201 
   2202 	default:
   2203 		panic("bad req->type in rf_DispatchKernelIO");
   2204 	}
   2205 	db1_printf(("Exiting from DispatchKernelIO\n"));
   2206 
   2207 	return (0);
   2208 }
   2209 /* this is the callback function associated with a I/O invoked from
   2210    kernel code.
   2211  */
   2212 static void
   2213 KernelWakeupFunc(struct buf *bp)
   2214 {
   2215 	RF_DiskQueueData_t *req = NULL;
   2216 	RF_DiskQueue_t *queue;
   2217 
   2218 	db1_printf(("recovering the request queue:\n"));
   2219 
   2220 	req = bp->b_private;
   2221 
   2222 	queue = (RF_DiskQueue_t *) req->queue;
   2223 
   2224 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
   2225 
   2226 #if RF_ACC_TRACE > 0
   2227 	if (req->tracerec) {
   2228 		RF_ETIMER_STOP(req->tracerec->timer);
   2229 		RF_ETIMER_EVAL(req->tracerec->timer);
   2230 		rf_lock_mutex2(rf_tracing_mutex);
   2231 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2232 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
   2233 		req->tracerec->num_phys_ios++;
   2234 		rf_unlock_mutex2(rf_tracing_mutex);
   2235 	}
   2236 #endif
   2237 
   2238 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
   2239 	 * ballistic, and mark the component as hosed... */
   2240 
   2241 	if (bp->b_error != 0) {
   2242 		/* Mark the disk as dead */
   2243 		/* but only mark it once... */
   2244 		/* and only if it wouldn't leave this RAID set
   2245 		   completely broken */
   2246 		if (((queue->raidPtr->Disks[queue->col].status ==
   2247 		      rf_ds_optimal) ||
   2248 		     (queue->raidPtr->Disks[queue->col].status ==
   2249 		      rf_ds_used_spare)) &&
   2250 		     (queue->raidPtr->numFailures <
   2251 		      queue->raidPtr->Layout.map->faultsTolerated)) {
   2252 			printf("raid%d: IO Error.  Marking %s as failed.\n",
   2253 			       queue->raidPtr->raidid,
   2254 			       queue->raidPtr->Disks[queue->col].devname);
   2255 			queue->raidPtr->Disks[queue->col].status =
   2256 			    rf_ds_failed;
   2257 			queue->raidPtr->status = rf_rs_degraded;
   2258 			queue->raidPtr->numFailures++;
   2259 			queue->raidPtr->numNewFailures++;
   2260 		} else {	/* Disk is already dead... */
   2261 			/* printf("Disk already marked as dead!\n"); */
   2262 		}
   2263 
   2264 	}
   2265 
   2266 	/* Fill in the error value */
   2267 	req->error = bp->b_error;
   2268 
   2269 	/* Drop this one on the "finished" queue... */
   2270 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
   2271 
   2272 	/* Let the raidio thread know there is work to be done. */
   2273 	rf_signal_cond2(queue->raidPtr->iodone_cv);
   2274 
   2275 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
   2276 }
   2277 
   2278 
   2279 /*
   2280  * initialize a buf structure for doing an I/O in the kernel.
   2281  */
   2282 static void
   2283 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
   2284        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
   2285        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
   2286        struct proc *b_proc)
   2287 {
   2288 	/* bp->b_flags       = B_PHYS | rw_flag; */
   2289 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
   2290 	bp->b_oflags = 0;
   2291 	bp->b_cflags = 0;
   2292 	bp->b_bcount = numSect << logBytesPerSector;
   2293 	bp->b_bufsize = bp->b_bcount;
   2294 	bp->b_error = 0;
   2295 	bp->b_dev = dev;
   2296 	bp->b_data = bf;
   2297 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
   2298 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
   2299 	if (bp->b_bcount == 0) {
   2300 		panic("bp->b_bcount is zero in InitBP!!");
   2301 	}
   2302 	bp->b_proc = b_proc;
   2303 	bp->b_iodone = cbFunc;
   2304 	bp->b_private = cbArg;
   2305 }
   2306 
   2307 static void
   2308 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
   2309 		    struct disklabel *lp)
   2310 {
   2311 	memset(lp, 0, sizeof(*lp));
   2312 
   2313 	/* fabricate a label... */
   2314 	lp->d_secperunit = raidPtr->totalSectors;
   2315 	lp->d_secsize = raidPtr->bytesPerSector;
   2316 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
   2317 	lp->d_ntracks = 4 * raidPtr->numCol;
   2318 	lp->d_ncylinders = raidPtr->totalSectors /
   2319 		(lp->d_nsectors * lp->d_ntracks);
   2320 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
   2321 
   2322 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
   2323 	lp->d_type = DTYPE_RAID;
   2324 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
   2325 	lp->d_rpm = 3600;
   2326 	lp->d_interleave = 1;
   2327 	lp->d_flags = 0;
   2328 
   2329 	lp->d_partitions[RAW_PART].p_offset = 0;
   2330 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
   2331 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
   2332 	lp->d_npartitions = RAW_PART + 1;
   2333 
   2334 	lp->d_magic = DISKMAGIC;
   2335 	lp->d_magic2 = DISKMAGIC;
   2336 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
   2337 
   2338 }
   2339 /*
   2340  * Read the disklabel from the raid device.  If one is not present, fake one
   2341  * up.
   2342  */
   2343 static void
   2344 raidgetdisklabel(dev_t dev)
   2345 {
   2346 	int     unit = raidunit(dev);
   2347 	struct raid_softc *rs = &raid_softc[unit];
   2348 	const char   *errstring;
   2349 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2350 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
   2351 	RF_Raid_t *raidPtr;
   2352 
   2353 	db1_printf(("Getting the disklabel...\n"));
   2354 
   2355 	memset(clp, 0, sizeof(*clp));
   2356 
   2357 	raidPtr = raidPtrs[unit];
   2358 
   2359 	raidgetdefaultlabel(raidPtr, rs, lp);
   2360 
   2361 	/*
   2362 	 * Call the generic disklabel extraction routine.
   2363 	 */
   2364 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
   2365 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
   2366 	if (errstring)
   2367 		raidmakedisklabel(rs);
   2368 	else {
   2369 		int     i;
   2370 		struct partition *pp;
   2371 
   2372 		/*
   2373 		 * Sanity check whether the found disklabel is valid.
   2374 		 *
   2375 		 * This is necessary since total size of the raid device
   2376 		 * may vary when an interleave is changed even though exactly
   2377 		 * same components are used, and old disklabel may used
   2378 		 * if that is found.
   2379 		 */
   2380 		if (lp->d_secperunit != rs->sc_size)
   2381 			printf("raid%d: WARNING: %s: "
   2382 			    "total sector size in disklabel (%" PRIu32 ") != "
   2383 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
   2384 			    lp->d_secperunit, rs->sc_size);
   2385 		for (i = 0; i < lp->d_npartitions; i++) {
   2386 			pp = &lp->d_partitions[i];
   2387 			if (pp->p_offset + pp->p_size > rs->sc_size)
   2388 				printf("raid%d: WARNING: %s: end of partition `%c' "
   2389 				       "exceeds the size of raid (%" PRIu64 ")\n",
   2390 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
   2391 		}
   2392 	}
   2393 
   2394 }
   2395 /*
   2396  * Take care of things one might want to take care of in the event
   2397  * that a disklabel isn't present.
   2398  */
   2399 static void
   2400 raidmakedisklabel(struct raid_softc *rs)
   2401 {
   2402 	struct disklabel *lp = rs->sc_dkdev.dk_label;
   2403 	db1_printf(("Making a label..\n"));
   2404 
   2405 	/*
   2406 	 * For historical reasons, if there's no disklabel present
   2407 	 * the raw partition must be marked FS_BSDFFS.
   2408 	 */
   2409 
   2410 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
   2411 
   2412 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
   2413 
   2414 	lp->d_checksum = dkcksum(lp);
   2415 }
   2416 /*
   2417  * Wait interruptibly for an exclusive lock.
   2418  *
   2419  * XXX
   2420  * Several drivers do this; it should be abstracted and made MP-safe.
   2421  * (Hmm... where have we seen this warning before :->  GO )
   2422  */
   2423 static int
   2424 raidlock(struct raid_softc *rs)
   2425 {
   2426 	int     error;
   2427 
   2428 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
   2429 		rs->sc_flags |= RAIDF_WANTED;
   2430 		if ((error =
   2431 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
   2432 			return (error);
   2433 	}
   2434 	rs->sc_flags |= RAIDF_LOCKED;
   2435 	return (0);
   2436 }
   2437 /*
   2438  * Unlock and wake up any waiters.
   2439  */
   2440 static void
   2441 raidunlock(struct raid_softc *rs)
   2442 {
   2443 
   2444 	rs->sc_flags &= ~RAIDF_LOCKED;
   2445 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
   2446 		rs->sc_flags &= ~RAIDF_WANTED;
   2447 		wakeup(rs);
   2448 	}
   2449 }
   2450 
   2451 
   2452 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
   2453 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
   2454 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
   2455 
   2456 static daddr_t
   2457 rf_component_info_offset(void)
   2458 {
   2459 
   2460 	return RF_COMPONENT_INFO_OFFSET;
   2461 }
   2462 
   2463 static daddr_t
   2464 rf_component_info_size(unsigned secsize)
   2465 {
   2466 	daddr_t info_size;
   2467 
   2468 	KASSERT(secsize);
   2469 	if (secsize > RF_COMPONENT_INFO_SIZE)
   2470 		info_size = secsize;
   2471 	else
   2472 		info_size = RF_COMPONENT_INFO_SIZE;
   2473 
   2474 	return info_size;
   2475 }
   2476 
   2477 static daddr_t
   2478 rf_parity_map_offset(RF_Raid_t *raidPtr)
   2479 {
   2480 	daddr_t map_offset;
   2481 
   2482 	KASSERT(raidPtr->bytesPerSector);
   2483 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
   2484 		map_offset = raidPtr->bytesPerSector;
   2485 	else
   2486 		map_offset = RF_COMPONENT_INFO_SIZE;
   2487 	map_offset += rf_component_info_offset();
   2488 
   2489 	return map_offset;
   2490 }
   2491 
   2492 static daddr_t
   2493 rf_parity_map_size(RF_Raid_t *raidPtr)
   2494 {
   2495 	daddr_t map_size;
   2496 
   2497 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
   2498 		map_size = raidPtr->bytesPerSector;
   2499 	else
   2500 		map_size = RF_PARITY_MAP_SIZE;
   2501 
   2502 	return map_size;
   2503 }
   2504 
   2505 int
   2506 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2507 {
   2508 	RF_ComponentLabel_t *clabel;
   2509 
   2510 	clabel = raidget_component_label(raidPtr, col);
   2511 	clabel->clean = RF_RAID_CLEAN;
   2512 	raidflush_component_label(raidPtr, col);
   2513 	return(0);
   2514 }
   2515 
   2516 
   2517 int
   2518 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2519 {
   2520 	RF_ComponentLabel_t *clabel;
   2521 
   2522 	clabel = raidget_component_label(raidPtr, col);
   2523 	clabel->clean = RF_RAID_DIRTY;
   2524 	raidflush_component_label(raidPtr, col);
   2525 	return(0);
   2526 }
   2527 
   2528 int
   2529 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2530 {
   2531 	KASSERT(raidPtr->bytesPerSector);
   2532 	return raidread_component_label(raidPtr->bytesPerSector,
   2533 	    raidPtr->Disks[col].dev,
   2534 	    raidPtr->raid_cinfo[col].ci_vp,
   2535 	    &raidPtr->raid_cinfo[col].ci_label);
   2536 }
   2537 
   2538 RF_ComponentLabel_t *
   2539 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2540 {
   2541 	return &raidPtr->raid_cinfo[col].ci_label;
   2542 }
   2543 
   2544 int
   2545 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
   2546 {
   2547 	RF_ComponentLabel_t *label;
   2548 
   2549 	label = &raidPtr->raid_cinfo[col].ci_label;
   2550 	label->mod_counter = raidPtr->mod_counter;
   2551 #ifndef RF_NO_PARITY_MAP
   2552 	label->parity_map_modcount = label->mod_counter;
   2553 #endif
   2554 	return raidwrite_component_label(raidPtr->bytesPerSector,
   2555 	    raidPtr->Disks[col].dev,
   2556 	    raidPtr->raid_cinfo[col].ci_vp, label);
   2557 }
   2558 
   2559 
   2560 static int
   2561 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2562     RF_ComponentLabel_t *clabel)
   2563 {
   2564 	return raidread_component_area(dev, b_vp, clabel,
   2565 	    sizeof(RF_ComponentLabel_t),
   2566 	    rf_component_info_offset(),
   2567 	    rf_component_info_size(secsize));
   2568 }
   2569 
   2570 /* ARGSUSED */
   2571 static int
   2572 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2573     size_t msize, daddr_t offset, daddr_t dsize)
   2574 {
   2575 	struct buf *bp;
   2576 	const struct bdevsw *bdev;
   2577 	int error;
   2578 
   2579 	/* XXX should probably ensure that we don't try to do this if
   2580 	   someone has changed rf_protected_sectors. */
   2581 
   2582 	if (b_vp == NULL) {
   2583 		/* For whatever reason, this component is not valid.
   2584 		   Don't try to read a component label from it. */
   2585 		return(EINVAL);
   2586 	}
   2587 
   2588 	/* get a block of the appropriate size... */
   2589 	bp = geteblk((int)dsize);
   2590 	bp->b_dev = dev;
   2591 
   2592 	/* get our ducks in a row for the read */
   2593 	bp->b_blkno = offset / DEV_BSIZE;
   2594 	bp->b_bcount = dsize;
   2595 	bp->b_flags |= B_READ;
   2596  	bp->b_resid = dsize;
   2597 
   2598 	bdev = bdevsw_lookup(bp->b_dev);
   2599 	if (bdev == NULL)
   2600 		return (ENXIO);
   2601 	(*bdev->d_strategy)(bp);
   2602 
   2603 	error = biowait(bp);
   2604 
   2605 	if (!error) {
   2606 		memcpy(data, bp->b_data, msize);
   2607 	}
   2608 
   2609 	brelse(bp, 0);
   2610 	return(error);
   2611 }
   2612 
   2613 
   2614 static int
   2615 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
   2616     RF_ComponentLabel_t *clabel)
   2617 {
   2618 	return raidwrite_component_area(dev, b_vp, clabel,
   2619 	    sizeof(RF_ComponentLabel_t),
   2620 	    rf_component_info_offset(),
   2621 	    rf_component_info_size(secsize), 0);
   2622 }
   2623 
   2624 /* ARGSUSED */
   2625 static int
   2626 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
   2627     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
   2628 {
   2629 	struct buf *bp;
   2630 	const struct bdevsw *bdev;
   2631 	int error;
   2632 
   2633 	/* get a block of the appropriate size... */
   2634 	bp = geteblk((int)dsize);
   2635 	bp->b_dev = dev;
   2636 
   2637 	/* get our ducks in a row for the write */
   2638 	bp->b_blkno = offset / DEV_BSIZE;
   2639 	bp->b_bcount = dsize;
   2640 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
   2641  	bp->b_resid = dsize;
   2642 
   2643 	memset(bp->b_data, 0, dsize);
   2644 	memcpy(bp->b_data, data, msize);
   2645 
   2646 	bdev = bdevsw_lookup(bp->b_dev);
   2647 	if (bdev == NULL)
   2648 		return (ENXIO);
   2649 	(*bdev->d_strategy)(bp);
   2650 	if (asyncp)
   2651 		return 0;
   2652 	error = biowait(bp);
   2653 	brelse(bp, 0);
   2654 	if (error) {
   2655 #if 1
   2656 		printf("Failed to write RAID component info!\n");
   2657 #endif
   2658 	}
   2659 
   2660 	return(error);
   2661 }
   2662 
   2663 void
   2664 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2665 {
   2666 	int c;
   2667 
   2668 	for (c = 0; c < raidPtr->numCol; c++) {
   2669 		/* Skip dead disks. */
   2670 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2671 			continue;
   2672 		/* XXXjld: what if an error occurs here? */
   2673 		raidwrite_component_area(raidPtr->Disks[c].dev,
   2674 		    raidPtr->raid_cinfo[c].ci_vp, map,
   2675 		    RF_PARITYMAP_NBYTE,
   2676 		    rf_parity_map_offset(raidPtr),
   2677 		    rf_parity_map_size(raidPtr), 0);
   2678 	}
   2679 }
   2680 
   2681 void
   2682 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
   2683 {
   2684 	struct rf_paritymap_ondisk tmp;
   2685 	int c,first;
   2686 
   2687 	first=1;
   2688 	for (c = 0; c < raidPtr->numCol; c++) {
   2689 		/* Skip dead disks. */
   2690 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
   2691 			continue;
   2692 		raidread_component_area(raidPtr->Disks[c].dev,
   2693 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
   2694 		    RF_PARITYMAP_NBYTE,
   2695 		    rf_parity_map_offset(raidPtr),
   2696 		    rf_parity_map_size(raidPtr));
   2697 		if (first) {
   2698 			memcpy(map, &tmp, sizeof(*map));
   2699 			first = 0;
   2700 		} else {
   2701 			rf_paritymap_merge(map, &tmp);
   2702 		}
   2703 	}
   2704 }
   2705 
   2706 void
   2707 rf_markalldirty(RF_Raid_t *raidPtr)
   2708 {
   2709 	RF_ComponentLabel_t *clabel;
   2710 	int sparecol;
   2711 	int c;
   2712 	int j;
   2713 	int scol = -1;
   2714 
   2715 	raidPtr->mod_counter++;
   2716 	for (c = 0; c < raidPtr->numCol; c++) {
   2717 		/* we don't want to touch (at all) a disk that has
   2718 		   failed */
   2719 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
   2720 			clabel = raidget_component_label(raidPtr, c);
   2721 			if (clabel->status == rf_ds_spared) {
   2722 				/* XXX do something special...
   2723 				   but whatever you do, don't
   2724 				   try to access it!! */
   2725 			} else {
   2726 				raidmarkdirty(raidPtr, c);
   2727 			}
   2728 		}
   2729 	}
   2730 
   2731 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2732 		sparecol = raidPtr->numCol + c;
   2733 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2734 			/*
   2735 
   2736 			   we claim this disk is "optimal" if it's
   2737 			   rf_ds_used_spare, as that means it should be
   2738 			   directly substitutable for the disk it replaced.
   2739 			   We note that too...
   2740 
   2741 			 */
   2742 
   2743 			for(j=0;j<raidPtr->numCol;j++) {
   2744 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2745 					scol = j;
   2746 					break;
   2747 				}
   2748 			}
   2749 
   2750 			clabel = raidget_component_label(raidPtr, sparecol);
   2751 			/* make sure status is noted */
   2752 
   2753 			raid_init_component_label(raidPtr, clabel);
   2754 
   2755 			clabel->row = 0;
   2756 			clabel->column = scol;
   2757 			/* Note: we *don't* change status from rf_ds_used_spare
   2758 			   to rf_ds_optimal */
   2759 			/* clabel.status = rf_ds_optimal; */
   2760 
   2761 			raidmarkdirty(raidPtr, sparecol);
   2762 		}
   2763 	}
   2764 }
   2765 
   2766 
   2767 void
   2768 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
   2769 {
   2770 	RF_ComponentLabel_t *clabel;
   2771 	int sparecol;
   2772 	int c;
   2773 	int j;
   2774 	int scol;
   2775 
   2776 	scol = -1;
   2777 
   2778 	/* XXX should do extra checks to make sure things really are clean,
   2779 	   rather than blindly setting the clean bit... */
   2780 
   2781 	raidPtr->mod_counter++;
   2782 
   2783 	for (c = 0; c < raidPtr->numCol; c++) {
   2784 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   2785 			clabel = raidget_component_label(raidPtr, c);
   2786 			/* make sure status is noted */
   2787 			clabel->status = rf_ds_optimal;
   2788 
   2789 			/* note what unit we are configured as */
   2790 			clabel->last_unit = raidPtr->raidid;
   2791 
   2792 			raidflush_component_label(raidPtr, c);
   2793 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2794 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2795 					raidmarkclean(raidPtr, c);
   2796 				}
   2797 			}
   2798 		}
   2799 		/* else we don't touch it.. */
   2800 	}
   2801 
   2802 	for( c = 0; c < raidPtr->numSpare ; c++) {
   2803 		sparecol = raidPtr->numCol + c;
   2804 		/* Need to ensure that the reconstruct actually completed! */
   2805 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   2806 			/*
   2807 
   2808 			   we claim this disk is "optimal" if it's
   2809 			   rf_ds_used_spare, as that means it should be
   2810 			   directly substitutable for the disk it replaced.
   2811 			   We note that too...
   2812 
   2813 			 */
   2814 
   2815 			for(j=0;j<raidPtr->numCol;j++) {
   2816 				if (raidPtr->Disks[j].spareCol == sparecol) {
   2817 					scol = j;
   2818 					break;
   2819 				}
   2820 			}
   2821 
   2822 			/* XXX shouldn't *really* need this... */
   2823 			clabel = raidget_component_label(raidPtr, sparecol);
   2824 			/* make sure status is noted */
   2825 
   2826 			raid_init_component_label(raidPtr, clabel);
   2827 
   2828 			clabel->column = scol;
   2829 			clabel->status = rf_ds_optimal;
   2830 			clabel->last_unit = raidPtr->raidid;
   2831 
   2832 			raidflush_component_label(raidPtr, sparecol);
   2833 			if (final == RF_FINAL_COMPONENT_UPDATE) {
   2834 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
   2835 					raidmarkclean(raidPtr, sparecol);
   2836 				}
   2837 			}
   2838 		}
   2839 	}
   2840 }
   2841 
   2842 void
   2843 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
   2844 {
   2845 
   2846 	if (vp != NULL) {
   2847 		if (auto_configured == 1) {
   2848 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   2849 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   2850 			vput(vp);
   2851 
   2852 		} else {
   2853 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
   2854 		}
   2855 	}
   2856 }
   2857 
   2858 
   2859 void
   2860 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
   2861 {
   2862 	int r,c;
   2863 	struct vnode *vp;
   2864 	int acd;
   2865 
   2866 
   2867 	/* We take this opportunity to close the vnodes like we should.. */
   2868 
   2869 	for (c = 0; c < raidPtr->numCol; c++) {
   2870 		vp = raidPtr->raid_cinfo[c].ci_vp;
   2871 		acd = raidPtr->Disks[c].auto_configured;
   2872 		rf_close_component(raidPtr, vp, acd);
   2873 		raidPtr->raid_cinfo[c].ci_vp = NULL;
   2874 		raidPtr->Disks[c].auto_configured = 0;
   2875 	}
   2876 
   2877 	for (r = 0; r < raidPtr->numSpare; r++) {
   2878 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
   2879 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
   2880 		rf_close_component(raidPtr, vp, acd);
   2881 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
   2882 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
   2883 	}
   2884 }
   2885 
   2886 
   2887 void
   2888 rf_ReconThread(struct rf_recon_req *req)
   2889 {
   2890 	int     s;
   2891 	RF_Raid_t *raidPtr;
   2892 
   2893 	s = splbio();
   2894 	raidPtr = (RF_Raid_t *) req->raidPtr;
   2895 	raidPtr->recon_in_progress = 1;
   2896 
   2897 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
   2898 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
   2899 
   2900 	RF_Free(req, sizeof(*req));
   2901 
   2902 	raidPtr->recon_in_progress = 0;
   2903 	splx(s);
   2904 
   2905 	/* That's all... */
   2906 	kthread_exit(0);	/* does not return */
   2907 }
   2908 
   2909 void
   2910 rf_RewriteParityThread(RF_Raid_t *raidPtr)
   2911 {
   2912 	int retcode;
   2913 	int s;
   2914 
   2915 	raidPtr->parity_rewrite_stripes_done = 0;
   2916 	raidPtr->parity_rewrite_in_progress = 1;
   2917 	s = splbio();
   2918 	retcode = rf_RewriteParity(raidPtr);
   2919 	splx(s);
   2920 	if (retcode) {
   2921 		printf("raid%d: Error re-writing parity (%d)!\n",
   2922 		    raidPtr->raidid, retcode);
   2923 	} else {
   2924 		/* set the clean bit!  If we shutdown correctly,
   2925 		   the clean bit on each component label will get
   2926 		   set */
   2927 		raidPtr->parity_good = RF_RAID_CLEAN;
   2928 	}
   2929 	raidPtr->parity_rewrite_in_progress = 0;
   2930 
   2931 	/* Anyone waiting for us to stop?  If so, inform them... */
   2932 	if (raidPtr->waitShutdown) {
   2933 		wakeup(&raidPtr->parity_rewrite_in_progress);
   2934 	}
   2935 
   2936 	/* That's all... */
   2937 	kthread_exit(0);	/* does not return */
   2938 }
   2939 
   2940 
   2941 void
   2942 rf_CopybackThread(RF_Raid_t *raidPtr)
   2943 {
   2944 	int s;
   2945 
   2946 	raidPtr->copyback_in_progress = 1;
   2947 	s = splbio();
   2948 	rf_CopybackReconstructedData(raidPtr);
   2949 	splx(s);
   2950 	raidPtr->copyback_in_progress = 0;
   2951 
   2952 	/* That's all... */
   2953 	kthread_exit(0);	/* does not return */
   2954 }
   2955 
   2956 
   2957 void
   2958 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
   2959 {
   2960 	int s;
   2961 	RF_Raid_t *raidPtr;
   2962 
   2963 	s = splbio();
   2964 	raidPtr = req->raidPtr;
   2965 	raidPtr->recon_in_progress = 1;
   2966 	rf_ReconstructInPlace(raidPtr, req->col);
   2967 	RF_Free(req, sizeof(*req));
   2968 	raidPtr->recon_in_progress = 0;
   2969 	splx(s);
   2970 
   2971 	/* That's all... */
   2972 	kthread_exit(0);	/* does not return */
   2973 }
   2974 
   2975 static RF_AutoConfig_t *
   2976 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
   2977     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
   2978     unsigned secsize)
   2979 {
   2980 	int good_one = 0;
   2981 	RF_ComponentLabel_t *clabel;
   2982 	RF_AutoConfig_t *ac;
   2983 
   2984 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
   2985 	if (clabel == NULL) {
   2986 oomem:
   2987 		    while(ac_list) {
   2988 			    ac = ac_list;
   2989 			    if (ac->clabel)
   2990 				    free(ac->clabel, M_RAIDFRAME);
   2991 			    ac_list = ac_list->next;
   2992 			    free(ac, M_RAIDFRAME);
   2993 		    }
   2994 		    printf("RAID auto config: out of memory!\n");
   2995 		    return NULL; /* XXX probably should panic? */
   2996 	}
   2997 
   2998 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
   2999 		/* Got the label.  Does it look reasonable? */
   3000 		if (rf_reasonable_label(clabel, numsecs) &&
   3001 		    (rf_component_label_partitionsize(clabel) <= size)) {
   3002 #ifdef DEBUG
   3003 			printf("Component on: %s: %llu\n",
   3004 				cname, (unsigned long long)size);
   3005 			rf_print_component_label(clabel);
   3006 #endif
   3007 			/* if it's reasonable, add it, else ignore it. */
   3008 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
   3009 				M_NOWAIT);
   3010 			if (ac == NULL) {
   3011 				free(clabel, M_RAIDFRAME);
   3012 				goto oomem;
   3013 			}
   3014 			strlcpy(ac->devname, cname, sizeof(ac->devname));
   3015 			ac->dev = dev;
   3016 			ac->vp = vp;
   3017 			ac->clabel = clabel;
   3018 			ac->next = ac_list;
   3019 			ac_list = ac;
   3020 			good_one = 1;
   3021 		}
   3022 	}
   3023 	if (!good_one) {
   3024 		/* cleanup */
   3025 		free(clabel, M_RAIDFRAME);
   3026 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3027 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3028 		vput(vp);
   3029 	}
   3030 	return ac_list;
   3031 }
   3032 
   3033 RF_AutoConfig_t *
   3034 rf_find_raid_components(void)
   3035 {
   3036 	struct vnode *vp;
   3037 	struct disklabel label;
   3038 	device_t dv;
   3039 	deviter_t di;
   3040 	dev_t dev;
   3041 	int bmajor, bminor, wedge, rf_part_found;
   3042 	int error;
   3043 	int i;
   3044 	RF_AutoConfig_t *ac_list;
   3045 	uint64_t numsecs;
   3046 	unsigned secsize;
   3047 
   3048 	/* initialize the AutoConfig list */
   3049 	ac_list = NULL;
   3050 
   3051 	/* we begin by trolling through *all* the devices on the system */
   3052 
   3053 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
   3054 	     dv = deviter_next(&di)) {
   3055 
   3056 		/* we are only interested in disks... */
   3057 		if (device_class(dv) != DV_DISK)
   3058 			continue;
   3059 
   3060 		/* we don't care about floppies... */
   3061 		if (device_is_a(dv, "fd")) {
   3062 			continue;
   3063 		}
   3064 
   3065 		/* we don't care about CD's... */
   3066 		if (device_is_a(dv, "cd")) {
   3067 			continue;
   3068 		}
   3069 
   3070 		/* we don't care about md's... */
   3071 		if (device_is_a(dv, "md")) {
   3072 			continue;
   3073 		}
   3074 
   3075 		/* hdfd is the Atari/Hades floppy driver */
   3076 		if (device_is_a(dv, "hdfd")) {
   3077 			continue;
   3078 		}
   3079 
   3080 		/* fdisa is the Atari/Milan floppy driver */
   3081 		if (device_is_a(dv, "fdisa")) {
   3082 			continue;
   3083 		}
   3084 
   3085 		/* need to find the device_name_to_block_device_major stuff */
   3086 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
   3087 
   3088 		rf_part_found = 0; /*No raid partition as yet*/
   3089 
   3090 		/* get a vnode for the raw partition of this disk */
   3091 
   3092 		wedge = device_is_a(dv, "dk");
   3093 		bminor = minor(device_unit(dv));
   3094 		dev = wedge ? makedev(bmajor, bminor) :
   3095 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
   3096 		if (bdevvp(dev, &vp))
   3097 			panic("RAID can't alloc vnode");
   3098 
   3099 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
   3100 
   3101 		if (error) {
   3102 			/* "Who cares."  Continue looking
   3103 			   for something that exists*/
   3104 			vput(vp);
   3105 			continue;
   3106 		}
   3107 
   3108 		error = getdisksize(vp, &numsecs, &secsize);
   3109 		if (error) {
   3110 			vput(vp);
   3111 			continue;
   3112 		}
   3113 		if (wedge) {
   3114 			struct dkwedge_info dkw;
   3115 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
   3116 			    NOCRED);
   3117 			if (error) {
   3118 				printf("RAIDframe: can't get wedge info for "
   3119 				    "dev %s (%d)\n", device_xname(dv), error);
   3120 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3121 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3122 				vput(vp);
   3123 				continue;
   3124 			}
   3125 
   3126 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
   3127 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3128 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3129 				vput(vp);
   3130 				continue;
   3131 			}
   3132 
   3133 			ac_list = rf_get_component(ac_list, dev, vp,
   3134 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
   3135 			rf_part_found = 1; /*There is a raid component on this disk*/
   3136 			continue;
   3137 		}
   3138 
   3139 		/* Ok, the disk exists.  Go get the disklabel. */
   3140 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
   3141 		if (error) {
   3142 			/*
   3143 			 * XXX can't happen - open() would
   3144 			 * have errored out (or faked up one)
   3145 			 */
   3146 			if (error != ENOTTY)
   3147 				printf("RAIDframe: can't get label for dev "
   3148 				    "%s (%d)\n", device_xname(dv), error);
   3149 		}
   3150 
   3151 		/* don't need this any more.  We'll allocate it again
   3152 		   a little later if we really do... */
   3153 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
   3154 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
   3155 		vput(vp);
   3156 
   3157 		if (error)
   3158 			continue;
   3159 
   3160 		rf_part_found = 0; /*No raid partitions yet*/
   3161 		for (i = 0; i < label.d_npartitions; i++) {
   3162 			char cname[sizeof(ac_list->devname)];
   3163 
   3164 			/* We only support partitions marked as RAID */
   3165 			if (label.d_partitions[i].p_fstype != FS_RAID)
   3166 				continue;
   3167 
   3168 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
   3169 			if (bdevvp(dev, &vp))
   3170 				panic("RAID can't alloc vnode");
   3171 
   3172 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3173 			if (error) {
   3174 				/* Whatever... */
   3175 				vput(vp);
   3176 				continue;
   3177 			}
   3178 			snprintf(cname, sizeof(cname), "%s%c",
   3179 			    device_xname(dv), 'a' + i);
   3180 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3181 				label.d_partitions[i].p_size, numsecs, secsize);
   3182 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
   3183 		}
   3184 
   3185 		/*
   3186 		 *If there is no raid component on this disk, either in a
   3187 		 *disklabel or inside a wedge, check the raw partition as well,
   3188 		 *as it is possible to configure raid components on raw disk
   3189 		 *devices.
   3190 		 */
   3191 
   3192 		if (!rf_part_found) {
   3193 			char cname[sizeof(ac_list->devname)];
   3194 
   3195 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
   3196 			if (bdevvp(dev, &vp))
   3197 				panic("RAID can't alloc vnode");
   3198 
   3199 			error = VOP_OPEN(vp, FREAD, NOCRED);
   3200 			if (error) {
   3201 				/* Whatever... */
   3202 				vput(vp);
   3203 				continue;
   3204 			}
   3205 			snprintf(cname, sizeof(cname), "%s%c",
   3206 			    device_xname(dv), 'a' + RAW_PART);
   3207 			ac_list = rf_get_component(ac_list, dev, vp, cname,
   3208 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
   3209 		}
   3210 	}
   3211 	deviter_release(&di);
   3212 	return ac_list;
   3213 }
   3214 
   3215 
   3216 int
   3217 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3218 {
   3219 
   3220 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
   3221 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
   3222 	    ((clabel->clean == RF_RAID_CLEAN) ||
   3223 	     (clabel->clean == RF_RAID_DIRTY)) &&
   3224 	    clabel->row >=0 &&
   3225 	    clabel->column >= 0 &&
   3226 	    clabel->num_rows > 0 &&
   3227 	    clabel->num_columns > 0 &&
   3228 	    clabel->row < clabel->num_rows &&
   3229 	    clabel->column < clabel->num_columns &&
   3230 	    clabel->blockSize > 0 &&
   3231 	    /*
   3232 	     * numBlocksHi may contain garbage, but it is ok since
   3233 	     * the type is unsigned.  If it is really garbage,
   3234 	     * rf_fix_old_label_size() will fix it.
   3235 	     */
   3236 	    rf_component_label_numblocks(clabel) > 0) {
   3237 		/*
   3238 		 * label looks reasonable enough...
   3239 		 * let's make sure it has no old garbage.
   3240 		 */
   3241 		if (numsecs)
   3242 			rf_fix_old_label_size(clabel, numsecs);
   3243 		return(1);
   3244 	}
   3245 	return(0);
   3246 }
   3247 
   3248 
   3249 /*
   3250  * For reasons yet unknown, some old component labels have garbage in
   3251  * the newer numBlocksHi region, and this causes lossage.  Since those
   3252  * disks will also have numsecs set to less than 32 bits of sectors,
   3253  * we can determine when this corruption has occured, and fix it.
   3254  *
   3255  * The exact same problem, with the same unknown reason, happens to
   3256  * the partitionSizeHi member as well.
   3257  */
   3258 static void
   3259 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
   3260 {
   3261 
   3262 	if (numsecs < ((uint64_t)1 << 32)) {
   3263 		if (clabel->numBlocksHi) {
   3264 			printf("WARNING: total sectors < 32 bits, yet "
   3265 			       "numBlocksHi set\n"
   3266 			       "WARNING: resetting numBlocksHi to zero.\n");
   3267 			clabel->numBlocksHi = 0;
   3268 		}
   3269 
   3270 		if (clabel->partitionSizeHi) {
   3271 			printf("WARNING: total sectors < 32 bits, yet "
   3272 			       "partitionSizeHi set\n"
   3273 			       "WARNING: resetting partitionSizeHi to zero.\n");
   3274 			clabel->partitionSizeHi = 0;
   3275 		}
   3276 	}
   3277 }
   3278 
   3279 
   3280 #ifdef DEBUG
   3281 void
   3282 rf_print_component_label(RF_ComponentLabel_t *clabel)
   3283 {
   3284 	uint64_t numBlocks;
   3285 
   3286 	numBlocks = rf_component_label_numblocks(clabel);
   3287 
   3288 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
   3289 	       clabel->row, clabel->column,
   3290 	       clabel->num_rows, clabel->num_columns);
   3291 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
   3292 	       clabel->version, clabel->serial_number,
   3293 	       clabel->mod_counter);
   3294 	printf("   Clean: %s Status: %d\n",
   3295 	       clabel->clean ? "Yes" : "No", clabel->status);
   3296 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
   3297 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
   3298 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
   3299 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
   3300 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
   3301 	printf("   Contains root partition: %s\n",
   3302 	       clabel->root_partition ? "Yes" : "No");
   3303 	printf("   Last configured as: raid%d\n", clabel->last_unit);
   3304 #if 0
   3305 	   printf("   Config order: %d\n", clabel->config_order);
   3306 #endif
   3307 
   3308 }
   3309 #endif
   3310 
   3311 RF_ConfigSet_t *
   3312 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
   3313 {
   3314 	RF_AutoConfig_t *ac;
   3315 	RF_ConfigSet_t *config_sets;
   3316 	RF_ConfigSet_t *cset;
   3317 	RF_AutoConfig_t *ac_next;
   3318 
   3319 
   3320 	config_sets = NULL;
   3321 
   3322 	/* Go through the AutoConfig list, and figure out which components
   3323 	   belong to what sets.  */
   3324 	ac = ac_list;
   3325 	while(ac!=NULL) {
   3326 		/* we're going to putz with ac->next, so save it here
   3327 		   for use at the end of the loop */
   3328 		ac_next = ac->next;
   3329 
   3330 		if (config_sets == NULL) {
   3331 			/* will need at least this one... */
   3332 			config_sets = (RF_ConfigSet_t *)
   3333 				malloc(sizeof(RF_ConfigSet_t),
   3334 				       M_RAIDFRAME, M_NOWAIT);
   3335 			if (config_sets == NULL) {
   3336 				panic("rf_create_auto_sets: No memory!");
   3337 			}
   3338 			/* this one is easy :) */
   3339 			config_sets->ac = ac;
   3340 			config_sets->next = NULL;
   3341 			config_sets->rootable = 0;
   3342 			ac->next = NULL;
   3343 		} else {
   3344 			/* which set does this component fit into? */
   3345 			cset = config_sets;
   3346 			while(cset!=NULL) {
   3347 				if (rf_does_it_fit(cset, ac)) {
   3348 					/* looks like it matches... */
   3349 					ac->next = cset->ac;
   3350 					cset->ac = ac;
   3351 					break;
   3352 				}
   3353 				cset = cset->next;
   3354 			}
   3355 			if (cset==NULL) {
   3356 				/* didn't find a match above... new set..*/
   3357 				cset = (RF_ConfigSet_t *)
   3358 					malloc(sizeof(RF_ConfigSet_t),
   3359 					       M_RAIDFRAME, M_NOWAIT);
   3360 				if (cset == NULL) {
   3361 					panic("rf_create_auto_sets: No memory!");
   3362 				}
   3363 				cset->ac = ac;
   3364 				ac->next = NULL;
   3365 				cset->next = config_sets;
   3366 				cset->rootable = 0;
   3367 				config_sets = cset;
   3368 			}
   3369 		}
   3370 		ac = ac_next;
   3371 	}
   3372 
   3373 
   3374 	return(config_sets);
   3375 }
   3376 
   3377 static int
   3378 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
   3379 {
   3380 	RF_ComponentLabel_t *clabel1, *clabel2;
   3381 
   3382 	/* If this one matches the *first* one in the set, that's good
   3383 	   enough, since the other members of the set would have been
   3384 	   through here too... */
   3385 	/* note that we are not checking partitionSize here..
   3386 
   3387 	   Note that we are also not checking the mod_counters here.
   3388 	   If everything else matches execpt the mod_counter, that's
   3389 	   good enough for this test.  We will deal with the mod_counters
   3390 	   a little later in the autoconfiguration process.
   3391 
   3392 	    (clabel1->mod_counter == clabel2->mod_counter) &&
   3393 
   3394 	   The reason we don't check for this is that failed disks
   3395 	   will have lower modification counts.  If those disks are
   3396 	   not added to the set they used to belong to, then they will
   3397 	   form their own set, which may result in 2 different sets,
   3398 	   for example, competing to be configured at raid0, and
   3399 	   perhaps competing to be the root filesystem set.  If the
   3400 	   wrong ones get configured, or both attempt to become /,
   3401 	   weird behaviour and or serious lossage will occur.  Thus we
   3402 	   need to bring them into the fold here, and kick them out at
   3403 	   a later point.
   3404 
   3405 	*/
   3406 
   3407 	clabel1 = cset->ac->clabel;
   3408 	clabel2 = ac->clabel;
   3409 	if ((clabel1->version == clabel2->version) &&
   3410 	    (clabel1->serial_number == clabel2->serial_number) &&
   3411 	    (clabel1->num_rows == clabel2->num_rows) &&
   3412 	    (clabel1->num_columns == clabel2->num_columns) &&
   3413 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
   3414 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
   3415 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
   3416 	    (clabel1->parityConfig == clabel2->parityConfig) &&
   3417 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
   3418 	    (clabel1->blockSize == clabel2->blockSize) &&
   3419 	    rf_component_label_numblocks(clabel1) ==
   3420 	    rf_component_label_numblocks(clabel2) &&
   3421 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
   3422 	    (clabel1->root_partition == clabel2->root_partition) &&
   3423 	    (clabel1->last_unit == clabel2->last_unit) &&
   3424 	    (clabel1->config_order == clabel2->config_order)) {
   3425 		/* if it get's here, it almost *has* to be a match */
   3426 	} else {
   3427 		/* it's not consistent with somebody in the set..
   3428 		   punt */
   3429 		return(0);
   3430 	}
   3431 	/* all was fine.. it must fit... */
   3432 	return(1);
   3433 }
   3434 
   3435 int
   3436 rf_have_enough_components(RF_ConfigSet_t *cset)
   3437 {
   3438 	RF_AutoConfig_t *ac;
   3439 	RF_AutoConfig_t *auto_config;
   3440 	RF_ComponentLabel_t *clabel;
   3441 	int c;
   3442 	int num_cols;
   3443 	int num_missing;
   3444 	int mod_counter;
   3445 	int mod_counter_found;
   3446 	int even_pair_failed;
   3447 	char parity_type;
   3448 
   3449 
   3450 	/* check to see that we have enough 'live' components
   3451 	   of this set.  If so, we can configure it if necessary */
   3452 
   3453 	num_cols = cset->ac->clabel->num_columns;
   3454 	parity_type = cset->ac->clabel->parityConfig;
   3455 
   3456 	/* XXX Check for duplicate components!?!?!? */
   3457 
   3458 	/* Determine what the mod_counter is supposed to be for this set. */
   3459 
   3460 	mod_counter_found = 0;
   3461 	mod_counter = 0;
   3462 	ac = cset->ac;
   3463 	while(ac!=NULL) {
   3464 		if (mod_counter_found==0) {
   3465 			mod_counter = ac->clabel->mod_counter;
   3466 			mod_counter_found = 1;
   3467 		} else {
   3468 			if (ac->clabel->mod_counter > mod_counter) {
   3469 				mod_counter = ac->clabel->mod_counter;
   3470 			}
   3471 		}
   3472 		ac = ac->next;
   3473 	}
   3474 
   3475 	num_missing = 0;
   3476 	auto_config = cset->ac;
   3477 
   3478 	even_pair_failed = 0;
   3479 	for(c=0; c<num_cols; c++) {
   3480 		ac = auto_config;
   3481 		while(ac!=NULL) {
   3482 			if ((ac->clabel->column == c) &&
   3483 			    (ac->clabel->mod_counter == mod_counter)) {
   3484 				/* it's this one... */
   3485 #ifdef DEBUG
   3486 				printf("Found: %s at %d\n",
   3487 				       ac->devname,c);
   3488 #endif
   3489 				break;
   3490 			}
   3491 			ac=ac->next;
   3492 		}
   3493 		if (ac==NULL) {
   3494 				/* Didn't find one here! */
   3495 				/* special case for RAID 1, especially
   3496 				   where there are more than 2
   3497 				   components (where RAIDframe treats
   3498 				   things a little differently :( ) */
   3499 			if (parity_type == '1') {
   3500 				if (c%2 == 0) { /* even component */
   3501 					even_pair_failed = 1;
   3502 				} else { /* odd component.  If
   3503 					    we're failed, and
   3504 					    so is the even
   3505 					    component, it's
   3506 					    "Good Night, Charlie" */
   3507 					if (even_pair_failed == 1) {
   3508 						return(0);
   3509 					}
   3510 				}
   3511 			} else {
   3512 				/* normal accounting */
   3513 				num_missing++;
   3514 			}
   3515 		}
   3516 		if ((parity_type == '1') && (c%2 == 1)) {
   3517 				/* Just did an even component, and we didn't
   3518 				   bail.. reset the even_pair_failed flag,
   3519 				   and go on to the next component.... */
   3520 			even_pair_failed = 0;
   3521 		}
   3522 	}
   3523 
   3524 	clabel = cset->ac->clabel;
   3525 
   3526 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
   3527 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
   3528 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
   3529 		/* XXX this needs to be made *much* more general */
   3530 		/* Too many failures */
   3531 		return(0);
   3532 	}
   3533 	/* otherwise, all is well, and we've got enough to take a kick
   3534 	   at autoconfiguring this set */
   3535 	return(1);
   3536 }
   3537 
   3538 void
   3539 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
   3540 			RF_Raid_t *raidPtr)
   3541 {
   3542 	RF_ComponentLabel_t *clabel;
   3543 	int i;
   3544 
   3545 	clabel = ac->clabel;
   3546 
   3547 	/* 1. Fill in the common stuff */
   3548 	config->numRow = clabel->num_rows = 1;
   3549 	config->numCol = clabel->num_columns;
   3550 	config->numSpare = 0; /* XXX should this be set here? */
   3551 	config->sectPerSU = clabel->sectPerSU;
   3552 	config->SUsPerPU = clabel->SUsPerPU;
   3553 	config->SUsPerRU = clabel->SUsPerRU;
   3554 	config->parityConfig = clabel->parityConfig;
   3555 	/* XXX... */
   3556 	strcpy(config->diskQueueType,"fifo");
   3557 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
   3558 	config->layoutSpecificSize = 0; /* XXX ?? */
   3559 
   3560 	while(ac!=NULL) {
   3561 		/* row/col values will be in range due to the checks
   3562 		   in reasonable_label() */
   3563 		strcpy(config->devnames[0][ac->clabel->column],
   3564 		       ac->devname);
   3565 		ac = ac->next;
   3566 	}
   3567 
   3568 	for(i=0;i<RF_MAXDBGV;i++) {
   3569 		config->debugVars[i][0] = 0;
   3570 	}
   3571 }
   3572 
   3573 int
   3574 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
   3575 {
   3576 	RF_ComponentLabel_t *clabel;
   3577 	int column;
   3578 	int sparecol;
   3579 
   3580 	raidPtr->autoconfigure = new_value;
   3581 
   3582 	for(column=0; column<raidPtr->numCol; column++) {
   3583 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3584 			clabel = raidget_component_label(raidPtr, column);
   3585 			clabel->autoconfigure = new_value;
   3586 			raidflush_component_label(raidPtr, column);
   3587 		}
   3588 	}
   3589 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3590 		sparecol = raidPtr->numCol + column;
   3591 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3592 			clabel = raidget_component_label(raidPtr, sparecol);
   3593 			clabel->autoconfigure = new_value;
   3594 			raidflush_component_label(raidPtr, sparecol);
   3595 		}
   3596 	}
   3597 	return(new_value);
   3598 }
   3599 
   3600 int
   3601 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
   3602 {
   3603 	RF_ComponentLabel_t *clabel;
   3604 	int column;
   3605 	int sparecol;
   3606 
   3607 	raidPtr->root_partition = new_value;
   3608 	for(column=0; column<raidPtr->numCol; column++) {
   3609 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
   3610 			clabel = raidget_component_label(raidPtr, column);
   3611 			clabel->root_partition = new_value;
   3612 			raidflush_component_label(raidPtr, column);
   3613 		}
   3614 	}
   3615 	for(column = 0; column < raidPtr->numSpare ; column++) {
   3616 		sparecol = raidPtr->numCol + column;
   3617 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3618 			clabel = raidget_component_label(raidPtr, sparecol);
   3619 			clabel->root_partition = new_value;
   3620 			raidflush_component_label(raidPtr, sparecol);
   3621 		}
   3622 	}
   3623 	return(new_value);
   3624 }
   3625 
   3626 void
   3627 rf_release_all_vps(RF_ConfigSet_t *cset)
   3628 {
   3629 	RF_AutoConfig_t *ac;
   3630 
   3631 	ac = cset->ac;
   3632 	while(ac!=NULL) {
   3633 		/* Close the vp, and give it back */
   3634 		if (ac->vp) {
   3635 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
   3636 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
   3637 			vput(ac->vp);
   3638 			ac->vp = NULL;
   3639 		}
   3640 		ac = ac->next;
   3641 	}
   3642 }
   3643 
   3644 
   3645 void
   3646 rf_cleanup_config_set(RF_ConfigSet_t *cset)
   3647 {
   3648 	RF_AutoConfig_t *ac;
   3649 	RF_AutoConfig_t *next_ac;
   3650 
   3651 	ac = cset->ac;
   3652 	while(ac!=NULL) {
   3653 		next_ac = ac->next;
   3654 		/* nuke the label */
   3655 		free(ac->clabel, M_RAIDFRAME);
   3656 		/* cleanup the config structure */
   3657 		free(ac, M_RAIDFRAME);
   3658 		/* "next.." */
   3659 		ac = next_ac;
   3660 	}
   3661 	/* and, finally, nuke the config set */
   3662 	free(cset, M_RAIDFRAME);
   3663 }
   3664 
   3665 
   3666 void
   3667 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
   3668 {
   3669 	/* current version number */
   3670 	clabel->version = RF_COMPONENT_LABEL_VERSION;
   3671 	clabel->serial_number = raidPtr->serial_number;
   3672 	clabel->mod_counter = raidPtr->mod_counter;
   3673 
   3674 	clabel->num_rows = 1;
   3675 	clabel->num_columns = raidPtr->numCol;
   3676 	clabel->clean = RF_RAID_DIRTY; /* not clean */
   3677 	clabel->status = rf_ds_optimal; /* "It's good!" */
   3678 
   3679 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
   3680 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
   3681 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
   3682 
   3683 	clabel->blockSize = raidPtr->bytesPerSector;
   3684 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
   3685 
   3686 	/* XXX not portable */
   3687 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
   3688 	clabel->maxOutstanding = raidPtr->maxOutstanding;
   3689 	clabel->autoconfigure = raidPtr->autoconfigure;
   3690 	clabel->root_partition = raidPtr->root_partition;
   3691 	clabel->last_unit = raidPtr->raidid;
   3692 	clabel->config_order = raidPtr->config_order;
   3693 
   3694 #ifndef RF_NO_PARITY_MAP
   3695 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
   3696 #endif
   3697 }
   3698 
   3699 int
   3700 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
   3701 {
   3702 	RF_Raid_t *raidPtr;
   3703 	RF_Config_t *config;
   3704 	int raidID;
   3705 	int retcode;
   3706 
   3707 #ifdef DEBUG
   3708 	printf("RAID autoconfigure\n");
   3709 #endif
   3710 
   3711 	retcode = 0;
   3712 	*unit = -1;
   3713 
   3714 	/* 1. Create a config structure */
   3715 
   3716 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
   3717 				       M_RAIDFRAME,
   3718 				       M_NOWAIT);
   3719 	if (config==NULL) {
   3720 		printf("Out of mem!?!?\n");
   3721 				/* XXX do something more intelligent here. */
   3722 		return(1);
   3723 	}
   3724 
   3725 	memset(config, 0, sizeof(RF_Config_t));
   3726 
   3727 	/*
   3728 	   2. Figure out what RAID ID this one is supposed to live at
   3729 	   See if we can get the same RAID dev that it was configured
   3730 	   on last time..
   3731 	*/
   3732 
   3733 	raidID = cset->ac->clabel->last_unit;
   3734 	if ((raidID < 0) || (raidID >= numraid)) {
   3735 		/* let's not wander off into lala land. */
   3736 		raidID = numraid - 1;
   3737 	}
   3738 	if (raidPtrs[raidID]->valid != 0) {
   3739 
   3740 		/*
   3741 		   Nope... Go looking for an alternative...
   3742 		   Start high so we don't immediately use raid0 if that's
   3743 		   not taken.
   3744 		*/
   3745 
   3746 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
   3747 			if (raidPtrs[raidID]->valid == 0) {
   3748 				/* can use this one! */
   3749 				break;
   3750 			}
   3751 		}
   3752 	}
   3753 
   3754 	if (raidID < 0) {
   3755 		/* punt... */
   3756 		printf("Unable to auto configure this set!\n");
   3757 		printf("(Out of RAID devs!)\n");
   3758 		free(config, M_RAIDFRAME);
   3759 		return(1);
   3760 	}
   3761 
   3762 #ifdef DEBUG
   3763 	printf("Configuring raid%d:\n",raidID);
   3764 #endif
   3765 
   3766 	raidPtr = raidPtrs[raidID];
   3767 
   3768 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
   3769 	raidPtr->raidid = raidID;
   3770 	raidPtr->openings = RAIDOUTSTANDING;
   3771 
   3772 	/* 3. Build the configuration structure */
   3773 	rf_create_configuration(cset->ac, config, raidPtr);
   3774 
   3775 	/* 4. Do the configuration */
   3776 	retcode = rf_Configure(raidPtr, config, cset->ac);
   3777 
   3778 	if (retcode == 0) {
   3779 
   3780 		raidinit(raidPtrs[raidID]);
   3781 
   3782 		rf_markalldirty(raidPtrs[raidID]);
   3783 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
   3784 		if (cset->ac->clabel->root_partition==1) {
   3785 			/* everything configured just fine.  Make a note
   3786 			   that this set is eligible to be root. */
   3787 			cset->rootable = 1;
   3788 			/* XXX do this here? */
   3789 			raidPtrs[raidID]->root_partition = 1;
   3790 		}
   3791 	}
   3792 
   3793 	/* 5. Cleanup */
   3794 	free(config, M_RAIDFRAME);
   3795 
   3796 	*unit = raidID;
   3797 	return(retcode);
   3798 }
   3799 
   3800 void
   3801 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
   3802 {
   3803 	struct buf *bp;
   3804 
   3805 	bp = (struct buf *)desc->bp;
   3806 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
   3807 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
   3808 }
   3809 
   3810 void
   3811 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
   3812 	     size_t xmin, size_t xmax)
   3813 {
   3814 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
   3815 	pool_sethiwat(p, xmax);
   3816 	pool_prime(p, xmin);
   3817 	pool_setlowat(p, xmin);
   3818 }
   3819 
   3820 /*
   3821  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
   3822  * if there is IO pending and if that IO could possibly be done for a
   3823  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
   3824  * otherwise.
   3825  *
   3826  */
   3827 
   3828 int
   3829 rf_buf_queue_check(int raidid)
   3830 {
   3831 	if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
   3832 	    raidPtrs[raidid]->openings > 0) {
   3833 		/* there is work to do */
   3834 		return 0;
   3835 	}
   3836 	/* default is nothing to do */
   3837 	return 1;
   3838 }
   3839 
   3840 int
   3841 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
   3842 {
   3843 	uint64_t numsecs;
   3844 	unsigned secsize;
   3845 	int error;
   3846 
   3847 	error = getdisksize(vp, &numsecs, &secsize);
   3848 	if (error == 0) {
   3849 		diskPtr->blockSize = secsize;
   3850 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
   3851 		diskPtr->partitionSize = numsecs;
   3852 		return 0;
   3853 	}
   3854 	return error;
   3855 }
   3856 
   3857 static int
   3858 raid_match(device_t self, cfdata_t cfdata, void *aux)
   3859 {
   3860 	return 1;
   3861 }
   3862 
   3863 static void
   3864 raid_attach(device_t parent, device_t self, void *aux)
   3865 {
   3866 
   3867 }
   3868 
   3869 
   3870 static int
   3871 raid_detach(device_t self, int flags)
   3872 {
   3873 	int error;
   3874 	struct raid_softc *rs = &raid_softc[device_unit(self)];
   3875 
   3876 	if ((error = raidlock(rs)) != 0)
   3877 		return (error);
   3878 
   3879 	error = raid_detach_unlocked(rs);
   3880 
   3881 	raidunlock(rs);
   3882 
   3883 	return error;
   3884 }
   3885 
   3886 static void
   3887 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
   3888 {
   3889 	prop_dictionary_t disk_info, odisk_info, geom;
   3890 	disk_info = prop_dictionary_create();
   3891 	geom = prop_dictionary_create();
   3892 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
   3893 				   raidPtr->totalSectors);
   3894 	prop_dictionary_set_uint32(geom, "sector-size",
   3895 				   raidPtr->bytesPerSector);
   3896 
   3897 	prop_dictionary_set_uint16(geom, "sectors-per-track",
   3898 				   raidPtr->Layout.dataSectorsPerStripe);
   3899 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
   3900 				   4 * raidPtr->numCol);
   3901 
   3902 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
   3903 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
   3904 	   (4 * raidPtr->numCol)));
   3905 
   3906 	prop_dictionary_set(disk_info, "geometry", geom);
   3907 	prop_object_release(geom);
   3908 	prop_dictionary_set(device_properties(rs->sc_dev),
   3909 			    "disk-info", disk_info);
   3910 	odisk_info = rs->sc_dkdev.dk_info;
   3911 	rs->sc_dkdev.dk_info = disk_info;
   3912 	if (odisk_info)
   3913 		prop_object_release(odisk_info);
   3914 }
   3915 
   3916 /*
   3917  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
   3918  * We end up returning whatever error was returned by the first cache flush
   3919  * that fails.
   3920  */
   3921 
   3922 int
   3923 rf_sync_component_caches(RF_Raid_t *raidPtr)
   3924 {
   3925 	int c, sparecol;
   3926 	int e,error;
   3927 	int force = 1;
   3928 
   3929 	error = 0;
   3930 	for (c = 0; c < raidPtr->numCol; c++) {
   3931 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
   3932 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
   3933 					  &force, FWRITE, NOCRED);
   3934 			if (e) {
   3935 				if (e != ENODEV)
   3936 					printf("raid%d: cache flush to component %s failed.\n",
   3937 					       raidPtr->raidid, raidPtr->Disks[c].devname);
   3938 				if (error == 0) {
   3939 					error = e;
   3940 				}
   3941 			}
   3942 		}
   3943 	}
   3944 
   3945 	for( c = 0; c < raidPtr->numSpare ; c++) {
   3946 		sparecol = raidPtr->numCol + c;
   3947 		/* Need to ensure that the reconstruct actually completed! */
   3948 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
   3949 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
   3950 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
   3951 			if (e) {
   3952 				if (e != ENODEV)
   3953 					printf("raid%d: cache flush to component %s failed.\n",
   3954 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
   3955 				if (error == 0) {
   3956 					error = e;
   3957 				}
   3958 			}
   3959 		}
   3960 	}
   3961 	return error;
   3962 }
   3963