rf_netbsdkintf.c revision 1.345.2.2 1 /* $NetBSD: rf_netbsdkintf.c,v 1.345.2.2 2016/07/17 05:05:10 pgoyette Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.345.2.2 2016/07/17 05:05:10 pgoyette Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/localcount.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #ifdef COMPAT_50
153 #include "rf_compat50.h"
154 #endif
155
156 #include "ioconf.h"
157
158 #ifdef DEBUG
159 int rf_kdebug_level = 0;
160 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
161 #else /* DEBUG */
162 #define db1_printf(a) { }
163 #endif /* DEBUG */
164
165 #ifdef DEBUG_ROOT
166 #define DPRINTF(a, ...) printf(a, __VA_ARGS__)
167 #else
168 #define DPRINTF(a, ...)
169 #endif
170
171 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
172 static rf_declare_mutex2(rf_sparet_wait_mutex);
173 static rf_declare_cond2(rf_sparet_wait_cv);
174 static rf_declare_cond2(rf_sparet_resp_cv);
175
176 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
177 * spare table */
178 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
179 * installation process */
180 #endif
181
182 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
183
184 /* prototypes */
185 static void KernelWakeupFunc(struct buf *);
186 static void InitBP(struct buf *, struct vnode *, unsigned,
187 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
188 void *, int, struct proc *);
189 struct raid_softc;
190 static void raidinit(struct raid_softc *);
191 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
192
193 static int raid_match(device_t, cfdata_t, void *);
194 static void raid_attach(device_t, device_t, void *);
195 static int raid_detach(device_t, int);
196
197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
198 daddr_t, daddr_t);
199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
200 daddr_t, daddr_t, int);
201
202 static int raidwrite_component_label(unsigned,
203 dev_t, struct vnode *, RF_ComponentLabel_t *);
204 static int raidread_component_label(unsigned,
205 dev_t, struct vnode *, RF_ComponentLabel_t *);
206
207 static int raid_diskstart(device_t, struct buf *bp);
208 static int raid_dumpblocks(device_t, void *, daddr_t, int);
209 static int raid_lastclose(device_t);
210
211 static dev_type_open(raidopen);
212 static dev_type_close(raidclose);
213 static dev_type_read(raidread);
214 static dev_type_write(raidwrite);
215 static dev_type_ioctl(raidioctl);
216 static dev_type_strategy(raidstrategy);
217 static dev_type_dump(raiddump);
218 static dev_type_size(raidsize);
219
220 #ifdef _MODULE
221 struct localcount raid_localcount_bdev, raid_localcount_cdev;
222 #endif
223
224 const struct bdevsw raid_bdevsw = {
225 .d_open = raidopen,
226 .d_close = raidclose,
227 .d_strategy = raidstrategy,
228 .d_ioctl = raidioctl,
229 .d_dump = raiddump,
230 .d_psize = raidsize,
231 .d_discard = nodiscard,
232 #ifdef _MODULE
233 .d_localcount = &raid_localcount_bdev,
234 #endif
235 .d_flag = D_DISK
236 };
237
238 const struct cdevsw raid_cdevsw = {
239 .d_open = raidopen,
240 .d_close = raidclose,
241 .d_read = raidread,
242 .d_write = raidwrite,
243 .d_ioctl = raidioctl,
244 .d_stop = nostop,
245 .d_tty = notty,
246 .d_poll = nopoll,
247 .d_mmap = nommap,
248 .d_kqfilter = nokqfilter,
249 .d_discard = nodiscard,
250 #ifdef _MODULE
251 .d_localcount = &raid_localcount_bdev,
252 #endif
253 .d_flag = D_DISK
254 };
255
256 static struct dkdriver rf_dkdriver = {
257 .d_open = raidopen,
258 .d_close = raidclose,
259 .d_strategy = raidstrategy,
260 .d_diskstart = raid_diskstart,
261 .d_dumpblocks = raid_dumpblocks,
262 .d_lastclose = raid_lastclose,
263 .d_minphys = minphys
264 };
265
266 struct raid_softc {
267 struct dk_softc sc_dksc;
268 int sc_unit;
269 int sc_flags; /* flags */
270 int sc_cflags; /* configuration flags */
271 kmutex_t sc_mutex; /* interlock mutex */
272 kcondvar_t sc_cv; /* and the condvar */
273 uint64_t sc_size; /* size of the raid device */
274 char sc_xname[20]; /* XXX external name */
275 RF_Raid_t sc_r;
276 LIST_ENTRY(raid_softc) sc_link;
277 };
278 /* sc_flags */
279 #define RAIDF_INITED 0x01 /* unit has been initialized */
280 #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */
281 #define RAIDF_DETACH 0x04 /* detach after final close */
282 #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */
283 #define RAIDF_LOCKED 0x10 /* unit is locked */
284 #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */
285
286 #define raidunit(x) DISKUNIT(x)
287 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
288
289 extern struct cfdriver raid_cd;
290 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
291 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
292 DVF_DETACH_SHUTDOWN);
293
294 /*
295 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
296 * Be aware that large numbers can allow the driver to consume a lot of
297 * kernel memory, especially on writes, and in degraded mode reads.
298 *
299 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
300 * a single 64K write will typically require 64K for the old data,
301 * 64K for the old parity, and 64K for the new parity, for a total
302 * of 192K (if the parity buffer is not re-used immediately).
303 * Even it if is used immediately, that's still 128K, which when multiplied
304 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
305 *
306 * Now in degraded mode, for example, a 64K read on the above setup may
307 * require data reconstruction, which will require *all* of the 4 remaining
308 * disks to participate -- 4 * 32K/disk == 128K again.
309 */
310
311 #ifndef RAIDOUTSTANDING
312 #define RAIDOUTSTANDING 6
313 #endif
314
315 #define RAIDLABELDEV(dev) \
316 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
317
318 /* declared here, and made public, for the benefit of KVM stuff.. */
319
320 static int raidlock(struct raid_softc *);
321 static void raidunlock(struct raid_softc *);
322
323 static int raid_detach_unlocked(struct raid_softc *);
324
325 static void rf_markalldirty(RF_Raid_t *);
326 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
327
328 void rf_ReconThread(struct rf_recon_req *);
329 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
330 void rf_CopybackThread(RF_Raid_t *raidPtr);
331 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
332 int rf_autoconfig(device_t);
333 void rf_buildroothack(RF_ConfigSet_t *);
334
335 RF_AutoConfig_t *rf_find_raid_components(void);
336 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
337 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
338 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
339 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
340 int rf_set_autoconfig(RF_Raid_t *, int);
341 int rf_set_rootpartition(RF_Raid_t *, int);
342 void rf_release_all_vps(RF_ConfigSet_t *);
343 void rf_cleanup_config_set(RF_ConfigSet_t *);
344 int rf_have_enough_components(RF_ConfigSet_t *);
345 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
346 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
347
348 /*
349 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
350 * Note that this is overridden by having RAID_AUTOCONFIG as an option
351 * in the kernel config file.
352 */
353 #ifdef RAID_AUTOCONFIG
354 int raidautoconfig = 1;
355 #else
356 int raidautoconfig = 0;
357 #endif
358 static bool raidautoconfigdone = false;
359
360 struct RF_Pools_s rf_pools;
361
362 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
363 static kmutex_t raid_lock;
364
365 static struct raid_softc *
366 raidcreate(int unit) {
367 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
368 if (sc == NULL) {
369 #ifdef DIAGNOSTIC
370 printf("%s: out of memory\n", __func__);
371 #endif
372 return NULL;
373 }
374 sc->sc_unit = unit;
375 cv_init(&sc->sc_cv, "raidunit");
376 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
377 return sc;
378 }
379
380 static void
381 raiddestroy(struct raid_softc *sc) {
382 cv_destroy(&sc->sc_cv);
383 mutex_destroy(&sc->sc_mutex);
384 kmem_free(sc, sizeof(*sc));
385 }
386
387 static struct raid_softc *
388 raidget(int unit, bool create) {
389 struct raid_softc *sc;
390 if (unit < 0) {
391 #ifdef DIAGNOSTIC
392 panic("%s: unit %d!", __func__, unit);
393 #endif
394 return NULL;
395 }
396 mutex_enter(&raid_lock);
397 LIST_FOREACH(sc, &raids, sc_link) {
398 if (sc->sc_unit == unit) {
399 mutex_exit(&raid_lock);
400 return sc;
401 }
402 }
403 mutex_exit(&raid_lock);
404 if (!create)
405 return NULL;
406 if ((sc = raidcreate(unit)) == NULL)
407 return NULL;
408 mutex_enter(&raid_lock);
409 LIST_INSERT_HEAD(&raids, sc, sc_link);
410 mutex_exit(&raid_lock);
411 return sc;
412 }
413
414 static void
415 raidput(struct raid_softc *sc) {
416 mutex_enter(&raid_lock);
417 LIST_REMOVE(sc, sc_link);
418 mutex_exit(&raid_lock);
419 raiddestroy(sc);
420 }
421
422 void
423 raidattach(int num)
424 {
425
426 /*
427 * Device attachment and associated initialization now occurs
428 * as part of the module initialization.
429 */
430 }
431
432 int
433 rf_autoconfig(device_t self)
434 {
435 RF_AutoConfig_t *ac_list;
436 RF_ConfigSet_t *config_sets;
437
438 if (!raidautoconfig || raidautoconfigdone == true)
439 return (0);
440
441 /* XXX This code can only be run once. */
442 raidautoconfigdone = true;
443
444 #ifdef __HAVE_CPU_BOOTCONF
445 /*
446 * 0. find the boot device if needed first so we can use it later
447 * this needs to be done before we autoconfigure any raid sets,
448 * because if we use wedges we are not going to be able to open
449 * the boot device later
450 */
451 if (booted_device == NULL)
452 cpu_bootconf();
453 #endif
454 /* 1. locate all RAID components on the system */
455 aprint_debug("Searching for RAID components...\n");
456 ac_list = rf_find_raid_components();
457
458 /* 2. Sort them into their respective sets. */
459 config_sets = rf_create_auto_sets(ac_list);
460
461 /*
462 * 3. Evaluate each set and configure the valid ones.
463 * This gets done in rf_buildroothack().
464 */
465 rf_buildroothack(config_sets);
466
467 return 1;
468 }
469
470 static int
471 rf_containsboot(RF_Raid_t *r, device_t bdv) {
472 const char *bootname = device_xname(bdv);
473 size_t len = strlen(bootname);
474
475 for (int col = 0; col < r->numCol; col++) {
476 const char *devname = r->Disks[col].devname;
477 devname += sizeof("/dev/") - 1;
478 if (strncmp(devname, "dk", 2) == 0) {
479 const char *parent =
480 dkwedge_get_parent_name(r->Disks[col].dev);
481 if (parent != NULL)
482 devname = parent;
483 }
484 if (strncmp(devname, bootname, len) == 0) {
485 struct raid_softc *sc = r->softc;
486 aprint_debug("raid%d includes boot device %s\n",
487 sc->sc_unit, devname);
488 return 1;
489 }
490 }
491 return 0;
492 }
493
494 void
495 rf_buildroothack(RF_ConfigSet_t *config_sets)
496 {
497 RF_ConfigSet_t *cset;
498 RF_ConfigSet_t *next_cset;
499 int num_root;
500 struct raid_softc *sc, *rsc;
501 struct dk_softc *dksc;
502
503 sc = rsc = NULL;
504 num_root = 0;
505 cset = config_sets;
506 while (cset != NULL) {
507 next_cset = cset->next;
508 if (rf_have_enough_components(cset) &&
509 cset->ac->clabel->autoconfigure == 1) {
510 sc = rf_auto_config_set(cset);
511 if (sc != NULL) {
512 aprint_debug("raid%d: configured ok\n",
513 sc->sc_unit);
514 if (cset->rootable) {
515 rsc = sc;
516 num_root++;
517 }
518 } else {
519 /* The autoconfig didn't work :( */
520 aprint_debug("Autoconfig failed\n");
521 rf_release_all_vps(cset);
522 }
523 } else {
524 /* we're not autoconfiguring this set...
525 release the associated resources */
526 rf_release_all_vps(cset);
527 }
528 /* cleanup */
529 rf_cleanup_config_set(cset);
530 cset = next_cset;
531 }
532 dksc = &rsc->sc_dksc;
533
534 /* if the user has specified what the root device should be
535 then we don't touch booted_device or boothowto... */
536
537 if (rootspec != NULL)
538 return;
539
540 /* we found something bootable... */
541
542 /*
543 * XXX: The following code assumes that the root raid
544 * is the first ('a') partition. This is about the best
545 * we can do with a BSD disklabel, but we might be able
546 * to do better with a GPT label, by setting a specified
547 * attribute to indicate the root partition. We can then
548 * stash the partition number in the r->root_partition
549 * high bits (the bottom 2 bits are already used). For
550 * now we just set booted_partition to 0 when we override
551 * root.
552 */
553 if (num_root == 1) {
554 device_t candidate_root;
555 if (dksc->sc_dkdev.dk_nwedges != 0) {
556 char cname[sizeof(cset->ac->devname)];
557 /* XXX: assume partition 'a' first */
558 snprintf(cname, sizeof(cname), "%s%c",
559 device_xname(dksc->sc_dev), 'a');
560 candidate_root = dkwedge_find_by_wname(cname);
561 DPRINTF("%s: candidate wedge root=%s\n", __func__,
562 cname);
563 if (candidate_root == NULL) {
564 /*
565 * If that is not found, because we don't use
566 * disklabel, return the first dk child
567 * XXX: we can skip the 'a' check above
568 * and always do this...
569 */
570 size_t i = 0;
571 candidate_root = dkwedge_find_by_parent(
572 device_xname(dksc->sc_dev), &i);
573 }
574 DPRINTF("%s: candidate wedge root=%p\n", __func__,
575 candidate_root);
576 } else
577 candidate_root = dksc->sc_dev;
578 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root);
579 DPRINTF("%s: booted_device=%p root_partition=%d "
580 "contains_boot=%d\n", __func__, booted_device,
581 rsc->sc_r.root_partition,
582 rf_containsboot(&rsc->sc_r, booted_device));
583 if (booted_device == NULL ||
584 rsc->sc_r.root_partition == 1 ||
585 rf_containsboot(&rsc->sc_r, booted_device)) {
586 booted_device = candidate_root;
587 booted_partition = 0; /* XXX assume 'a' */
588 }
589 } else if (num_root > 1) {
590 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root,
591 booted_device);
592
593 /*
594 * Maybe the MD code can help. If it cannot, then
595 * setroot() will discover that we have no
596 * booted_device and will ask the user if nothing was
597 * hardwired in the kernel config file
598 */
599 if (booted_device == NULL)
600 return;
601
602 num_root = 0;
603 mutex_enter(&raid_lock);
604 LIST_FOREACH(sc, &raids, sc_link) {
605 RF_Raid_t *r = &sc->sc_r;
606 if (r->valid == 0)
607 continue;
608
609 if (r->root_partition == 0)
610 continue;
611
612 if (rf_containsboot(r, booted_device)) {
613 num_root++;
614 rsc = sc;
615 dksc = &rsc->sc_dksc;
616 }
617 }
618 mutex_exit(&raid_lock);
619
620 if (num_root == 1) {
621 booted_device = dksc->sc_dev;
622 booted_partition = 0; /* XXX assume 'a' */
623 } else {
624 /* we can't guess.. require the user to answer... */
625 boothowto |= RB_ASKNAME;
626 }
627 }
628 }
629
630 static int
631 raidsize(dev_t dev)
632 {
633 struct raid_softc *rs;
634 struct dk_softc *dksc;
635 unsigned int unit;
636
637 unit = raidunit(dev);
638 if ((rs = raidget(unit, false)) == NULL)
639 return -1;
640 dksc = &rs->sc_dksc;
641
642 if ((rs->sc_flags & RAIDF_INITED) == 0)
643 return -1;
644
645 return dk_size(dksc, dev);
646 }
647
648 static int
649 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
650 {
651 unsigned int unit;
652 struct raid_softc *rs;
653 struct dk_softc *dksc;
654
655 unit = raidunit(dev);
656 if ((rs = raidget(unit, false)) == NULL)
657 return ENXIO;
658 dksc = &rs->sc_dksc;
659
660 if ((rs->sc_flags & RAIDF_INITED) == 0)
661 return ENODEV;
662
663 /*
664 Note that blkno is relative to this particular partition.
665 By adding adding RF_PROTECTED_SECTORS, we get a value that
666 is relative to the partition used for the underlying component.
667 */
668 blkno += RF_PROTECTED_SECTORS;
669
670 return dk_dump(dksc, dev, blkno, va, size);
671 }
672
673 static int
674 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
675 {
676 struct raid_softc *rs = raidsoftc(dev);
677 const struct bdevsw *bdev;
678 RF_Raid_t *raidPtr;
679 int c, sparecol, j, scol, dumpto;
680 int error = 0;
681
682 raidPtr = &rs->sc_r;
683
684 /* we only support dumping to RAID 1 sets */
685 if (raidPtr->Layout.numDataCol != 1 ||
686 raidPtr->Layout.numParityCol != 1)
687 return EINVAL;
688
689 if ((error = raidlock(rs)) != 0)
690 return error;
691
692 /* figure out what device is alive.. */
693
694 /*
695 Look for a component to dump to. The preference for the
696 component to dump to is as follows:
697 1) the master
698 2) a used_spare of the master
699 3) the slave
700 4) a used_spare of the slave
701 */
702
703 dumpto = -1;
704 for (c = 0; c < raidPtr->numCol; c++) {
705 if (raidPtr->Disks[c].status == rf_ds_optimal) {
706 /* this might be the one */
707 dumpto = c;
708 break;
709 }
710 }
711
712 /*
713 At this point we have possibly selected a live master or a
714 live slave. We now check to see if there is a spared
715 master (or a spared slave), if we didn't find a live master
716 or a live slave.
717 */
718
719 for (c = 0; c < raidPtr->numSpare; c++) {
720 sparecol = raidPtr->numCol + c;
721 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
722 /* How about this one? */
723 scol = -1;
724 for(j=0;j<raidPtr->numCol;j++) {
725 if (raidPtr->Disks[j].spareCol == sparecol) {
726 scol = j;
727 break;
728 }
729 }
730 if (scol == 0) {
731 /*
732 We must have found a spared master!
733 We'll take that over anything else
734 found so far. (We couldn't have
735 found a real master before, since
736 this is a used spare, and it's
737 saying that it's replacing the
738 master.) On reboot (with
739 autoconfiguration turned on)
740 sparecol will become the 1st
741 component (component0) of this set.
742 */
743 dumpto = sparecol;
744 break;
745 } else if (scol != -1) {
746 /*
747 Must be a spared slave. We'll dump
748 to that if we havn't found anything
749 else so far.
750 */
751 if (dumpto == -1)
752 dumpto = sparecol;
753 }
754 }
755 }
756
757 if (dumpto == -1) {
758 /* we couldn't find any live components to dump to!?!?
759 */
760 error = EINVAL;
761 goto out;
762 }
763
764 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
765 if (bdev == NULL) {
766 error = ENXIO;
767 goto out;
768 }
769
770 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
771 blkno, va, nblk * raidPtr->bytesPerSector);
772
773 out:
774 raidunlock(rs);
775
776 return error;
777 }
778
779 /* ARGSUSED */
780 static int
781 raidopen(dev_t dev, int flags, int fmt,
782 struct lwp *l)
783 {
784 int unit = raidunit(dev);
785 struct raid_softc *rs;
786 struct dk_softc *dksc;
787 int error = 0;
788 int part, pmask;
789
790 if ((rs = raidget(unit, true)) == NULL)
791 return ENXIO;
792 if ((error = raidlock(rs)) != 0)
793 return (error);
794
795 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
796 error = EBUSY;
797 goto bad;
798 }
799
800 dksc = &rs->sc_dksc;
801
802 part = DISKPART(dev);
803 pmask = (1 << part);
804
805 if (!DK_BUSY(dksc, pmask) &&
806 ((rs->sc_flags & RAIDF_INITED) != 0)) {
807 /* First one... mark things as dirty... Note that we *MUST*
808 have done a configure before this. I DO NOT WANT TO BE
809 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
810 THAT THEY BELONG TOGETHER!!!!! */
811 /* XXX should check to see if we're only open for reading
812 here... If so, we needn't do this, but then need some
813 other way of keeping track of what's happened.. */
814
815 rf_markalldirty(&rs->sc_r);
816 }
817
818 if ((rs->sc_flags & RAIDF_INITED) != 0)
819 error = dk_open(dksc, dev, flags, fmt, l);
820
821 bad:
822 raidunlock(rs);
823
824 return (error);
825
826
827 }
828
829 static int
830 raid_lastclose(device_t self)
831 {
832 struct raid_softc *rs = raidsoftc(self);
833
834 /* Last one... device is not unconfigured yet.
835 Device shutdown has taken care of setting the
836 clean bits if RAIDF_INITED is not set
837 mark things as clean... */
838
839 rf_update_component_labels(&rs->sc_r,
840 RF_FINAL_COMPONENT_UPDATE);
841
842 /* pass to unlocked code */
843 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
844 rs->sc_flags |= RAIDF_DETACH;
845
846 return 0;
847 }
848
849 /* ARGSUSED */
850 static int
851 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
852 {
853 int unit = raidunit(dev);
854 struct raid_softc *rs;
855 struct dk_softc *dksc;
856 cfdata_t cf;
857 int error = 0, do_detach = 0, do_put = 0;
858
859 if ((rs = raidget(unit, false)) == NULL)
860 return ENXIO;
861 dksc = &rs->sc_dksc;
862
863 if ((error = raidlock(rs)) != 0)
864 return (error);
865
866 if ((rs->sc_flags & RAIDF_INITED) != 0) {
867 error = dk_close(dksc, dev, flags, fmt, l);
868 if ((rs->sc_flags & RAIDF_DETACH) != 0)
869 do_detach = 1;
870 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
871 do_put = 1;
872
873 raidunlock(rs);
874
875 if (do_detach) {
876 /* free the pseudo device attach bits */
877 cf = device_cfdata(dksc->sc_dev);
878 error = config_detach(dksc->sc_dev, 0);
879 if (error == 0)
880 free(cf, M_RAIDFRAME);
881 } else if (do_put) {
882 raidput(rs);
883 }
884
885 return (error);
886
887 }
888
889 static void
890 raid_wakeup(RF_Raid_t *raidPtr)
891 {
892 rf_lock_mutex2(raidPtr->iodone_lock);
893 rf_signal_cond2(raidPtr->iodone_cv);
894 rf_unlock_mutex2(raidPtr->iodone_lock);
895 }
896
897 static void
898 raidstrategy(struct buf *bp)
899 {
900 unsigned int unit;
901 struct raid_softc *rs;
902 struct dk_softc *dksc;
903 RF_Raid_t *raidPtr;
904
905 unit = raidunit(bp->b_dev);
906 if ((rs = raidget(unit, false)) == NULL) {
907 bp->b_error = ENXIO;
908 goto fail;
909 }
910 if ((rs->sc_flags & RAIDF_INITED) == 0) {
911 bp->b_error = ENXIO;
912 goto fail;
913 }
914 dksc = &rs->sc_dksc;
915 raidPtr = &rs->sc_r;
916
917 /* Queue IO only */
918 if (dk_strategy_defer(dksc, bp))
919 goto done;
920
921 /* schedule the IO to happen at the next convenient time */
922 raid_wakeup(raidPtr);
923
924 done:
925 return;
926
927 fail:
928 bp->b_resid = bp->b_bcount;
929 biodone(bp);
930 }
931
932 static int
933 raid_diskstart(device_t dev, struct buf *bp)
934 {
935 struct raid_softc *rs = raidsoftc(dev);
936 RF_Raid_t *raidPtr;
937
938 raidPtr = &rs->sc_r;
939 if (!raidPtr->valid) {
940 db1_printf(("raid is not valid..\n"));
941 return ENODEV;
942 }
943
944 /* XXX */
945 bp->b_resid = 0;
946
947 return raiddoaccess(raidPtr, bp);
948 }
949
950 void
951 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
952 {
953 struct raid_softc *rs;
954 struct dk_softc *dksc;
955
956 rs = raidPtr->softc;
957 dksc = &rs->sc_dksc;
958
959 dk_done(dksc, bp);
960
961 rf_lock_mutex2(raidPtr->mutex);
962 raidPtr->openings++;
963 rf_unlock_mutex2(raidPtr->mutex);
964
965 /* schedule more IO */
966 raid_wakeup(raidPtr);
967 }
968
969 /* ARGSUSED */
970 static int
971 raidread(dev_t dev, struct uio *uio, int flags)
972 {
973 int unit = raidunit(dev);
974 struct raid_softc *rs;
975
976 if ((rs = raidget(unit, false)) == NULL)
977 return ENXIO;
978
979 if ((rs->sc_flags & RAIDF_INITED) == 0)
980 return (ENXIO);
981
982 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
983
984 }
985
986 /* ARGSUSED */
987 static int
988 raidwrite(dev_t dev, struct uio *uio, int flags)
989 {
990 int unit = raidunit(dev);
991 struct raid_softc *rs;
992
993 if ((rs = raidget(unit, false)) == NULL)
994 return ENXIO;
995
996 if ((rs->sc_flags & RAIDF_INITED) == 0)
997 return (ENXIO);
998
999 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
1000
1001 }
1002
1003 static int
1004 raid_detach_unlocked(struct raid_softc *rs)
1005 {
1006 struct dk_softc *dksc = &rs->sc_dksc;
1007 RF_Raid_t *raidPtr;
1008 int error;
1009
1010 raidPtr = &rs->sc_r;
1011
1012 if (DK_BUSY(dksc, 0) ||
1013 raidPtr->recon_in_progress != 0 ||
1014 raidPtr->parity_rewrite_in_progress != 0 ||
1015 raidPtr->copyback_in_progress != 0)
1016 return EBUSY;
1017
1018 if ((rs->sc_flags & RAIDF_INITED) == 0)
1019 return 0;
1020
1021 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1022
1023 if ((error = rf_Shutdown(raidPtr)) != 0)
1024 return error;
1025
1026 rs->sc_flags &= ~RAIDF_INITED;
1027
1028 /* Kill off any queued buffers */
1029 dk_drain(dksc);
1030 bufq_free(dksc->sc_bufq);
1031
1032 /* Detach the disk. */
1033 dkwedge_delall(&dksc->sc_dkdev);
1034 disk_detach(&dksc->sc_dkdev);
1035 disk_destroy(&dksc->sc_dkdev);
1036 dk_detach(dksc);
1037
1038 return 0;
1039 }
1040
1041 static int
1042 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1043 {
1044 int unit = raidunit(dev);
1045 int error = 0;
1046 int part, pmask;
1047 struct raid_softc *rs;
1048 struct dk_softc *dksc;
1049 RF_Config_t *k_cfg, *u_cfg;
1050 RF_Raid_t *raidPtr;
1051 RF_RaidDisk_t *diskPtr;
1052 RF_AccTotals_t *totals;
1053 RF_DeviceConfig_t *d_cfg, **ucfgp;
1054 u_char *specific_buf;
1055 int retcode = 0;
1056 int column;
1057 /* int raidid; */
1058 struct rf_recon_req *rrcopy, *rr;
1059 RF_ComponentLabel_t *clabel;
1060 RF_ComponentLabel_t *ci_label;
1061 RF_ComponentLabel_t **clabel_ptr;
1062 RF_SingleComponent_t *sparePtr,*componentPtr;
1063 RF_SingleComponent_t component;
1064 RF_ProgressInfo_t progressInfo, **progressInfoPtr;
1065 int i, j, d;
1066
1067 if ((rs = raidget(unit, false)) == NULL)
1068 return ENXIO;
1069 dksc = &rs->sc_dksc;
1070 raidPtr = &rs->sc_r;
1071
1072 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1073 (int) DISKPART(dev), (int) unit, cmd));
1074
1075 /* Must be initialized for these... */
1076 switch (cmd) {
1077 case RAIDFRAME_REWRITEPARITY:
1078 case RAIDFRAME_GET_INFO:
1079 case RAIDFRAME_RESET_ACCTOTALS:
1080 case RAIDFRAME_GET_ACCTOTALS:
1081 case RAIDFRAME_KEEP_ACCTOTALS:
1082 case RAIDFRAME_GET_SIZE:
1083 case RAIDFRAME_FAIL_DISK:
1084 case RAIDFRAME_COPYBACK:
1085 case RAIDFRAME_CHECK_RECON_STATUS:
1086 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1087 case RAIDFRAME_GET_COMPONENT_LABEL:
1088 case RAIDFRAME_SET_COMPONENT_LABEL:
1089 case RAIDFRAME_ADD_HOT_SPARE:
1090 case RAIDFRAME_REMOVE_HOT_SPARE:
1091 case RAIDFRAME_INIT_LABELS:
1092 case RAIDFRAME_REBUILD_IN_PLACE:
1093 case RAIDFRAME_CHECK_PARITY:
1094 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1095 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1096 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1097 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1098 case RAIDFRAME_SET_AUTOCONFIG:
1099 case RAIDFRAME_SET_ROOT:
1100 case RAIDFRAME_DELETE_COMPONENT:
1101 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1102 case RAIDFRAME_PARITYMAP_STATUS:
1103 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1104 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1105 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1106 if ((rs->sc_flags & RAIDF_INITED) == 0)
1107 return (ENXIO);
1108 }
1109
1110 switch (cmd) {
1111 #ifdef COMPAT_50
1112 case RAIDFRAME_GET_INFO50:
1113 return rf_get_info50(raidPtr, data);
1114
1115 case RAIDFRAME_CONFIGURE50:
1116 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1117 return retcode;
1118 goto config;
1119 #endif
1120 /* configure the system */
1121 case RAIDFRAME_CONFIGURE:
1122
1123 if (raidPtr->valid) {
1124 /* There is a valid RAID set running on this unit! */
1125 printf("raid%d: Device already configured!\n",unit);
1126 return(EINVAL);
1127 }
1128
1129 /* copy-in the configuration information */
1130 /* data points to a pointer to the configuration structure */
1131
1132 u_cfg = *((RF_Config_t **) data);
1133 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1134 if (k_cfg == NULL) {
1135 return (ENOMEM);
1136 }
1137 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1138 if (retcode) {
1139 RF_Free(k_cfg, sizeof(RF_Config_t));
1140 db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1141 retcode));
1142 goto no_config;
1143 }
1144 goto config;
1145 config:
1146 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1147
1148 /* allocate a buffer for the layout-specific data, and copy it
1149 * in */
1150 if (k_cfg->layoutSpecificSize) {
1151 if (k_cfg->layoutSpecificSize > 10000) {
1152 /* sanity check */
1153 RF_Free(k_cfg, sizeof(RF_Config_t));
1154 retcode = EINVAL;
1155 goto no_config;
1156 }
1157 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1158 (u_char *));
1159 if (specific_buf == NULL) {
1160 RF_Free(k_cfg, sizeof(RF_Config_t));
1161 retcode = ENOMEM;
1162 goto no_config;
1163 }
1164 retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1165 k_cfg->layoutSpecificSize);
1166 if (retcode) {
1167 RF_Free(k_cfg, sizeof(RF_Config_t));
1168 RF_Free(specific_buf,
1169 k_cfg->layoutSpecificSize);
1170 db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1171 retcode));
1172 goto no_config;
1173 }
1174 } else
1175 specific_buf = NULL;
1176 k_cfg->layoutSpecific = specific_buf;
1177
1178 /* should do some kind of sanity check on the configuration.
1179 * Store the sum of all the bytes in the last byte? */
1180
1181 /* configure the system */
1182
1183 /*
1184 * Clear the entire RAID descriptor, just to make sure
1185 * there is no stale data left in the case of a
1186 * reconfiguration
1187 */
1188 memset(raidPtr, 0, sizeof(*raidPtr));
1189 raidPtr->softc = rs;
1190 raidPtr->raidid = unit;
1191
1192 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1193
1194 if (retcode == 0) {
1195
1196 /* allow this many simultaneous IO's to
1197 this RAID device */
1198 raidPtr->openings = RAIDOUTSTANDING;
1199
1200 raidinit(rs);
1201 raid_wakeup(raidPtr);
1202 rf_markalldirty(raidPtr);
1203 }
1204 /* free the buffers. No return code here. */
1205 if (k_cfg->layoutSpecificSize) {
1206 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1207 }
1208 RF_Free(k_cfg, sizeof(RF_Config_t));
1209
1210 no_config:
1211 /*
1212 * If configuration failed, set sc_flags so that we
1213 * will detach the device when we close it.
1214 */
1215 if (retcode != 0)
1216 rs->sc_flags |= RAIDF_SHUTDOWN;
1217 return (retcode);
1218
1219 /* shutdown the system */
1220 case RAIDFRAME_SHUTDOWN:
1221
1222 part = DISKPART(dev);
1223 pmask = (1 << part);
1224
1225 if ((error = raidlock(rs)) != 0)
1226 return (error);
1227
1228 if (DK_BUSY(dksc, pmask) ||
1229 raidPtr->recon_in_progress != 0 ||
1230 raidPtr->parity_rewrite_in_progress != 0 ||
1231 raidPtr->copyback_in_progress != 0)
1232 retcode = EBUSY;
1233 else {
1234 /* detach and free on close */
1235 rs->sc_flags |= RAIDF_SHUTDOWN;
1236 retcode = 0;
1237 }
1238
1239 raidunlock(rs);
1240
1241 return (retcode);
1242 case RAIDFRAME_GET_COMPONENT_LABEL:
1243 clabel_ptr = (RF_ComponentLabel_t **) data;
1244 /* need to read the component label for the disk indicated
1245 by row,column in clabel */
1246
1247 /*
1248 * Perhaps there should be an option to skip the in-core
1249 * copy and hit the disk, as with disklabel(8).
1250 */
1251 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1252
1253 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1254
1255 if (retcode) {
1256 RF_Free(clabel, sizeof(*clabel));
1257 return retcode;
1258 }
1259
1260 clabel->row = 0; /* Don't allow looking at anything else.*/
1261
1262 column = clabel->column;
1263
1264 if ((column < 0) || (column >= raidPtr->numCol +
1265 raidPtr->numSpare)) {
1266 RF_Free(clabel, sizeof(*clabel));
1267 return EINVAL;
1268 }
1269
1270 RF_Free(clabel, sizeof(*clabel));
1271
1272 clabel = raidget_component_label(raidPtr, column);
1273
1274 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1275
1276 #if 0
1277 case RAIDFRAME_SET_COMPONENT_LABEL:
1278 clabel = (RF_ComponentLabel_t *) data;
1279
1280 /* XXX check the label for valid stuff... */
1281 /* Note that some things *should not* get modified --
1282 the user should be re-initing the labels instead of
1283 trying to patch things.
1284 */
1285
1286 raidid = raidPtr->raidid;
1287 #ifdef DEBUG
1288 printf("raid%d: Got component label:\n", raidid);
1289 printf("raid%d: Version: %d\n", raidid, clabel->version);
1290 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1291 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1292 printf("raid%d: Column: %d\n", raidid, clabel->column);
1293 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1294 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1295 printf("raid%d: Status: %d\n", raidid, clabel->status);
1296 #endif
1297 clabel->row = 0;
1298 column = clabel->column;
1299
1300 if ((column < 0) || (column >= raidPtr->numCol)) {
1301 return(EINVAL);
1302 }
1303
1304 /* XXX this isn't allowed to do anything for now :-) */
1305
1306 /* XXX and before it is, we need to fill in the rest
1307 of the fields!?!?!?! */
1308 memcpy(raidget_component_label(raidPtr, column),
1309 clabel, sizeof(*clabel));
1310 raidflush_component_label(raidPtr, column);
1311 return (0);
1312 #endif
1313
1314 case RAIDFRAME_INIT_LABELS:
1315 clabel = (RF_ComponentLabel_t *) data;
1316 /*
1317 we only want the serial number from
1318 the above. We get all the rest of the information
1319 from the config that was used to create this RAID
1320 set.
1321 */
1322
1323 raidPtr->serial_number = clabel->serial_number;
1324
1325 for(column=0;column<raidPtr->numCol;column++) {
1326 diskPtr = &raidPtr->Disks[column];
1327 if (!RF_DEAD_DISK(diskPtr->status)) {
1328 ci_label = raidget_component_label(raidPtr,
1329 column);
1330 /* Zeroing this is important. */
1331 memset(ci_label, 0, sizeof(*ci_label));
1332 raid_init_component_label(raidPtr, ci_label);
1333 ci_label->serial_number =
1334 raidPtr->serial_number;
1335 ci_label->row = 0; /* we dont' pretend to support more */
1336 rf_component_label_set_partitionsize(ci_label,
1337 diskPtr->partitionSize);
1338 ci_label->column = column;
1339 raidflush_component_label(raidPtr, column);
1340 }
1341 /* XXXjld what about the spares? */
1342 }
1343
1344 return (retcode);
1345 case RAIDFRAME_SET_AUTOCONFIG:
1346 d = rf_set_autoconfig(raidPtr, *(int *) data);
1347 printf("raid%d: New autoconfig value is: %d\n",
1348 raidPtr->raidid, d);
1349 *(int *) data = d;
1350 return (retcode);
1351
1352 case RAIDFRAME_SET_ROOT:
1353 d = rf_set_rootpartition(raidPtr, *(int *) data);
1354 printf("raid%d: New rootpartition value is: %d\n",
1355 raidPtr->raidid, d);
1356 *(int *) data = d;
1357 return (retcode);
1358
1359 /* initialize all parity */
1360 case RAIDFRAME_REWRITEPARITY:
1361
1362 if (raidPtr->Layout.map->faultsTolerated == 0) {
1363 /* Parity for RAID 0 is trivially correct */
1364 raidPtr->parity_good = RF_RAID_CLEAN;
1365 return(0);
1366 }
1367
1368 if (raidPtr->parity_rewrite_in_progress == 1) {
1369 /* Re-write is already in progress! */
1370 return(EINVAL);
1371 }
1372
1373 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1374 rf_RewriteParityThread,
1375 raidPtr,"raid_parity");
1376 return (retcode);
1377
1378
1379 case RAIDFRAME_ADD_HOT_SPARE:
1380 sparePtr = (RF_SingleComponent_t *) data;
1381 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1382 retcode = rf_add_hot_spare(raidPtr, &component);
1383 return(retcode);
1384
1385 case RAIDFRAME_REMOVE_HOT_SPARE:
1386 return(retcode);
1387
1388 case RAIDFRAME_DELETE_COMPONENT:
1389 componentPtr = (RF_SingleComponent_t *)data;
1390 memcpy( &component, componentPtr,
1391 sizeof(RF_SingleComponent_t));
1392 retcode = rf_delete_component(raidPtr, &component);
1393 return(retcode);
1394
1395 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1396 componentPtr = (RF_SingleComponent_t *)data;
1397 memcpy( &component, componentPtr,
1398 sizeof(RF_SingleComponent_t));
1399 retcode = rf_incorporate_hot_spare(raidPtr, &component);
1400 return(retcode);
1401
1402 case RAIDFRAME_REBUILD_IN_PLACE:
1403
1404 if (raidPtr->Layout.map->faultsTolerated == 0) {
1405 /* Can't do this on a RAID 0!! */
1406 return(EINVAL);
1407 }
1408
1409 if (raidPtr->recon_in_progress == 1) {
1410 /* a reconstruct is already in progress! */
1411 return(EINVAL);
1412 }
1413
1414 componentPtr = (RF_SingleComponent_t *) data;
1415 memcpy( &component, componentPtr,
1416 sizeof(RF_SingleComponent_t));
1417 component.row = 0; /* we don't support any more */
1418 column = component.column;
1419
1420 if ((column < 0) || (column >= raidPtr->numCol)) {
1421 return(EINVAL);
1422 }
1423
1424 rf_lock_mutex2(raidPtr->mutex);
1425 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1426 (raidPtr->numFailures > 0)) {
1427 /* XXX 0 above shouldn't be constant!!! */
1428 /* some component other than this has failed.
1429 Let's not make things worse than they already
1430 are... */
1431 printf("raid%d: Unable to reconstruct to disk at:\n",
1432 raidPtr->raidid);
1433 printf("raid%d: Col: %d Too many failures.\n",
1434 raidPtr->raidid, column);
1435 rf_unlock_mutex2(raidPtr->mutex);
1436 return (EINVAL);
1437 }
1438 if (raidPtr->Disks[column].status ==
1439 rf_ds_reconstructing) {
1440 printf("raid%d: Unable to reconstruct to disk at:\n",
1441 raidPtr->raidid);
1442 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column);
1443
1444 rf_unlock_mutex2(raidPtr->mutex);
1445 return (EINVAL);
1446 }
1447 if (raidPtr->Disks[column].status == rf_ds_spared) {
1448 rf_unlock_mutex2(raidPtr->mutex);
1449 return (EINVAL);
1450 }
1451 rf_unlock_mutex2(raidPtr->mutex);
1452
1453 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1454 if (rrcopy == NULL)
1455 return(ENOMEM);
1456
1457 rrcopy->raidPtr = (void *) raidPtr;
1458 rrcopy->col = column;
1459
1460 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1461 rf_ReconstructInPlaceThread,
1462 rrcopy,"raid_reconip");
1463 return(retcode);
1464
1465 case RAIDFRAME_GET_INFO:
1466 if (!raidPtr->valid)
1467 return (ENODEV);
1468 ucfgp = (RF_DeviceConfig_t **) data;
1469 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1470 (RF_DeviceConfig_t *));
1471 if (d_cfg == NULL)
1472 return (ENOMEM);
1473 d_cfg->rows = 1; /* there is only 1 row now */
1474 d_cfg->cols = raidPtr->numCol;
1475 d_cfg->ndevs = raidPtr->numCol;
1476 if (d_cfg->ndevs >= RF_MAX_DISKS) {
1477 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1478 return (ENOMEM);
1479 }
1480 d_cfg->nspares = raidPtr->numSpare;
1481 if (d_cfg->nspares >= RF_MAX_DISKS) {
1482 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1483 return (ENOMEM);
1484 }
1485 d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1486 d = 0;
1487 for (j = 0; j < d_cfg->cols; j++) {
1488 d_cfg->devs[d] = raidPtr->Disks[j];
1489 d++;
1490 }
1491 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1492 d_cfg->spares[i] = raidPtr->Disks[j];
1493 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) {
1494 /* XXX: raidctl(8) expects to see this as a used spare */
1495 d_cfg->spares[i].status = rf_ds_used_spare;
1496 }
1497 }
1498 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1499 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1500
1501 return (retcode);
1502
1503 case RAIDFRAME_CHECK_PARITY:
1504 *(int *) data = raidPtr->parity_good;
1505 return (0);
1506
1507 case RAIDFRAME_PARITYMAP_STATUS:
1508 if (rf_paritymap_ineligible(raidPtr))
1509 return EINVAL;
1510 rf_paritymap_status(raidPtr->parity_map,
1511 (struct rf_pmstat *)data);
1512 return 0;
1513
1514 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1515 if (rf_paritymap_ineligible(raidPtr))
1516 return EINVAL;
1517 if (raidPtr->parity_map == NULL)
1518 return ENOENT; /* ??? */
1519 if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1520 (struct rf_pmparams *)data, 1))
1521 return EINVAL;
1522 return 0;
1523
1524 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1525 if (rf_paritymap_ineligible(raidPtr))
1526 return EINVAL;
1527 *(int *) data = rf_paritymap_get_disable(raidPtr);
1528 return 0;
1529
1530 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1531 if (rf_paritymap_ineligible(raidPtr))
1532 return EINVAL;
1533 rf_paritymap_set_disable(raidPtr, *(int *)data);
1534 /* XXX should errors be passed up? */
1535 return 0;
1536
1537 case RAIDFRAME_RESET_ACCTOTALS:
1538 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1539 return (0);
1540
1541 case RAIDFRAME_GET_ACCTOTALS:
1542 totals = (RF_AccTotals_t *) data;
1543 *totals = raidPtr->acc_totals;
1544 return (0);
1545
1546 case RAIDFRAME_KEEP_ACCTOTALS:
1547 raidPtr->keep_acc_totals = *(int *)data;
1548 return (0);
1549
1550 case RAIDFRAME_GET_SIZE:
1551 *(int *) data = raidPtr->totalSectors;
1552 return (0);
1553
1554 /* fail a disk & optionally start reconstruction */
1555 case RAIDFRAME_FAIL_DISK:
1556
1557 if (raidPtr->Layout.map->faultsTolerated == 0) {
1558 /* Can't do this on a RAID 0!! */
1559 return(EINVAL);
1560 }
1561
1562 rr = (struct rf_recon_req *) data;
1563 rr->row = 0;
1564 if (rr->col < 0 || rr->col >= raidPtr->numCol)
1565 return (EINVAL);
1566
1567
1568 rf_lock_mutex2(raidPtr->mutex);
1569 if (raidPtr->status == rf_rs_reconstructing) {
1570 /* you can't fail a disk while we're reconstructing! */
1571 /* XXX wrong for RAID6 */
1572 rf_unlock_mutex2(raidPtr->mutex);
1573 return (EINVAL);
1574 }
1575 if ((raidPtr->Disks[rr->col].status ==
1576 rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1577 /* some other component has failed. Let's not make
1578 things worse. XXX wrong for RAID6 */
1579 rf_unlock_mutex2(raidPtr->mutex);
1580 return (EINVAL);
1581 }
1582 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1583 /* Can't fail a spared disk! */
1584 rf_unlock_mutex2(raidPtr->mutex);
1585 return (EINVAL);
1586 }
1587 rf_unlock_mutex2(raidPtr->mutex);
1588
1589 /* make a copy of the recon request so that we don't rely on
1590 * the user's buffer */
1591 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1592 if (rrcopy == NULL)
1593 return(ENOMEM);
1594 memcpy(rrcopy, rr, sizeof(*rr));
1595 rrcopy->raidPtr = (void *) raidPtr;
1596
1597 retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1598 rf_ReconThread,
1599 rrcopy,"raid_recon");
1600 return (0);
1601
1602 /* invoke a copyback operation after recon on whatever disk
1603 * needs it, if any */
1604 case RAIDFRAME_COPYBACK:
1605
1606 if (raidPtr->Layout.map->faultsTolerated == 0) {
1607 /* This makes no sense on a RAID 0!! */
1608 return(EINVAL);
1609 }
1610
1611 if (raidPtr->copyback_in_progress == 1) {
1612 /* Copyback is already in progress! */
1613 return(EINVAL);
1614 }
1615
1616 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1617 rf_CopybackThread,
1618 raidPtr,"raid_copyback");
1619 return (retcode);
1620
1621 /* return the percentage completion of reconstruction */
1622 case RAIDFRAME_CHECK_RECON_STATUS:
1623 if (raidPtr->Layout.map->faultsTolerated == 0) {
1624 /* This makes no sense on a RAID 0, so tell the
1625 user it's done. */
1626 *(int *) data = 100;
1627 return(0);
1628 }
1629 if (raidPtr->status != rf_rs_reconstructing)
1630 *(int *) data = 100;
1631 else {
1632 if (raidPtr->reconControl->numRUsTotal > 0) {
1633 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1634 } else {
1635 *(int *) data = 0;
1636 }
1637 }
1638 return (0);
1639 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1640 progressInfoPtr = (RF_ProgressInfo_t **) data;
1641 if (raidPtr->status != rf_rs_reconstructing) {
1642 progressInfo.remaining = 0;
1643 progressInfo.completed = 100;
1644 progressInfo.total = 100;
1645 } else {
1646 progressInfo.total =
1647 raidPtr->reconControl->numRUsTotal;
1648 progressInfo.completed =
1649 raidPtr->reconControl->numRUsComplete;
1650 progressInfo.remaining = progressInfo.total -
1651 progressInfo.completed;
1652 }
1653 retcode = copyout(&progressInfo, *progressInfoPtr,
1654 sizeof(RF_ProgressInfo_t));
1655 return (retcode);
1656
1657 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1658 if (raidPtr->Layout.map->faultsTolerated == 0) {
1659 /* This makes no sense on a RAID 0, so tell the
1660 user it's done. */
1661 *(int *) data = 100;
1662 return(0);
1663 }
1664 if (raidPtr->parity_rewrite_in_progress == 1) {
1665 *(int *) data = 100 *
1666 raidPtr->parity_rewrite_stripes_done /
1667 raidPtr->Layout.numStripe;
1668 } else {
1669 *(int *) data = 100;
1670 }
1671 return (0);
1672
1673 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1674 progressInfoPtr = (RF_ProgressInfo_t **) data;
1675 if (raidPtr->parity_rewrite_in_progress == 1) {
1676 progressInfo.total = raidPtr->Layout.numStripe;
1677 progressInfo.completed =
1678 raidPtr->parity_rewrite_stripes_done;
1679 progressInfo.remaining = progressInfo.total -
1680 progressInfo.completed;
1681 } else {
1682 progressInfo.remaining = 0;
1683 progressInfo.completed = 100;
1684 progressInfo.total = 100;
1685 }
1686 retcode = copyout(&progressInfo, *progressInfoPtr,
1687 sizeof(RF_ProgressInfo_t));
1688 return (retcode);
1689
1690 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1691 if (raidPtr->Layout.map->faultsTolerated == 0) {
1692 /* This makes no sense on a RAID 0 */
1693 *(int *) data = 100;
1694 return(0);
1695 }
1696 if (raidPtr->copyback_in_progress == 1) {
1697 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1698 raidPtr->Layout.numStripe;
1699 } else {
1700 *(int *) data = 100;
1701 }
1702 return (0);
1703
1704 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1705 progressInfoPtr = (RF_ProgressInfo_t **) data;
1706 if (raidPtr->copyback_in_progress == 1) {
1707 progressInfo.total = raidPtr->Layout.numStripe;
1708 progressInfo.completed =
1709 raidPtr->copyback_stripes_done;
1710 progressInfo.remaining = progressInfo.total -
1711 progressInfo.completed;
1712 } else {
1713 progressInfo.remaining = 0;
1714 progressInfo.completed = 100;
1715 progressInfo.total = 100;
1716 }
1717 retcode = copyout(&progressInfo, *progressInfoPtr,
1718 sizeof(RF_ProgressInfo_t));
1719 return (retcode);
1720
1721 case RAIDFRAME_SET_LAST_UNIT:
1722 for (column = 0; column < raidPtr->numCol; column++)
1723 if (raidPtr->Disks[column].status != rf_ds_optimal)
1724 return EBUSY;
1725
1726 for (column = 0; column < raidPtr->numCol; column++) {
1727 clabel = raidget_component_label(raidPtr, column);
1728 clabel->last_unit = *(int *)data;
1729 raidflush_component_label(raidPtr, column);
1730 }
1731 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1732 return 0;
1733
1734 /* the sparetable daemon calls this to wait for the kernel to
1735 * need a spare table. this ioctl does not return until a
1736 * spare table is needed. XXX -- calling mpsleep here in the
1737 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1738 * -- I should either compute the spare table in the kernel,
1739 * or have a different -- XXX XXX -- interface (a different
1740 * character device) for delivering the table -- XXX */
1741 #if 0
1742 case RAIDFRAME_SPARET_WAIT:
1743 rf_lock_mutex2(rf_sparet_wait_mutex);
1744 while (!rf_sparet_wait_queue)
1745 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1746 waitreq = rf_sparet_wait_queue;
1747 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1748 rf_unlock_mutex2(rf_sparet_wait_mutex);
1749
1750 /* structure assignment */
1751 *((RF_SparetWait_t *) data) = *waitreq;
1752
1753 RF_Free(waitreq, sizeof(*waitreq));
1754 return (0);
1755
1756 /* wakes up a process waiting on SPARET_WAIT and puts an error
1757 * code in it that will cause the dameon to exit */
1758 case RAIDFRAME_ABORT_SPARET_WAIT:
1759 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1760 waitreq->fcol = -1;
1761 rf_lock_mutex2(rf_sparet_wait_mutex);
1762 waitreq->next = rf_sparet_wait_queue;
1763 rf_sparet_wait_queue = waitreq;
1764 rf_broadcast_conf2(rf_sparet_wait_cv);
1765 rf_unlock_mutex2(rf_sparet_wait_mutex);
1766 return (0);
1767
1768 /* used by the spare table daemon to deliver a spare table
1769 * into the kernel */
1770 case RAIDFRAME_SEND_SPARET:
1771
1772 /* install the spare table */
1773 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1774
1775 /* respond to the requestor. the return status of the spare
1776 * table installation is passed in the "fcol" field */
1777 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1778 waitreq->fcol = retcode;
1779 rf_lock_mutex2(rf_sparet_wait_mutex);
1780 waitreq->next = rf_sparet_resp_queue;
1781 rf_sparet_resp_queue = waitreq;
1782 rf_broadcast_cond2(rf_sparet_resp_cv);
1783 rf_unlock_mutex2(rf_sparet_wait_mutex);
1784
1785 return (retcode);
1786 #endif
1787
1788 default:
1789 break; /* fall through to the os-specific code below */
1790
1791 }
1792
1793 if (!raidPtr->valid)
1794 return (EINVAL);
1795
1796 /*
1797 * Add support for "regular" device ioctls here.
1798 */
1799
1800 error = dk_ioctl(dksc, dev, cmd, data, flag, l);
1801 if (error != EPASSTHROUGH)
1802 return (error);
1803
1804 switch (cmd) {
1805 case DIOCCACHESYNC:
1806 return rf_sync_component_caches(raidPtr);
1807
1808 default:
1809 retcode = ENOTTY;
1810 }
1811 return (retcode);
1812
1813 }
1814
1815
1816 /* raidinit -- complete the rest of the initialization for the
1817 RAIDframe device. */
1818
1819
1820 static void
1821 raidinit(struct raid_softc *rs)
1822 {
1823 cfdata_t cf;
1824 unsigned int unit;
1825 struct dk_softc *dksc = &rs->sc_dksc;
1826 RF_Raid_t *raidPtr = &rs->sc_r;
1827 device_t dev;
1828
1829 unit = raidPtr->raidid;
1830
1831 /* XXX doesn't check bounds. */
1832 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1833
1834 /* attach the pseudo device */
1835 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1836 cf->cf_name = raid_cd.cd_name;
1837 cf->cf_atname = raid_cd.cd_name;
1838 cf->cf_unit = unit;
1839 cf->cf_fstate = FSTATE_STAR;
1840
1841 dev = config_attach_pseudo(cf);
1842 if (dev == NULL) {
1843 printf("raid%d: config_attach_pseudo failed\n",
1844 raidPtr->raidid);
1845 free(cf, M_RAIDFRAME);
1846 return;
1847 }
1848
1849 /* provide a backpointer to the real softc */
1850 raidsoftc(dev) = rs;
1851
1852 /* disk_attach actually creates space for the CPU disklabel, among
1853 * other things, so it's critical to call this *BEFORE* we try putzing
1854 * with disklabels. */
1855 dk_init(dksc, dev, DKTYPE_RAID);
1856 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1857
1858 /* XXX There may be a weird interaction here between this, and
1859 * protectedSectors, as used in RAIDframe. */
1860
1861 rs->sc_size = raidPtr->totalSectors;
1862
1863 /* Attach dk and disk subsystems */
1864 dk_attach(dksc);
1865 disk_attach(&dksc->sc_dkdev);
1866 rf_set_geometry(rs, raidPtr);
1867
1868 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1869
1870 /* mark unit as usuable */
1871 rs->sc_flags |= RAIDF_INITED;
1872
1873 dkwedge_discover(&dksc->sc_dkdev);
1874 }
1875
1876 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1877 /* wake up the daemon & tell it to get us a spare table
1878 * XXX
1879 * the entries in the queues should be tagged with the raidPtr
1880 * so that in the extremely rare case that two recons happen at once,
1881 * we know for which device were requesting a spare table
1882 * XXX
1883 *
1884 * XXX This code is not currently used. GO
1885 */
1886 int
1887 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1888 {
1889 int retcode;
1890
1891 rf_lock_mutex2(rf_sparet_wait_mutex);
1892 req->next = rf_sparet_wait_queue;
1893 rf_sparet_wait_queue = req;
1894 rf_broadcast_cond2(rf_sparet_wait_cv);
1895
1896 /* mpsleep unlocks the mutex */
1897 while (!rf_sparet_resp_queue) {
1898 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1899 }
1900 req = rf_sparet_resp_queue;
1901 rf_sparet_resp_queue = req->next;
1902 rf_unlock_mutex2(rf_sparet_wait_mutex);
1903
1904 retcode = req->fcol;
1905 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1906 * alloc'd */
1907 return (retcode);
1908 }
1909 #endif
1910
1911 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1912 * bp & passes it down.
1913 * any calls originating in the kernel must use non-blocking I/O
1914 * do some extra sanity checking to return "appropriate" error values for
1915 * certain conditions (to make some standard utilities work)
1916 *
1917 * Formerly known as: rf_DoAccessKernel
1918 */
1919 void
1920 raidstart(RF_Raid_t *raidPtr)
1921 {
1922 struct raid_softc *rs;
1923 struct dk_softc *dksc;
1924
1925 rs = raidPtr->softc;
1926 dksc = &rs->sc_dksc;
1927 /* quick check to see if anything has died recently */
1928 rf_lock_mutex2(raidPtr->mutex);
1929 if (raidPtr->numNewFailures > 0) {
1930 rf_unlock_mutex2(raidPtr->mutex);
1931 rf_update_component_labels(raidPtr,
1932 RF_NORMAL_COMPONENT_UPDATE);
1933 rf_lock_mutex2(raidPtr->mutex);
1934 raidPtr->numNewFailures--;
1935 }
1936 rf_unlock_mutex2(raidPtr->mutex);
1937
1938 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1939 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1940 return;
1941 }
1942
1943 dk_start(dksc, NULL);
1944 }
1945
1946 static int
1947 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1948 {
1949 RF_SectorCount_t num_blocks, pb, sum;
1950 RF_RaidAddr_t raid_addr;
1951 daddr_t blocknum;
1952 int do_async;
1953 int rc;
1954
1955 rf_lock_mutex2(raidPtr->mutex);
1956 if (raidPtr->openings == 0) {
1957 rf_unlock_mutex2(raidPtr->mutex);
1958 return EAGAIN;
1959 }
1960 rf_unlock_mutex2(raidPtr->mutex);
1961
1962 blocknum = bp->b_rawblkno;
1963
1964 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1965 (int) blocknum));
1966
1967 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1968 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1969
1970 /* *THIS* is where we adjust what block we're going to...
1971 * but DO NOT TOUCH bp->b_blkno!!! */
1972 raid_addr = blocknum;
1973
1974 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1975 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1976 sum = raid_addr + num_blocks + pb;
1977 if (1 || rf_debugKernelAccess) {
1978 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1979 (int) raid_addr, (int) sum, (int) num_blocks,
1980 (int) pb, (int) bp->b_resid));
1981 }
1982 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1983 || (sum < num_blocks) || (sum < pb)) {
1984 rc = ENOSPC;
1985 goto done;
1986 }
1987 /*
1988 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
1989 */
1990
1991 if (bp->b_bcount & raidPtr->sectorMask) {
1992 rc = ENOSPC;
1993 goto done;
1994 }
1995 db1_printf(("Calling DoAccess..\n"));
1996
1997
1998 rf_lock_mutex2(raidPtr->mutex);
1999 raidPtr->openings--;
2000 rf_unlock_mutex2(raidPtr->mutex);
2001
2002 /*
2003 * Everything is async.
2004 */
2005 do_async = 1;
2006
2007 /* don't ever condition on bp->b_flags & B_WRITE.
2008 * always condition on B_READ instead */
2009
2010 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2011 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2012 do_async, raid_addr, num_blocks,
2013 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2014
2015 done:
2016 return rc;
2017 }
2018
2019 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2020
2021 int
2022 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2023 {
2024 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2025 struct buf *bp;
2026
2027 req->queue = queue;
2028 bp = req->bp;
2029
2030 switch (req->type) {
2031 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2032 /* XXX need to do something extra here.. */
2033 /* I'm leaving this in, as I've never actually seen it used,
2034 * and I'd like folks to report it... GO */
2035 printf(("WAKEUP CALLED\n"));
2036 queue->numOutstanding++;
2037
2038 bp->b_flags = 0;
2039 bp->b_private = req;
2040
2041 KernelWakeupFunc(bp);
2042 break;
2043
2044 case RF_IO_TYPE_READ:
2045 case RF_IO_TYPE_WRITE:
2046 #if RF_ACC_TRACE > 0
2047 if (req->tracerec) {
2048 RF_ETIMER_START(req->tracerec->timer);
2049 }
2050 #endif
2051 InitBP(bp, queue->rf_cinfo->ci_vp,
2052 op, queue->rf_cinfo->ci_dev,
2053 req->sectorOffset, req->numSector,
2054 req->buf, KernelWakeupFunc, (void *) req,
2055 queue->raidPtr->logBytesPerSector, req->b_proc);
2056
2057 if (rf_debugKernelAccess) {
2058 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2059 (long) bp->b_blkno));
2060 }
2061 queue->numOutstanding++;
2062 queue->last_deq_sector = req->sectorOffset;
2063 /* acc wouldn't have been let in if there were any pending
2064 * reqs at any other priority */
2065 queue->curPriority = req->priority;
2066
2067 db1_printf(("Going for %c to unit %d col %d\n",
2068 req->type, queue->raidPtr->raidid,
2069 queue->col));
2070 db1_printf(("sector %d count %d (%d bytes) %d\n",
2071 (int) req->sectorOffset, (int) req->numSector,
2072 (int) (req->numSector <<
2073 queue->raidPtr->logBytesPerSector),
2074 (int) queue->raidPtr->logBytesPerSector));
2075
2076 /*
2077 * XXX: drop lock here since this can block at
2078 * least with backing SCSI devices. Retake it
2079 * to minimize fuss with calling interfaces.
2080 */
2081
2082 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2083 bdev_strategy(bp);
2084 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2085 break;
2086
2087 default:
2088 panic("bad req->type in rf_DispatchKernelIO");
2089 }
2090 db1_printf(("Exiting from DispatchKernelIO\n"));
2091
2092 return (0);
2093 }
2094 /* this is the callback function associated with a I/O invoked from
2095 kernel code.
2096 */
2097 static void
2098 KernelWakeupFunc(struct buf *bp)
2099 {
2100 RF_DiskQueueData_t *req = NULL;
2101 RF_DiskQueue_t *queue;
2102
2103 db1_printf(("recovering the request queue:\n"));
2104
2105 req = bp->b_private;
2106
2107 queue = (RF_DiskQueue_t *) req->queue;
2108
2109 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2110
2111 #if RF_ACC_TRACE > 0
2112 if (req->tracerec) {
2113 RF_ETIMER_STOP(req->tracerec->timer);
2114 RF_ETIMER_EVAL(req->tracerec->timer);
2115 rf_lock_mutex2(rf_tracing_mutex);
2116 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2117 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2118 req->tracerec->num_phys_ios++;
2119 rf_unlock_mutex2(rf_tracing_mutex);
2120 }
2121 #endif
2122
2123 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2124 * ballistic, and mark the component as hosed... */
2125
2126 if (bp->b_error != 0) {
2127 /* Mark the disk as dead */
2128 /* but only mark it once... */
2129 /* and only if it wouldn't leave this RAID set
2130 completely broken */
2131 if (((queue->raidPtr->Disks[queue->col].status ==
2132 rf_ds_optimal) ||
2133 (queue->raidPtr->Disks[queue->col].status ==
2134 rf_ds_used_spare)) &&
2135 (queue->raidPtr->numFailures <
2136 queue->raidPtr->Layout.map->faultsTolerated)) {
2137 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2138 queue->raidPtr->raidid,
2139 bp->b_error,
2140 queue->raidPtr->Disks[queue->col].devname);
2141 queue->raidPtr->Disks[queue->col].status =
2142 rf_ds_failed;
2143 queue->raidPtr->status = rf_rs_degraded;
2144 queue->raidPtr->numFailures++;
2145 queue->raidPtr->numNewFailures++;
2146 } else { /* Disk is already dead... */
2147 /* printf("Disk already marked as dead!\n"); */
2148 }
2149
2150 }
2151
2152 /* Fill in the error value */
2153 req->error = bp->b_error;
2154
2155 /* Drop this one on the "finished" queue... */
2156 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2157
2158 /* Let the raidio thread know there is work to be done. */
2159 rf_signal_cond2(queue->raidPtr->iodone_cv);
2160
2161 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2162 }
2163
2164
2165 /*
2166 * initialize a buf structure for doing an I/O in the kernel.
2167 */
2168 static void
2169 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2170 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2171 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2172 struct proc *b_proc)
2173 {
2174 /* bp->b_flags = B_PHYS | rw_flag; */
2175 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */
2176 bp->b_oflags = 0;
2177 bp->b_cflags = 0;
2178 bp->b_bcount = numSect << logBytesPerSector;
2179 bp->b_bufsize = bp->b_bcount;
2180 bp->b_error = 0;
2181 bp->b_dev = dev;
2182 bp->b_data = bf;
2183 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2184 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2185 if (bp->b_bcount == 0) {
2186 panic("bp->b_bcount is zero in InitBP!!");
2187 }
2188 bp->b_proc = b_proc;
2189 bp->b_iodone = cbFunc;
2190 bp->b_private = cbArg;
2191 }
2192
2193 /*
2194 * Wait interruptibly for an exclusive lock.
2195 *
2196 * XXX
2197 * Several drivers do this; it should be abstracted and made MP-safe.
2198 * (Hmm... where have we seen this warning before :-> GO )
2199 */
2200 static int
2201 raidlock(struct raid_softc *rs)
2202 {
2203 int error;
2204
2205 error = 0;
2206 mutex_enter(&rs->sc_mutex);
2207 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2208 rs->sc_flags |= RAIDF_WANTED;
2209 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2210 if (error != 0)
2211 goto done;
2212 }
2213 rs->sc_flags |= RAIDF_LOCKED;
2214 done:
2215 mutex_exit(&rs->sc_mutex);
2216 return (error);
2217 }
2218 /*
2219 * Unlock and wake up any waiters.
2220 */
2221 static void
2222 raidunlock(struct raid_softc *rs)
2223 {
2224
2225 mutex_enter(&rs->sc_mutex);
2226 rs->sc_flags &= ~RAIDF_LOCKED;
2227 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2228 rs->sc_flags &= ~RAIDF_WANTED;
2229 cv_broadcast(&rs->sc_cv);
2230 }
2231 mutex_exit(&rs->sc_mutex);
2232 }
2233
2234
2235 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2236 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2237 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2238
2239 static daddr_t
2240 rf_component_info_offset(void)
2241 {
2242
2243 return RF_COMPONENT_INFO_OFFSET;
2244 }
2245
2246 static daddr_t
2247 rf_component_info_size(unsigned secsize)
2248 {
2249 daddr_t info_size;
2250
2251 KASSERT(secsize);
2252 if (secsize > RF_COMPONENT_INFO_SIZE)
2253 info_size = secsize;
2254 else
2255 info_size = RF_COMPONENT_INFO_SIZE;
2256
2257 return info_size;
2258 }
2259
2260 static daddr_t
2261 rf_parity_map_offset(RF_Raid_t *raidPtr)
2262 {
2263 daddr_t map_offset;
2264
2265 KASSERT(raidPtr->bytesPerSector);
2266 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2267 map_offset = raidPtr->bytesPerSector;
2268 else
2269 map_offset = RF_COMPONENT_INFO_SIZE;
2270 map_offset += rf_component_info_offset();
2271
2272 return map_offset;
2273 }
2274
2275 static daddr_t
2276 rf_parity_map_size(RF_Raid_t *raidPtr)
2277 {
2278 daddr_t map_size;
2279
2280 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2281 map_size = raidPtr->bytesPerSector;
2282 else
2283 map_size = RF_PARITY_MAP_SIZE;
2284
2285 return map_size;
2286 }
2287
2288 int
2289 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2290 {
2291 RF_ComponentLabel_t *clabel;
2292
2293 clabel = raidget_component_label(raidPtr, col);
2294 clabel->clean = RF_RAID_CLEAN;
2295 raidflush_component_label(raidPtr, col);
2296 return(0);
2297 }
2298
2299
2300 int
2301 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2302 {
2303 RF_ComponentLabel_t *clabel;
2304
2305 clabel = raidget_component_label(raidPtr, col);
2306 clabel->clean = RF_RAID_DIRTY;
2307 raidflush_component_label(raidPtr, col);
2308 return(0);
2309 }
2310
2311 int
2312 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2313 {
2314 KASSERT(raidPtr->bytesPerSector);
2315 return raidread_component_label(raidPtr->bytesPerSector,
2316 raidPtr->Disks[col].dev,
2317 raidPtr->raid_cinfo[col].ci_vp,
2318 &raidPtr->raid_cinfo[col].ci_label);
2319 }
2320
2321 RF_ComponentLabel_t *
2322 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2323 {
2324 return &raidPtr->raid_cinfo[col].ci_label;
2325 }
2326
2327 int
2328 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2329 {
2330 RF_ComponentLabel_t *label;
2331
2332 label = &raidPtr->raid_cinfo[col].ci_label;
2333 label->mod_counter = raidPtr->mod_counter;
2334 #ifndef RF_NO_PARITY_MAP
2335 label->parity_map_modcount = label->mod_counter;
2336 #endif
2337 return raidwrite_component_label(raidPtr->bytesPerSector,
2338 raidPtr->Disks[col].dev,
2339 raidPtr->raid_cinfo[col].ci_vp, label);
2340 }
2341
2342
2343 static int
2344 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2345 RF_ComponentLabel_t *clabel)
2346 {
2347 return raidread_component_area(dev, b_vp, clabel,
2348 sizeof(RF_ComponentLabel_t),
2349 rf_component_info_offset(),
2350 rf_component_info_size(secsize));
2351 }
2352
2353 /* ARGSUSED */
2354 static int
2355 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2356 size_t msize, daddr_t offset, daddr_t dsize)
2357 {
2358 struct buf *bp;
2359 int error;
2360
2361 /* XXX should probably ensure that we don't try to do this if
2362 someone has changed rf_protected_sectors. */
2363
2364 if (b_vp == NULL) {
2365 /* For whatever reason, this component is not valid.
2366 Don't try to read a component label from it. */
2367 return(EINVAL);
2368 }
2369
2370 /* get a block of the appropriate size... */
2371 bp = geteblk((int)dsize);
2372 bp->b_dev = dev;
2373
2374 /* get our ducks in a row for the read */
2375 bp->b_blkno = offset / DEV_BSIZE;
2376 bp->b_bcount = dsize;
2377 bp->b_flags |= B_READ;
2378 bp->b_resid = dsize;
2379
2380 bdev_strategy(bp);
2381 error = biowait(bp);
2382
2383 if (!error) {
2384 memcpy(data, bp->b_data, msize);
2385 }
2386
2387 brelse(bp, 0);
2388 return(error);
2389 }
2390
2391
2392 static int
2393 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2394 RF_ComponentLabel_t *clabel)
2395 {
2396 return raidwrite_component_area(dev, b_vp, clabel,
2397 sizeof(RF_ComponentLabel_t),
2398 rf_component_info_offset(),
2399 rf_component_info_size(secsize), 0);
2400 }
2401
2402 /* ARGSUSED */
2403 static int
2404 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2405 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2406 {
2407 struct buf *bp;
2408 int error;
2409
2410 /* get a block of the appropriate size... */
2411 bp = geteblk((int)dsize);
2412 bp->b_dev = dev;
2413
2414 /* get our ducks in a row for the write */
2415 bp->b_blkno = offset / DEV_BSIZE;
2416 bp->b_bcount = dsize;
2417 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2418 bp->b_resid = dsize;
2419
2420 memset(bp->b_data, 0, dsize);
2421 memcpy(bp->b_data, data, msize);
2422
2423 bdev_strategy(bp);
2424 if (asyncp)
2425 return 0;
2426 error = biowait(bp);
2427 brelse(bp, 0);
2428 if (error) {
2429 #if 1
2430 printf("Failed to write RAID component info!\n");
2431 #endif
2432 }
2433
2434 return(error);
2435 }
2436
2437 void
2438 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2439 {
2440 int c;
2441
2442 for (c = 0; c < raidPtr->numCol; c++) {
2443 /* Skip dead disks. */
2444 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2445 continue;
2446 /* XXXjld: what if an error occurs here? */
2447 raidwrite_component_area(raidPtr->Disks[c].dev,
2448 raidPtr->raid_cinfo[c].ci_vp, map,
2449 RF_PARITYMAP_NBYTE,
2450 rf_parity_map_offset(raidPtr),
2451 rf_parity_map_size(raidPtr), 0);
2452 }
2453 }
2454
2455 void
2456 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2457 {
2458 struct rf_paritymap_ondisk tmp;
2459 int c,first;
2460
2461 first=1;
2462 for (c = 0; c < raidPtr->numCol; c++) {
2463 /* Skip dead disks. */
2464 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2465 continue;
2466 raidread_component_area(raidPtr->Disks[c].dev,
2467 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2468 RF_PARITYMAP_NBYTE,
2469 rf_parity_map_offset(raidPtr),
2470 rf_parity_map_size(raidPtr));
2471 if (first) {
2472 memcpy(map, &tmp, sizeof(*map));
2473 first = 0;
2474 } else {
2475 rf_paritymap_merge(map, &tmp);
2476 }
2477 }
2478 }
2479
2480 void
2481 rf_markalldirty(RF_Raid_t *raidPtr)
2482 {
2483 RF_ComponentLabel_t *clabel;
2484 int sparecol;
2485 int c;
2486 int j;
2487 int scol = -1;
2488
2489 raidPtr->mod_counter++;
2490 for (c = 0; c < raidPtr->numCol; c++) {
2491 /* we don't want to touch (at all) a disk that has
2492 failed */
2493 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2494 clabel = raidget_component_label(raidPtr, c);
2495 if (clabel->status == rf_ds_spared) {
2496 /* XXX do something special...
2497 but whatever you do, don't
2498 try to access it!! */
2499 } else {
2500 raidmarkdirty(raidPtr, c);
2501 }
2502 }
2503 }
2504
2505 for( c = 0; c < raidPtr->numSpare ; c++) {
2506 sparecol = raidPtr->numCol + c;
2507 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2508 /*
2509
2510 we claim this disk is "optimal" if it's
2511 rf_ds_used_spare, as that means it should be
2512 directly substitutable for the disk it replaced.
2513 We note that too...
2514
2515 */
2516
2517 for(j=0;j<raidPtr->numCol;j++) {
2518 if (raidPtr->Disks[j].spareCol == sparecol) {
2519 scol = j;
2520 break;
2521 }
2522 }
2523
2524 clabel = raidget_component_label(raidPtr, sparecol);
2525 /* make sure status is noted */
2526
2527 raid_init_component_label(raidPtr, clabel);
2528
2529 clabel->row = 0;
2530 clabel->column = scol;
2531 /* Note: we *don't* change status from rf_ds_used_spare
2532 to rf_ds_optimal */
2533 /* clabel.status = rf_ds_optimal; */
2534
2535 raidmarkdirty(raidPtr, sparecol);
2536 }
2537 }
2538 }
2539
2540
2541 void
2542 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2543 {
2544 RF_ComponentLabel_t *clabel;
2545 int sparecol;
2546 int c;
2547 int j;
2548 int scol;
2549 struct raid_softc *rs = raidPtr->softc;
2550
2551 scol = -1;
2552
2553 /* XXX should do extra checks to make sure things really are clean,
2554 rather than blindly setting the clean bit... */
2555
2556 raidPtr->mod_counter++;
2557
2558 for (c = 0; c < raidPtr->numCol; c++) {
2559 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2560 clabel = raidget_component_label(raidPtr, c);
2561 /* make sure status is noted */
2562 clabel->status = rf_ds_optimal;
2563
2564 /* note what unit we are configured as */
2565 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2566 clabel->last_unit = raidPtr->raidid;
2567
2568 raidflush_component_label(raidPtr, c);
2569 if (final == RF_FINAL_COMPONENT_UPDATE) {
2570 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2571 raidmarkclean(raidPtr, c);
2572 }
2573 }
2574 }
2575 /* else we don't touch it.. */
2576 }
2577
2578 for( c = 0; c < raidPtr->numSpare ; c++) {
2579 sparecol = raidPtr->numCol + c;
2580 /* Need to ensure that the reconstruct actually completed! */
2581 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2582 /*
2583
2584 we claim this disk is "optimal" if it's
2585 rf_ds_used_spare, as that means it should be
2586 directly substitutable for the disk it replaced.
2587 We note that too...
2588
2589 */
2590
2591 for(j=0;j<raidPtr->numCol;j++) {
2592 if (raidPtr->Disks[j].spareCol == sparecol) {
2593 scol = j;
2594 break;
2595 }
2596 }
2597
2598 /* XXX shouldn't *really* need this... */
2599 clabel = raidget_component_label(raidPtr, sparecol);
2600 /* make sure status is noted */
2601
2602 raid_init_component_label(raidPtr, clabel);
2603
2604 clabel->column = scol;
2605 clabel->status = rf_ds_optimal;
2606 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2607 clabel->last_unit = raidPtr->raidid;
2608
2609 raidflush_component_label(raidPtr, sparecol);
2610 if (final == RF_FINAL_COMPONENT_UPDATE) {
2611 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2612 raidmarkclean(raidPtr, sparecol);
2613 }
2614 }
2615 }
2616 }
2617 }
2618
2619 void
2620 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2621 {
2622
2623 if (vp != NULL) {
2624 if (auto_configured == 1) {
2625 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2626 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2627 vput(vp);
2628
2629 } else {
2630 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2631 }
2632 }
2633 }
2634
2635
2636 void
2637 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2638 {
2639 int r,c;
2640 struct vnode *vp;
2641 int acd;
2642
2643
2644 /* We take this opportunity to close the vnodes like we should.. */
2645
2646 for (c = 0; c < raidPtr->numCol; c++) {
2647 vp = raidPtr->raid_cinfo[c].ci_vp;
2648 acd = raidPtr->Disks[c].auto_configured;
2649 rf_close_component(raidPtr, vp, acd);
2650 raidPtr->raid_cinfo[c].ci_vp = NULL;
2651 raidPtr->Disks[c].auto_configured = 0;
2652 }
2653
2654 for (r = 0; r < raidPtr->numSpare; r++) {
2655 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2656 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2657 rf_close_component(raidPtr, vp, acd);
2658 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2659 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2660 }
2661 }
2662
2663
2664 void
2665 rf_ReconThread(struct rf_recon_req *req)
2666 {
2667 int s;
2668 RF_Raid_t *raidPtr;
2669
2670 s = splbio();
2671 raidPtr = (RF_Raid_t *) req->raidPtr;
2672 raidPtr->recon_in_progress = 1;
2673
2674 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2675 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2676
2677 RF_Free(req, sizeof(*req));
2678
2679 raidPtr->recon_in_progress = 0;
2680 splx(s);
2681
2682 /* That's all... */
2683 kthread_exit(0); /* does not return */
2684 }
2685
2686 void
2687 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2688 {
2689 int retcode;
2690 int s;
2691
2692 raidPtr->parity_rewrite_stripes_done = 0;
2693 raidPtr->parity_rewrite_in_progress = 1;
2694 s = splbio();
2695 retcode = rf_RewriteParity(raidPtr);
2696 splx(s);
2697 if (retcode) {
2698 printf("raid%d: Error re-writing parity (%d)!\n",
2699 raidPtr->raidid, retcode);
2700 } else {
2701 /* set the clean bit! If we shutdown correctly,
2702 the clean bit on each component label will get
2703 set */
2704 raidPtr->parity_good = RF_RAID_CLEAN;
2705 }
2706 raidPtr->parity_rewrite_in_progress = 0;
2707
2708 /* Anyone waiting for us to stop? If so, inform them... */
2709 if (raidPtr->waitShutdown) {
2710 wakeup(&raidPtr->parity_rewrite_in_progress);
2711 }
2712
2713 /* That's all... */
2714 kthread_exit(0); /* does not return */
2715 }
2716
2717
2718 void
2719 rf_CopybackThread(RF_Raid_t *raidPtr)
2720 {
2721 int s;
2722
2723 raidPtr->copyback_in_progress = 1;
2724 s = splbio();
2725 rf_CopybackReconstructedData(raidPtr);
2726 splx(s);
2727 raidPtr->copyback_in_progress = 0;
2728
2729 /* That's all... */
2730 kthread_exit(0); /* does not return */
2731 }
2732
2733
2734 void
2735 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2736 {
2737 int s;
2738 RF_Raid_t *raidPtr;
2739
2740 s = splbio();
2741 raidPtr = req->raidPtr;
2742 raidPtr->recon_in_progress = 1;
2743 rf_ReconstructInPlace(raidPtr, req->col);
2744 RF_Free(req, sizeof(*req));
2745 raidPtr->recon_in_progress = 0;
2746 splx(s);
2747
2748 /* That's all... */
2749 kthread_exit(0); /* does not return */
2750 }
2751
2752 static RF_AutoConfig_t *
2753 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2754 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2755 unsigned secsize)
2756 {
2757 int good_one = 0;
2758 RF_ComponentLabel_t *clabel;
2759 RF_AutoConfig_t *ac;
2760
2761 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2762 if (clabel == NULL) {
2763 oomem:
2764 while(ac_list) {
2765 ac = ac_list;
2766 if (ac->clabel)
2767 free(ac->clabel, M_RAIDFRAME);
2768 ac_list = ac_list->next;
2769 free(ac, M_RAIDFRAME);
2770 }
2771 printf("RAID auto config: out of memory!\n");
2772 return NULL; /* XXX probably should panic? */
2773 }
2774
2775 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2776 /* Got the label. Does it look reasonable? */
2777 if (rf_reasonable_label(clabel, numsecs) &&
2778 (rf_component_label_partitionsize(clabel) <= size)) {
2779 #ifdef DEBUG
2780 printf("Component on: %s: %llu\n",
2781 cname, (unsigned long long)size);
2782 rf_print_component_label(clabel);
2783 #endif
2784 /* if it's reasonable, add it, else ignore it. */
2785 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2786 M_NOWAIT);
2787 if (ac == NULL) {
2788 free(clabel, M_RAIDFRAME);
2789 goto oomem;
2790 }
2791 strlcpy(ac->devname, cname, sizeof(ac->devname));
2792 ac->dev = dev;
2793 ac->vp = vp;
2794 ac->clabel = clabel;
2795 ac->next = ac_list;
2796 ac_list = ac;
2797 good_one = 1;
2798 }
2799 }
2800 if (!good_one) {
2801 /* cleanup */
2802 free(clabel, M_RAIDFRAME);
2803 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2804 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2805 vput(vp);
2806 }
2807 return ac_list;
2808 }
2809
2810 RF_AutoConfig_t *
2811 rf_find_raid_components(void)
2812 {
2813 struct vnode *vp;
2814 struct disklabel label;
2815 device_t dv;
2816 deviter_t di;
2817 dev_t dev;
2818 int bmajor, bminor, wedge, rf_part_found;
2819 int error;
2820 int i;
2821 RF_AutoConfig_t *ac_list;
2822 uint64_t numsecs;
2823 unsigned secsize;
2824 int dowedges;
2825
2826 /* initialize the AutoConfig list */
2827 ac_list = NULL;
2828
2829 /*
2830 * we begin by trolling through *all* the devices on the system *twice*
2831 * first we scan for wedges, second for other devices. This avoids
2832 * using a raw partition instead of a wedge that covers the whole disk
2833 */
2834
2835 for (dowedges=1; dowedges>=0; --dowedges) {
2836 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2837 dv = deviter_next(&di)) {
2838
2839 /* we are only interested in disks... */
2840 if (device_class(dv) != DV_DISK)
2841 continue;
2842
2843 /* we don't care about floppies... */
2844 if (device_is_a(dv, "fd")) {
2845 continue;
2846 }
2847
2848 /* we don't care about CD's... */
2849 if (device_is_a(dv, "cd")) {
2850 continue;
2851 }
2852
2853 /* we don't care about md's... */
2854 if (device_is_a(dv, "md")) {
2855 continue;
2856 }
2857
2858 /* hdfd is the Atari/Hades floppy driver */
2859 if (device_is_a(dv, "hdfd")) {
2860 continue;
2861 }
2862
2863 /* fdisa is the Atari/Milan floppy driver */
2864 if (device_is_a(dv, "fdisa")) {
2865 continue;
2866 }
2867
2868 /* are we in the wedges pass ? */
2869 wedge = device_is_a(dv, "dk");
2870 if (wedge != dowedges) {
2871 continue;
2872 }
2873
2874 /* need to find the device_name_to_block_device_major stuff */
2875 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2876
2877 rf_part_found = 0; /*No raid partition as yet*/
2878
2879 /* get a vnode for the raw partition of this disk */
2880 bminor = minor(device_unit(dv));
2881 dev = wedge ? makedev(bmajor, bminor) :
2882 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2883 if (bdevvp(dev, &vp))
2884 panic("RAID can't alloc vnode");
2885
2886 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2887
2888 if (error) {
2889 /* "Who cares." Continue looking
2890 for something that exists*/
2891 vput(vp);
2892 continue;
2893 }
2894
2895 error = getdisksize(vp, &numsecs, &secsize);
2896 if (error) {
2897 /*
2898 * Pseudo devices like vnd and cgd can be
2899 * opened but may still need some configuration.
2900 * Ignore these quietly.
2901 */
2902 if (error != ENXIO)
2903 printf("RAIDframe: can't get disk size"
2904 " for dev %s (%d)\n",
2905 device_xname(dv), error);
2906 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2907 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2908 vput(vp);
2909 continue;
2910 }
2911 if (wedge) {
2912 struct dkwedge_info dkw;
2913 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2914 NOCRED);
2915 if (error) {
2916 printf("RAIDframe: can't get wedge info for "
2917 "dev %s (%d)\n", device_xname(dv), error);
2918 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2919 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2920 vput(vp);
2921 continue;
2922 }
2923
2924 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2925 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2926 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2927 vput(vp);
2928 continue;
2929 }
2930
2931 ac_list = rf_get_component(ac_list, dev, vp,
2932 device_xname(dv), dkw.dkw_size, numsecs, secsize);
2933 rf_part_found = 1; /*There is a raid component on this disk*/
2934 continue;
2935 }
2936
2937 /* Ok, the disk exists. Go get the disklabel. */
2938 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2939 if (error) {
2940 /*
2941 * XXX can't happen - open() would
2942 * have errored out (or faked up one)
2943 */
2944 if (error != ENOTTY)
2945 printf("RAIDframe: can't get label for dev "
2946 "%s (%d)\n", device_xname(dv), error);
2947 }
2948
2949 /* don't need this any more. We'll allocate it again
2950 a little later if we really do... */
2951 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2952 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2953 vput(vp);
2954
2955 if (error)
2956 continue;
2957
2958 rf_part_found = 0; /*No raid partitions yet*/
2959 for (i = 0; i < label.d_npartitions; i++) {
2960 char cname[sizeof(ac_list->devname)];
2961
2962 /* We only support partitions marked as RAID */
2963 if (label.d_partitions[i].p_fstype != FS_RAID)
2964 continue;
2965
2966 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2967 if (bdevvp(dev, &vp))
2968 panic("RAID can't alloc vnode");
2969
2970 error = VOP_OPEN(vp, FREAD, NOCRED);
2971 if (error) {
2972 /* Whatever... */
2973 vput(vp);
2974 continue;
2975 }
2976 snprintf(cname, sizeof(cname), "%s%c",
2977 device_xname(dv), 'a' + i);
2978 ac_list = rf_get_component(ac_list, dev, vp, cname,
2979 label.d_partitions[i].p_size, numsecs, secsize);
2980 rf_part_found = 1; /*There is at least one raid partition on this disk*/
2981 }
2982
2983 /*
2984 *If there is no raid component on this disk, either in a
2985 *disklabel or inside a wedge, check the raw partition as well,
2986 *as it is possible to configure raid components on raw disk
2987 *devices.
2988 */
2989
2990 if (!rf_part_found) {
2991 char cname[sizeof(ac_list->devname)];
2992
2993 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
2994 if (bdevvp(dev, &vp))
2995 panic("RAID can't alloc vnode");
2996
2997 error = VOP_OPEN(vp, FREAD, NOCRED);
2998 if (error) {
2999 /* Whatever... */
3000 vput(vp);
3001 continue;
3002 }
3003 snprintf(cname, sizeof(cname), "%s%c",
3004 device_xname(dv), 'a' + RAW_PART);
3005 ac_list = rf_get_component(ac_list, dev, vp, cname,
3006 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3007 }
3008 }
3009 deviter_release(&di);
3010 }
3011 return ac_list;
3012 }
3013
3014
3015 int
3016 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3017 {
3018
3019 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3020 (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3021 ((clabel->clean == RF_RAID_CLEAN) ||
3022 (clabel->clean == RF_RAID_DIRTY)) &&
3023 clabel->row >=0 &&
3024 clabel->column >= 0 &&
3025 clabel->num_rows > 0 &&
3026 clabel->num_columns > 0 &&
3027 clabel->row < clabel->num_rows &&
3028 clabel->column < clabel->num_columns &&
3029 clabel->blockSize > 0 &&
3030 /*
3031 * numBlocksHi may contain garbage, but it is ok since
3032 * the type is unsigned. If it is really garbage,
3033 * rf_fix_old_label_size() will fix it.
3034 */
3035 rf_component_label_numblocks(clabel) > 0) {
3036 /*
3037 * label looks reasonable enough...
3038 * let's make sure it has no old garbage.
3039 */
3040 if (numsecs)
3041 rf_fix_old_label_size(clabel, numsecs);
3042 return(1);
3043 }
3044 return(0);
3045 }
3046
3047
3048 /*
3049 * For reasons yet unknown, some old component labels have garbage in
3050 * the newer numBlocksHi region, and this causes lossage. Since those
3051 * disks will also have numsecs set to less than 32 bits of sectors,
3052 * we can determine when this corruption has occurred, and fix it.
3053 *
3054 * The exact same problem, with the same unknown reason, happens to
3055 * the partitionSizeHi member as well.
3056 */
3057 static void
3058 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3059 {
3060
3061 if (numsecs < ((uint64_t)1 << 32)) {
3062 if (clabel->numBlocksHi) {
3063 printf("WARNING: total sectors < 32 bits, yet "
3064 "numBlocksHi set\n"
3065 "WARNING: resetting numBlocksHi to zero.\n");
3066 clabel->numBlocksHi = 0;
3067 }
3068
3069 if (clabel->partitionSizeHi) {
3070 printf("WARNING: total sectors < 32 bits, yet "
3071 "partitionSizeHi set\n"
3072 "WARNING: resetting partitionSizeHi to zero.\n");
3073 clabel->partitionSizeHi = 0;
3074 }
3075 }
3076 }
3077
3078
3079 #ifdef DEBUG
3080 void
3081 rf_print_component_label(RF_ComponentLabel_t *clabel)
3082 {
3083 uint64_t numBlocks;
3084 static const char *rp[] = {
3085 "No", "Force", "Soft", "*invalid*"
3086 };
3087
3088
3089 numBlocks = rf_component_label_numblocks(clabel);
3090
3091 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3092 clabel->row, clabel->column,
3093 clabel->num_rows, clabel->num_columns);
3094 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3095 clabel->version, clabel->serial_number,
3096 clabel->mod_counter);
3097 printf(" Clean: %s Status: %d\n",
3098 clabel->clean ? "Yes" : "No", clabel->status);
3099 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3100 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3101 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3102 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3103 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3104 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3105 printf(" Last configured as: raid%d\n", clabel->last_unit);
3106 #if 0
3107 printf(" Config order: %d\n", clabel->config_order);
3108 #endif
3109
3110 }
3111 #endif
3112
3113 RF_ConfigSet_t *
3114 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3115 {
3116 RF_AutoConfig_t *ac;
3117 RF_ConfigSet_t *config_sets;
3118 RF_ConfigSet_t *cset;
3119 RF_AutoConfig_t *ac_next;
3120
3121
3122 config_sets = NULL;
3123
3124 /* Go through the AutoConfig list, and figure out which components
3125 belong to what sets. */
3126 ac = ac_list;
3127 while(ac!=NULL) {
3128 /* we're going to putz with ac->next, so save it here
3129 for use at the end of the loop */
3130 ac_next = ac->next;
3131
3132 if (config_sets == NULL) {
3133 /* will need at least this one... */
3134 config_sets = (RF_ConfigSet_t *)
3135 malloc(sizeof(RF_ConfigSet_t),
3136 M_RAIDFRAME, M_NOWAIT);
3137 if (config_sets == NULL) {
3138 panic("rf_create_auto_sets: No memory!");
3139 }
3140 /* this one is easy :) */
3141 config_sets->ac = ac;
3142 config_sets->next = NULL;
3143 config_sets->rootable = 0;
3144 ac->next = NULL;
3145 } else {
3146 /* which set does this component fit into? */
3147 cset = config_sets;
3148 while(cset!=NULL) {
3149 if (rf_does_it_fit(cset, ac)) {
3150 /* looks like it matches... */
3151 ac->next = cset->ac;
3152 cset->ac = ac;
3153 break;
3154 }
3155 cset = cset->next;
3156 }
3157 if (cset==NULL) {
3158 /* didn't find a match above... new set..*/
3159 cset = (RF_ConfigSet_t *)
3160 malloc(sizeof(RF_ConfigSet_t),
3161 M_RAIDFRAME, M_NOWAIT);
3162 if (cset == NULL) {
3163 panic("rf_create_auto_sets: No memory!");
3164 }
3165 cset->ac = ac;
3166 ac->next = NULL;
3167 cset->next = config_sets;
3168 cset->rootable = 0;
3169 config_sets = cset;
3170 }
3171 }
3172 ac = ac_next;
3173 }
3174
3175
3176 return(config_sets);
3177 }
3178
3179 static int
3180 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3181 {
3182 RF_ComponentLabel_t *clabel1, *clabel2;
3183
3184 /* If this one matches the *first* one in the set, that's good
3185 enough, since the other members of the set would have been
3186 through here too... */
3187 /* note that we are not checking partitionSize here..
3188
3189 Note that we are also not checking the mod_counters here.
3190 If everything else matches except the mod_counter, that's
3191 good enough for this test. We will deal with the mod_counters
3192 a little later in the autoconfiguration process.
3193
3194 (clabel1->mod_counter == clabel2->mod_counter) &&
3195
3196 The reason we don't check for this is that failed disks
3197 will have lower modification counts. If those disks are
3198 not added to the set they used to belong to, then they will
3199 form their own set, which may result in 2 different sets,
3200 for example, competing to be configured at raid0, and
3201 perhaps competing to be the root filesystem set. If the
3202 wrong ones get configured, or both attempt to become /,
3203 weird behaviour and or serious lossage will occur. Thus we
3204 need to bring them into the fold here, and kick them out at
3205 a later point.
3206
3207 */
3208
3209 clabel1 = cset->ac->clabel;
3210 clabel2 = ac->clabel;
3211 if ((clabel1->version == clabel2->version) &&
3212 (clabel1->serial_number == clabel2->serial_number) &&
3213 (clabel1->num_rows == clabel2->num_rows) &&
3214 (clabel1->num_columns == clabel2->num_columns) &&
3215 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3216 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3217 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3218 (clabel1->parityConfig == clabel2->parityConfig) &&
3219 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3220 (clabel1->blockSize == clabel2->blockSize) &&
3221 rf_component_label_numblocks(clabel1) ==
3222 rf_component_label_numblocks(clabel2) &&
3223 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3224 (clabel1->root_partition == clabel2->root_partition) &&
3225 (clabel1->last_unit == clabel2->last_unit) &&
3226 (clabel1->config_order == clabel2->config_order)) {
3227 /* if it get's here, it almost *has* to be a match */
3228 } else {
3229 /* it's not consistent with somebody in the set..
3230 punt */
3231 return(0);
3232 }
3233 /* all was fine.. it must fit... */
3234 return(1);
3235 }
3236
3237 int
3238 rf_have_enough_components(RF_ConfigSet_t *cset)
3239 {
3240 RF_AutoConfig_t *ac;
3241 RF_AutoConfig_t *auto_config;
3242 RF_ComponentLabel_t *clabel;
3243 int c;
3244 int num_cols;
3245 int num_missing;
3246 int mod_counter;
3247 int mod_counter_found;
3248 int even_pair_failed;
3249 char parity_type;
3250
3251
3252 /* check to see that we have enough 'live' components
3253 of this set. If so, we can configure it if necessary */
3254
3255 num_cols = cset->ac->clabel->num_columns;
3256 parity_type = cset->ac->clabel->parityConfig;
3257
3258 /* XXX Check for duplicate components!?!?!? */
3259
3260 /* Determine what the mod_counter is supposed to be for this set. */
3261
3262 mod_counter_found = 0;
3263 mod_counter = 0;
3264 ac = cset->ac;
3265 while(ac!=NULL) {
3266 if (mod_counter_found==0) {
3267 mod_counter = ac->clabel->mod_counter;
3268 mod_counter_found = 1;
3269 } else {
3270 if (ac->clabel->mod_counter > mod_counter) {
3271 mod_counter = ac->clabel->mod_counter;
3272 }
3273 }
3274 ac = ac->next;
3275 }
3276
3277 num_missing = 0;
3278 auto_config = cset->ac;
3279
3280 even_pair_failed = 0;
3281 for(c=0; c<num_cols; c++) {
3282 ac = auto_config;
3283 while(ac!=NULL) {
3284 if ((ac->clabel->column == c) &&
3285 (ac->clabel->mod_counter == mod_counter)) {
3286 /* it's this one... */
3287 #ifdef DEBUG
3288 printf("Found: %s at %d\n",
3289 ac->devname,c);
3290 #endif
3291 break;
3292 }
3293 ac=ac->next;
3294 }
3295 if (ac==NULL) {
3296 /* Didn't find one here! */
3297 /* special case for RAID 1, especially
3298 where there are more than 2
3299 components (where RAIDframe treats
3300 things a little differently :( ) */
3301 if (parity_type == '1') {
3302 if (c%2 == 0) { /* even component */
3303 even_pair_failed = 1;
3304 } else { /* odd component. If
3305 we're failed, and
3306 so is the even
3307 component, it's
3308 "Good Night, Charlie" */
3309 if (even_pair_failed == 1) {
3310 return(0);
3311 }
3312 }
3313 } else {
3314 /* normal accounting */
3315 num_missing++;
3316 }
3317 }
3318 if ((parity_type == '1') && (c%2 == 1)) {
3319 /* Just did an even component, and we didn't
3320 bail.. reset the even_pair_failed flag,
3321 and go on to the next component.... */
3322 even_pair_failed = 0;
3323 }
3324 }
3325
3326 clabel = cset->ac->clabel;
3327
3328 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3329 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3330 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3331 /* XXX this needs to be made *much* more general */
3332 /* Too many failures */
3333 return(0);
3334 }
3335 /* otherwise, all is well, and we've got enough to take a kick
3336 at autoconfiguring this set */
3337 return(1);
3338 }
3339
3340 void
3341 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3342 RF_Raid_t *raidPtr)
3343 {
3344 RF_ComponentLabel_t *clabel;
3345 int i;
3346
3347 clabel = ac->clabel;
3348
3349 /* 1. Fill in the common stuff */
3350 config->numRow = clabel->num_rows = 1;
3351 config->numCol = clabel->num_columns;
3352 config->numSpare = 0; /* XXX should this be set here? */
3353 config->sectPerSU = clabel->sectPerSU;
3354 config->SUsPerPU = clabel->SUsPerPU;
3355 config->SUsPerRU = clabel->SUsPerRU;
3356 config->parityConfig = clabel->parityConfig;
3357 /* XXX... */
3358 strcpy(config->diskQueueType,"fifo");
3359 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3360 config->layoutSpecificSize = 0; /* XXX ?? */
3361
3362 while(ac!=NULL) {
3363 /* row/col values will be in range due to the checks
3364 in reasonable_label() */
3365 strcpy(config->devnames[0][ac->clabel->column],
3366 ac->devname);
3367 ac = ac->next;
3368 }
3369
3370 for(i=0;i<RF_MAXDBGV;i++) {
3371 config->debugVars[i][0] = 0;
3372 }
3373 }
3374
3375 int
3376 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3377 {
3378 RF_ComponentLabel_t *clabel;
3379 int column;
3380 int sparecol;
3381
3382 raidPtr->autoconfigure = new_value;
3383
3384 for(column=0; column<raidPtr->numCol; column++) {
3385 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3386 clabel = raidget_component_label(raidPtr, column);
3387 clabel->autoconfigure = new_value;
3388 raidflush_component_label(raidPtr, column);
3389 }
3390 }
3391 for(column = 0; column < raidPtr->numSpare ; column++) {
3392 sparecol = raidPtr->numCol + column;
3393 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3394 clabel = raidget_component_label(raidPtr, sparecol);
3395 clabel->autoconfigure = new_value;
3396 raidflush_component_label(raidPtr, sparecol);
3397 }
3398 }
3399 return(new_value);
3400 }
3401
3402 int
3403 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3404 {
3405 RF_ComponentLabel_t *clabel;
3406 int column;
3407 int sparecol;
3408
3409 raidPtr->root_partition = new_value;
3410 for(column=0; column<raidPtr->numCol; column++) {
3411 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3412 clabel = raidget_component_label(raidPtr, column);
3413 clabel->root_partition = new_value;
3414 raidflush_component_label(raidPtr, column);
3415 }
3416 }
3417 for(column = 0; column < raidPtr->numSpare ; column++) {
3418 sparecol = raidPtr->numCol + column;
3419 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3420 clabel = raidget_component_label(raidPtr, sparecol);
3421 clabel->root_partition = new_value;
3422 raidflush_component_label(raidPtr, sparecol);
3423 }
3424 }
3425 return(new_value);
3426 }
3427
3428 void
3429 rf_release_all_vps(RF_ConfigSet_t *cset)
3430 {
3431 RF_AutoConfig_t *ac;
3432
3433 ac = cset->ac;
3434 while(ac!=NULL) {
3435 /* Close the vp, and give it back */
3436 if (ac->vp) {
3437 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3438 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3439 vput(ac->vp);
3440 ac->vp = NULL;
3441 }
3442 ac = ac->next;
3443 }
3444 }
3445
3446
3447 void
3448 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3449 {
3450 RF_AutoConfig_t *ac;
3451 RF_AutoConfig_t *next_ac;
3452
3453 ac = cset->ac;
3454 while(ac!=NULL) {
3455 next_ac = ac->next;
3456 /* nuke the label */
3457 free(ac->clabel, M_RAIDFRAME);
3458 /* cleanup the config structure */
3459 free(ac, M_RAIDFRAME);
3460 /* "next.." */
3461 ac = next_ac;
3462 }
3463 /* and, finally, nuke the config set */
3464 free(cset, M_RAIDFRAME);
3465 }
3466
3467
3468 void
3469 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3470 {
3471 /* current version number */
3472 clabel->version = RF_COMPONENT_LABEL_VERSION;
3473 clabel->serial_number = raidPtr->serial_number;
3474 clabel->mod_counter = raidPtr->mod_counter;
3475
3476 clabel->num_rows = 1;
3477 clabel->num_columns = raidPtr->numCol;
3478 clabel->clean = RF_RAID_DIRTY; /* not clean */
3479 clabel->status = rf_ds_optimal; /* "It's good!" */
3480
3481 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3482 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3483 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3484
3485 clabel->blockSize = raidPtr->bytesPerSector;
3486 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3487
3488 /* XXX not portable */
3489 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3490 clabel->maxOutstanding = raidPtr->maxOutstanding;
3491 clabel->autoconfigure = raidPtr->autoconfigure;
3492 clabel->root_partition = raidPtr->root_partition;
3493 clabel->last_unit = raidPtr->raidid;
3494 clabel->config_order = raidPtr->config_order;
3495
3496 #ifndef RF_NO_PARITY_MAP
3497 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3498 #endif
3499 }
3500
3501 struct raid_softc *
3502 rf_auto_config_set(RF_ConfigSet_t *cset)
3503 {
3504 RF_Raid_t *raidPtr;
3505 RF_Config_t *config;
3506 int raidID;
3507 struct raid_softc *sc;
3508
3509 #ifdef DEBUG
3510 printf("RAID autoconfigure\n");
3511 #endif
3512
3513 /* 1. Create a config structure */
3514 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO);
3515 if (config == NULL) {
3516 printf("%s: Out of mem - config!?!?\n", __func__);
3517 /* XXX do something more intelligent here. */
3518 return NULL;
3519 }
3520
3521 /*
3522 2. Figure out what RAID ID this one is supposed to live at
3523 See if we can get the same RAID dev that it was configured
3524 on last time..
3525 */
3526
3527 raidID = cset->ac->clabel->last_unit;
3528 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3529 sc = raidget(++raidID, false))
3530 continue;
3531 #ifdef DEBUG
3532 printf("Configuring raid%d:\n",raidID);
3533 #endif
3534
3535 if (sc == NULL)
3536 sc = raidget(raidID, true);
3537 if (sc == NULL) {
3538 printf("%s: Out of mem - softc!?!?\n", __func__);
3539 /* XXX do something more intelligent here. */
3540 free(config, M_RAIDFRAME);
3541 return NULL;
3542 }
3543
3544 raidPtr = &sc->sc_r;
3545
3546 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3547 raidPtr->softc = sc;
3548 raidPtr->raidid = raidID;
3549 raidPtr->openings = RAIDOUTSTANDING;
3550
3551 /* 3. Build the configuration structure */
3552 rf_create_configuration(cset->ac, config, raidPtr);
3553
3554 /* 4. Do the configuration */
3555 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3556 raidinit(sc);
3557
3558 rf_markalldirty(raidPtr);
3559 raidPtr->autoconfigure = 1; /* XXX do this here? */
3560 switch (cset->ac->clabel->root_partition) {
3561 case 1: /* Force Root */
3562 case 2: /* Soft Root: root when boot partition part of raid */
3563 /*
3564 * everything configured just fine. Make a note
3565 * that this set is eligible to be root,
3566 * or forced to be root
3567 */
3568 cset->rootable = cset->ac->clabel->root_partition;
3569 /* XXX do this here? */
3570 raidPtr->root_partition = cset->rootable;
3571 break;
3572 default:
3573 break;
3574 }
3575 } else {
3576 raidput(sc);
3577 sc = NULL;
3578 }
3579
3580 /* 5. Cleanup */
3581 free(config, M_RAIDFRAME);
3582 return sc;
3583 }
3584
3585 void
3586 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3587 size_t xmin, size_t xmax)
3588 {
3589 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3590 pool_sethiwat(p, xmax);
3591 pool_prime(p, xmin);
3592 pool_setlowat(p, xmin);
3593 }
3594
3595 /*
3596 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3597 * to see if there is IO pending and if that IO could possibly be done
3598 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3599 * otherwise.
3600 *
3601 */
3602 int
3603 rf_buf_queue_check(RF_Raid_t *raidPtr)
3604 {
3605 struct raid_softc *rs;
3606 struct dk_softc *dksc;
3607
3608 rs = raidPtr->softc;
3609 dksc = &rs->sc_dksc;
3610
3611 if ((rs->sc_flags & RAIDF_INITED) == 0)
3612 return 1;
3613
3614 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3615 /* there is work to do */
3616 return 0;
3617 }
3618 /* default is nothing to do */
3619 return 1;
3620 }
3621
3622 int
3623 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3624 {
3625 uint64_t numsecs;
3626 unsigned secsize;
3627 int error;
3628
3629 error = getdisksize(vp, &numsecs, &secsize);
3630 if (error == 0) {
3631 diskPtr->blockSize = secsize;
3632 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3633 diskPtr->partitionSize = numsecs;
3634 return 0;
3635 }
3636 return error;
3637 }
3638
3639 static int
3640 raid_match(device_t self, cfdata_t cfdata, void *aux)
3641 {
3642 return 1;
3643 }
3644
3645 static void
3646 raid_attach(device_t parent, device_t self, void *aux)
3647 {
3648 }
3649
3650
3651 static int
3652 raid_detach(device_t self, int flags)
3653 {
3654 int error;
3655 struct raid_softc *rs = raidsoftc(self);
3656
3657 if (rs == NULL)
3658 return ENXIO;
3659
3660 if ((error = raidlock(rs)) != 0)
3661 return (error);
3662
3663 error = raid_detach_unlocked(rs);
3664
3665 raidunlock(rs);
3666
3667 /* XXX raid can be referenced here */
3668
3669 if (error)
3670 return error;
3671
3672 /* Free the softc */
3673 raidput(rs);
3674
3675 return 0;
3676 }
3677
3678 static void
3679 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3680 {
3681 struct dk_softc *dksc = &rs->sc_dksc;
3682 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3683
3684 memset(dg, 0, sizeof(*dg));
3685
3686 dg->dg_secperunit = raidPtr->totalSectors;
3687 dg->dg_secsize = raidPtr->bytesPerSector;
3688 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3689 dg->dg_ntracks = 4 * raidPtr->numCol;
3690
3691 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3692 }
3693
3694 /*
3695 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3696 * We end up returning whatever error was returned by the first cache flush
3697 * that fails.
3698 */
3699
3700 int
3701 rf_sync_component_caches(RF_Raid_t *raidPtr)
3702 {
3703 int c, sparecol;
3704 int e,error;
3705 int force = 1;
3706
3707 error = 0;
3708 for (c = 0; c < raidPtr->numCol; c++) {
3709 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3710 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3711 &force, FWRITE, NOCRED);
3712 if (e) {
3713 if (e != ENODEV)
3714 printf("raid%d: cache flush to component %s failed.\n",
3715 raidPtr->raidid, raidPtr->Disks[c].devname);
3716 if (error == 0) {
3717 error = e;
3718 }
3719 }
3720 }
3721 }
3722
3723 for( c = 0; c < raidPtr->numSpare ; c++) {
3724 sparecol = raidPtr->numCol + c;
3725 /* Need to ensure that the reconstruct actually completed! */
3726 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3727 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3728 DIOCCACHESYNC, &force, FWRITE, NOCRED);
3729 if (e) {
3730 if (e != ENODEV)
3731 printf("raid%d: cache flush to component %s failed.\n",
3732 raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3733 if (error == 0) {
3734 error = e;
3735 }
3736 }
3737 }
3738 }
3739 return error;
3740 }
3741
3742 /*
3743 * Module interface
3744 */
3745
3746 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr");
3747
3748 #ifdef _MODULE
3749 CFDRIVER_DECL(raid, DV_DISK, NULL);
3750 #endif
3751
3752 static int raid_modcmd(modcmd_t, void *);
3753 static int raid_modcmd_init(void);
3754 static int raid_modcmd_fini(void);
3755
3756 static int
3757 raid_modcmd(modcmd_t cmd, void *data)
3758 {
3759 int error;
3760
3761 error = 0;
3762 switch (cmd) {
3763 case MODULE_CMD_INIT:
3764 error = raid_modcmd_init();
3765 break;
3766 case MODULE_CMD_FINI:
3767 error = raid_modcmd_fini();
3768 break;
3769 default:
3770 error = ENOTTY;
3771 break;
3772 }
3773 return error;
3774 }
3775
3776 static int
3777 raid_modcmd_init(void)
3778 {
3779 int error;
3780 #ifdef _MODULE
3781 int bmajor, cmajor;
3782 #endif
3783
3784 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
3785 mutex_enter(&raid_lock);
3786 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3787 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
3788 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
3789 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
3790
3791 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
3792 #endif
3793
3794 #ifdef _MODULE
3795 bmajor = cmajor = -1;
3796 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
3797 &raid_cdevsw, &cmajor);
3798 if (error != 0) {
3799 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
3800 mutex_exit(&raid_lock);
3801 return error;
3802 }
3803 error = config_cfdriver_attach(&raid_cd);
3804 if (error != 0) {
3805 aprint_error("%s: config_cfdriver_attach failed %d\n",
3806 __func__, error);
3807 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3808 mutex_exit(&raid_lock);
3809 return error;
3810 }
3811 #endif
3812 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3813 if (error != 0) {
3814 aprint_error("%s: config_cfattach_attach failed %d\n",
3815 __func__, error);
3816 #ifdef _MODULE
3817 config_cfdriver_detach(&raid_cd);
3818 #endif
3819 devsw_detach(&raid_bdevsw, &raid_cdevsw);
3820 mutex_exit(&raid_lock);
3821 return error;
3822 }
3823
3824 raidautoconfigdone = false;
3825
3826 mutex_exit(&raid_lock);
3827
3828 if (error == 0) {
3829 if (rf_BootRaidframe(true) == 0)
3830 aprint_verbose("Kernelized RAIDframe activated\n");
3831 else
3832 panic("Serious error activating RAID!!");
3833 }
3834
3835 /*
3836 * Register a finalizer which will be used to auto-config RAID
3837 * sets once all real hardware devices have been found.
3838 */
3839 error = config_finalize_register(NULL, rf_autoconfig);
3840 if (error != 0) {
3841 aprint_error("WARNING: unable to register RAIDframe "
3842 "finalizer\n");
3843 error = 0;
3844 }
3845
3846 return error;
3847 }
3848
3849 static int
3850 raid_modcmd_fini(void)
3851 {
3852 int error;
3853
3854 mutex_enter(&raid_lock);
3855
3856 /* Don't allow unload if raid device(s) exist. */
3857 if (!LIST_EMPTY(&raids)) {
3858 mutex_exit(&raid_lock);
3859 return EBUSY;
3860 }
3861
3862 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
3863 if (error != 0) {
3864 aprint_error("%s: cannot detach cfattach\n",__func__);
3865 mutex_exit(&raid_lock);
3866 return error;
3867 }
3868 #ifdef _MODULE
3869 error = config_cfdriver_detach(&raid_cd);
3870 if (error != 0) {
3871 aprint_error("%s: cannot detach cfdriver\n",__func__);
3872 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3873 mutex_exit(&raid_lock);
3874 return error;
3875 }
3876 error = devsw_detach(&raid_bdevsw, &raid_cdevsw);
3877 if (error != 0) {
3878 aprint_error("%s: cannot detach devsw\n",__func__);
3879 config_cfdriver_attach(&raid_cd);
3880 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
3881 mutex_exit(&raid_lock);
3882 return error;
3883 }
3884 #endif
3885 rf_BootRaidframe(false);
3886 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
3887 rf_destroy_mutex2(rf_sparet_wait_mutex);
3888 rf_destroy_cond2(rf_sparet_wait_cv);
3889 rf_destroy_cond2(rf_sparet_resp_cv);
3890 #endif
3891 mutex_exit(&raid_lock);
3892 mutex_destroy(&raid_lock);
3893
3894 return error;
3895 }
3896