rf_netbsdkintf.c revision 1.409 1 /* $NetBSD: rf_netbsdkintf.c,v 1.409 2022/08/28 00:26:04 oster Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.409 2022/08/28 00:26:04 oster Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165
166 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
167 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
169 * installation process */
170 #endif
171
172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
173
174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
175
176 /* prototypes */
177 static void KernelWakeupFunc(struct buf *);
178 static void InitBP(struct buf *, struct vnode *, unsigned,
179 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
180 void *, int);
181 static void raidinit(struct raid_softc *);
182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
184
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192 daddr_t, daddr_t, int);
193
194 static int raidwrite_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197 dev_t, struct vnode *, RF_ComponentLabel_t *);
198
199 static int raid_diskstart(device_t, struct buf *bp);
200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
201 static int raid_lastclose(device_t);
202
203 static dev_type_open(raidopen);
204 static dev_type_close(raidclose);
205 static dev_type_read(raidread);
206 static dev_type_write(raidwrite);
207 static dev_type_ioctl(raidioctl);
208 static dev_type_strategy(raidstrategy);
209 static dev_type_dump(raiddump);
210 static dev_type_size(raidsize);
211
212 const struct bdevsw raid_bdevsw = {
213 .d_open = raidopen,
214 .d_close = raidclose,
215 .d_strategy = raidstrategy,
216 .d_ioctl = raidioctl,
217 .d_dump = raiddump,
218 .d_psize = raidsize,
219 .d_discard = nodiscard,
220 .d_flag = D_DISK
221 };
222
223 const struct cdevsw raid_cdevsw = {
224 .d_open = raidopen,
225 .d_close = raidclose,
226 .d_read = raidread,
227 .d_write = raidwrite,
228 .d_ioctl = raidioctl,
229 .d_stop = nostop,
230 .d_tty = notty,
231 .d_poll = nopoll,
232 .d_mmap = nommap,
233 .d_kqfilter = nokqfilter,
234 .d_discard = nodiscard,
235 .d_flag = D_DISK
236 };
237
238 static struct dkdriver rf_dkdriver = {
239 .d_open = raidopen,
240 .d_close = raidclose,
241 .d_strategy = raidstrategy,
242 .d_diskstart = raid_diskstart,
243 .d_dumpblocks = raid_dumpblocks,
244 .d_lastclose = raid_lastclose,
245 .d_minphys = minphys
246 };
247
248 #define raidunit(x) DISKUNIT(x)
249 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
250
251 extern struct cfdriver raid_cd;
252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
253 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
254 DVF_DETACH_SHUTDOWN);
255
256 /* Internal representation of a rf_recon_req */
257 struct rf_recon_req_internal {
258 RF_RowCol_t col;
259 RF_ReconReqFlags_t flags;
260 void *raidPtr;
261 };
262
263 /*
264 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
265 * Be aware that large numbers can allow the driver to consume a lot of
266 * kernel memory, especially on writes, and in degraded mode reads.
267 *
268 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
269 * a single 64K write will typically require 64K for the old data,
270 * 64K for the old parity, and 64K for the new parity, for a total
271 * of 192K (if the parity buffer is not re-used immediately).
272 * Even it if is used immediately, that's still 128K, which when multiplied
273 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
274 *
275 * Now in degraded mode, for example, a 64K read on the above setup may
276 * require data reconstruction, which will require *all* of the 4 remaining
277 * disks to participate -- 4 * 32K/disk == 128K again.
278 */
279
280 #ifndef RAIDOUTSTANDING
281 #define RAIDOUTSTANDING 6
282 #endif
283
284 #define RAIDLABELDEV(dev) \
285 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
286
287 /* declared here, and made public, for the benefit of KVM stuff.. */
288
289 static int raidlock(struct raid_softc *);
290 static void raidunlock(struct raid_softc *);
291
292 static int raid_detach_unlocked(struct raid_softc *);
293
294 static void rf_markalldirty(RF_Raid_t *);
295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
296
297 static void rf_ReconThread(struct rf_recon_req_internal *);
298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
301 static int rf_autoconfig(device_t);
302 static int rf_rescan(void);
303 static void rf_buildroothack(RF_ConfigSet_t *);
304
305 static RF_AutoConfig_t *rf_find_raid_components(void);
306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
309 static int rf_set_autoconfig(RF_Raid_t *, int);
310 static int rf_set_rootpartition(RF_Raid_t *, int);
311 static void rf_release_all_vps(RF_ConfigSet_t *);
312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
313 static int rf_have_enough_components(RF_ConfigSet_t *);
314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
316
317 /*
318 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
319 * Note that this is overridden by having RAID_AUTOCONFIG as an option
320 * in the kernel config file.
321 */
322 #ifdef RAID_AUTOCONFIG
323 int raidautoconfig = 1;
324 #else
325 int raidautoconfig = 0;
326 #endif
327 static bool raidautoconfigdone = false;
328
329 struct pool rf_alloclist_pool; /* AllocList */
330
331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
332 static kmutex_t raid_lock;
333
334 static struct raid_softc *
335 raidcreate(int unit) {
336 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
337 sc->sc_unit = unit;
338 cv_init(&sc->sc_cv, "raidunit");
339 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
340 return sc;
341 }
342
343 static void
344 raiddestroy(struct raid_softc *sc) {
345 cv_destroy(&sc->sc_cv);
346 mutex_destroy(&sc->sc_mutex);
347 kmem_free(sc, sizeof(*sc));
348 }
349
350 static struct raid_softc *
351 raidget(int unit, bool create) {
352 struct raid_softc *sc;
353 if (unit < 0) {
354 #ifdef DIAGNOSTIC
355 panic("%s: unit %d!", __func__, unit);
356 #endif
357 return NULL;
358 }
359 mutex_enter(&raid_lock);
360 LIST_FOREACH(sc, &raids, sc_link) {
361 if (sc->sc_unit == unit) {
362 mutex_exit(&raid_lock);
363 return sc;
364 }
365 }
366 mutex_exit(&raid_lock);
367 if (!create)
368 return NULL;
369 sc = raidcreate(unit);
370 mutex_enter(&raid_lock);
371 LIST_INSERT_HEAD(&raids, sc, sc_link);
372 mutex_exit(&raid_lock);
373 return sc;
374 }
375
376 static void
377 raidput(struct raid_softc *sc) {
378 mutex_enter(&raid_lock);
379 LIST_REMOVE(sc, sc_link);
380 mutex_exit(&raid_lock);
381 raiddestroy(sc);
382 }
383
384 void
385 raidattach(int num)
386 {
387
388 /*
389 * Device attachment and associated initialization now occurs
390 * as part of the module initialization.
391 */
392 }
393
394 static int
395 rf_autoconfig(device_t self)
396 {
397 RF_AutoConfig_t *ac_list;
398 RF_ConfigSet_t *config_sets;
399
400 if (!raidautoconfig || raidautoconfigdone == true)
401 return 0;
402
403 /* XXX This code can only be run once. */
404 raidautoconfigdone = true;
405
406 #ifdef __HAVE_CPU_BOOTCONF
407 /*
408 * 0. find the boot device if needed first so we can use it later
409 * this needs to be done before we autoconfigure any raid sets,
410 * because if we use wedges we are not going to be able to open
411 * the boot device later
412 */
413 if (booted_device == NULL)
414 cpu_bootconf();
415 #endif
416 /* 1. locate all RAID components on the system */
417 aprint_debug("Searching for RAID components...\n");
418 ac_list = rf_find_raid_components();
419
420 /* 2. Sort them into their respective sets. */
421 config_sets = rf_create_auto_sets(ac_list);
422
423 /*
424 * 3. Evaluate each set and configure the valid ones.
425 * This gets done in rf_buildroothack().
426 */
427 rf_buildroothack(config_sets);
428
429 return 1;
430 }
431
432 int
433 rf_inited(const struct raid_softc *rs) {
434 return (rs->sc_flags & RAIDF_INITED) != 0;
435 }
436
437 RF_Raid_t *
438 rf_get_raid(struct raid_softc *rs) {
439 return &rs->sc_r;
440 }
441
442 int
443 rf_get_unit(const struct raid_softc *rs) {
444 return rs->sc_unit;
445 }
446
447 static int
448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
449 const char *bootname;
450 size_t len;
451
452 /* if bdv is NULL, the set can't contain it. exit early. */
453 if (bdv == NULL)
454 return 0;
455
456 bootname = device_xname(bdv);
457 len = strlen(bootname);
458
459 for (int col = 0; col < r->numCol; col++) {
460 const char *devname = r->Disks[col].devname;
461 devname += sizeof("/dev/") - 1;
462 if (strncmp(devname, "dk", 2) == 0) {
463 const char *parent =
464 dkwedge_get_parent_name(r->Disks[col].dev);
465 if (parent != NULL)
466 devname = parent;
467 }
468 if (strncmp(devname, bootname, len) == 0) {
469 struct raid_softc *sc = r->softc;
470 aprint_debug("raid%d includes boot device %s\n",
471 sc->sc_unit, devname);
472 return 1;
473 }
474 }
475 return 0;
476 }
477
478 static int
479 rf_rescan(void)
480 {
481 RF_AutoConfig_t *ac_list;
482 RF_ConfigSet_t *config_sets, *cset, *next_cset;
483 struct raid_softc *sc;
484 int raid_added;
485
486 ac_list = rf_find_raid_components();
487 config_sets = rf_create_auto_sets(ac_list);
488
489 raid_added = 1;
490 while (raid_added > 0) {
491 raid_added = 0;
492 cset = config_sets;
493 while (cset != NULL) {
494 next_cset = cset->next;
495 if (rf_have_enough_components(cset) &&
496 cset->ac->clabel->autoconfigure == 1) {
497 sc = rf_auto_config_set(cset);
498 if (sc != NULL) {
499 aprint_debug("raid%d: configured ok, rootable %d\n",
500 sc->sc_unit, cset->rootable);
501 /* We added one RAID set */
502 raid_added++;
503 } else {
504 /* The autoconfig didn't work :( */
505 aprint_debug("Autoconfig failed\n");
506 rf_release_all_vps(cset);
507 }
508 } else {
509 /* we're not autoconfiguring this set...
510 release the associated resources */
511 rf_release_all_vps(cset);
512 }
513 /* cleanup */
514 rf_cleanup_config_set(cset);
515 cset = next_cset;
516 }
517 if (raid_added > 0) {
518 /* We added at least one RAID set, so re-scan for recursive RAID */
519 ac_list = rf_find_raid_components();
520 config_sets = rf_create_auto_sets(ac_list);
521 }
522 }
523
524 return 0;
525 }
526
527
528 static void
529 rf_buildroothack(RF_ConfigSet_t *config_sets)
530 {
531 RF_AutoConfig_t *ac_list;
532 RF_ConfigSet_t *cset;
533 RF_ConfigSet_t *next_cset;
534 int num_root;
535 int raid_added;
536 struct raid_softc *sc, *rsc;
537 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
538
539 sc = rsc = NULL;
540 num_root = 0;
541
542 raid_added = 1;
543 while (raid_added > 0) {
544 raid_added = 0;
545 cset = config_sets;
546 while (cset != NULL) {
547 next_cset = cset->next;
548 if (rf_have_enough_components(cset) &&
549 cset->ac->clabel->autoconfigure == 1) {
550 sc = rf_auto_config_set(cset);
551 if (sc != NULL) {
552 aprint_debug("raid%d: configured ok, rootable %d\n",
553 sc->sc_unit, cset->rootable);
554 /* We added one RAID set */
555 raid_added++;
556 if (cset->rootable) {
557 rsc = sc;
558 num_root++;
559 }
560 } else {
561 /* The autoconfig didn't work :( */
562 aprint_debug("Autoconfig failed\n");
563 rf_release_all_vps(cset);
564 }
565 } else {
566 /* we're not autoconfiguring this set...
567 release the associated resources */
568 rf_release_all_vps(cset);
569 }
570 /* cleanup */
571 rf_cleanup_config_set(cset);
572 cset = next_cset;
573 }
574 if (raid_added > 0) {
575 /* We added at least one RAID set, so re-scan for recursive RAID */
576 ac_list = rf_find_raid_components();
577 config_sets = rf_create_auto_sets(ac_list);
578 }
579 }
580
581 /* if the user has specified what the root device should be
582 then we don't touch booted_device or boothowto... */
583
584 if (rootspec != NULL) {
585 aprint_debug("%s: rootspec %s\n", __func__, rootspec);
586 return;
587 }
588
589 /* we found something bootable... */
590
591 /*
592 * XXX: The following code assumes that the root raid
593 * is the first ('a') partition. This is about the best
594 * we can do with a BSD disklabel, but we might be able
595 * to do better with a GPT label, by setting a specified
596 * attribute to indicate the root partition. We can then
597 * stash the partition number in the r->root_partition
598 * high bits (the bottom 2 bits are already used). For
599 * now we just set booted_partition to 0 when we override
600 * root.
601 */
602 if (num_root == 1) {
603 device_t candidate_root;
604 dksc = &rsc->sc_dksc;
605 if (dksc->sc_dkdev.dk_nwedges != 0) {
606 char cname[sizeof(cset->ac->devname)];
607 /* XXX: assume partition 'a' first */
608 snprintf(cname, sizeof(cname), "%s%c",
609 device_xname(dksc->sc_dev), 'a');
610 candidate_root = dkwedge_find_by_wname(cname);
611 aprint_debug("%s: candidate wedge root=%s\n", __func__,
612 cname);
613 if (candidate_root == NULL) {
614 /*
615 * If that is not found, because we don't use
616 * disklabel, return the first dk child
617 * XXX: we can skip the 'a' check above
618 * and always do this...
619 */
620 size_t i = 0;
621 candidate_root = dkwedge_find_by_parent(
622 device_xname(dksc->sc_dev), &i);
623 }
624 aprint_debug("%s: candidate wedge root=%p\n", __func__,
625 candidate_root);
626 } else
627 candidate_root = dksc->sc_dev;
628 aprint_debug("%s: candidate root=%p booted_device=%p "
629 "root_partition=%d contains_boot=%d\n",
630 __func__, candidate_root, booted_device,
631 rsc->sc_r.root_partition,
632 rf_containsboot(&rsc->sc_r, booted_device));
633 /* XXX the check for booted_device == NULL can probably be
634 * dropped, now that rf_containsboot handles that case.
635 */
636 if (booted_device == NULL ||
637 rsc->sc_r.root_partition == 1 ||
638 rf_containsboot(&rsc->sc_r, booted_device)) {
639 booted_device = candidate_root;
640 booted_method = "raidframe/single";
641 booted_partition = 0; /* XXX assume 'a' */
642 aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
643 device_xname(booted_device), booted_device);
644 }
645 } else if (num_root > 1) {
646 aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
647 booted_device);
648
649 /*
650 * Maybe the MD code can help. If it cannot, then
651 * setroot() will discover that we have no
652 * booted_device and will ask the user if nothing was
653 * hardwired in the kernel config file
654 */
655 if (booted_device == NULL)
656 return;
657
658 num_root = 0;
659 mutex_enter(&raid_lock);
660 LIST_FOREACH(sc, &raids, sc_link) {
661 RF_Raid_t *r = &sc->sc_r;
662 if (r->valid == 0)
663 continue;
664
665 if (r->root_partition == 0)
666 continue;
667
668 if (rf_containsboot(r, booted_device)) {
669 num_root++;
670 rsc = sc;
671 dksc = &rsc->sc_dksc;
672 }
673 }
674 mutex_exit(&raid_lock);
675
676 if (num_root == 1) {
677 booted_device = dksc->sc_dev;
678 booted_method = "raidframe/multi";
679 booted_partition = 0; /* XXX assume 'a' */
680 } else {
681 /* we can't guess.. require the user to answer... */
682 boothowto |= RB_ASKNAME;
683 }
684 }
685 }
686
687 static int
688 raidsize(dev_t dev)
689 {
690 struct raid_softc *rs;
691 struct dk_softc *dksc;
692 unsigned int unit;
693
694 unit = raidunit(dev);
695 if ((rs = raidget(unit, false)) == NULL)
696 return -1;
697 dksc = &rs->sc_dksc;
698
699 if ((rs->sc_flags & RAIDF_INITED) == 0)
700 return -1;
701
702 return dk_size(dksc, dev);
703 }
704
705 static int
706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
707 {
708 unsigned int unit;
709 struct raid_softc *rs;
710 struct dk_softc *dksc;
711
712 unit = raidunit(dev);
713 if ((rs = raidget(unit, false)) == NULL)
714 return ENXIO;
715 dksc = &rs->sc_dksc;
716
717 if ((rs->sc_flags & RAIDF_INITED) == 0)
718 return ENODEV;
719
720 /*
721 Note that blkno is relative to this particular partition.
722 By adding adding RF_PROTECTED_SECTORS, we get a value that
723 is relative to the partition used for the underlying component.
724 */
725 blkno += RF_PROTECTED_SECTORS;
726
727 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
728 }
729
730 static int
731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
732 {
733 struct raid_softc *rs = raidsoftc(dev);
734 const struct bdevsw *bdev;
735 RF_Raid_t *raidPtr;
736 int c, sparecol, j, scol, dumpto;
737 int error = 0;
738
739 raidPtr = &rs->sc_r;
740
741 /* we only support dumping to RAID 1 sets */
742 if (raidPtr->Layout.numDataCol != 1 ||
743 raidPtr->Layout.numParityCol != 1)
744 return EINVAL;
745
746 if ((error = raidlock(rs)) != 0)
747 return error;
748
749 /* figure out what device is alive.. */
750
751 /*
752 Look for a component to dump to. The preference for the
753 component to dump to is as follows:
754 1) the first component
755 2) a used_spare of the first component
756 3) the second component
757 4) a used_spare of the second component
758 */
759
760 dumpto = -1;
761 for (c = 0; c < raidPtr->numCol; c++) {
762 if (raidPtr->Disks[c].status == rf_ds_optimal) {
763 /* this might be the one */
764 dumpto = c;
765 break;
766 }
767 }
768
769 /*
770 At this point we have possibly selected a live component.
771 If we didn't find a live ocmponent, we now check to see
772 if there is a relevant spared component.
773 */
774
775 for (c = 0; c < raidPtr->numSpare; c++) {
776 sparecol = raidPtr->numCol + c;
777 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
778 /* How about this one? */
779 scol = -1;
780 for(j=0;j<raidPtr->numCol;j++) {
781 if (raidPtr->Disks[j].spareCol == sparecol) {
782 scol = j;
783 break;
784 }
785 }
786 if (scol == 0) {
787 /*
788 We must have found a spared first
789 component! We'll take that over
790 anything else found so far. (We
791 couldn't have found a real first
792 component before, since this is a
793 used spare, and it's saying that
794 it's replacing the first
795 component.) On reboot (with
796 autoconfiguration turned on)
797 sparecol will become the first
798 component (component0) of this set.
799 */
800 dumpto = sparecol;
801 break;
802 } else if (scol != -1) {
803 /*
804 Must be a spared second component.
805 We'll dump to that if we havn't found
806 anything else so far.
807 */
808 if (dumpto == -1)
809 dumpto = sparecol;
810 }
811 }
812 }
813
814 if (dumpto == -1) {
815 /* we couldn't find any live components to dump to!?!?
816 */
817 error = EINVAL;
818 goto out;
819 }
820
821 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
822 if (bdev == NULL) {
823 error = ENXIO;
824 goto out;
825 }
826
827 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
828 blkno, va, nblk * raidPtr->bytesPerSector);
829
830 out:
831 raidunlock(rs);
832
833 return error;
834 }
835
836 /* ARGSUSED */
837 static int
838 raidopen(dev_t dev, int flags, int fmt,
839 struct lwp *l)
840 {
841 int unit = raidunit(dev);
842 struct raid_softc *rs;
843 struct dk_softc *dksc;
844 int error = 0;
845 int part, pmask;
846
847 if ((rs = raidget(unit, true)) == NULL)
848 return ENXIO;
849 if ((error = raidlock(rs)) != 0)
850 return error;
851
852 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
853 error = EBUSY;
854 goto bad;
855 }
856
857 dksc = &rs->sc_dksc;
858
859 part = DISKPART(dev);
860 pmask = (1 << part);
861
862 if (!DK_BUSY(dksc, pmask) &&
863 ((rs->sc_flags & RAIDF_INITED) != 0)) {
864 /* First one... mark things as dirty... Note that we *MUST*
865 have done a configure before this. I DO NOT WANT TO BE
866 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
867 THAT THEY BELONG TOGETHER!!!!! */
868 /* XXX should check to see if we're only open for reading
869 here... If so, we needn't do this, but then need some
870 other way of keeping track of what's happened.. */
871
872 rf_markalldirty(&rs->sc_r);
873 }
874
875 if ((rs->sc_flags & RAIDF_INITED) != 0)
876 error = dk_open(dksc, dev, flags, fmt, l);
877
878 bad:
879 raidunlock(rs);
880
881 return error;
882
883
884 }
885
886 static int
887 raid_lastclose(device_t self)
888 {
889 struct raid_softc *rs = raidsoftc(self);
890
891 /* Last one... device is not unconfigured yet.
892 Device shutdown has taken care of setting the
893 clean bits if RAIDF_INITED is not set
894 mark things as clean... */
895
896 rf_update_component_labels(&rs->sc_r,
897 RF_FINAL_COMPONENT_UPDATE);
898
899 /* pass to unlocked code */
900 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
901 rs->sc_flags |= RAIDF_DETACH;
902
903 return 0;
904 }
905
906 /* ARGSUSED */
907 static int
908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
909 {
910 int unit = raidunit(dev);
911 struct raid_softc *rs;
912 struct dk_softc *dksc;
913 cfdata_t cf;
914 int error = 0, do_detach = 0, do_put = 0;
915
916 if ((rs = raidget(unit, false)) == NULL)
917 return ENXIO;
918 dksc = &rs->sc_dksc;
919
920 if ((error = raidlock(rs)) != 0)
921 return error;
922
923 if ((rs->sc_flags & RAIDF_INITED) != 0) {
924 error = dk_close(dksc, dev, flags, fmt, l);
925 if ((rs->sc_flags & RAIDF_DETACH) != 0)
926 do_detach = 1;
927 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
928 do_put = 1;
929
930 raidunlock(rs);
931
932 if (do_detach) {
933 /* free the pseudo device attach bits */
934 cf = device_cfdata(dksc->sc_dev);
935 error = config_detach(dksc->sc_dev, 0);
936 if (error == 0)
937 free(cf, M_RAIDFRAME);
938 } else if (do_put) {
939 raidput(rs);
940 }
941
942 return error;
943
944 }
945
946 static void
947 raid_wakeup(RF_Raid_t *raidPtr)
948 {
949 rf_lock_mutex2(raidPtr->iodone_lock);
950 rf_signal_cond2(raidPtr->iodone_cv);
951 rf_unlock_mutex2(raidPtr->iodone_lock);
952 }
953
954 static void
955 raidstrategy(struct buf *bp)
956 {
957 unsigned int unit;
958 struct raid_softc *rs;
959 struct dk_softc *dksc;
960 RF_Raid_t *raidPtr;
961
962 unit = raidunit(bp->b_dev);
963 if ((rs = raidget(unit, false)) == NULL) {
964 bp->b_error = ENXIO;
965 goto fail;
966 }
967 if ((rs->sc_flags & RAIDF_INITED) == 0) {
968 bp->b_error = ENXIO;
969 goto fail;
970 }
971 dksc = &rs->sc_dksc;
972 raidPtr = &rs->sc_r;
973
974 /* Queue IO only */
975 if (dk_strategy_defer(dksc, bp))
976 goto done;
977
978 /* schedule the IO to happen at the next convenient time */
979 raid_wakeup(raidPtr);
980
981 done:
982 return;
983
984 fail:
985 bp->b_resid = bp->b_bcount;
986 biodone(bp);
987 }
988
989 static int
990 raid_diskstart(device_t dev, struct buf *bp)
991 {
992 struct raid_softc *rs = raidsoftc(dev);
993 RF_Raid_t *raidPtr;
994
995 raidPtr = &rs->sc_r;
996 if (!raidPtr->valid) {
997 db1_printf(("raid is not valid..\n"));
998 return ENODEV;
999 }
1000
1001 /* XXX */
1002 bp->b_resid = 0;
1003
1004 return raiddoaccess(raidPtr, bp);
1005 }
1006
1007 void
1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1009 {
1010 struct raid_softc *rs;
1011 struct dk_softc *dksc;
1012
1013 rs = raidPtr->softc;
1014 dksc = &rs->sc_dksc;
1015
1016 dk_done(dksc, bp);
1017
1018 rf_lock_mutex2(raidPtr->mutex);
1019 raidPtr->openings++;
1020 rf_unlock_mutex2(raidPtr->mutex);
1021
1022 /* schedule more IO */
1023 raid_wakeup(raidPtr);
1024 }
1025
1026 /* ARGSUSED */
1027 static int
1028 raidread(dev_t dev, struct uio *uio, int flags)
1029 {
1030 int unit = raidunit(dev);
1031 struct raid_softc *rs;
1032
1033 if ((rs = raidget(unit, false)) == NULL)
1034 return ENXIO;
1035
1036 if ((rs->sc_flags & RAIDF_INITED) == 0)
1037 return ENXIO;
1038
1039 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1040
1041 }
1042
1043 /* ARGSUSED */
1044 static int
1045 raidwrite(dev_t dev, struct uio *uio, int flags)
1046 {
1047 int unit = raidunit(dev);
1048 struct raid_softc *rs;
1049
1050 if ((rs = raidget(unit, false)) == NULL)
1051 return ENXIO;
1052
1053 if ((rs->sc_flags & RAIDF_INITED) == 0)
1054 return ENXIO;
1055
1056 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1057
1058 }
1059
1060 static int
1061 raid_detach_unlocked(struct raid_softc *rs)
1062 {
1063 struct dk_softc *dksc = &rs->sc_dksc;
1064 RF_Raid_t *raidPtr;
1065 int error;
1066
1067 raidPtr = &rs->sc_r;
1068
1069 if (DK_BUSY(dksc, 0) ||
1070 raidPtr->recon_in_progress != 0 ||
1071 raidPtr->parity_rewrite_in_progress != 0 ||
1072 raidPtr->copyback_in_progress != 0)
1073 return EBUSY;
1074
1075 if ((rs->sc_flags & RAIDF_INITED) == 0)
1076 return 0;
1077
1078 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1079
1080 if ((error = rf_Shutdown(raidPtr)) != 0)
1081 return error;
1082
1083 rs->sc_flags &= ~RAIDF_INITED;
1084
1085 /* Kill off any queued buffers */
1086 dk_drain(dksc);
1087 bufq_free(dksc->sc_bufq);
1088
1089 /* Detach the disk. */
1090 dkwedge_delall(&dksc->sc_dkdev);
1091 disk_detach(&dksc->sc_dkdev);
1092 disk_destroy(&dksc->sc_dkdev);
1093 dk_detach(dksc);
1094
1095 return 0;
1096 }
1097
1098 static bool
1099 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1100 {
1101 switch (cmd) {
1102 case RAIDFRAME_ADD_HOT_SPARE:
1103 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1104 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1105 case RAIDFRAME_CHECK_PARITY:
1106 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1107 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1108 case RAIDFRAME_CHECK_RECON_STATUS:
1109 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1110 case RAIDFRAME_COPYBACK:
1111 case RAIDFRAME_DELETE_COMPONENT:
1112 case RAIDFRAME_FAIL_DISK:
1113 case RAIDFRAME_GET_ACCTOTALS:
1114 case RAIDFRAME_GET_COMPONENT_LABEL:
1115 case RAIDFRAME_GET_INFO:
1116 case RAIDFRAME_GET_SIZE:
1117 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1118 case RAIDFRAME_INIT_LABELS:
1119 case RAIDFRAME_KEEP_ACCTOTALS:
1120 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1121 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1122 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1123 case RAIDFRAME_PARITYMAP_STATUS:
1124 case RAIDFRAME_REBUILD_IN_PLACE:
1125 case RAIDFRAME_REMOVE_HOT_SPARE:
1126 case RAIDFRAME_RESET_ACCTOTALS:
1127 case RAIDFRAME_REWRITEPARITY:
1128 case RAIDFRAME_SET_AUTOCONFIG:
1129 case RAIDFRAME_SET_COMPONENT_LABEL:
1130 case RAIDFRAME_SET_LAST_UNIT:
1131 case RAIDFRAME_SET_ROOT:
1132 case RAIDFRAME_SHUTDOWN:
1133 return (rs->sc_flags & RAIDF_INITED) == 0;
1134 }
1135 return false;
1136 }
1137
1138 int
1139 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1140 {
1141 struct rf_recon_req_internal *rrint;
1142
1143 if (raidPtr->Layout.map->faultsTolerated == 0) {
1144 /* Can't do this on a RAID 0!! */
1145 return EINVAL;
1146 }
1147
1148 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1149 /* bad column */
1150 return EINVAL;
1151 }
1152
1153 rf_lock_mutex2(raidPtr->mutex);
1154 if (raidPtr->status == rf_rs_reconstructing) {
1155 /* you can't fail a disk while we're reconstructing! */
1156 /* XXX wrong for RAID6 */
1157 goto out;
1158 }
1159 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1160 (raidPtr->numFailures > 0)) {
1161 /* some other component has failed. Let's not make
1162 things worse. XXX wrong for RAID6 */
1163 goto out;
1164 }
1165 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1166 /* Can't fail a spared disk! */
1167 goto out;
1168 }
1169 rf_unlock_mutex2(raidPtr->mutex);
1170
1171 /* make a copy of the recon request so that we don't rely on
1172 * the user's buffer */
1173 rrint = RF_Malloc(sizeof(*rrint));
1174 if (rrint == NULL)
1175 return(ENOMEM);
1176 rrint->col = rr->col;
1177 rrint->flags = rr->flags;
1178 rrint->raidPtr = raidPtr;
1179
1180 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1181 rrint, "raid_recon");
1182 out:
1183 rf_unlock_mutex2(raidPtr->mutex);
1184 return EINVAL;
1185 }
1186
1187 static int
1188 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1189 {
1190 /* allocate a buffer for the layout-specific data, and copy it in */
1191 if (k_cfg->layoutSpecificSize == 0)
1192 return 0;
1193
1194 if (k_cfg->layoutSpecificSize > 10000) {
1195 /* sanity check */
1196 return EINVAL;
1197 }
1198
1199 u_char *specific_buf;
1200 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1201 if (specific_buf == NULL)
1202 return ENOMEM;
1203
1204 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1205 k_cfg->layoutSpecificSize);
1206 if (retcode) {
1207 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1208 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1209 return retcode;
1210 }
1211
1212 k_cfg->layoutSpecific = specific_buf;
1213 return 0;
1214 }
1215
1216 static int
1217 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1218 {
1219 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1220
1221 if (rs->sc_r.valid) {
1222 /* There is a valid RAID set running on this unit! */
1223 printf("raid%d: Device already configured!\n", rs->sc_unit);
1224 return EINVAL;
1225 }
1226
1227 /* copy-in the configuration information */
1228 /* data points to a pointer to the configuration structure */
1229 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1230 if (*k_cfg == NULL) {
1231 return ENOMEM;
1232 }
1233 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1234 if (retcode == 0)
1235 return 0;
1236 RF_Free(*k_cfg, sizeof(RF_Config_t));
1237 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1238 rs->sc_flags |= RAIDF_SHUTDOWN;
1239 return retcode;
1240 }
1241
1242 int
1243 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1244 {
1245 int retcode, i;
1246 RF_Raid_t *raidPtr = &rs->sc_r;
1247
1248 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1249
1250 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1251 goto out;
1252
1253 /* should do some kind of sanity check on the configuration.
1254 * Store the sum of all the bytes in the last byte? */
1255
1256 /* Force nul-termination on all strings. */
1257 #define ZERO_FINAL(s) do { s[sizeof(s) - 1] = '\0'; } while (0)
1258 for (i = 0; i < RF_MAXCOL; i++) {
1259 ZERO_FINAL(k_cfg->devnames[0][i]);
1260 }
1261 for (i = 0; i < RF_MAXSPARE; i++) {
1262 ZERO_FINAL(k_cfg->spare_names[i]);
1263 }
1264 for (i = 0; i < RF_MAXDBGV; i++) {
1265 ZERO_FINAL(k_cfg->debugVars[i]);
1266 }
1267 #undef ZERO_FINAL
1268
1269 /* Check some basic limits. */
1270 if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1271 retcode = EINVAL;
1272 goto out;
1273 }
1274 if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1275 retcode = EINVAL;
1276 goto out;
1277 }
1278
1279 /* configure the system */
1280
1281 /*
1282 * Clear the entire RAID descriptor, just to make sure
1283 * there is no stale data left in the case of a
1284 * reconfiguration
1285 */
1286 memset(raidPtr, 0, sizeof(*raidPtr));
1287 raidPtr->softc = rs;
1288 raidPtr->raidid = rs->sc_unit;
1289
1290 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1291
1292 if (retcode == 0) {
1293 /* allow this many simultaneous IO's to
1294 this RAID device */
1295 raidPtr->openings = RAIDOUTSTANDING;
1296
1297 raidinit(rs);
1298 raid_wakeup(raidPtr);
1299 rf_markalldirty(raidPtr);
1300 }
1301
1302 /* free the buffers. No return code here. */
1303 if (k_cfg->layoutSpecificSize) {
1304 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1305 }
1306 out:
1307 RF_Free(k_cfg, sizeof(RF_Config_t));
1308 if (retcode) {
1309 /*
1310 * If configuration failed, set sc_flags so that we
1311 * will detach the device when we close it.
1312 */
1313 rs->sc_flags |= RAIDF_SHUTDOWN;
1314 }
1315 return retcode;
1316 }
1317
1318 #if RF_DISABLED
1319 static int
1320 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1321 {
1322
1323 /* XXX check the label for valid stuff... */
1324 /* Note that some things *should not* get modified --
1325 the user should be re-initing the labels instead of
1326 trying to patch things.
1327 */
1328 #ifdef DEBUG
1329 int raidid = raidPtr->raidid;
1330 printf("raid%d: Got component label:\n", raidid);
1331 printf("raid%d: Version: %d\n", raidid, clabel->version);
1332 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1333 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1334 printf("raid%d: Column: %d\n", raidid, clabel->column);
1335 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1336 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1337 printf("raid%d: Status: %d\n", raidid, clabel->status);
1338 #endif /* DEBUG */
1339 clabel->row = 0;
1340 int column = clabel->column;
1341
1342 if ((column < 0) || (column >= raidPtr->numCol)) {
1343 return(EINVAL);
1344 }
1345
1346 /* XXX this isn't allowed to do anything for now :-) */
1347
1348 /* XXX and before it is, we need to fill in the rest
1349 of the fields!?!?!?! */
1350 memcpy(raidget_component_label(raidPtr, column),
1351 clabel, sizeof(*clabel));
1352 raidflush_component_label(raidPtr, column);
1353 return 0;
1354 }
1355 #endif
1356
1357 static int
1358 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1359 {
1360 /*
1361 we only want the serial number from
1362 the above. We get all the rest of the information
1363 from the config that was used to create this RAID
1364 set.
1365 */
1366
1367 raidPtr->serial_number = clabel->serial_number;
1368
1369 for (int column = 0; column < raidPtr->numCol; column++) {
1370 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1371 if (RF_DEAD_DISK(diskPtr->status))
1372 continue;
1373 RF_ComponentLabel_t *ci_label = raidget_component_label(
1374 raidPtr, column);
1375 /* Zeroing this is important. */
1376 memset(ci_label, 0, sizeof(*ci_label));
1377 raid_init_component_label(raidPtr, ci_label);
1378 ci_label->serial_number = raidPtr->serial_number;
1379 ci_label->row = 0; /* we dont' pretend to support more */
1380 rf_component_label_set_partitionsize(ci_label,
1381 diskPtr->partitionSize);
1382 ci_label->column = column;
1383 raidflush_component_label(raidPtr, column);
1384 /* XXXjld what about the spares? */
1385 }
1386
1387 return 0;
1388 }
1389
1390 static int
1391 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1392 {
1393
1394 if (raidPtr->Layout.map->faultsTolerated == 0) {
1395 /* Can't do this on a RAID 0!! */
1396 return EINVAL;
1397 }
1398
1399 if (raidPtr->recon_in_progress == 1) {
1400 /* a reconstruct is already in progress! */
1401 return EINVAL;
1402 }
1403
1404 RF_SingleComponent_t component;
1405 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1406 component.row = 0; /* we don't support any more */
1407 int column = component.column;
1408
1409 if ((column < 0) || (column >= raidPtr->numCol)) {
1410 return EINVAL;
1411 }
1412
1413 rf_lock_mutex2(raidPtr->mutex);
1414 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1415 (raidPtr->numFailures > 0)) {
1416 /* XXX 0 above shouldn't be constant!!! */
1417 /* some component other than this has failed.
1418 Let's not make things worse than they already
1419 are... */
1420 printf("raid%d: Unable to reconstruct to disk at:\n",
1421 raidPtr->raidid);
1422 printf("raid%d: Col: %d Too many failures.\n",
1423 raidPtr->raidid, column);
1424 rf_unlock_mutex2(raidPtr->mutex);
1425 return EINVAL;
1426 }
1427
1428 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1429 printf("raid%d: Unable to reconstruct to disk at:\n",
1430 raidPtr->raidid);
1431 printf("raid%d: Col: %d "
1432 "Reconstruction already occurring!\n",
1433 raidPtr->raidid, column);
1434
1435 rf_unlock_mutex2(raidPtr->mutex);
1436 return EINVAL;
1437 }
1438
1439 if (raidPtr->Disks[column].status == rf_ds_spared) {
1440 rf_unlock_mutex2(raidPtr->mutex);
1441 return EINVAL;
1442 }
1443
1444 rf_unlock_mutex2(raidPtr->mutex);
1445
1446 struct rf_recon_req_internal *rrint;
1447 rrint = RF_Malloc(sizeof(*rrint));
1448 if (rrint == NULL)
1449 return ENOMEM;
1450
1451 rrint->col = column;
1452 rrint->raidPtr = raidPtr;
1453
1454 return RF_CREATE_THREAD(raidPtr->recon_thread,
1455 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1456 }
1457
1458 static int
1459 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1460 {
1461 /*
1462 * This makes no sense on a RAID 0, or if we are not reconstructing
1463 * so tell the user it's done.
1464 */
1465 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1466 raidPtr->status != rf_rs_reconstructing) {
1467 *data = 100;
1468 return 0;
1469 }
1470 if (raidPtr->reconControl->numRUsTotal == 0) {
1471 *data = 0;
1472 return 0;
1473 }
1474 *data = (raidPtr->reconControl->numRUsComplete * 100
1475 / raidPtr->reconControl->numRUsTotal);
1476 return 0;
1477 }
1478
1479 /*
1480 * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1481 * on the component_name[] array.
1482 */
1483 static void
1484 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1485 {
1486
1487 memcpy(component, data, sizeof *component);
1488 component->component_name[sizeof(component->component_name) - 1] = '\0';
1489 }
1490
1491 static int
1492 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1493 {
1494 int unit = raidunit(dev);
1495 int part, pmask;
1496 struct raid_softc *rs;
1497 struct dk_softc *dksc;
1498 RF_Config_t *k_cfg;
1499 RF_Raid_t *raidPtr;
1500 RF_AccTotals_t *totals;
1501 RF_SingleComponent_t component;
1502 RF_DeviceConfig_t *d_cfg, *ucfgp;
1503 int retcode = 0;
1504 int column;
1505 RF_ComponentLabel_t *clabel;
1506 int d;
1507
1508 if ((rs = raidget(unit, false)) == NULL)
1509 return ENXIO;
1510
1511 dksc = &rs->sc_dksc;
1512 raidPtr = &rs->sc_r;
1513
1514 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1515 (int) DISKPART(dev), (int) unit, cmd));
1516
1517 /* Must be initialized for these... */
1518 if (rf_must_be_initialized(rs, cmd))
1519 return ENXIO;
1520
1521 switch (cmd) {
1522 /* configure the system */
1523 case RAIDFRAME_CONFIGURE:
1524 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1525 return retcode;
1526 return rf_construct(rs, k_cfg);
1527
1528 /* shutdown the system */
1529 case RAIDFRAME_SHUTDOWN:
1530
1531 part = DISKPART(dev);
1532 pmask = (1 << part);
1533
1534 if ((retcode = raidlock(rs)) != 0)
1535 return retcode;
1536
1537 if (DK_BUSY(dksc, pmask) ||
1538 raidPtr->recon_in_progress != 0 ||
1539 raidPtr->parity_rewrite_in_progress != 0 ||
1540 raidPtr->copyback_in_progress != 0)
1541 retcode = EBUSY;
1542 else {
1543 /* detach and free on close */
1544 rs->sc_flags |= RAIDF_SHUTDOWN;
1545 retcode = 0;
1546 }
1547
1548 raidunlock(rs);
1549
1550 return retcode;
1551 case RAIDFRAME_GET_COMPONENT_LABEL:
1552 return rf_get_component_label(raidPtr, data);
1553
1554 #if RF_DISABLED
1555 case RAIDFRAME_SET_COMPONENT_LABEL:
1556 return rf_set_component_label(raidPtr, data);
1557 #endif
1558
1559 case RAIDFRAME_INIT_LABELS:
1560 return rf_init_component_label(raidPtr, data);
1561
1562 case RAIDFRAME_SET_AUTOCONFIG:
1563 d = rf_set_autoconfig(raidPtr, *(int *) data);
1564 printf("raid%d: New autoconfig value is: %d\n",
1565 raidPtr->raidid, d);
1566 *(int *) data = d;
1567 return retcode;
1568
1569 case RAIDFRAME_SET_ROOT:
1570 d = rf_set_rootpartition(raidPtr, *(int *) data);
1571 printf("raid%d: New rootpartition value is: %d\n",
1572 raidPtr->raidid, d);
1573 *(int *) data = d;
1574 return retcode;
1575
1576 /* initialize all parity */
1577 case RAIDFRAME_REWRITEPARITY:
1578
1579 if (raidPtr->Layout.map->faultsTolerated == 0) {
1580 /* Parity for RAID 0 is trivially correct */
1581 raidPtr->parity_good = RF_RAID_CLEAN;
1582 return 0;
1583 }
1584
1585 if (raidPtr->parity_rewrite_in_progress == 1) {
1586 /* Re-write is already in progress! */
1587 return EINVAL;
1588 }
1589
1590 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1591 rf_RewriteParityThread, raidPtr,"raid_parity");
1592
1593 case RAIDFRAME_ADD_HOT_SPARE:
1594 rf_copy_single_component(&component, data);
1595 return rf_add_hot_spare(raidPtr, &component);
1596
1597 case RAIDFRAME_REMOVE_HOT_SPARE:
1598 return retcode;
1599
1600 case RAIDFRAME_DELETE_COMPONENT:
1601 rf_copy_single_component(&component, data);
1602 return rf_delete_component(raidPtr, &component);
1603
1604 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1605 rf_copy_single_component(&component, data);
1606 return rf_incorporate_hot_spare(raidPtr, &component);
1607
1608 case RAIDFRAME_REBUILD_IN_PLACE:
1609 return rf_rebuild_in_place(raidPtr, data);
1610
1611 case RAIDFRAME_GET_INFO:
1612 ucfgp = *(RF_DeviceConfig_t **)data;
1613 d_cfg = RF_Malloc(sizeof(*d_cfg));
1614 if (d_cfg == NULL)
1615 return ENOMEM;
1616 retcode = rf_get_info(raidPtr, d_cfg);
1617 if (retcode == 0) {
1618 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1619 }
1620 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1621 return retcode;
1622
1623 case RAIDFRAME_CHECK_PARITY:
1624 *(int *) data = raidPtr->parity_good;
1625 return 0;
1626
1627 case RAIDFRAME_PARITYMAP_STATUS:
1628 if (rf_paritymap_ineligible(raidPtr))
1629 return EINVAL;
1630 rf_paritymap_status(raidPtr->parity_map, data);
1631 return 0;
1632
1633 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1634 if (rf_paritymap_ineligible(raidPtr))
1635 return EINVAL;
1636 if (raidPtr->parity_map == NULL)
1637 return ENOENT; /* ??? */
1638 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1639 return EINVAL;
1640 return 0;
1641
1642 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1643 if (rf_paritymap_ineligible(raidPtr))
1644 return EINVAL;
1645 *(int *) data = rf_paritymap_get_disable(raidPtr);
1646 return 0;
1647
1648 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1649 if (rf_paritymap_ineligible(raidPtr))
1650 return EINVAL;
1651 rf_paritymap_set_disable(raidPtr, *(int *)data);
1652 /* XXX should errors be passed up? */
1653 return 0;
1654
1655 case RAIDFRAME_RESCAN:
1656 return rf_rescan();
1657
1658 case RAIDFRAME_RESET_ACCTOTALS:
1659 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1660 return 0;
1661
1662 case RAIDFRAME_GET_ACCTOTALS:
1663 totals = (RF_AccTotals_t *) data;
1664 *totals = raidPtr->acc_totals;
1665 return 0;
1666
1667 case RAIDFRAME_KEEP_ACCTOTALS:
1668 raidPtr->keep_acc_totals = *(int *)data;
1669 return 0;
1670
1671 case RAIDFRAME_GET_SIZE:
1672 *(int *) data = raidPtr->totalSectors;
1673 return 0;
1674
1675 case RAIDFRAME_FAIL_DISK:
1676 return rf_fail_disk(raidPtr, data);
1677
1678 /* invoke a copyback operation after recon on whatever disk
1679 * needs it, if any */
1680 case RAIDFRAME_COPYBACK:
1681
1682 if (raidPtr->Layout.map->faultsTolerated == 0) {
1683 /* This makes no sense on a RAID 0!! */
1684 return EINVAL;
1685 }
1686
1687 if (raidPtr->copyback_in_progress == 1) {
1688 /* Copyback is already in progress! */
1689 return EINVAL;
1690 }
1691
1692 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1693 rf_CopybackThread, raidPtr, "raid_copyback");
1694
1695 /* return the percentage completion of reconstruction */
1696 case RAIDFRAME_CHECK_RECON_STATUS:
1697 return rf_check_recon_status(raidPtr, data);
1698
1699 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1700 rf_check_recon_status_ext(raidPtr, data);
1701 return 0;
1702
1703 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1704 if (raidPtr->Layout.map->faultsTolerated == 0) {
1705 /* This makes no sense on a RAID 0, so tell the
1706 user it's done. */
1707 *(int *) data = 100;
1708 return 0;
1709 }
1710 if (raidPtr->parity_rewrite_in_progress == 1) {
1711 *(int *) data = 100 *
1712 raidPtr->parity_rewrite_stripes_done /
1713 raidPtr->Layout.numStripe;
1714 } else {
1715 *(int *) data = 100;
1716 }
1717 return 0;
1718
1719 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1720 rf_check_parityrewrite_status_ext(raidPtr, data);
1721 return 0;
1722
1723 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1724 if (raidPtr->Layout.map->faultsTolerated == 0) {
1725 /* This makes no sense on a RAID 0 */
1726 *(int *) data = 100;
1727 return 0;
1728 }
1729 if (raidPtr->copyback_in_progress == 1) {
1730 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1731 raidPtr->Layout.numStripe;
1732 } else {
1733 *(int *) data = 100;
1734 }
1735 return 0;
1736
1737 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1738 rf_check_copyback_status_ext(raidPtr, data);
1739 return 0;
1740
1741 case RAIDFRAME_SET_LAST_UNIT:
1742 for (column = 0; column < raidPtr->numCol; column++)
1743 if (raidPtr->Disks[column].status != rf_ds_optimal)
1744 return EBUSY;
1745
1746 for (column = 0; column < raidPtr->numCol; column++) {
1747 clabel = raidget_component_label(raidPtr, column);
1748 clabel->last_unit = *(int *)data;
1749 raidflush_component_label(raidPtr, column);
1750 }
1751 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1752 return 0;
1753
1754 /* the sparetable daemon calls this to wait for the kernel to
1755 * need a spare table. this ioctl does not return until a
1756 * spare table is needed. XXX -- calling mpsleep here in the
1757 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1758 * -- I should either compute the spare table in the kernel,
1759 * or have a different -- XXX XXX -- interface (a different
1760 * character device) for delivering the table -- XXX */
1761 #if RF_DISABLED
1762 case RAIDFRAME_SPARET_WAIT:
1763 rf_lock_mutex2(rf_sparet_wait_mutex);
1764 while (!rf_sparet_wait_queue)
1765 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1766 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1767 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1768 rf_unlock_mutex2(rf_sparet_wait_mutex);
1769
1770 /* structure assignment */
1771 *((RF_SparetWait_t *) data) = *waitreq;
1772
1773 RF_Free(waitreq, sizeof(*waitreq));
1774 return 0;
1775
1776 /* wakes up a process waiting on SPARET_WAIT and puts an error
1777 * code in it that will cause the dameon to exit */
1778 case RAIDFRAME_ABORT_SPARET_WAIT:
1779 waitreq = RF_Malloc(sizeof(*waitreq));
1780 waitreq->fcol = -1;
1781 rf_lock_mutex2(rf_sparet_wait_mutex);
1782 waitreq->next = rf_sparet_wait_queue;
1783 rf_sparet_wait_queue = waitreq;
1784 rf_broadcast_cond2(rf_sparet_wait_cv);
1785 rf_unlock_mutex2(rf_sparet_wait_mutex);
1786 return 0;
1787
1788 /* used by the spare table daemon to deliver a spare table
1789 * into the kernel */
1790 case RAIDFRAME_SEND_SPARET:
1791
1792 /* install the spare table */
1793 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1794
1795 /* respond to the requestor. the return status of the spare
1796 * table installation is passed in the "fcol" field */
1797 waitred = RF_Malloc(sizeof(*waitreq));
1798 waitreq->fcol = retcode;
1799 rf_lock_mutex2(rf_sparet_wait_mutex);
1800 waitreq->next = rf_sparet_resp_queue;
1801 rf_sparet_resp_queue = waitreq;
1802 rf_broadcast_cond2(rf_sparet_resp_cv);
1803 rf_unlock_mutex2(rf_sparet_wait_mutex);
1804
1805 return retcode;
1806 #endif
1807 default:
1808 /*
1809 * Don't bother trying to load compat modules
1810 * if it is not our ioctl. This is more efficient
1811 * and makes rump tests not depend on compat code
1812 */
1813 if (IOCGROUP(cmd) != 'r')
1814 break;
1815 #ifdef _LP64
1816 if ((l->l_proc->p_flag & PK_32) != 0) {
1817 module_autoload("compat_netbsd32_raid",
1818 MODULE_CLASS_EXEC);
1819 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1820 (rs, cmd, data), enosys(), retcode);
1821 if (retcode != EPASSTHROUGH)
1822 return retcode;
1823 }
1824 #endif
1825 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1826 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1827 (rs, cmd, data), enosys(), retcode);
1828 if (retcode != EPASSTHROUGH)
1829 return retcode;
1830
1831 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1832 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1833 (rs, cmd, data), enosys(), retcode);
1834 if (retcode != EPASSTHROUGH)
1835 return retcode;
1836 break; /* fall through to the os-specific code below */
1837
1838 }
1839
1840 if (!raidPtr->valid)
1841 return EINVAL;
1842
1843 /*
1844 * Add support for "regular" device ioctls here.
1845 */
1846
1847 switch (cmd) {
1848 case DIOCGCACHE:
1849 retcode = rf_get_component_caches(raidPtr, (int *)data);
1850 break;
1851
1852 case DIOCCACHESYNC:
1853 retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1854 break;
1855
1856 default:
1857 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1858 break;
1859 }
1860
1861 return retcode;
1862
1863 }
1864
1865
1866 /* raidinit -- complete the rest of the initialization for the
1867 RAIDframe device. */
1868
1869
1870 static void
1871 raidinit(struct raid_softc *rs)
1872 {
1873 cfdata_t cf;
1874 unsigned int unit;
1875 struct dk_softc *dksc = &rs->sc_dksc;
1876 RF_Raid_t *raidPtr = &rs->sc_r;
1877 device_t dev;
1878
1879 unit = raidPtr->raidid;
1880
1881 /* XXX doesn't check bounds. */
1882 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1883
1884 /* attach the pseudo device */
1885 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1886 cf->cf_name = raid_cd.cd_name;
1887 cf->cf_atname = raid_cd.cd_name;
1888 cf->cf_unit = unit;
1889 cf->cf_fstate = FSTATE_STAR;
1890
1891 dev = config_attach_pseudo(cf);
1892 if (dev == NULL) {
1893 printf("raid%d: config_attach_pseudo failed\n",
1894 raidPtr->raidid);
1895 free(cf, M_RAIDFRAME);
1896 return;
1897 }
1898
1899 /* provide a backpointer to the real softc */
1900 raidsoftc(dev) = rs;
1901
1902 /* disk_attach actually creates space for the CPU disklabel, among
1903 * other things, so it's critical to call this *BEFORE* we try putzing
1904 * with disklabels. */
1905 dk_init(dksc, dev, DKTYPE_RAID);
1906 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1907
1908 /* XXX There may be a weird interaction here between this, and
1909 * protectedSectors, as used in RAIDframe. */
1910
1911 rs->sc_size = raidPtr->totalSectors;
1912
1913 /* Attach dk and disk subsystems */
1914 dk_attach(dksc);
1915 disk_attach(&dksc->sc_dkdev);
1916 rf_set_geometry(rs, raidPtr);
1917
1918 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1919
1920 /* mark unit as usuable */
1921 rs->sc_flags |= RAIDF_INITED;
1922
1923 dkwedge_discover(&dksc->sc_dkdev);
1924 }
1925
1926 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1927 /* wake up the daemon & tell it to get us a spare table
1928 * XXX
1929 * the entries in the queues should be tagged with the raidPtr
1930 * so that in the extremely rare case that two recons happen at once,
1931 * we know for which device were requesting a spare table
1932 * XXX
1933 *
1934 * XXX This code is not currently used. GO
1935 */
1936 int
1937 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1938 {
1939 int retcode;
1940
1941 rf_lock_mutex2(rf_sparet_wait_mutex);
1942 req->next = rf_sparet_wait_queue;
1943 rf_sparet_wait_queue = req;
1944 rf_broadcast_cond2(rf_sparet_wait_cv);
1945
1946 /* mpsleep unlocks the mutex */
1947 while (!rf_sparet_resp_queue) {
1948 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1949 }
1950 req = rf_sparet_resp_queue;
1951 rf_sparet_resp_queue = req->next;
1952 rf_unlock_mutex2(rf_sparet_wait_mutex);
1953
1954 retcode = req->fcol;
1955 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1956 * alloc'd */
1957 return retcode;
1958 }
1959 #endif
1960
1961 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1962 * bp & passes it down.
1963 * any calls originating in the kernel must use non-blocking I/O
1964 * do some extra sanity checking to return "appropriate" error values for
1965 * certain conditions (to make some standard utilities work)
1966 *
1967 * Formerly known as: rf_DoAccessKernel
1968 */
1969 void
1970 raidstart(RF_Raid_t *raidPtr)
1971 {
1972 struct raid_softc *rs;
1973 struct dk_softc *dksc;
1974
1975 rs = raidPtr->softc;
1976 dksc = &rs->sc_dksc;
1977 /* quick check to see if anything has died recently */
1978 rf_lock_mutex2(raidPtr->mutex);
1979 if (raidPtr->numNewFailures > 0) {
1980 rf_unlock_mutex2(raidPtr->mutex);
1981 rf_update_component_labels(raidPtr,
1982 RF_NORMAL_COMPONENT_UPDATE);
1983 rf_lock_mutex2(raidPtr->mutex);
1984 raidPtr->numNewFailures--;
1985 }
1986 rf_unlock_mutex2(raidPtr->mutex);
1987
1988 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1989 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1990 return;
1991 }
1992
1993 dk_start(dksc, NULL);
1994 }
1995
1996 static int
1997 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1998 {
1999 RF_SectorCount_t num_blocks, pb, sum;
2000 RF_RaidAddr_t raid_addr;
2001 daddr_t blocknum;
2002 int rc;
2003
2004 rf_lock_mutex2(raidPtr->mutex);
2005 if (raidPtr->openings == 0) {
2006 rf_unlock_mutex2(raidPtr->mutex);
2007 return EAGAIN;
2008 }
2009 rf_unlock_mutex2(raidPtr->mutex);
2010
2011 blocknum = bp->b_rawblkno;
2012
2013 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2014 (int) blocknum));
2015
2016 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2017 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2018
2019 /* *THIS* is where we adjust what block we're going to...
2020 * but DO NOT TOUCH bp->b_blkno!!! */
2021 raid_addr = blocknum;
2022
2023 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2024 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2025 sum = raid_addr + num_blocks + pb;
2026 if (1 || rf_debugKernelAccess) {
2027 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2028 (int) raid_addr, (int) sum, (int) num_blocks,
2029 (int) pb, (int) bp->b_resid));
2030 }
2031 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2032 || (sum < num_blocks) || (sum < pb)) {
2033 rc = ENOSPC;
2034 goto done;
2035 }
2036 /*
2037 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2038 */
2039
2040 if (bp->b_bcount & raidPtr->sectorMask) {
2041 rc = ENOSPC;
2042 goto done;
2043 }
2044 db1_printf(("Calling DoAccess..\n"));
2045
2046
2047 rf_lock_mutex2(raidPtr->mutex);
2048 raidPtr->openings--;
2049 rf_unlock_mutex2(raidPtr->mutex);
2050
2051 /* don't ever condition on bp->b_flags & B_WRITE.
2052 * always condition on B_READ instead */
2053
2054 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2055 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2056 raid_addr, num_blocks,
2057 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2058
2059 done:
2060 return rc;
2061 }
2062
2063 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2064
2065 int
2066 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2067 {
2068 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2069 struct buf *bp;
2070
2071 req->queue = queue;
2072 bp = req->bp;
2073
2074 switch (req->type) {
2075 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2076 /* XXX need to do something extra here.. */
2077 /* I'm leaving this in, as I've never actually seen it used,
2078 * and I'd like folks to report it... GO */
2079 printf("%s: WAKEUP CALLED\n", __func__);
2080 queue->numOutstanding++;
2081
2082 bp->b_flags = 0;
2083 bp->b_private = req;
2084
2085 KernelWakeupFunc(bp);
2086 break;
2087
2088 case RF_IO_TYPE_READ:
2089 case RF_IO_TYPE_WRITE:
2090 #if RF_ACC_TRACE > 0
2091 if (req->tracerec) {
2092 RF_ETIMER_START(req->tracerec->timer);
2093 }
2094 #endif
2095 InitBP(bp, queue->rf_cinfo->ci_vp,
2096 op, queue->rf_cinfo->ci_dev,
2097 req->sectorOffset, req->numSector,
2098 req->buf, KernelWakeupFunc, (void *) req,
2099 queue->raidPtr->logBytesPerSector);
2100
2101 if (rf_debugKernelAccess) {
2102 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2103 (long) bp->b_blkno));
2104 }
2105 queue->numOutstanding++;
2106 queue->last_deq_sector = req->sectorOffset;
2107 /* acc wouldn't have been let in if there were any pending
2108 * reqs at any other priority */
2109 queue->curPriority = req->priority;
2110
2111 db1_printf(("Going for %c to unit %d col %d\n",
2112 req->type, queue->raidPtr->raidid,
2113 queue->col));
2114 db1_printf(("sector %d count %d (%d bytes) %d\n",
2115 (int) req->sectorOffset, (int) req->numSector,
2116 (int) (req->numSector <<
2117 queue->raidPtr->logBytesPerSector),
2118 (int) queue->raidPtr->logBytesPerSector));
2119
2120 /*
2121 * XXX: drop lock here since this can block at
2122 * least with backing SCSI devices. Retake it
2123 * to minimize fuss with calling interfaces.
2124 */
2125
2126 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2127 bdev_strategy(bp);
2128 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2129 break;
2130
2131 default:
2132 panic("bad req->type in rf_DispatchKernelIO");
2133 }
2134 db1_printf(("Exiting from DispatchKernelIO\n"));
2135
2136 return 0;
2137 }
2138 /* this is the callback function associated with a I/O invoked from
2139 kernel code.
2140 */
2141 static void
2142 KernelWakeupFunc(struct buf *bp)
2143 {
2144 RF_DiskQueueData_t *req = NULL;
2145 RF_DiskQueue_t *queue;
2146
2147 db1_printf(("recovering the request queue:\n"));
2148
2149 req = bp->b_private;
2150
2151 queue = (RF_DiskQueue_t *) req->queue;
2152
2153 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2154
2155 #if RF_ACC_TRACE > 0
2156 if (req->tracerec) {
2157 RF_ETIMER_STOP(req->tracerec->timer);
2158 RF_ETIMER_EVAL(req->tracerec->timer);
2159 rf_lock_mutex2(rf_tracing_mutex);
2160 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2161 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2162 req->tracerec->num_phys_ios++;
2163 rf_unlock_mutex2(rf_tracing_mutex);
2164 }
2165 #endif
2166
2167 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2168 * ballistic, and mark the component as hosed... */
2169
2170 if (bp->b_error != 0) {
2171 /* Mark the disk as dead */
2172 /* but only mark it once... */
2173 /* and only if it wouldn't leave this RAID set
2174 completely broken */
2175 if (((queue->raidPtr->Disks[queue->col].status ==
2176 rf_ds_optimal) ||
2177 (queue->raidPtr->Disks[queue->col].status ==
2178 rf_ds_used_spare)) &&
2179 (queue->raidPtr->numFailures <
2180 queue->raidPtr->Layout.map->faultsTolerated)) {
2181 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2182 queue->raidPtr->raidid,
2183 bp->b_error,
2184 queue->raidPtr->Disks[queue->col].devname);
2185 queue->raidPtr->Disks[queue->col].status =
2186 rf_ds_failed;
2187 queue->raidPtr->status = rf_rs_degraded;
2188 queue->raidPtr->numFailures++;
2189 queue->raidPtr->numNewFailures++;
2190 } else { /* Disk is already dead... */
2191 /* printf("Disk already marked as dead!\n"); */
2192 }
2193
2194 }
2195
2196 /* Fill in the error value */
2197 req->error = bp->b_error;
2198
2199 /* Drop this one on the "finished" queue... */
2200 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2201
2202 /* Let the raidio thread know there is work to be done. */
2203 rf_signal_cond2(queue->raidPtr->iodone_cv);
2204
2205 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2206 }
2207
2208
2209 /*
2210 * initialize a buf structure for doing an I/O in the kernel.
2211 */
2212 static void
2213 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2214 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2215 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2216 {
2217 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2218 bp->b_oflags = 0;
2219 bp->b_cflags = 0;
2220 bp->b_bcount = numSect << logBytesPerSector;
2221 bp->b_bufsize = bp->b_bcount;
2222 bp->b_error = 0;
2223 bp->b_dev = dev;
2224 bp->b_data = bf;
2225 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2226 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2227 if (bp->b_bcount == 0) {
2228 panic("bp->b_bcount is zero in InitBP!!");
2229 }
2230 bp->b_iodone = cbFunc;
2231 bp->b_private = cbArg;
2232 }
2233
2234 /*
2235 * Wait interruptibly for an exclusive lock.
2236 *
2237 * XXX
2238 * Several drivers do this; it should be abstracted and made MP-safe.
2239 * (Hmm... where have we seen this warning before :-> GO )
2240 */
2241 static int
2242 raidlock(struct raid_softc *rs)
2243 {
2244 int error;
2245
2246 error = 0;
2247 mutex_enter(&rs->sc_mutex);
2248 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2249 rs->sc_flags |= RAIDF_WANTED;
2250 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2251 if (error != 0)
2252 goto done;
2253 }
2254 rs->sc_flags |= RAIDF_LOCKED;
2255 done:
2256 mutex_exit(&rs->sc_mutex);
2257 return error;
2258 }
2259 /*
2260 * Unlock and wake up any waiters.
2261 */
2262 static void
2263 raidunlock(struct raid_softc *rs)
2264 {
2265
2266 mutex_enter(&rs->sc_mutex);
2267 rs->sc_flags &= ~RAIDF_LOCKED;
2268 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2269 rs->sc_flags &= ~RAIDF_WANTED;
2270 cv_broadcast(&rs->sc_cv);
2271 }
2272 mutex_exit(&rs->sc_mutex);
2273 }
2274
2275
2276 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2277 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2278 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2279
2280 static daddr_t
2281 rf_component_info_offset(void)
2282 {
2283
2284 return RF_COMPONENT_INFO_OFFSET;
2285 }
2286
2287 static daddr_t
2288 rf_component_info_size(unsigned secsize)
2289 {
2290 daddr_t info_size;
2291
2292 KASSERT(secsize);
2293 if (secsize > RF_COMPONENT_INFO_SIZE)
2294 info_size = secsize;
2295 else
2296 info_size = RF_COMPONENT_INFO_SIZE;
2297
2298 return info_size;
2299 }
2300
2301 static daddr_t
2302 rf_parity_map_offset(RF_Raid_t *raidPtr)
2303 {
2304 daddr_t map_offset;
2305
2306 KASSERT(raidPtr->bytesPerSector);
2307 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2308 map_offset = raidPtr->bytesPerSector;
2309 else
2310 map_offset = RF_COMPONENT_INFO_SIZE;
2311 map_offset += rf_component_info_offset();
2312
2313 return map_offset;
2314 }
2315
2316 static daddr_t
2317 rf_parity_map_size(RF_Raid_t *raidPtr)
2318 {
2319 daddr_t map_size;
2320
2321 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2322 map_size = raidPtr->bytesPerSector;
2323 else
2324 map_size = RF_PARITY_MAP_SIZE;
2325
2326 return map_size;
2327 }
2328
2329 int
2330 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2331 {
2332 RF_ComponentLabel_t *clabel;
2333
2334 clabel = raidget_component_label(raidPtr, col);
2335 clabel->clean = RF_RAID_CLEAN;
2336 raidflush_component_label(raidPtr, col);
2337 return(0);
2338 }
2339
2340
2341 int
2342 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2343 {
2344 RF_ComponentLabel_t *clabel;
2345
2346 clabel = raidget_component_label(raidPtr, col);
2347 clabel->clean = RF_RAID_DIRTY;
2348 raidflush_component_label(raidPtr, col);
2349 return(0);
2350 }
2351
2352 int
2353 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2354 {
2355 KASSERT(raidPtr->bytesPerSector);
2356
2357 return raidread_component_label(raidPtr->bytesPerSector,
2358 raidPtr->Disks[col].dev,
2359 raidPtr->raid_cinfo[col].ci_vp,
2360 &raidPtr->raid_cinfo[col].ci_label);
2361 }
2362
2363 RF_ComponentLabel_t *
2364 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2365 {
2366 return &raidPtr->raid_cinfo[col].ci_label;
2367 }
2368
2369 int
2370 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2371 {
2372 RF_ComponentLabel_t *label;
2373
2374 label = &raidPtr->raid_cinfo[col].ci_label;
2375 label->mod_counter = raidPtr->mod_counter;
2376 #ifndef RF_NO_PARITY_MAP
2377 label->parity_map_modcount = label->mod_counter;
2378 #endif
2379 return raidwrite_component_label(raidPtr->bytesPerSector,
2380 raidPtr->Disks[col].dev,
2381 raidPtr->raid_cinfo[col].ci_vp, label);
2382 }
2383
2384 /*
2385 * Swap the label endianness.
2386 *
2387 * Everything in the component label is 4-byte-swapped except the version,
2388 * which is kept in the byte-swapped version at all times, and indicates
2389 * for the writer that a swap is necessary.
2390 *
2391 * For reads it is expected that out_label == clabel, but writes expect
2392 * separate labels so only the re-swapped label is written out to disk,
2393 * leaving the swapped-except-version internally.
2394 *
2395 * Only support swapping label version 2.
2396 */
2397 static void
2398 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2399 {
2400 int *in, *out, *in_last;
2401
2402 KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2403
2404 /* Don't swap the label, but do copy it. */
2405 out_label->version = clabel->version;
2406
2407 in = &clabel->serial_number;
2408 in_last = &clabel->future_use2[42];
2409 out = &out_label->serial_number;
2410
2411 for (; in < in_last; in++, out++)
2412 *out = bswap32(*in);
2413 }
2414
2415 static int
2416 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2417 RF_ComponentLabel_t *clabel)
2418 {
2419 int error;
2420
2421 error = raidread_component_area(dev, b_vp, clabel,
2422 sizeof(RF_ComponentLabel_t),
2423 rf_component_info_offset(),
2424 rf_component_info_size(secsize));
2425
2426 if (error == 0 &&
2427 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2428 rf_swap_label(clabel, clabel);
2429 }
2430
2431 return error;
2432 }
2433
2434 /* ARGSUSED */
2435 static int
2436 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2437 size_t msize, daddr_t offset, daddr_t dsize)
2438 {
2439 struct buf *bp;
2440 int error;
2441
2442 /* XXX should probably ensure that we don't try to do this if
2443 someone has changed rf_protected_sectors. */
2444
2445 if (b_vp == NULL) {
2446 /* For whatever reason, this component is not valid.
2447 Don't try to read a component label from it. */
2448 return(EINVAL);
2449 }
2450
2451 /* get a block of the appropriate size... */
2452 bp = geteblk((int)dsize);
2453 bp->b_dev = dev;
2454
2455 /* get our ducks in a row for the read */
2456 bp->b_blkno = offset / DEV_BSIZE;
2457 bp->b_bcount = dsize;
2458 bp->b_flags |= B_READ;
2459 bp->b_resid = dsize;
2460
2461 bdev_strategy(bp);
2462 error = biowait(bp);
2463
2464 if (!error) {
2465 memcpy(data, bp->b_data, msize);
2466 }
2467
2468 brelse(bp, 0);
2469 return(error);
2470 }
2471
2472 static int
2473 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2474 RF_ComponentLabel_t *clabel)
2475 {
2476 RF_ComponentLabel_t *clabel_write = clabel;
2477 RF_ComponentLabel_t lclabel;
2478 int error;
2479
2480 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2481 clabel_write = &lclabel;
2482 rf_swap_label(clabel, clabel_write);
2483 }
2484 error = raidwrite_component_area(dev, b_vp, clabel_write,
2485 sizeof(RF_ComponentLabel_t),
2486 rf_component_info_offset(),
2487 rf_component_info_size(secsize), 0);
2488
2489 return error;
2490 }
2491
2492 /* ARGSUSED */
2493 static int
2494 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2495 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2496 {
2497 struct buf *bp;
2498 int error;
2499
2500 /* get a block of the appropriate size... */
2501 bp = geteblk((int)dsize);
2502 bp->b_dev = dev;
2503
2504 /* get our ducks in a row for the write */
2505 bp->b_blkno = offset / DEV_BSIZE;
2506 bp->b_bcount = dsize;
2507 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2508 bp->b_resid = dsize;
2509
2510 memset(bp->b_data, 0, dsize);
2511 memcpy(bp->b_data, data, msize);
2512
2513 bdev_strategy(bp);
2514 if (asyncp)
2515 return 0;
2516 error = biowait(bp);
2517 brelse(bp, 0);
2518 if (error) {
2519 #if 1
2520 printf("Failed to write RAID component info!\n");
2521 #endif
2522 }
2523
2524 return(error);
2525 }
2526
2527 void
2528 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2529 {
2530 int c;
2531
2532 for (c = 0; c < raidPtr->numCol; c++) {
2533 /* Skip dead disks. */
2534 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2535 continue;
2536 /* XXXjld: what if an error occurs here? */
2537 raidwrite_component_area(raidPtr->Disks[c].dev,
2538 raidPtr->raid_cinfo[c].ci_vp, map,
2539 RF_PARITYMAP_NBYTE,
2540 rf_parity_map_offset(raidPtr),
2541 rf_parity_map_size(raidPtr), 0);
2542 }
2543 }
2544
2545 void
2546 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2547 {
2548 struct rf_paritymap_ondisk tmp;
2549 int c,first;
2550
2551 first=1;
2552 for (c = 0; c < raidPtr->numCol; c++) {
2553 /* Skip dead disks. */
2554 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2555 continue;
2556 raidread_component_area(raidPtr->Disks[c].dev,
2557 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2558 RF_PARITYMAP_NBYTE,
2559 rf_parity_map_offset(raidPtr),
2560 rf_parity_map_size(raidPtr));
2561 if (first) {
2562 memcpy(map, &tmp, sizeof(*map));
2563 first = 0;
2564 } else {
2565 rf_paritymap_merge(map, &tmp);
2566 }
2567 }
2568 }
2569
2570 void
2571 rf_markalldirty(RF_Raid_t *raidPtr)
2572 {
2573 RF_ComponentLabel_t *clabel;
2574 int sparecol;
2575 int c;
2576 int j;
2577 int scol = -1;
2578
2579 raidPtr->mod_counter++;
2580 for (c = 0; c < raidPtr->numCol; c++) {
2581 /* we don't want to touch (at all) a disk that has
2582 failed */
2583 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2584 clabel = raidget_component_label(raidPtr, c);
2585 if (clabel->status == rf_ds_spared) {
2586 /* XXX do something special...
2587 but whatever you do, don't
2588 try to access it!! */
2589 } else {
2590 raidmarkdirty(raidPtr, c);
2591 }
2592 }
2593 }
2594
2595 for( c = 0; c < raidPtr->numSpare ; c++) {
2596 sparecol = raidPtr->numCol + c;
2597 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2598 /*
2599
2600 we claim this disk is "optimal" if it's
2601 rf_ds_used_spare, as that means it should be
2602 directly substitutable for the disk it replaced.
2603 We note that too...
2604
2605 */
2606
2607 for(j=0;j<raidPtr->numCol;j++) {
2608 if (raidPtr->Disks[j].spareCol == sparecol) {
2609 scol = j;
2610 break;
2611 }
2612 }
2613
2614 clabel = raidget_component_label(raidPtr, sparecol);
2615 /* make sure status is noted */
2616
2617 raid_init_component_label(raidPtr, clabel);
2618
2619 clabel->row = 0;
2620 clabel->column = scol;
2621 /* Note: we *don't* change status from rf_ds_used_spare
2622 to rf_ds_optimal */
2623 /* clabel.status = rf_ds_optimal; */
2624
2625 raidmarkdirty(raidPtr, sparecol);
2626 }
2627 }
2628 }
2629
2630
2631 void
2632 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2633 {
2634 RF_ComponentLabel_t *clabel;
2635 int sparecol;
2636 int c;
2637 int j;
2638 int scol;
2639 struct raid_softc *rs = raidPtr->softc;
2640
2641 scol = -1;
2642
2643 /* XXX should do extra checks to make sure things really are clean,
2644 rather than blindly setting the clean bit... */
2645
2646 raidPtr->mod_counter++;
2647
2648 for (c = 0; c < raidPtr->numCol; c++) {
2649 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2650 clabel = raidget_component_label(raidPtr, c);
2651 /* make sure status is noted */
2652 clabel->status = rf_ds_optimal;
2653
2654 /* note what unit we are configured as */
2655 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2656 clabel->last_unit = raidPtr->raidid;
2657
2658 raidflush_component_label(raidPtr, c);
2659 if (final == RF_FINAL_COMPONENT_UPDATE) {
2660 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2661 raidmarkclean(raidPtr, c);
2662 }
2663 }
2664 }
2665 /* else we don't touch it.. */
2666 }
2667
2668 for( c = 0; c < raidPtr->numSpare ; c++) {
2669 sparecol = raidPtr->numCol + c;
2670 /* Need to ensure that the reconstruct actually completed! */
2671 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2672 /*
2673
2674 we claim this disk is "optimal" if it's
2675 rf_ds_used_spare, as that means it should be
2676 directly substitutable for the disk it replaced.
2677 We note that too...
2678
2679 */
2680
2681 for(j=0;j<raidPtr->numCol;j++) {
2682 if (raidPtr->Disks[j].spareCol == sparecol) {
2683 scol = j;
2684 break;
2685 }
2686 }
2687
2688 /* XXX shouldn't *really* need this... */
2689 clabel = raidget_component_label(raidPtr, sparecol);
2690 /* make sure status is noted */
2691
2692 raid_init_component_label(raidPtr, clabel);
2693
2694 clabel->column = scol;
2695 clabel->status = rf_ds_optimal;
2696 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2697 clabel->last_unit = raidPtr->raidid;
2698
2699 raidflush_component_label(raidPtr, sparecol);
2700 if (final == RF_FINAL_COMPONENT_UPDATE) {
2701 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2702 raidmarkclean(raidPtr, sparecol);
2703 }
2704 }
2705 }
2706 }
2707 }
2708
2709 void
2710 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2711 {
2712
2713 if (vp != NULL) {
2714 if (auto_configured == 1) {
2715 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2716 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2717 vput(vp);
2718
2719 } else {
2720 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2721 }
2722 }
2723 }
2724
2725
2726 void
2727 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2728 {
2729 int r,c;
2730 struct vnode *vp;
2731 int acd;
2732
2733
2734 /* We take this opportunity to close the vnodes like we should.. */
2735
2736 for (c = 0; c < raidPtr->numCol; c++) {
2737 vp = raidPtr->raid_cinfo[c].ci_vp;
2738 acd = raidPtr->Disks[c].auto_configured;
2739 rf_close_component(raidPtr, vp, acd);
2740 raidPtr->raid_cinfo[c].ci_vp = NULL;
2741 raidPtr->Disks[c].auto_configured = 0;
2742 }
2743
2744 for (r = 0; r < raidPtr->numSpare; r++) {
2745 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2746 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2747 rf_close_component(raidPtr, vp, acd);
2748 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2749 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2750 }
2751 }
2752
2753
2754 static void
2755 rf_ReconThread(struct rf_recon_req_internal *req)
2756 {
2757 int s;
2758 RF_Raid_t *raidPtr;
2759
2760 s = splbio();
2761 raidPtr = (RF_Raid_t *) req->raidPtr;
2762 raidPtr->recon_in_progress = 1;
2763
2764 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2765 raidPtr->forceRecon = 1;
2766 }
2767
2768 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2769 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2770
2771 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2772 raidPtr->forceRecon = 0;
2773 }
2774
2775 RF_Free(req, sizeof(*req));
2776
2777 raidPtr->recon_in_progress = 0;
2778 splx(s);
2779
2780 /* That's all... */
2781 kthread_exit(0); /* does not return */
2782 }
2783
2784 static void
2785 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2786 {
2787 int retcode;
2788 int s;
2789
2790 raidPtr->parity_rewrite_stripes_done = 0;
2791 raidPtr->parity_rewrite_in_progress = 1;
2792 s = splbio();
2793 retcode = rf_RewriteParity(raidPtr);
2794 splx(s);
2795 if (retcode) {
2796 printf("raid%d: Error re-writing parity (%d)!\n",
2797 raidPtr->raidid, retcode);
2798 } else {
2799 /* set the clean bit! If we shutdown correctly,
2800 the clean bit on each component label will get
2801 set */
2802 raidPtr->parity_good = RF_RAID_CLEAN;
2803 }
2804 raidPtr->parity_rewrite_in_progress = 0;
2805
2806 /* Anyone waiting for us to stop? If so, inform them... */
2807 if (raidPtr->waitShutdown) {
2808 rf_lock_mutex2(raidPtr->rad_lock);
2809 cv_broadcast(&raidPtr->parity_rewrite_cv);
2810 rf_unlock_mutex2(raidPtr->rad_lock);
2811 }
2812
2813 /* That's all... */
2814 kthread_exit(0); /* does not return */
2815 }
2816
2817
2818 static void
2819 rf_CopybackThread(RF_Raid_t *raidPtr)
2820 {
2821 int s;
2822
2823 raidPtr->copyback_in_progress = 1;
2824 s = splbio();
2825 rf_CopybackReconstructedData(raidPtr);
2826 splx(s);
2827 raidPtr->copyback_in_progress = 0;
2828
2829 /* That's all... */
2830 kthread_exit(0); /* does not return */
2831 }
2832
2833
2834 static void
2835 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2836 {
2837 int s;
2838 RF_Raid_t *raidPtr;
2839
2840 s = splbio();
2841 raidPtr = req->raidPtr;
2842 raidPtr->recon_in_progress = 1;
2843
2844 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2845 raidPtr->forceRecon = 1;
2846 }
2847
2848 rf_ReconstructInPlace(raidPtr, req->col);
2849
2850 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2851 raidPtr->forceRecon = 0;
2852 }
2853
2854 RF_Free(req, sizeof(*req));
2855 raidPtr->recon_in_progress = 0;
2856 splx(s);
2857
2858 /* That's all... */
2859 kthread_exit(0); /* does not return */
2860 }
2861
2862 static RF_AutoConfig_t *
2863 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2864 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2865 unsigned secsize)
2866 {
2867 int good_one = 0;
2868 RF_ComponentLabel_t *clabel;
2869 RF_AutoConfig_t *ac;
2870
2871 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2872
2873 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2874 /* Got the label. Does it look reasonable? */
2875 if (rf_reasonable_label(clabel, numsecs) &&
2876 (rf_component_label_partitionsize(clabel) <= size)) {
2877 #ifdef DEBUG
2878 printf("Component on: %s: %llu\n",
2879 cname, (unsigned long long)size);
2880 rf_print_component_label(clabel);
2881 #endif
2882 /* if it's reasonable, add it, else ignore it. */
2883 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2884 M_WAITOK);
2885 strlcpy(ac->devname, cname, sizeof(ac->devname));
2886 ac->dev = dev;
2887 ac->vp = vp;
2888 ac->clabel = clabel;
2889 ac->next = ac_list;
2890 ac_list = ac;
2891 good_one = 1;
2892 }
2893 }
2894 if (!good_one) {
2895 /* cleanup */
2896 free(clabel, M_RAIDFRAME);
2897 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2898 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2899 vput(vp);
2900 }
2901 return ac_list;
2902 }
2903
2904 static RF_AutoConfig_t *
2905 rf_find_raid_components(void)
2906 {
2907 struct vnode *vp;
2908 struct disklabel label;
2909 device_t dv;
2910 deviter_t di;
2911 dev_t dev;
2912 int bmajor, bminor, wedge, rf_part_found;
2913 int error;
2914 int i;
2915 RF_AutoConfig_t *ac_list;
2916 uint64_t numsecs;
2917 unsigned secsize;
2918 int dowedges;
2919
2920 /* initialize the AutoConfig list */
2921 ac_list = NULL;
2922
2923 /*
2924 * we begin by trolling through *all* the devices on the system *twice*
2925 * first we scan for wedges, second for other devices. This avoids
2926 * using a raw partition instead of a wedge that covers the whole disk
2927 */
2928
2929 for (dowedges=1; dowedges>=0; --dowedges) {
2930 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2931 dv = deviter_next(&di)) {
2932
2933 /* we are only interested in disks */
2934 if (device_class(dv) != DV_DISK)
2935 continue;
2936
2937 /* we don't care about floppies */
2938 if (device_is_a(dv, "fd")) {
2939 continue;
2940 }
2941
2942 /* we don't care about CDs. */
2943 if (device_is_a(dv, "cd")) {
2944 continue;
2945 }
2946
2947 /* we don't care about md. */
2948 if (device_is_a(dv, "md")) {
2949 continue;
2950 }
2951
2952 /* hdfd is the Atari/Hades floppy driver */
2953 if (device_is_a(dv, "hdfd")) {
2954 continue;
2955 }
2956
2957 /* fdisa is the Atari/Milan floppy driver */
2958 if (device_is_a(dv, "fdisa")) {
2959 continue;
2960 }
2961
2962 /* we don't care about spiflash */
2963 if (device_is_a(dv, "spiflash")) {
2964 continue;
2965 }
2966
2967 /* are we in the wedges pass ? */
2968 wedge = device_is_a(dv, "dk");
2969 if (wedge != dowedges) {
2970 continue;
2971 }
2972
2973 /* need to find the device_name_to_block_device_major stuff */
2974 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2975
2976 rf_part_found = 0; /*No raid partition as yet*/
2977
2978 /* get a vnode for the raw partition of this disk */
2979 bminor = minor(device_unit(dv));
2980 dev = wedge ? makedev(bmajor, bminor) :
2981 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2982 if (bdevvp(dev, &vp))
2983 panic("RAID can't alloc vnode");
2984
2985 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2986 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2987
2988 if (error) {
2989 /* "Who cares." Continue looking
2990 for something that exists*/
2991 vput(vp);
2992 continue;
2993 }
2994
2995 VOP_UNLOCK(vp);
2996 error = getdisksize(vp, &numsecs, &secsize);
2997 if (error) {
2998 /*
2999 * Pseudo devices like vnd and cgd can be
3000 * opened but may still need some configuration.
3001 * Ignore these quietly.
3002 */
3003 if (error != ENXIO)
3004 printf("RAIDframe: can't get disk size"
3005 " for dev %s (%d)\n",
3006 device_xname(dv), error);
3007 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3008 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3009 vput(vp);
3010 continue;
3011 }
3012 if (wedge) {
3013 struct dkwedge_info dkw;
3014 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3015 NOCRED);
3016 if (error) {
3017 printf("RAIDframe: can't get wedge info for "
3018 "dev %s (%d)\n", device_xname(dv), error);
3019 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3020 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3021 vput(vp);
3022 continue;
3023 }
3024
3025 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3026 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3027 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3028 vput(vp);
3029 continue;
3030 }
3031
3032 ac_list = rf_get_component(ac_list, dev, vp,
3033 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3034 rf_part_found = 1; /*There is a raid component on this disk*/
3035 continue;
3036 }
3037
3038 /* Ok, the disk exists. Go get the disklabel. */
3039 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3040 if (error) {
3041 /*
3042 * XXX can't happen - open() would
3043 * have errored out (or faked up one)
3044 */
3045 if (error != ENOTTY)
3046 printf("RAIDframe: can't get label for dev "
3047 "%s (%d)\n", device_xname(dv), error);
3048 }
3049
3050 /* don't need this any more. We'll allocate it again
3051 a little later if we really do... */
3052 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3053 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3054 vput(vp);
3055
3056 if (error)
3057 continue;
3058
3059 rf_part_found = 0; /*No raid partitions yet*/
3060 for (i = 0; i < label.d_npartitions; i++) {
3061 char cname[sizeof(ac_list->devname)];
3062
3063 /* We only support partitions marked as RAID */
3064 if (label.d_partitions[i].p_fstype != FS_RAID)
3065 continue;
3066
3067 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3068 if (bdevvp(dev, &vp))
3069 panic("RAID can't alloc vnode");
3070
3071 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3072 error = VOP_OPEN(vp, FREAD, NOCRED);
3073 if (error) {
3074 /* Not quite a 'whatever'. In
3075 * this situation we know
3076 * there is a FS_RAID
3077 * partition, but we can't
3078 * open it. The most likely
3079 * reason is that the
3080 * partition is already in
3081 * use by another RAID set.
3082 * So note that we've already
3083 * found a partition on this
3084 * disk so we don't attempt
3085 * to use the raw disk later. */
3086 rf_part_found = 1;
3087 vput(vp);
3088 continue;
3089 }
3090 VOP_UNLOCK(vp);
3091 snprintf(cname, sizeof(cname), "%s%c",
3092 device_xname(dv), 'a' + i);
3093 ac_list = rf_get_component(ac_list, dev, vp, cname,
3094 label.d_partitions[i].p_size, numsecs, secsize);
3095 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3096 }
3097
3098 /*
3099 *If there is no raid component on this disk, either in a
3100 *disklabel or inside a wedge, check the raw partition as well,
3101 *as it is possible to configure raid components on raw disk
3102 *devices.
3103 */
3104
3105 if (!rf_part_found) {
3106 char cname[sizeof(ac_list->devname)];
3107
3108 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3109 if (bdevvp(dev, &vp))
3110 panic("RAID can't alloc vnode");
3111
3112 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3113
3114 error = VOP_OPEN(vp, FREAD, NOCRED);
3115 if (error) {
3116 /* Whatever... */
3117 vput(vp);
3118 continue;
3119 }
3120 VOP_UNLOCK(vp);
3121 snprintf(cname, sizeof(cname), "%s%c",
3122 device_xname(dv), 'a' + RAW_PART);
3123 ac_list = rf_get_component(ac_list, dev, vp, cname,
3124 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3125 }
3126 }
3127 deviter_release(&di);
3128 }
3129 return ac_list;
3130 }
3131
3132 int
3133 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3134 {
3135
3136 if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3137 clabel->version==RF_COMPONENT_LABEL_VERSION ||
3138 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3139 (clabel->clean == RF_RAID_CLEAN ||
3140 clabel->clean == RF_RAID_DIRTY) &&
3141 clabel->row >=0 &&
3142 clabel->column >= 0 &&
3143 clabel->num_rows > 0 &&
3144 clabel->num_columns > 0 &&
3145 clabel->row < clabel->num_rows &&
3146 clabel->column < clabel->num_columns &&
3147 clabel->blockSize > 0 &&
3148 /*
3149 * numBlocksHi may contain garbage, but it is ok since
3150 * the type is unsigned. If it is really garbage,
3151 * rf_fix_old_label_size() will fix it.
3152 */
3153 rf_component_label_numblocks(clabel) > 0) {
3154 /*
3155 * label looks reasonable enough...
3156 * let's make sure it has no old garbage.
3157 */
3158 if (numsecs)
3159 rf_fix_old_label_size(clabel, numsecs);
3160 return(1);
3161 }
3162 return(0);
3163 }
3164
3165
3166 /*
3167 * For reasons yet unknown, some old component labels have garbage in
3168 * the newer numBlocksHi region, and this causes lossage. Since those
3169 * disks will also have numsecs set to less than 32 bits of sectors,
3170 * we can determine when this corruption has occurred, and fix it.
3171 *
3172 * The exact same problem, with the same unknown reason, happens to
3173 * the partitionSizeHi member as well.
3174 */
3175 static void
3176 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3177 {
3178
3179 if (numsecs < ((uint64_t)1 << 32)) {
3180 if (clabel->numBlocksHi) {
3181 printf("WARNING: total sectors < 32 bits, yet "
3182 "numBlocksHi set\n"
3183 "WARNING: resetting numBlocksHi to zero.\n");
3184 clabel->numBlocksHi = 0;
3185 }
3186
3187 if (clabel->partitionSizeHi) {
3188 printf("WARNING: total sectors < 32 bits, yet "
3189 "partitionSizeHi set\n"
3190 "WARNING: resetting partitionSizeHi to zero.\n");
3191 clabel->partitionSizeHi = 0;
3192 }
3193 }
3194 }
3195
3196
3197 #ifdef DEBUG
3198 void
3199 rf_print_component_label(RF_ComponentLabel_t *clabel)
3200 {
3201 uint64_t numBlocks;
3202 static const char *rp[] = {
3203 "No", "Force", "Soft", "*invalid*"
3204 };
3205
3206
3207 numBlocks = rf_component_label_numblocks(clabel);
3208
3209 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3210 clabel->row, clabel->column,
3211 clabel->num_rows, clabel->num_columns);
3212 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3213 clabel->version, clabel->serial_number,
3214 clabel->mod_counter);
3215 printf(" Clean: %s Status: %d\n",
3216 clabel->clean ? "Yes" : "No", clabel->status);
3217 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3218 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3219 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3220 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3221 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3222 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3223 printf(" Last configured as: raid%d\n", clabel->last_unit);
3224 #if 0
3225 printf(" Config order: %d\n", clabel->config_order);
3226 #endif
3227
3228 }
3229 #endif
3230
3231 static RF_ConfigSet_t *
3232 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3233 {
3234 RF_AutoConfig_t *ac;
3235 RF_ConfigSet_t *config_sets;
3236 RF_ConfigSet_t *cset;
3237 RF_AutoConfig_t *ac_next;
3238
3239
3240 config_sets = NULL;
3241
3242 /* Go through the AutoConfig list, and figure out which components
3243 belong to what sets. */
3244 ac = ac_list;
3245 while(ac!=NULL) {
3246 /* we're going to putz with ac->next, so save it here
3247 for use at the end of the loop */
3248 ac_next = ac->next;
3249
3250 if (config_sets == NULL) {
3251 /* will need at least this one... */
3252 config_sets = malloc(sizeof(RF_ConfigSet_t),
3253 M_RAIDFRAME, M_WAITOK);
3254 /* this one is easy :) */
3255 config_sets->ac = ac;
3256 config_sets->next = NULL;
3257 config_sets->rootable = 0;
3258 ac->next = NULL;
3259 } else {
3260 /* which set does this component fit into? */
3261 cset = config_sets;
3262 while(cset!=NULL) {
3263 if (rf_does_it_fit(cset, ac)) {
3264 /* looks like it matches... */
3265 ac->next = cset->ac;
3266 cset->ac = ac;
3267 break;
3268 }
3269 cset = cset->next;
3270 }
3271 if (cset==NULL) {
3272 /* didn't find a match above... new set..*/
3273 cset = malloc(sizeof(RF_ConfigSet_t),
3274 M_RAIDFRAME, M_WAITOK);
3275 cset->ac = ac;
3276 ac->next = NULL;
3277 cset->next = config_sets;
3278 cset->rootable = 0;
3279 config_sets = cset;
3280 }
3281 }
3282 ac = ac_next;
3283 }
3284
3285
3286 return(config_sets);
3287 }
3288
3289 static int
3290 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3291 {
3292 RF_ComponentLabel_t *clabel1, *clabel2;
3293
3294 /* If this one matches the *first* one in the set, that's good
3295 enough, since the other members of the set would have been
3296 through here too... */
3297 /* note that we are not checking partitionSize here..
3298
3299 Note that we are also not checking the mod_counters here.
3300 If everything else matches except the mod_counter, that's
3301 good enough for this test. We will deal with the mod_counters
3302 a little later in the autoconfiguration process.
3303
3304 (clabel1->mod_counter == clabel2->mod_counter) &&
3305
3306 The reason we don't check for this is that failed disks
3307 will have lower modification counts. If those disks are
3308 not added to the set they used to belong to, then they will
3309 form their own set, which may result in 2 different sets,
3310 for example, competing to be configured at raid0, and
3311 perhaps competing to be the root filesystem set. If the
3312 wrong ones get configured, or both attempt to become /,
3313 weird behaviour and or serious lossage will occur. Thus we
3314 need to bring them into the fold here, and kick them out at
3315 a later point.
3316
3317 */
3318
3319 clabel1 = cset->ac->clabel;
3320 clabel2 = ac->clabel;
3321 if ((clabel1->version == clabel2->version) &&
3322 (clabel1->serial_number == clabel2->serial_number) &&
3323 (clabel1->num_rows == clabel2->num_rows) &&
3324 (clabel1->num_columns == clabel2->num_columns) &&
3325 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3326 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3327 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3328 (clabel1->parityConfig == clabel2->parityConfig) &&
3329 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3330 (clabel1->blockSize == clabel2->blockSize) &&
3331 rf_component_label_numblocks(clabel1) ==
3332 rf_component_label_numblocks(clabel2) &&
3333 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3334 (clabel1->root_partition == clabel2->root_partition) &&
3335 (clabel1->last_unit == clabel2->last_unit) &&
3336 (clabel1->config_order == clabel2->config_order)) {
3337 /* if it get's here, it almost *has* to be a match */
3338 } else {
3339 /* it's not consistent with somebody in the set..
3340 punt */
3341 return(0);
3342 }
3343 /* all was fine.. it must fit... */
3344 return(1);
3345 }
3346
3347 static int
3348 rf_have_enough_components(RF_ConfigSet_t *cset)
3349 {
3350 RF_AutoConfig_t *ac;
3351 RF_AutoConfig_t *auto_config;
3352 RF_ComponentLabel_t *clabel;
3353 int c;
3354 int num_cols;
3355 int num_missing;
3356 int mod_counter;
3357 int mod_counter_found;
3358 int even_pair_failed;
3359 char parity_type;
3360
3361
3362 /* check to see that we have enough 'live' components
3363 of this set. If so, we can configure it if necessary */
3364
3365 num_cols = cset->ac->clabel->num_columns;
3366 parity_type = cset->ac->clabel->parityConfig;
3367
3368 /* XXX Check for duplicate components!?!?!? */
3369
3370 /* Determine what the mod_counter is supposed to be for this set. */
3371
3372 mod_counter_found = 0;
3373 mod_counter = 0;
3374 ac = cset->ac;
3375 while(ac!=NULL) {
3376 if (mod_counter_found==0) {
3377 mod_counter = ac->clabel->mod_counter;
3378 mod_counter_found = 1;
3379 } else {
3380 if (ac->clabel->mod_counter > mod_counter) {
3381 mod_counter = ac->clabel->mod_counter;
3382 }
3383 }
3384 ac = ac->next;
3385 }
3386
3387 num_missing = 0;
3388 auto_config = cset->ac;
3389
3390 even_pair_failed = 0;
3391 for(c=0; c<num_cols; c++) {
3392 ac = auto_config;
3393 while(ac!=NULL) {
3394 if ((ac->clabel->column == c) &&
3395 (ac->clabel->mod_counter == mod_counter)) {
3396 /* it's this one... */
3397 #ifdef DEBUG
3398 printf("Found: %s at %d\n",
3399 ac->devname,c);
3400 #endif
3401 break;
3402 }
3403 ac=ac->next;
3404 }
3405 if (ac==NULL) {
3406 /* Didn't find one here! */
3407 /* special case for RAID 1, especially
3408 where there are more than 2
3409 components (where RAIDframe treats
3410 things a little differently :( ) */
3411 if (parity_type == '1') {
3412 if (c%2 == 0) { /* even component */
3413 even_pair_failed = 1;
3414 } else { /* odd component. If
3415 we're failed, and
3416 so is the even
3417 component, it's
3418 "Good Night, Charlie" */
3419 if (even_pair_failed == 1) {
3420 return(0);
3421 }
3422 }
3423 } else {
3424 /* normal accounting */
3425 num_missing++;
3426 }
3427 }
3428 if ((parity_type == '1') && (c%2 == 1)) {
3429 /* Just did an even component, and we didn't
3430 bail.. reset the even_pair_failed flag,
3431 and go on to the next component.... */
3432 even_pair_failed = 0;
3433 }
3434 }
3435
3436 clabel = cset->ac->clabel;
3437
3438 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3439 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3440 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3441 /* XXX this needs to be made *much* more general */
3442 /* Too many failures */
3443 return(0);
3444 }
3445 /* otherwise, all is well, and we've got enough to take a kick
3446 at autoconfiguring this set */
3447 return(1);
3448 }
3449
3450 static void
3451 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3452 RF_Raid_t *raidPtr)
3453 {
3454 RF_ComponentLabel_t *clabel;
3455 int i;
3456
3457 clabel = ac->clabel;
3458
3459 /* 1. Fill in the common stuff */
3460 config->numCol = clabel->num_columns;
3461 config->numSpare = 0; /* XXX should this be set here? */
3462 config->sectPerSU = clabel->sectPerSU;
3463 config->SUsPerPU = clabel->SUsPerPU;
3464 config->SUsPerRU = clabel->SUsPerRU;
3465 config->parityConfig = clabel->parityConfig;
3466 /* XXX... */
3467 strcpy(config->diskQueueType,"fifo");
3468 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3469 config->layoutSpecificSize = 0; /* XXX ?? */
3470
3471 while(ac!=NULL) {
3472 /* row/col values will be in range due to the checks
3473 in reasonable_label() */
3474 strcpy(config->devnames[0][ac->clabel->column],
3475 ac->devname);
3476 ac = ac->next;
3477 }
3478
3479 for(i=0;i<RF_MAXDBGV;i++) {
3480 config->debugVars[i][0] = 0;
3481 }
3482 }
3483
3484 static int
3485 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3486 {
3487 RF_ComponentLabel_t *clabel;
3488 int column;
3489 int sparecol;
3490
3491 raidPtr->autoconfigure = new_value;
3492
3493 for(column=0; column<raidPtr->numCol; column++) {
3494 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3495 clabel = raidget_component_label(raidPtr, column);
3496 clabel->autoconfigure = new_value;
3497 raidflush_component_label(raidPtr, column);
3498 }
3499 }
3500 for(column = 0; column < raidPtr->numSpare ; column++) {
3501 sparecol = raidPtr->numCol + column;
3502 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3503 clabel = raidget_component_label(raidPtr, sparecol);
3504 clabel->autoconfigure = new_value;
3505 raidflush_component_label(raidPtr, sparecol);
3506 }
3507 }
3508 return(new_value);
3509 }
3510
3511 static int
3512 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3513 {
3514 RF_ComponentLabel_t *clabel;
3515 int column;
3516 int sparecol;
3517
3518 raidPtr->root_partition = new_value;
3519 for(column=0; column<raidPtr->numCol; column++) {
3520 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3521 clabel = raidget_component_label(raidPtr, column);
3522 clabel->root_partition = new_value;
3523 raidflush_component_label(raidPtr, column);
3524 }
3525 }
3526 for(column = 0; column < raidPtr->numSpare ; column++) {
3527 sparecol = raidPtr->numCol + column;
3528 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3529 clabel = raidget_component_label(raidPtr, sparecol);
3530 clabel->root_partition = new_value;
3531 raidflush_component_label(raidPtr, sparecol);
3532 }
3533 }
3534 return(new_value);
3535 }
3536
3537 static void
3538 rf_release_all_vps(RF_ConfigSet_t *cset)
3539 {
3540 RF_AutoConfig_t *ac;
3541
3542 ac = cset->ac;
3543 while(ac!=NULL) {
3544 /* Close the vp, and give it back */
3545 if (ac->vp) {
3546 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3547 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3548 vput(ac->vp);
3549 ac->vp = NULL;
3550 }
3551 ac = ac->next;
3552 }
3553 }
3554
3555
3556 static void
3557 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3558 {
3559 RF_AutoConfig_t *ac;
3560 RF_AutoConfig_t *next_ac;
3561
3562 ac = cset->ac;
3563 while(ac!=NULL) {
3564 next_ac = ac->next;
3565 /* nuke the label */
3566 free(ac->clabel, M_RAIDFRAME);
3567 /* cleanup the config structure */
3568 free(ac, M_RAIDFRAME);
3569 /* "next.." */
3570 ac = next_ac;
3571 }
3572 /* and, finally, nuke the config set */
3573 free(cset, M_RAIDFRAME);
3574 }
3575
3576
3577 void
3578 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3579 {
3580 /* avoid over-writing byteswapped version. */
3581 if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3582 clabel->version = RF_COMPONENT_LABEL_VERSION;
3583 clabel->serial_number = raidPtr->serial_number;
3584 clabel->mod_counter = raidPtr->mod_counter;
3585
3586 clabel->num_rows = 1;
3587 clabel->num_columns = raidPtr->numCol;
3588 clabel->clean = RF_RAID_DIRTY; /* not clean */
3589 clabel->status = rf_ds_optimal; /* "It's good!" */
3590
3591 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3592 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3593 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3594
3595 clabel->blockSize = raidPtr->bytesPerSector;
3596 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3597
3598 /* XXX not portable */
3599 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3600 clabel->maxOutstanding = raidPtr->maxOutstanding;
3601 clabel->autoconfigure = raidPtr->autoconfigure;
3602 clabel->root_partition = raidPtr->root_partition;
3603 clabel->last_unit = raidPtr->raidid;
3604 clabel->config_order = raidPtr->config_order;
3605
3606 #ifndef RF_NO_PARITY_MAP
3607 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3608 #endif
3609 }
3610
3611 static struct raid_softc *
3612 rf_auto_config_set(RF_ConfigSet_t *cset)
3613 {
3614 RF_Raid_t *raidPtr;
3615 RF_Config_t *config;
3616 int raidID;
3617 struct raid_softc *sc;
3618
3619 #ifdef DEBUG
3620 printf("RAID autoconfigure\n");
3621 #endif
3622
3623 /* 1. Create a config structure */
3624 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3625
3626 /*
3627 2. Figure out what RAID ID this one is supposed to live at
3628 See if we can get the same RAID dev that it was configured
3629 on last time..
3630 */
3631
3632 raidID = cset->ac->clabel->last_unit;
3633 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3634 sc = raidget(++raidID, false))
3635 continue;
3636 #ifdef DEBUG
3637 printf("Configuring raid%d:\n",raidID);
3638 #endif
3639
3640 if (sc == NULL)
3641 sc = raidget(raidID, true);
3642 raidPtr = &sc->sc_r;
3643
3644 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3645 raidPtr->softc = sc;
3646 raidPtr->raidid = raidID;
3647 raidPtr->openings = RAIDOUTSTANDING;
3648
3649 /* 3. Build the configuration structure */
3650 rf_create_configuration(cset->ac, config, raidPtr);
3651
3652 /* 4. Do the configuration */
3653 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3654 raidinit(sc);
3655
3656 rf_markalldirty(raidPtr);
3657 raidPtr->autoconfigure = 1; /* XXX do this here? */
3658 switch (cset->ac->clabel->root_partition) {
3659 case 1: /* Force Root */
3660 case 2: /* Soft Root: root when boot partition part of raid */
3661 /*
3662 * everything configured just fine. Make a note
3663 * that this set is eligible to be root,
3664 * or forced to be root
3665 */
3666 cset->rootable = cset->ac->clabel->root_partition;
3667 /* XXX do this here? */
3668 raidPtr->root_partition = cset->rootable;
3669 break;
3670 default:
3671 break;
3672 }
3673 } else {
3674 raidput(sc);
3675 sc = NULL;
3676 }
3677
3678 /* 5. Cleanup */
3679 free(config, M_RAIDFRAME);
3680 return sc;
3681 }
3682
3683 void
3684 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3685 size_t xmin, size_t xmax)
3686 {
3687
3688 /* Format: raid%d_foo */
3689 snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3690
3691 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3692 pool_sethiwat(p, xmax);
3693 pool_prime(p, xmin);
3694 }
3695
3696
3697 /*
3698 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3699 * to see if there is IO pending and if that IO could possibly be done
3700 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3701 * otherwise.
3702 *
3703 */
3704 int
3705 rf_buf_queue_check(RF_Raid_t *raidPtr)
3706 {
3707 struct raid_softc *rs;
3708 struct dk_softc *dksc;
3709
3710 rs = raidPtr->softc;
3711 dksc = &rs->sc_dksc;
3712
3713 if ((rs->sc_flags & RAIDF_INITED) == 0)
3714 return 1;
3715
3716 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3717 /* there is work to do */
3718 return 0;
3719 }
3720 /* default is nothing to do */
3721 return 1;
3722 }
3723
3724 int
3725 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3726 {
3727 uint64_t numsecs;
3728 unsigned secsize;
3729 int error;
3730
3731 error = getdisksize(vp, &numsecs, &secsize);
3732 if (error == 0) {
3733 diskPtr->blockSize = secsize;
3734 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3735 diskPtr->partitionSize = numsecs;
3736 return 0;
3737 }
3738 return error;
3739 }
3740
3741 static int
3742 raid_match(device_t self, cfdata_t cfdata, void *aux)
3743 {
3744 return 1;
3745 }
3746
3747 static void
3748 raid_attach(device_t parent, device_t self, void *aux)
3749 {
3750 }
3751
3752
3753 static int
3754 raid_detach(device_t self, int flags)
3755 {
3756 int error;
3757 struct raid_softc *rs = raidsoftc(self);
3758
3759 if (rs == NULL)
3760 return ENXIO;
3761
3762 if ((error = raidlock(rs)) != 0)
3763 return error;
3764
3765 error = raid_detach_unlocked(rs);
3766
3767 raidunlock(rs);
3768
3769 /* XXX raid can be referenced here */
3770
3771 if (error)
3772 return error;
3773
3774 /* Free the softc */
3775 raidput(rs);
3776
3777 return 0;
3778 }
3779
3780 static void
3781 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3782 {
3783 struct dk_softc *dksc = &rs->sc_dksc;
3784 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3785
3786 memset(dg, 0, sizeof(*dg));
3787
3788 dg->dg_secperunit = raidPtr->totalSectors;
3789 dg->dg_secsize = raidPtr->bytesPerSector;
3790 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3791 dg->dg_ntracks = 4 * raidPtr->numCol;
3792
3793 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3794 }
3795
3796 /*
3797 * Get cache info for all the components (including spares).
3798 * Returns intersection of all the cache flags of all disks, or first
3799 * error if any encountered.
3800 * XXXfua feature flags can change as spares are added - lock down somehow
3801 */
3802 static int
3803 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3804 {
3805 int c;
3806 int error;
3807 int dkwhole = 0, dkpart;
3808
3809 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3810 /*
3811 * Check any non-dead disk, even when currently being
3812 * reconstructed.
3813 */
3814 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3815 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3816 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3817 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3818 if (error) {
3819 if (error != ENODEV) {
3820 printf("raid%d: get cache for component %s failed\n",
3821 raidPtr->raidid,
3822 raidPtr->Disks[c].devname);
3823 }
3824
3825 return error;
3826 }
3827
3828 if (c == 0)
3829 dkwhole = dkpart;
3830 else
3831 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3832 }
3833 }
3834
3835 *data = dkwhole;
3836
3837 return 0;
3838 }
3839
3840 /*
3841 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3842 * We end up returning whatever error was returned by the first cache flush
3843 * that fails.
3844 */
3845
3846 static int
3847 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3848 {
3849 int e = 0;
3850 for (int i = 0; i < 5; i++) {
3851 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3852 &force, FWRITE, NOCRED);
3853 if (!e || e == ENODEV)
3854 return e;
3855 printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3856 raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3857 }
3858 return e;
3859 }
3860
3861 int
3862 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3863 {
3864 int c, error;
3865
3866 error = 0;
3867 for (c = 0; c < raidPtr->numCol; c++) {
3868 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3869 int e = rf_sync_component_cache(raidPtr, c, force);
3870 if (e && !error)
3871 error = e;
3872 }
3873 }
3874
3875 for (c = 0; c < raidPtr->numSpare ; c++) {
3876 int sparecol = raidPtr->numCol + c;
3877 /* Need to ensure that the reconstruct actually completed! */
3878 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3879 int e = rf_sync_component_cache(raidPtr, sparecol,
3880 force);
3881 if (e && !error)
3882 error = e;
3883 }
3884 }
3885 return error;
3886 }
3887
3888 /* Fill in info with the current status */
3889 void
3890 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3891 {
3892
3893 memset(info, 0, sizeof(*info));
3894
3895 if (raidPtr->status != rf_rs_reconstructing) {
3896 info->total = 100;
3897 info->completed = 100;
3898 } else {
3899 info->total = raidPtr->reconControl->numRUsTotal;
3900 info->completed = raidPtr->reconControl->numRUsComplete;
3901 }
3902 info->remaining = info->total - info->completed;
3903 }
3904
3905 /* Fill in info with the current status */
3906 void
3907 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3908 {
3909
3910 memset(info, 0, sizeof(*info));
3911
3912 if (raidPtr->parity_rewrite_in_progress == 1) {
3913 info->total = raidPtr->Layout.numStripe;
3914 info->completed = raidPtr->parity_rewrite_stripes_done;
3915 } else {
3916 info->completed = 100;
3917 info->total = 100;
3918 }
3919 info->remaining = info->total - info->completed;
3920 }
3921
3922 /* Fill in info with the current status */
3923 void
3924 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3925 {
3926
3927 memset(info, 0, sizeof(*info));
3928
3929 if (raidPtr->copyback_in_progress == 1) {
3930 info->total = raidPtr->Layout.numStripe;
3931 info->completed = raidPtr->copyback_stripes_done;
3932 info->remaining = info->total - info->completed;
3933 } else {
3934 info->remaining = 0;
3935 info->completed = 100;
3936 info->total = 100;
3937 }
3938 }
3939
3940 /* Fill in config with the current info */
3941 int
3942 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3943 {
3944 int d, i, j;
3945
3946 if (!raidPtr->valid)
3947 return ENODEV;
3948 config->cols = raidPtr->numCol;
3949 config->ndevs = raidPtr->numCol;
3950 if (config->ndevs >= RF_MAX_DISKS)
3951 return ENOMEM;
3952 config->nspares = raidPtr->numSpare;
3953 if (config->nspares >= RF_MAX_DISKS)
3954 return ENOMEM;
3955 config->maxqdepth = raidPtr->maxQueueDepth;
3956 d = 0;
3957 for (j = 0; j < config->cols; j++) {
3958 config->devs[d] = raidPtr->Disks[j];
3959 d++;
3960 }
3961 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3962 config->spares[i] = raidPtr->Disks[j];
3963 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3964 /* XXX: raidctl(8) expects to see this as a used spare */
3965 config->spares[i].status = rf_ds_used_spare;
3966 }
3967 }
3968 return 0;
3969 }
3970
3971 int
3972 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3973 {
3974 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3975 RF_ComponentLabel_t *raid_clabel;
3976 int column = clabel->column;
3977
3978 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3979 return EINVAL;
3980 raid_clabel = raidget_component_label(raidPtr, column);
3981 memcpy(clabel, raid_clabel, sizeof *clabel);
3982 /* Fix-up for userland. */
3983 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
3984 clabel->version = RF_COMPONENT_LABEL_VERSION;
3985
3986 return 0;
3987 }
3988
3989 /*
3990 * Module interface
3991 */
3992
3993 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3994
3995 #ifdef _MODULE
3996 CFDRIVER_DECL(raid, DV_DISK, NULL);
3997 #endif
3998
3999 static int raid_modcmd(modcmd_t, void *);
4000 static int raid_modcmd_init(void);
4001 static int raid_modcmd_fini(void);
4002
4003 static int
4004 raid_modcmd(modcmd_t cmd, void *data)
4005 {
4006 int error;
4007
4008 error = 0;
4009 switch (cmd) {
4010 case MODULE_CMD_INIT:
4011 error = raid_modcmd_init();
4012 break;
4013 case MODULE_CMD_FINI:
4014 error = raid_modcmd_fini();
4015 break;
4016 default:
4017 error = ENOTTY;
4018 break;
4019 }
4020 return error;
4021 }
4022
4023 static int
4024 raid_modcmd_init(void)
4025 {
4026 int error;
4027 int bmajor, cmajor;
4028
4029 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
4030 mutex_enter(&raid_lock);
4031 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4032 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4033 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4034 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4035
4036 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4037 #endif
4038
4039 bmajor = cmajor = -1;
4040 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4041 &raid_cdevsw, &cmajor);
4042 if (error != 0 && error != EEXIST) {
4043 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4044 mutex_exit(&raid_lock);
4045 return error;
4046 }
4047 #ifdef _MODULE
4048 error = config_cfdriver_attach(&raid_cd);
4049 if (error != 0) {
4050 aprint_error("%s: config_cfdriver_attach failed %d\n",
4051 __func__, error);
4052 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4053 mutex_exit(&raid_lock);
4054 return error;
4055 }
4056 #endif
4057 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4058 if (error != 0) {
4059 aprint_error("%s: config_cfattach_attach failed %d\n",
4060 __func__, error);
4061 #ifdef _MODULE
4062 config_cfdriver_detach(&raid_cd);
4063 #endif
4064 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4065 mutex_exit(&raid_lock);
4066 return error;
4067 }
4068
4069 raidautoconfigdone = false;
4070
4071 mutex_exit(&raid_lock);
4072
4073 if (error == 0) {
4074 if (rf_BootRaidframe(true) == 0)
4075 aprint_verbose("Kernelized RAIDframe activated\n");
4076 else
4077 panic("Serious error activating RAID!!");
4078 }
4079
4080 /*
4081 * Register a finalizer which will be used to auto-config RAID
4082 * sets once all real hardware devices have been found.
4083 */
4084 error = config_finalize_register(NULL, rf_autoconfig);
4085 if (error != 0) {
4086 aprint_error("WARNING: unable to register RAIDframe "
4087 "finalizer\n");
4088 error = 0;
4089 }
4090
4091 return error;
4092 }
4093
4094 static int
4095 raid_modcmd_fini(void)
4096 {
4097 int error;
4098
4099 mutex_enter(&raid_lock);
4100
4101 /* Don't allow unload if raid device(s) exist. */
4102 if (!LIST_EMPTY(&raids)) {
4103 mutex_exit(&raid_lock);
4104 return EBUSY;
4105 }
4106
4107 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4108 if (error != 0) {
4109 aprint_error("%s: cannot detach cfattach\n",__func__);
4110 mutex_exit(&raid_lock);
4111 return error;
4112 }
4113 #ifdef _MODULE
4114 error = config_cfdriver_detach(&raid_cd);
4115 if (error != 0) {
4116 aprint_error("%s: cannot detach cfdriver\n",__func__);
4117 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4118 mutex_exit(&raid_lock);
4119 return error;
4120 }
4121 #endif
4122 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4123 rf_BootRaidframe(false);
4124 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4125 rf_destroy_mutex2(rf_sparet_wait_mutex);
4126 rf_destroy_cond2(rf_sparet_wait_cv);
4127 rf_destroy_cond2(rf_sparet_resp_cv);
4128 #endif
4129 mutex_exit(&raid_lock);
4130 mutex_destroy(&raid_lock);
4131
4132 return error;
4133 }
4134