rf_netbsdkintf.c revision 1.408 1 /* $NetBSD: rf_netbsdkintf.c,v 1.408 2022/08/10 01:16:38 mrg Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.408 2022/08/10 01:16:38 mrg Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_copyback.h"
140 #include "rf_dag.h"
141 #include "rf_dagflags.h"
142 #include "rf_desc.h"
143 #include "rf_diskqueue.h"
144 #include "rf_etimer.h"
145 #include "rf_general.h"
146 #include "rf_kintf.h"
147 #include "rf_options.h"
148 #include "rf_driver.h"
149 #include "rf_parityscan.h"
150 #include "rf_threadstuff.h"
151
152 #include "ioconf.h"
153
154 #ifdef DEBUG
155 int rf_kdebug_level = 0;
156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
157 #else /* DEBUG */
158 #define db1_printf(a) { }
159 #endif /* DEBUG */
160
161 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
162 static rf_declare_mutex2(rf_sparet_wait_mutex);
163 static rf_declare_cond2(rf_sparet_wait_cv);
164 static rf_declare_cond2(rf_sparet_resp_cv);
165
166 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
167 * spare table */
168 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
169 * installation process */
170 #endif
171
172 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
173
174 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
175
176 /* prototypes */
177 static void KernelWakeupFunc(struct buf *);
178 static void InitBP(struct buf *, struct vnode *, unsigned,
179 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
180 void *, int);
181 static void raidinit(struct raid_softc *);
182 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
183 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
184
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190 daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192 daddr_t, daddr_t, int);
193
194 static int raidwrite_component_label(unsigned,
195 dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197 dev_t, struct vnode *, RF_ComponentLabel_t *);
198
199 static int raid_diskstart(device_t, struct buf *bp);
200 static int raid_dumpblocks(device_t, void *, daddr_t, int);
201 static int raid_lastclose(device_t);
202
203 static dev_type_open(raidopen);
204 static dev_type_close(raidclose);
205 static dev_type_read(raidread);
206 static dev_type_write(raidwrite);
207 static dev_type_ioctl(raidioctl);
208 static dev_type_strategy(raidstrategy);
209 static dev_type_dump(raiddump);
210 static dev_type_size(raidsize);
211
212 const struct bdevsw raid_bdevsw = {
213 .d_open = raidopen,
214 .d_close = raidclose,
215 .d_strategy = raidstrategy,
216 .d_ioctl = raidioctl,
217 .d_dump = raiddump,
218 .d_psize = raidsize,
219 .d_discard = nodiscard,
220 .d_flag = D_DISK
221 };
222
223 const struct cdevsw raid_cdevsw = {
224 .d_open = raidopen,
225 .d_close = raidclose,
226 .d_read = raidread,
227 .d_write = raidwrite,
228 .d_ioctl = raidioctl,
229 .d_stop = nostop,
230 .d_tty = notty,
231 .d_poll = nopoll,
232 .d_mmap = nommap,
233 .d_kqfilter = nokqfilter,
234 .d_discard = nodiscard,
235 .d_flag = D_DISK
236 };
237
238 static struct dkdriver rf_dkdriver = {
239 .d_open = raidopen,
240 .d_close = raidclose,
241 .d_strategy = raidstrategy,
242 .d_diskstart = raid_diskstart,
243 .d_dumpblocks = raid_dumpblocks,
244 .d_lastclose = raid_lastclose,
245 .d_minphys = minphys
246 };
247
248 #define raidunit(x) DISKUNIT(x)
249 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
250
251 extern struct cfdriver raid_cd;
252 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
253 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
254 DVF_DETACH_SHUTDOWN);
255
256 /* Internal representation of a rf_recon_req */
257 struct rf_recon_req_internal {
258 RF_RowCol_t col;
259 RF_ReconReqFlags_t flags;
260 void *raidPtr;
261 };
262
263 /*
264 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
265 * Be aware that large numbers can allow the driver to consume a lot of
266 * kernel memory, especially on writes, and in degraded mode reads.
267 *
268 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
269 * a single 64K write will typically require 64K for the old data,
270 * 64K for the old parity, and 64K for the new parity, for a total
271 * of 192K (if the parity buffer is not re-used immediately).
272 * Even it if is used immediately, that's still 128K, which when multiplied
273 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
274 *
275 * Now in degraded mode, for example, a 64K read on the above setup may
276 * require data reconstruction, which will require *all* of the 4 remaining
277 * disks to participate -- 4 * 32K/disk == 128K again.
278 */
279
280 #ifndef RAIDOUTSTANDING
281 #define RAIDOUTSTANDING 6
282 #endif
283
284 #define RAIDLABELDEV(dev) \
285 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
286
287 /* declared here, and made public, for the benefit of KVM stuff.. */
288
289 static int raidlock(struct raid_softc *);
290 static void raidunlock(struct raid_softc *);
291
292 static int raid_detach_unlocked(struct raid_softc *);
293
294 static void rf_markalldirty(RF_Raid_t *);
295 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
296
297 static void rf_ReconThread(struct rf_recon_req_internal *);
298 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
299 static void rf_CopybackThread(RF_Raid_t *raidPtr);
300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
301 static int rf_autoconfig(device_t);
302 static int rf_rescan(void);
303 static void rf_buildroothack(RF_ConfigSet_t *);
304
305 static RF_AutoConfig_t *rf_find_raid_components(void);
306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
309 static int rf_set_autoconfig(RF_Raid_t *, int);
310 static int rf_set_rootpartition(RF_Raid_t *, int);
311 static void rf_release_all_vps(RF_ConfigSet_t *);
312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
313 static int rf_have_enough_components(RF_ConfigSet_t *);
314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
316
317 /*
318 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
319 * Note that this is overridden by having RAID_AUTOCONFIG as an option
320 * in the kernel config file.
321 */
322 #ifdef RAID_AUTOCONFIG
323 int raidautoconfig = 1;
324 #else
325 int raidautoconfig = 0;
326 #endif
327 static bool raidautoconfigdone = false;
328
329 struct pool rf_alloclist_pool; /* AllocList */
330
331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
332 static kmutex_t raid_lock;
333
334 static struct raid_softc *
335 raidcreate(int unit) {
336 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
337 sc->sc_unit = unit;
338 cv_init(&sc->sc_cv, "raidunit");
339 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
340 return sc;
341 }
342
343 static void
344 raiddestroy(struct raid_softc *sc) {
345 cv_destroy(&sc->sc_cv);
346 mutex_destroy(&sc->sc_mutex);
347 kmem_free(sc, sizeof(*sc));
348 }
349
350 static struct raid_softc *
351 raidget(int unit, bool create) {
352 struct raid_softc *sc;
353 if (unit < 0) {
354 #ifdef DIAGNOSTIC
355 panic("%s: unit %d!", __func__, unit);
356 #endif
357 return NULL;
358 }
359 mutex_enter(&raid_lock);
360 LIST_FOREACH(sc, &raids, sc_link) {
361 if (sc->sc_unit == unit) {
362 mutex_exit(&raid_lock);
363 return sc;
364 }
365 }
366 mutex_exit(&raid_lock);
367 if (!create)
368 return NULL;
369 sc = raidcreate(unit);
370 mutex_enter(&raid_lock);
371 LIST_INSERT_HEAD(&raids, sc, sc_link);
372 mutex_exit(&raid_lock);
373 return sc;
374 }
375
376 static void
377 raidput(struct raid_softc *sc) {
378 mutex_enter(&raid_lock);
379 LIST_REMOVE(sc, sc_link);
380 mutex_exit(&raid_lock);
381 raiddestroy(sc);
382 }
383
384 void
385 raidattach(int num)
386 {
387
388 /*
389 * Device attachment and associated initialization now occurs
390 * as part of the module initialization.
391 */
392 }
393
394 static int
395 rf_autoconfig(device_t self)
396 {
397 RF_AutoConfig_t *ac_list;
398 RF_ConfigSet_t *config_sets;
399
400 if (!raidautoconfig || raidautoconfigdone == true)
401 return 0;
402
403 /* XXX This code can only be run once. */
404 raidautoconfigdone = true;
405
406 #ifdef __HAVE_CPU_BOOTCONF
407 /*
408 * 0. find the boot device if needed first so we can use it later
409 * this needs to be done before we autoconfigure any raid sets,
410 * because if we use wedges we are not going to be able to open
411 * the boot device later
412 */
413 if (booted_device == NULL)
414 cpu_bootconf();
415 #endif
416 /* 1. locate all RAID components on the system */
417 aprint_debug("Searching for RAID components...\n");
418 ac_list = rf_find_raid_components();
419
420 /* 2. Sort them into their respective sets. */
421 config_sets = rf_create_auto_sets(ac_list);
422
423 /*
424 * 3. Evaluate each set and configure the valid ones.
425 * This gets done in rf_buildroothack().
426 */
427 rf_buildroothack(config_sets);
428
429 return 1;
430 }
431
432 int
433 rf_inited(const struct raid_softc *rs) {
434 return (rs->sc_flags & RAIDF_INITED) != 0;
435 }
436
437 RF_Raid_t *
438 rf_get_raid(struct raid_softc *rs) {
439 return &rs->sc_r;
440 }
441
442 int
443 rf_get_unit(const struct raid_softc *rs) {
444 return rs->sc_unit;
445 }
446
447 static int
448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
449 const char *bootname;
450 size_t len;
451
452 /* if bdv is NULL, the set can't contain it. exit early. */
453 if (bdv == NULL)
454 return 0;
455
456 bootname = device_xname(bdv);
457 len = strlen(bootname);
458
459 for (int col = 0; col < r->numCol; col++) {
460 const char *devname = r->Disks[col].devname;
461 devname += sizeof("/dev/") - 1;
462 if (strncmp(devname, "dk", 2) == 0) {
463 const char *parent =
464 dkwedge_get_parent_name(r->Disks[col].dev);
465 if (parent != NULL)
466 devname = parent;
467 }
468 if (strncmp(devname, bootname, len) == 0) {
469 struct raid_softc *sc = r->softc;
470 aprint_debug("raid%d includes boot device %s\n",
471 sc->sc_unit, devname);
472 return 1;
473 }
474 }
475 return 0;
476 }
477
478 static int
479 rf_rescan(void)
480 {
481 RF_AutoConfig_t *ac_list;
482 RF_ConfigSet_t *config_sets, *cset, *next_cset;
483 struct raid_softc *sc;
484 int raid_added;
485
486 ac_list = rf_find_raid_components();
487 config_sets = rf_create_auto_sets(ac_list);
488
489 raid_added = 1;
490 while (raid_added > 0) {
491 raid_added = 0;
492 cset = config_sets;
493 while (cset != NULL) {
494 next_cset = cset->next;
495 if (rf_have_enough_components(cset) &&
496 cset->ac->clabel->autoconfigure == 1) {
497 sc = rf_auto_config_set(cset);
498 if (sc != NULL) {
499 aprint_debug("raid%d: configured ok, rootable %d\n",
500 sc->sc_unit, cset->rootable);
501 /* We added one RAID set */
502 raid_added++;
503 } else {
504 /* The autoconfig didn't work :( */
505 aprint_debug("Autoconfig failed\n");
506 rf_release_all_vps(cset);
507 }
508 } else {
509 /* we're not autoconfiguring this set...
510 release the associated resources */
511 rf_release_all_vps(cset);
512 }
513 /* cleanup */
514 rf_cleanup_config_set(cset);
515 cset = next_cset;
516 }
517 if (raid_added > 0) {
518 /* We added at least one RAID set, so re-scan for recursive RAID */
519 ac_list = rf_find_raid_components();
520 config_sets = rf_create_auto_sets(ac_list);
521 }
522 }
523
524 return 0;
525 }
526
527
528 static void
529 rf_buildroothack(RF_ConfigSet_t *config_sets)
530 {
531 RF_AutoConfig_t *ac_list;
532 RF_ConfigSet_t *cset;
533 RF_ConfigSet_t *next_cset;
534 int num_root;
535 int raid_added;
536 struct raid_softc *sc, *rsc;
537 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
538
539 sc = rsc = NULL;
540 num_root = 0;
541
542 raid_added = 1;
543 while (raid_added > 0) {
544 raid_added = 0;
545 cset = config_sets;
546 while (cset != NULL) {
547 next_cset = cset->next;
548 if (rf_have_enough_components(cset) &&
549 cset->ac->clabel->autoconfigure == 1) {
550 sc = rf_auto_config_set(cset);
551 if (sc != NULL) {
552 aprint_debug("raid%d: configured ok, rootable %d\n",
553 sc->sc_unit, cset->rootable);
554 /* We added one RAID set */
555 raid_added++;
556 if (cset->rootable) {
557 rsc = sc;
558 num_root++;
559 }
560 } else {
561 /* The autoconfig didn't work :( */
562 aprint_debug("Autoconfig failed\n");
563 rf_release_all_vps(cset);
564 }
565 } else {
566 /* we're not autoconfiguring this set...
567 release the associated resources */
568 rf_release_all_vps(cset);
569 }
570 /* cleanup */
571 rf_cleanup_config_set(cset);
572 cset = next_cset;
573 }
574 if (raid_added > 0) {
575 /* We added at least one RAID set, so re-scan for recursive RAID */
576 ac_list = rf_find_raid_components();
577 config_sets = rf_create_auto_sets(ac_list);
578 }
579 }
580
581 /* if the user has specified what the root device should be
582 then we don't touch booted_device or boothowto... */
583
584 if (rootspec != NULL) {
585 aprint_debug("%s: rootspec %s\n", __func__, rootspec);
586 return;
587 }
588
589 /* we found something bootable... */
590
591 /*
592 * XXX: The following code assumes that the root raid
593 * is the first ('a') partition. This is about the best
594 * we can do with a BSD disklabel, but we might be able
595 * to do better with a GPT label, by setting a specified
596 * attribute to indicate the root partition. We can then
597 * stash the partition number in the r->root_partition
598 * high bits (the bottom 2 bits are already used). For
599 * now we just set booted_partition to 0 when we override
600 * root.
601 */
602 if (num_root == 1) {
603 device_t candidate_root;
604 dksc = &rsc->sc_dksc;
605 if (dksc->sc_dkdev.dk_nwedges != 0) {
606 char cname[sizeof(cset->ac->devname)];
607 /* XXX: assume partition 'a' first */
608 snprintf(cname, sizeof(cname), "%s%c",
609 device_xname(dksc->sc_dev), 'a');
610 candidate_root = dkwedge_find_by_wname(cname);
611 aprint_debug("%s: candidate wedge root=%s\n", __func__,
612 cname);
613 if (candidate_root == NULL) {
614 /*
615 * If that is not found, because we don't use
616 * disklabel, return the first dk child
617 * XXX: we can skip the 'a' check above
618 * and always do this...
619 */
620 size_t i = 0;
621 candidate_root = dkwedge_find_by_parent(
622 device_xname(dksc->sc_dev), &i);
623 }
624 aprint_debug("%s: candidate wedge root=%p\n", __func__,
625 candidate_root);
626 } else
627 candidate_root = dksc->sc_dev;
628 aprint_debug("%s: candidate root=%p booted_device=%p "
629 "root_partition=%d contains_boot=%d\n",
630 __func__, candidate_root, booted_device,
631 rsc->sc_r.root_partition,
632 rf_containsboot(&rsc->sc_r, booted_device));
633 /* XXX the check for booted_device == NULL can probably be
634 * dropped, now that rf_containsboot handles that case.
635 */
636 if (booted_device == NULL ||
637 rsc->sc_r.root_partition == 1 ||
638 rf_containsboot(&rsc->sc_r, booted_device)) {
639 booted_device = candidate_root;
640 booted_method = "raidframe/single";
641 booted_partition = 0; /* XXX assume 'a' */
642 aprint_debug("%s: set booted_device=%s(%p)\n", __func__,
643 device_xname(booted_device), booted_device);
644 }
645 } else if (num_root > 1) {
646 aprint_debug("%s: many roots=%d, %p\n", __func__, num_root,
647 booted_device);
648
649 /*
650 * Maybe the MD code can help. If it cannot, then
651 * setroot() will discover that we have no
652 * booted_device and will ask the user if nothing was
653 * hardwired in the kernel config file
654 */
655 if (booted_device == NULL)
656 return;
657
658 num_root = 0;
659 mutex_enter(&raid_lock);
660 LIST_FOREACH(sc, &raids, sc_link) {
661 RF_Raid_t *r = &sc->sc_r;
662 if (r->valid == 0)
663 continue;
664
665 if (r->root_partition == 0)
666 continue;
667
668 if (rf_containsboot(r, booted_device)) {
669 num_root++;
670 rsc = sc;
671 dksc = &rsc->sc_dksc;
672 }
673 }
674 mutex_exit(&raid_lock);
675
676 if (num_root == 1) {
677 booted_device = dksc->sc_dev;
678 booted_method = "raidframe/multi";
679 booted_partition = 0; /* XXX assume 'a' */
680 } else {
681 /* we can't guess.. require the user to answer... */
682 boothowto |= RB_ASKNAME;
683 }
684 }
685 }
686
687 static int
688 raidsize(dev_t dev)
689 {
690 struct raid_softc *rs;
691 struct dk_softc *dksc;
692 unsigned int unit;
693
694 unit = raidunit(dev);
695 if ((rs = raidget(unit, false)) == NULL)
696 return -1;
697 dksc = &rs->sc_dksc;
698
699 if ((rs->sc_flags & RAIDF_INITED) == 0)
700 return -1;
701
702 return dk_size(dksc, dev);
703 }
704
705 static int
706 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
707 {
708 unsigned int unit;
709 struct raid_softc *rs;
710 struct dk_softc *dksc;
711
712 unit = raidunit(dev);
713 if ((rs = raidget(unit, false)) == NULL)
714 return ENXIO;
715 dksc = &rs->sc_dksc;
716
717 if ((rs->sc_flags & RAIDF_INITED) == 0)
718 return ENODEV;
719
720 /*
721 Note that blkno is relative to this particular partition.
722 By adding adding RF_PROTECTED_SECTORS, we get a value that
723 is relative to the partition used for the underlying component.
724 */
725 blkno += RF_PROTECTED_SECTORS;
726
727 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
728 }
729
730 static int
731 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
732 {
733 struct raid_softc *rs = raidsoftc(dev);
734 const struct bdevsw *bdev;
735 RF_Raid_t *raidPtr;
736 int c, sparecol, j, scol, dumpto;
737 int error = 0;
738
739 raidPtr = &rs->sc_r;
740
741 /* we only support dumping to RAID 1 sets */
742 if (raidPtr->Layout.numDataCol != 1 ||
743 raidPtr->Layout.numParityCol != 1)
744 return EINVAL;
745
746 if ((error = raidlock(rs)) != 0)
747 return error;
748
749 /* figure out what device is alive.. */
750
751 /*
752 Look for a component to dump to. The preference for the
753 component to dump to is as follows:
754 1) the first component
755 2) a used_spare of the first component
756 3) the second component
757 4) a used_spare of the second component
758 */
759
760 dumpto = -1;
761 for (c = 0; c < raidPtr->numCol; c++) {
762 if (raidPtr->Disks[c].status == rf_ds_optimal) {
763 /* this might be the one */
764 dumpto = c;
765 break;
766 }
767 }
768
769 /*
770 At this point we have possibly selected a live component.
771 If we didn't find a live ocmponent, we now check to see
772 if there is a relevant spared component.
773 */
774
775 for (c = 0; c < raidPtr->numSpare; c++) {
776 sparecol = raidPtr->numCol + c;
777 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
778 /* How about this one? */
779 scol = -1;
780 for(j=0;j<raidPtr->numCol;j++) {
781 if (raidPtr->Disks[j].spareCol == sparecol) {
782 scol = j;
783 break;
784 }
785 }
786 if (scol == 0) {
787 /*
788 We must have found a spared first
789 component! We'll take that over
790 anything else found so far. (We
791 couldn't have found a real first
792 component before, since this is a
793 used spare, and it's saying that
794 it's replacing the first
795 component.) On reboot (with
796 autoconfiguration turned on)
797 sparecol will become the first
798 component (component0) of this set.
799 */
800 dumpto = sparecol;
801 break;
802 } else if (scol != -1) {
803 /*
804 Must be a spared second component.
805 We'll dump to that if we havn't found
806 anything else so far.
807 */
808 if (dumpto == -1)
809 dumpto = sparecol;
810 }
811 }
812 }
813
814 if (dumpto == -1) {
815 /* we couldn't find any live components to dump to!?!?
816 */
817 error = EINVAL;
818 goto out;
819 }
820
821 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
822 if (bdev == NULL) {
823 error = ENXIO;
824 goto out;
825 }
826
827 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
828 blkno, va, nblk * raidPtr->bytesPerSector);
829
830 out:
831 raidunlock(rs);
832
833 return error;
834 }
835
836 /* ARGSUSED */
837 static int
838 raidopen(dev_t dev, int flags, int fmt,
839 struct lwp *l)
840 {
841 int unit = raidunit(dev);
842 struct raid_softc *rs;
843 struct dk_softc *dksc;
844 int error = 0;
845 int part, pmask;
846
847 if ((rs = raidget(unit, true)) == NULL)
848 return ENXIO;
849 if ((error = raidlock(rs)) != 0)
850 return error;
851
852 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
853 error = EBUSY;
854 goto bad;
855 }
856
857 dksc = &rs->sc_dksc;
858
859 part = DISKPART(dev);
860 pmask = (1 << part);
861
862 if (!DK_BUSY(dksc, pmask) &&
863 ((rs->sc_flags & RAIDF_INITED) != 0)) {
864 /* First one... mark things as dirty... Note that we *MUST*
865 have done a configure before this. I DO NOT WANT TO BE
866 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
867 THAT THEY BELONG TOGETHER!!!!! */
868 /* XXX should check to see if we're only open for reading
869 here... If so, we needn't do this, but then need some
870 other way of keeping track of what's happened.. */
871
872 rf_markalldirty(&rs->sc_r);
873 }
874
875 if ((rs->sc_flags & RAIDF_INITED) != 0)
876 error = dk_open(dksc, dev, flags, fmt, l);
877
878 bad:
879 raidunlock(rs);
880
881 return error;
882
883
884 }
885
886 static int
887 raid_lastclose(device_t self)
888 {
889 struct raid_softc *rs = raidsoftc(self);
890
891 /* Last one... device is not unconfigured yet.
892 Device shutdown has taken care of setting the
893 clean bits if RAIDF_INITED is not set
894 mark things as clean... */
895
896 rf_update_component_labels(&rs->sc_r,
897 RF_FINAL_COMPONENT_UPDATE);
898
899 /* pass to unlocked code */
900 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
901 rs->sc_flags |= RAIDF_DETACH;
902
903 return 0;
904 }
905
906 /* ARGSUSED */
907 static int
908 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
909 {
910 int unit = raidunit(dev);
911 struct raid_softc *rs;
912 struct dk_softc *dksc;
913 cfdata_t cf;
914 int error = 0, do_detach = 0, do_put = 0;
915
916 if ((rs = raidget(unit, false)) == NULL)
917 return ENXIO;
918 dksc = &rs->sc_dksc;
919
920 if ((error = raidlock(rs)) != 0)
921 return error;
922
923 if ((rs->sc_flags & RAIDF_INITED) != 0) {
924 error = dk_close(dksc, dev, flags, fmt, l);
925 if ((rs->sc_flags & RAIDF_DETACH) != 0)
926 do_detach = 1;
927 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
928 do_put = 1;
929
930 raidunlock(rs);
931
932 if (do_detach) {
933 /* free the pseudo device attach bits */
934 cf = device_cfdata(dksc->sc_dev);
935 error = config_detach(dksc->sc_dev, 0);
936 if (error == 0)
937 free(cf, M_RAIDFRAME);
938 } else if (do_put) {
939 raidput(rs);
940 }
941
942 return error;
943
944 }
945
946 static void
947 raid_wakeup(RF_Raid_t *raidPtr)
948 {
949 rf_lock_mutex2(raidPtr->iodone_lock);
950 rf_signal_cond2(raidPtr->iodone_cv);
951 rf_unlock_mutex2(raidPtr->iodone_lock);
952 }
953
954 static void
955 raidstrategy(struct buf *bp)
956 {
957 unsigned int unit;
958 struct raid_softc *rs;
959 struct dk_softc *dksc;
960 RF_Raid_t *raidPtr;
961
962 unit = raidunit(bp->b_dev);
963 if ((rs = raidget(unit, false)) == NULL) {
964 bp->b_error = ENXIO;
965 goto fail;
966 }
967 if ((rs->sc_flags & RAIDF_INITED) == 0) {
968 bp->b_error = ENXIO;
969 goto fail;
970 }
971 dksc = &rs->sc_dksc;
972 raidPtr = &rs->sc_r;
973
974 /* Queue IO only */
975 if (dk_strategy_defer(dksc, bp))
976 goto done;
977
978 /* schedule the IO to happen at the next convenient time */
979 raid_wakeup(raidPtr);
980
981 done:
982 return;
983
984 fail:
985 bp->b_resid = bp->b_bcount;
986 biodone(bp);
987 }
988
989 static int
990 raid_diskstart(device_t dev, struct buf *bp)
991 {
992 struct raid_softc *rs = raidsoftc(dev);
993 RF_Raid_t *raidPtr;
994
995 raidPtr = &rs->sc_r;
996 if (!raidPtr->valid) {
997 db1_printf(("raid is not valid..\n"));
998 return ENODEV;
999 }
1000
1001 /* XXX */
1002 bp->b_resid = 0;
1003
1004 return raiddoaccess(raidPtr, bp);
1005 }
1006
1007 void
1008 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1009 {
1010 struct raid_softc *rs;
1011 struct dk_softc *dksc;
1012
1013 rs = raidPtr->softc;
1014 dksc = &rs->sc_dksc;
1015
1016 dk_done(dksc, bp);
1017
1018 rf_lock_mutex2(raidPtr->mutex);
1019 raidPtr->openings++;
1020 rf_unlock_mutex2(raidPtr->mutex);
1021
1022 /* schedule more IO */
1023 raid_wakeup(raidPtr);
1024 }
1025
1026 /* ARGSUSED */
1027 static int
1028 raidread(dev_t dev, struct uio *uio, int flags)
1029 {
1030 int unit = raidunit(dev);
1031 struct raid_softc *rs;
1032
1033 if ((rs = raidget(unit, false)) == NULL)
1034 return ENXIO;
1035
1036 if ((rs->sc_flags & RAIDF_INITED) == 0)
1037 return ENXIO;
1038
1039 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1040
1041 }
1042
1043 /* ARGSUSED */
1044 static int
1045 raidwrite(dev_t dev, struct uio *uio, int flags)
1046 {
1047 int unit = raidunit(dev);
1048 struct raid_softc *rs;
1049
1050 if ((rs = raidget(unit, false)) == NULL)
1051 return ENXIO;
1052
1053 if ((rs->sc_flags & RAIDF_INITED) == 0)
1054 return ENXIO;
1055
1056 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1057
1058 }
1059
1060 static int
1061 raid_detach_unlocked(struct raid_softc *rs)
1062 {
1063 struct dk_softc *dksc = &rs->sc_dksc;
1064 RF_Raid_t *raidPtr;
1065 int error;
1066
1067 raidPtr = &rs->sc_r;
1068
1069 if (DK_BUSY(dksc, 0) ||
1070 raidPtr->recon_in_progress != 0 ||
1071 raidPtr->parity_rewrite_in_progress != 0 ||
1072 raidPtr->copyback_in_progress != 0)
1073 return EBUSY;
1074
1075 if ((rs->sc_flags & RAIDF_INITED) == 0)
1076 return 0;
1077
1078 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1079
1080 if ((error = rf_Shutdown(raidPtr)) != 0)
1081 return error;
1082
1083 rs->sc_flags &= ~RAIDF_INITED;
1084
1085 /* Kill off any queued buffers */
1086 dk_drain(dksc);
1087 bufq_free(dksc->sc_bufq);
1088
1089 /* Detach the disk. */
1090 dkwedge_delall(&dksc->sc_dkdev);
1091 disk_detach(&dksc->sc_dkdev);
1092 disk_destroy(&dksc->sc_dkdev);
1093 dk_detach(dksc);
1094
1095 return 0;
1096 }
1097
1098 static bool
1099 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd)
1100 {
1101 switch (cmd) {
1102 case RAIDFRAME_ADD_HOT_SPARE:
1103 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1104 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1105 case RAIDFRAME_CHECK_PARITY:
1106 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1107 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1108 case RAIDFRAME_CHECK_RECON_STATUS:
1109 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1110 case RAIDFRAME_COPYBACK:
1111 case RAIDFRAME_DELETE_COMPONENT:
1112 case RAIDFRAME_FAIL_DISK:
1113 case RAIDFRAME_GET_ACCTOTALS:
1114 case RAIDFRAME_GET_COMPONENT_LABEL:
1115 case RAIDFRAME_GET_INFO:
1116 case RAIDFRAME_GET_SIZE:
1117 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1118 case RAIDFRAME_INIT_LABELS:
1119 case RAIDFRAME_KEEP_ACCTOTALS:
1120 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1121 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1122 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1123 case RAIDFRAME_PARITYMAP_STATUS:
1124 case RAIDFRAME_REBUILD_IN_PLACE:
1125 case RAIDFRAME_REMOVE_HOT_SPARE:
1126 case RAIDFRAME_RESET_ACCTOTALS:
1127 case RAIDFRAME_REWRITEPARITY:
1128 case RAIDFRAME_SET_AUTOCONFIG:
1129 case RAIDFRAME_SET_COMPONENT_LABEL:
1130 case RAIDFRAME_SET_ROOT:
1131 return (rs->sc_flags & RAIDF_INITED) == 0;
1132 }
1133 return false;
1134 }
1135
1136 int
1137 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1138 {
1139 struct rf_recon_req_internal *rrint;
1140
1141 if (raidPtr->Layout.map->faultsTolerated == 0) {
1142 /* Can't do this on a RAID 0!! */
1143 return EINVAL;
1144 }
1145
1146 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1147 /* bad column */
1148 return EINVAL;
1149 }
1150
1151 rf_lock_mutex2(raidPtr->mutex);
1152 if (raidPtr->status == rf_rs_reconstructing) {
1153 /* you can't fail a disk while we're reconstructing! */
1154 /* XXX wrong for RAID6 */
1155 goto out;
1156 }
1157 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1158 (raidPtr->numFailures > 0)) {
1159 /* some other component has failed. Let's not make
1160 things worse. XXX wrong for RAID6 */
1161 goto out;
1162 }
1163 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1164 /* Can't fail a spared disk! */
1165 goto out;
1166 }
1167 rf_unlock_mutex2(raidPtr->mutex);
1168
1169 /* make a copy of the recon request so that we don't rely on
1170 * the user's buffer */
1171 rrint = RF_Malloc(sizeof(*rrint));
1172 if (rrint == NULL)
1173 return(ENOMEM);
1174 rrint->col = rr->col;
1175 rrint->flags = rr->flags;
1176 rrint->raidPtr = raidPtr;
1177
1178 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1179 rrint, "raid_recon");
1180 out:
1181 rf_unlock_mutex2(raidPtr->mutex);
1182 return EINVAL;
1183 }
1184
1185 static int
1186 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1187 {
1188 /* allocate a buffer for the layout-specific data, and copy it in */
1189 if (k_cfg->layoutSpecificSize == 0)
1190 return 0;
1191
1192 if (k_cfg->layoutSpecificSize > 10000) {
1193 /* sanity check */
1194 return EINVAL;
1195 }
1196
1197 u_char *specific_buf;
1198 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1199 if (specific_buf == NULL)
1200 return ENOMEM;
1201
1202 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1203 k_cfg->layoutSpecificSize);
1204 if (retcode) {
1205 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1206 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1207 return retcode;
1208 }
1209
1210 k_cfg->layoutSpecific = specific_buf;
1211 return 0;
1212 }
1213
1214 static int
1215 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1216 {
1217 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1218
1219 if (rs->sc_r.valid) {
1220 /* There is a valid RAID set running on this unit! */
1221 printf("raid%d: Device already configured!\n", rs->sc_unit);
1222 return EINVAL;
1223 }
1224
1225 /* copy-in the configuration information */
1226 /* data points to a pointer to the configuration structure */
1227 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1228 if (*k_cfg == NULL) {
1229 return ENOMEM;
1230 }
1231 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1232 if (retcode == 0)
1233 return 0;
1234 RF_Free(*k_cfg, sizeof(RF_Config_t));
1235 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1236 rs->sc_flags |= RAIDF_SHUTDOWN;
1237 return retcode;
1238 }
1239
1240 int
1241 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1242 {
1243 int retcode, i;
1244 RF_Raid_t *raidPtr = &rs->sc_r;
1245
1246 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1247
1248 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1249 goto out;
1250
1251 /* should do some kind of sanity check on the configuration.
1252 * Store the sum of all the bytes in the last byte? */
1253
1254 /* Force nul-termination on all strings. */
1255 #define ZERO_FINAL(s) do { s[sizeof(s) - 1] = '\0'; } while (0)
1256 for (i = 0; i < RF_MAXCOL; i++) {
1257 ZERO_FINAL(k_cfg->devnames[0][i]);
1258 }
1259 for (i = 0; i < RF_MAXSPARE; i++) {
1260 ZERO_FINAL(k_cfg->spare_names[i]);
1261 }
1262 for (i = 0; i < RF_MAXDBGV; i++) {
1263 ZERO_FINAL(k_cfg->debugVars[i]);
1264 }
1265 #undef ZERO_FINAL
1266
1267 /* Check some basic limits. */
1268 if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1269 retcode = EINVAL;
1270 goto out;
1271 }
1272 if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1273 retcode = EINVAL;
1274 goto out;
1275 }
1276
1277 /* configure the system */
1278
1279 /*
1280 * Clear the entire RAID descriptor, just to make sure
1281 * there is no stale data left in the case of a
1282 * reconfiguration
1283 */
1284 memset(raidPtr, 0, sizeof(*raidPtr));
1285 raidPtr->softc = rs;
1286 raidPtr->raidid = rs->sc_unit;
1287
1288 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1289
1290 if (retcode == 0) {
1291 /* allow this many simultaneous IO's to
1292 this RAID device */
1293 raidPtr->openings = RAIDOUTSTANDING;
1294
1295 raidinit(rs);
1296 raid_wakeup(raidPtr);
1297 rf_markalldirty(raidPtr);
1298 }
1299
1300 /* free the buffers. No return code here. */
1301 if (k_cfg->layoutSpecificSize) {
1302 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1303 }
1304 out:
1305 RF_Free(k_cfg, sizeof(RF_Config_t));
1306 if (retcode) {
1307 /*
1308 * If configuration failed, set sc_flags so that we
1309 * will detach the device when we close it.
1310 */
1311 rs->sc_flags |= RAIDF_SHUTDOWN;
1312 }
1313 return retcode;
1314 }
1315
1316 #if RF_DISABLED
1317 static int
1318 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1319 {
1320
1321 /* XXX check the label for valid stuff... */
1322 /* Note that some things *should not* get modified --
1323 the user should be re-initing the labels instead of
1324 trying to patch things.
1325 */
1326 #ifdef DEBUG
1327 int raidid = raidPtr->raidid;
1328 printf("raid%d: Got component label:\n", raidid);
1329 printf("raid%d: Version: %d\n", raidid, clabel->version);
1330 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1331 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1332 printf("raid%d: Column: %d\n", raidid, clabel->column);
1333 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1334 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1335 printf("raid%d: Status: %d\n", raidid, clabel->status);
1336 #endif /* DEBUG */
1337 clabel->row = 0;
1338 int column = clabel->column;
1339
1340 if ((column < 0) || (column >= raidPtr->numCol)) {
1341 return(EINVAL);
1342 }
1343
1344 /* XXX this isn't allowed to do anything for now :-) */
1345
1346 /* XXX and before it is, we need to fill in the rest
1347 of the fields!?!?!?! */
1348 memcpy(raidget_component_label(raidPtr, column),
1349 clabel, sizeof(*clabel));
1350 raidflush_component_label(raidPtr, column);
1351 return 0;
1352 }
1353 #endif
1354
1355 static int
1356 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1357 {
1358 /*
1359 we only want the serial number from
1360 the above. We get all the rest of the information
1361 from the config that was used to create this RAID
1362 set.
1363 */
1364
1365 raidPtr->serial_number = clabel->serial_number;
1366
1367 for (int column = 0; column < raidPtr->numCol; column++) {
1368 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1369 if (RF_DEAD_DISK(diskPtr->status))
1370 continue;
1371 RF_ComponentLabel_t *ci_label = raidget_component_label(
1372 raidPtr, column);
1373 /* Zeroing this is important. */
1374 memset(ci_label, 0, sizeof(*ci_label));
1375 raid_init_component_label(raidPtr, ci_label);
1376 ci_label->serial_number = raidPtr->serial_number;
1377 ci_label->row = 0; /* we dont' pretend to support more */
1378 rf_component_label_set_partitionsize(ci_label,
1379 diskPtr->partitionSize);
1380 ci_label->column = column;
1381 raidflush_component_label(raidPtr, column);
1382 /* XXXjld what about the spares? */
1383 }
1384
1385 return 0;
1386 }
1387
1388 static int
1389 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1390 {
1391
1392 if (raidPtr->Layout.map->faultsTolerated == 0) {
1393 /* Can't do this on a RAID 0!! */
1394 return EINVAL;
1395 }
1396
1397 if (raidPtr->recon_in_progress == 1) {
1398 /* a reconstruct is already in progress! */
1399 return EINVAL;
1400 }
1401
1402 RF_SingleComponent_t component;
1403 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1404 component.row = 0; /* we don't support any more */
1405 int column = component.column;
1406
1407 if ((column < 0) || (column >= raidPtr->numCol)) {
1408 return EINVAL;
1409 }
1410
1411 rf_lock_mutex2(raidPtr->mutex);
1412 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1413 (raidPtr->numFailures > 0)) {
1414 /* XXX 0 above shouldn't be constant!!! */
1415 /* some component other than this has failed.
1416 Let's not make things worse than they already
1417 are... */
1418 printf("raid%d: Unable to reconstruct to disk at:\n",
1419 raidPtr->raidid);
1420 printf("raid%d: Col: %d Too many failures.\n",
1421 raidPtr->raidid, column);
1422 rf_unlock_mutex2(raidPtr->mutex);
1423 return EINVAL;
1424 }
1425
1426 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1427 printf("raid%d: Unable to reconstruct to disk at:\n",
1428 raidPtr->raidid);
1429 printf("raid%d: Col: %d "
1430 "Reconstruction already occurring!\n",
1431 raidPtr->raidid, column);
1432
1433 rf_unlock_mutex2(raidPtr->mutex);
1434 return EINVAL;
1435 }
1436
1437 if (raidPtr->Disks[column].status == rf_ds_spared) {
1438 rf_unlock_mutex2(raidPtr->mutex);
1439 return EINVAL;
1440 }
1441
1442 rf_unlock_mutex2(raidPtr->mutex);
1443
1444 struct rf_recon_req_internal *rrint;
1445 rrint = RF_Malloc(sizeof(*rrint));
1446 if (rrint == NULL)
1447 return ENOMEM;
1448
1449 rrint->col = column;
1450 rrint->raidPtr = raidPtr;
1451
1452 return RF_CREATE_THREAD(raidPtr->recon_thread,
1453 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1454 }
1455
1456 static int
1457 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1458 {
1459 /*
1460 * This makes no sense on a RAID 0, or if we are not reconstructing
1461 * so tell the user it's done.
1462 */
1463 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1464 raidPtr->status != rf_rs_reconstructing) {
1465 *data = 100;
1466 return 0;
1467 }
1468 if (raidPtr->reconControl->numRUsTotal == 0) {
1469 *data = 0;
1470 return 0;
1471 }
1472 *data = (raidPtr->reconControl->numRUsComplete * 100
1473 / raidPtr->reconControl->numRUsTotal);
1474 return 0;
1475 }
1476
1477 /*
1478 * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1479 * on the component_name[] array.
1480 */
1481 static void
1482 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1483 {
1484
1485 memcpy(component, data, sizeof *component);
1486 component->component_name[sizeof(component->component_name) - 1] = '\0';
1487 }
1488
1489 static int
1490 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1491 {
1492 int unit = raidunit(dev);
1493 int part, pmask;
1494 struct raid_softc *rs;
1495 struct dk_softc *dksc;
1496 RF_Config_t *k_cfg;
1497 RF_Raid_t *raidPtr;
1498 RF_AccTotals_t *totals;
1499 RF_SingleComponent_t component;
1500 RF_DeviceConfig_t *d_cfg, *ucfgp;
1501 int retcode = 0;
1502 int column;
1503 RF_ComponentLabel_t *clabel;
1504 int d;
1505
1506 if ((rs = raidget(unit, false)) == NULL)
1507 return ENXIO;
1508
1509 dksc = &rs->sc_dksc;
1510 raidPtr = &rs->sc_r;
1511
1512 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1513 (int) DISKPART(dev), (int) unit, cmd));
1514
1515 /* Must be initialized for these... */
1516 if (rf_must_be_initialized(rs, cmd))
1517 return ENXIO;
1518
1519 switch (cmd) {
1520 /* configure the system */
1521 case RAIDFRAME_CONFIGURE:
1522 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1523 return retcode;
1524 return rf_construct(rs, k_cfg);
1525
1526 /* shutdown the system */
1527 case RAIDFRAME_SHUTDOWN:
1528
1529 part = DISKPART(dev);
1530 pmask = (1 << part);
1531
1532 if ((retcode = raidlock(rs)) != 0)
1533 return retcode;
1534
1535 if (DK_BUSY(dksc, pmask) ||
1536 raidPtr->recon_in_progress != 0 ||
1537 raidPtr->parity_rewrite_in_progress != 0 ||
1538 raidPtr->copyback_in_progress != 0)
1539 retcode = EBUSY;
1540 else {
1541 /* detach and free on close */
1542 rs->sc_flags |= RAIDF_SHUTDOWN;
1543 retcode = 0;
1544 }
1545
1546 raidunlock(rs);
1547
1548 return retcode;
1549 case RAIDFRAME_GET_COMPONENT_LABEL:
1550 return rf_get_component_label(raidPtr, data);
1551
1552 #if RF_DISABLED
1553 case RAIDFRAME_SET_COMPONENT_LABEL:
1554 return rf_set_component_label(raidPtr, data);
1555 #endif
1556
1557 case RAIDFRAME_INIT_LABELS:
1558 return rf_init_component_label(raidPtr, data);
1559
1560 case RAIDFRAME_SET_AUTOCONFIG:
1561 d = rf_set_autoconfig(raidPtr, *(int *) data);
1562 printf("raid%d: New autoconfig value is: %d\n",
1563 raidPtr->raidid, d);
1564 *(int *) data = d;
1565 return retcode;
1566
1567 case RAIDFRAME_SET_ROOT:
1568 d = rf_set_rootpartition(raidPtr, *(int *) data);
1569 printf("raid%d: New rootpartition value is: %d\n",
1570 raidPtr->raidid, d);
1571 *(int *) data = d;
1572 return retcode;
1573
1574 /* initialize all parity */
1575 case RAIDFRAME_REWRITEPARITY:
1576
1577 if (raidPtr->Layout.map->faultsTolerated == 0) {
1578 /* Parity for RAID 0 is trivially correct */
1579 raidPtr->parity_good = RF_RAID_CLEAN;
1580 return 0;
1581 }
1582
1583 if (raidPtr->parity_rewrite_in_progress == 1) {
1584 /* Re-write is already in progress! */
1585 return EINVAL;
1586 }
1587
1588 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1589 rf_RewriteParityThread, raidPtr,"raid_parity");
1590
1591 case RAIDFRAME_ADD_HOT_SPARE:
1592 rf_copy_single_component(&component, data);
1593 return rf_add_hot_spare(raidPtr, &component);
1594
1595 case RAIDFRAME_REMOVE_HOT_SPARE:
1596 return retcode;
1597
1598 case RAIDFRAME_DELETE_COMPONENT:
1599 rf_copy_single_component(&component, data);
1600 return rf_delete_component(raidPtr, &component);
1601
1602 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1603 rf_copy_single_component(&component, data);
1604 return rf_incorporate_hot_spare(raidPtr, &component);
1605
1606 case RAIDFRAME_REBUILD_IN_PLACE:
1607 return rf_rebuild_in_place(raidPtr, data);
1608
1609 case RAIDFRAME_GET_INFO:
1610 ucfgp = *(RF_DeviceConfig_t **)data;
1611 d_cfg = RF_Malloc(sizeof(*d_cfg));
1612 if (d_cfg == NULL)
1613 return ENOMEM;
1614 retcode = rf_get_info(raidPtr, d_cfg);
1615 if (retcode == 0) {
1616 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1617 }
1618 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1619 return retcode;
1620
1621 case RAIDFRAME_CHECK_PARITY:
1622 *(int *) data = raidPtr->parity_good;
1623 return 0;
1624
1625 case RAIDFRAME_PARITYMAP_STATUS:
1626 if (rf_paritymap_ineligible(raidPtr))
1627 return EINVAL;
1628 rf_paritymap_status(raidPtr->parity_map, data);
1629 return 0;
1630
1631 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1632 if (rf_paritymap_ineligible(raidPtr))
1633 return EINVAL;
1634 if (raidPtr->parity_map == NULL)
1635 return ENOENT; /* ??? */
1636 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1637 return EINVAL;
1638 return 0;
1639
1640 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1641 if (rf_paritymap_ineligible(raidPtr))
1642 return EINVAL;
1643 *(int *) data = rf_paritymap_get_disable(raidPtr);
1644 return 0;
1645
1646 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1647 if (rf_paritymap_ineligible(raidPtr))
1648 return EINVAL;
1649 rf_paritymap_set_disable(raidPtr, *(int *)data);
1650 /* XXX should errors be passed up? */
1651 return 0;
1652
1653 case RAIDFRAME_RESCAN:
1654 return rf_rescan();
1655
1656 case RAIDFRAME_RESET_ACCTOTALS:
1657 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1658 return 0;
1659
1660 case RAIDFRAME_GET_ACCTOTALS:
1661 totals = (RF_AccTotals_t *) data;
1662 *totals = raidPtr->acc_totals;
1663 return 0;
1664
1665 case RAIDFRAME_KEEP_ACCTOTALS:
1666 raidPtr->keep_acc_totals = *(int *)data;
1667 return 0;
1668
1669 case RAIDFRAME_GET_SIZE:
1670 *(int *) data = raidPtr->totalSectors;
1671 return 0;
1672
1673 case RAIDFRAME_FAIL_DISK:
1674 return rf_fail_disk(raidPtr, data);
1675
1676 /* invoke a copyback operation after recon on whatever disk
1677 * needs it, if any */
1678 case RAIDFRAME_COPYBACK:
1679
1680 if (raidPtr->Layout.map->faultsTolerated == 0) {
1681 /* This makes no sense on a RAID 0!! */
1682 return EINVAL;
1683 }
1684
1685 if (raidPtr->copyback_in_progress == 1) {
1686 /* Copyback is already in progress! */
1687 return EINVAL;
1688 }
1689
1690 return RF_CREATE_THREAD(raidPtr->copyback_thread,
1691 rf_CopybackThread, raidPtr, "raid_copyback");
1692
1693 /* return the percentage completion of reconstruction */
1694 case RAIDFRAME_CHECK_RECON_STATUS:
1695 return rf_check_recon_status(raidPtr, data);
1696
1697 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1698 rf_check_recon_status_ext(raidPtr, data);
1699 return 0;
1700
1701 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1702 if (raidPtr->Layout.map->faultsTolerated == 0) {
1703 /* This makes no sense on a RAID 0, so tell the
1704 user it's done. */
1705 *(int *) data = 100;
1706 return 0;
1707 }
1708 if (raidPtr->parity_rewrite_in_progress == 1) {
1709 *(int *) data = 100 *
1710 raidPtr->parity_rewrite_stripes_done /
1711 raidPtr->Layout.numStripe;
1712 } else {
1713 *(int *) data = 100;
1714 }
1715 return 0;
1716
1717 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1718 rf_check_parityrewrite_status_ext(raidPtr, data);
1719 return 0;
1720
1721 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1722 if (raidPtr->Layout.map->faultsTolerated == 0) {
1723 /* This makes no sense on a RAID 0 */
1724 *(int *) data = 100;
1725 return 0;
1726 }
1727 if (raidPtr->copyback_in_progress == 1) {
1728 *(int *) data = 100 * raidPtr->copyback_stripes_done /
1729 raidPtr->Layout.numStripe;
1730 } else {
1731 *(int *) data = 100;
1732 }
1733 return 0;
1734
1735 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1736 rf_check_copyback_status_ext(raidPtr, data);
1737 return 0;
1738
1739 case RAIDFRAME_SET_LAST_UNIT:
1740 for (column = 0; column < raidPtr->numCol; column++)
1741 if (raidPtr->Disks[column].status != rf_ds_optimal)
1742 return EBUSY;
1743
1744 for (column = 0; column < raidPtr->numCol; column++) {
1745 clabel = raidget_component_label(raidPtr, column);
1746 clabel->last_unit = *(int *)data;
1747 raidflush_component_label(raidPtr, column);
1748 }
1749 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1750 return 0;
1751
1752 /* the sparetable daemon calls this to wait for the kernel to
1753 * need a spare table. this ioctl does not return until a
1754 * spare table is needed. XXX -- calling mpsleep here in the
1755 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1756 * -- I should either compute the spare table in the kernel,
1757 * or have a different -- XXX XXX -- interface (a different
1758 * character device) for delivering the table -- XXX */
1759 #if RF_DISABLED
1760 case RAIDFRAME_SPARET_WAIT:
1761 rf_lock_mutex2(rf_sparet_wait_mutex);
1762 while (!rf_sparet_wait_queue)
1763 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1764 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1765 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1766 rf_unlock_mutex2(rf_sparet_wait_mutex);
1767
1768 /* structure assignment */
1769 *((RF_SparetWait_t *) data) = *waitreq;
1770
1771 RF_Free(waitreq, sizeof(*waitreq));
1772 return 0;
1773
1774 /* wakes up a process waiting on SPARET_WAIT and puts an error
1775 * code in it that will cause the dameon to exit */
1776 case RAIDFRAME_ABORT_SPARET_WAIT:
1777 waitreq = RF_Malloc(sizeof(*waitreq));
1778 waitreq->fcol = -1;
1779 rf_lock_mutex2(rf_sparet_wait_mutex);
1780 waitreq->next = rf_sparet_wait_queue;
1781 rf_sparet_wait_queue = waitreq;
1782 rf_broadcast_cond2(rf_sparet_wait_cv);
1783 rf_unlock_mutex2(rf_sparet_wait_mutex);
1784 return 0;
1785
1786 /* used by the spare table daemon to deliver a spare table
1787 * into the kernel */
1788 case RAIDFRAME_SEND_SPARET:
1789
1790 /* install the spare table */
1791 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1792
1793 /* respond to the requestor. the return status of the spare
1794 * table installation is passed in the "fcol" field */
1795 waitred = RF_Malloc(sizeof(*waitreq));
1796 waitreq->fcol = retcode;
1797 rf_lock_mutex2(rf_sparet_wait_mutex);
1798 waitreq->next = rf_sparet_resp_queue;
1799 rf_sparet_resp_queue = waitreq;
1800 rf_broadcast_cond2(rf_sparet_resp_cv);
1801 rf_unlock_mutex2(rf_sparet_wait_mutex);
1802
1803 return retcode;
1804 #endif
1805 default:
1806 /*
1807 * Don't bother trying to load compat modules
1808 * if it is not our ioctl. This is more efficient
1809 * and makes rump tests not depend on compat code
1810 */
1811 if (IOCGROUP(cmd) != 'r')
1812 break;
1813 #ifdef _LP64
1814 if ((l->l_proc->p_flag & PK_32) != 0) {
1815 module_autoload("compat_netbsd32_raid",
1816 MODULE_CLASS_EXEC);
1817 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1818 (rs, cmd, data), enosys(), retcode);
1819 if (retcode != EPASSTHROUGH)
1820 return retcode;
1821 }
1822 #endif
1823 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1824 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1825 (rs, cmd, data), enosys(), retcode);
1826 if (retcode != EPASSTHROUGH)
1827 return retcode;
1828
1829 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1830 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1831 (rs, cmd, data), enosys(), retcode);
1832 if (retcode != EPASSTHROUGH)
1833 return retcode;
1834 break; /* fall through to the os-specific code below */
1835
1836 }
1837
1838 if (!raidPtr->valid)
1839 return EINVAL;
1840
1841 /*
1842 * Add support for "regular" device ioctls here.
1843 */
1844
1845 switch (cmd) {
1846 case DIOCGCACHE:
1847 retcode = rf_get_component_caches(raidPtr, (int *)data);
1848 break;
1849
1850 case DIOCCACHESYNC:
1851 retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1852 break;
1853
1854 default:
1855 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1856 break;
1857 }
1858
1859 return retcode;
1860
1861 }
1862
1863
1864 /* raidinit -- complete the rest of the initialization for the
1865 RAIDframe device. */
1866
1867
1868 static void
1869 raidinit(struct raid_softc *rs)
1870 {
1871 cfdata_t cf;
1872 unsigned int unit;
1873 struct dk_softc *dksc = &rs->sc_dksc;
1874 RF_Raid_t *raidPtr = &rs->sc_r;
1875 device_t dev;
1876
1877 unit = raidPtr->raidid;
1878
1879 /* XXX doesn't check bounds. */
1880 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1881
1882 /* attach the pseudo device */
1883 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1884 cf->cf_name = raid_cd.cd_name;
1885 cf->cf_atname = raid_cd.cd_name;
1886 cf->cf_unit = unit;
1887 cf->cf_fstate = FSTATE_STAR;
1888
1889 dev = config_attach_pseudo(cf);
1890 if (dev == NULL) {
1891 printf("raid%d: config_attach_pseudo failed\n",
1892 raidPtr->raidid);
1893 free(cf, M_RAIDFRAME);
1894 return;
1895 }
1896
1897 /* provide a backpointer to the real softc */
1898 raidsoftc(dev) = rs;
1899
1900 /* disk_attach actually creates space for the CPU disklabel, among
1901 * other things, so it's critical to call this *BEFORE* we try putzing
1902 * with disklabels. */
1903 dk_init(dksc, dev, DKTYPE_RAID);
1904 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1905
1906 /* XXX There may be a weird interaction here between this, and
1907 * protectedSectors, as used in RAIDframe. */
1908
1909 rs->sc_size = raidPtr->totalSectors;
1910
1911 /* Attach dk and disk subsystems */
1912 dk_attach(dksc);
1913 disk_attach(&dksc->sc_dkdev);
1914 rf_set_geometry(rs, raidPtr);
1915
1916 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1917
1918 /* mark unit as usuable */
1919 rs->sc_flags |= RAIDF_INITED;
1920
1921 dkwedge_discover(&dksc->sc_dkdev);
1922 }
1923
1924 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1925 /* wake up the daemon & tell it to get us a spare table
1926 * XXX
1927 * the entries in the queues should be tagged with the raidPtr
1928 * so that in the extremely rare case that two recons happen at once,
1929 * we know for which device were requesting a spare table
1930 * XXX
1931 *
1932 * XXX This code is not currently used. GO
1933 */
1934 int
1935 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1936 {
1937 int retcode;
1938
1939 rf_lock_mutex2(rf_sparet_wait_mutex);
1940 req->next = rf_sparet_wait_queue;
1941 rf_sparet_wait_queue = req;
1942 rf_broadcast_cond2(rf_sparet_wait_cv);
1943
1944 /* mpsleep unlocks the mutex */
1945 while (!rf_sparet_resp_queue) {
1946 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1947 }
1948 req = rf_sparet_resp_queue;
1949 rf_sparet_resp_queue = req->next;
1950 rf_unlock_mutex2(rf_sparet_wait_mutex);
1951
1952 retcode = req->fcol;
1953 RF_Free(req, sizeof(*req)); /* this is not the same req as we
1954 * alloc'd */
1955 return retcode;
1956 }
1957 #endif
1958
1959 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1960 * bp & passes it down.
1961 * any calls originating in the kernel must use non-blocking I/O
1962 * do some extra sanity checking to return "appropriate" error values for
1963 * certain conditions (to make some standard utilities work)
1964 *
1965 * Formerly known as: rf_DoAccessKernel
1966 */
1967 void
1968 raidstart(RF_Raid_t *raidPtr)
1969 {
1970 struct raid_softc *rs;
1971 struct dk_softc *dksc;
1972
1973 rs = raidPtr->softc;
1974 dksc = &rs->sc_dksc;
1975 /* quick check to see if anything has died recently */
1976 rf_lock_mutex2(raidPtr->mutex);
1977 if (raidPtr->numNewFailures > 0) {
1978 rf_unlock_mutex2(raidPtr->mutex);
1979 rf_update_component_labels(raidPtr,
1980 RF_NORMAL_COMPONENT_UPDATE);
1981 rf_lock_mutex2(raidPtr->mutex);
1982 raidPtr->numNewFailures--;
1983 }
1984 rf_unlock_mutex2(raidPtr->mutex);
1985
1986 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1987 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
1988 return;
1989 }
1990
1991 dk_start(dksc, NULL);
1992 }
1993
1994 static int
1995 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
1996 {
1997 RF_SectorCount_t num_blocks, pb, sum;
1998 RF_RaidAddr_t raid_addr;
1999 daddr_t blocknum;
2000 int rc;
2001
2002 rf_lock_mutex2(raidPtr->mutex);
2003 if (raidPtr->openings == 0) {
2004 rf_unlock_mutex2(raidPtr->mutex);
2005 return EAGAIN;
2006 }
2007 rf_unlock_mutex2(raidPtr->mutex);
2008
2009 blocknum = bp->b_rawblkno;
2010
2011 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2012 (int) blocknum));
2013
2014 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2015 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2016
2017 /* *THIS* is where we adjust what block we're going to...
2018 * but DO NOT TOUCH bp->b_blkno!!! */
2019 raid_addr = blocknum;
2020
2021 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2022 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2023 sum = raid_addr + num_blocks + pb;
2024 if (1 || rf_debugKernelAccess) {
2025 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2026 (int) raid_addr, (int) sum, (int) num_blocks,
2027 (int) pb, (int) bp->b_resid));
2028 }
2029 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2030 || (sum < num_blocks) || (sum < pb)) {
2031 rc = ENOSPC;
2032 goto done;
2033 }
2034 /*
2035 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2036 */
2037
2038 if (bp->b_bcount & raidPtr->sectorMask) {
2039 rc = ENOSPC;
2040 goto done;
2041 }
2042 db1_printf(("Calling DoAccess..\n"));
2043
2044
2045 rf_lock_mutex2(raidPtr->mutex);
2046 raidPtr->openings--;
2047 rf_unlock_mutex2(raidPtr->mutex);
2048
2049 /* don't ever condition on bp->b_flags & B_WRITE.
2050 * always condition on B_READ instead */
2051
2052 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2053 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2054 raid_addr, num_blocks,
2055 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2056
2057 done:
2058 return rc;
2059 }
2060
2061 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2062
2063 int
2064 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2065 {
2066 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2067 struct buf *bp;
2068
2069 req->queue = queue;
2070 bp = req->bp;
2071
2072 switch (req->type) {
2073 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2074 /* XXX need to do something extra here.. */
2075 /* I'm leaving this in, as I've never actually seen it used,
2076 * and I'd like folks to report it... GO */
2077 printf("%s: WAKEUP CALLED\n", __func__);
2078 queue->numOutstanding++;
2079
2080 bp->b_flags = 0;
2081 bp->b_private = req;
2082
2083 KernelWakeupFunc(bp);
2084 break;
2085
2086 case RF_IO_TYPE_READ:
2087 case RF_IO_TYPE_WRITE:
2088 #if RF_ACC_TRACE > 0
2089 if (req->tracerec) {
2090 RF_ETIMER_START(req->tracerec->timer);
2091 }
2092 #endif
2093 InitBP(bp, queue->rf_cinfo->ci_vp,
2094 op, queue->rf_cinfo->ci_dev,
2095 req->sectorOffset, req->numSector,
2096 req->buf, KernelWakeupFunc, (void *) req,
2097 queue->raidPtr->logBytesPerSector);
2098
2099 if (rf_debugKernelAccess) {
2100 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2101 (long) bp->b_blkno));
2102 }
2103 queue->numOutstanding++;
2104 queue->last_deq_sector = req->sectorOffset;
2105 /* acc wouldn't have been let in if there were any pending
2106 * reqs at any other priority */
2107 queue->curPriority = req->priority;
2108
2109 db1_printf(("Going for %c to unit %d col %d\n",
2110 req->type, queue->raidPtr->raidid,
2111 queue->col));
2112 db1_printf(("sector %d count %d (%d bytes) %d\n",
2113 (int) req->sectorOffset, (int) req->numSector,
2114 (int) (req->numSector <<
2115 queue->raidPtr->logBytesPerSector),
2116 (int) queue->raidPtr->logBytesPerSector));
2117
2118 /*
2119 * XXX: drop lock here since this can block at
2120 * least with backing SCSI devices. Retake it
2121 * to minimize fuss with calling interfaces.
2122 */
2123
2124 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2125 bdev_strategy(bp);
2126 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2127 break;
2128
2129 default:
2130 panic("bad req->type in rf_DispatchKernelIO");
2131 }
2132 db1_printf(("Exiting from DispatchKernelIO\n"));
2133
2134 return 0;
2135 }
2136 /* this is the callback function associated with a I/O invoked from
2137 kernel code.
2138 */
2139 static void
2140 KernelWakeupFunc(struct buf *bp)
2141 {
2142 RF_DiskQueueData_t *req = NULL;
2143 RF_DiskQueue_t *queue;
2144
2145 db1_printf(("recovering the request queue:\n"));
2146
2147 req = bp->b_private;
2148
2149 queue = (RF_DiskQueue_t *) req->queue;
2150
2151 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2152
2153 #if RF_ACC_TRACE > 0
2154 if (req->tracerec) {
2155 RF_ETIMER_STOP(req->tracerec->timer);
2156 RF_ETIMER_EVAL(req->tracerec->timer);
2157 rf_lock_mutex2(rf_tracing_mutex);
2158 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2159 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2160 req->tracerec->num_phys_ios++;
2161 rf_unlock_mutex2(rf_tracing_mutex);
2162 }
2163 #endif
2164
2165 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2166 * ballistic, and mark the component as hosed... */
2167
2168 if (bp->b_error != 0) {
2169 /* Mark the disk as dead */
2170 /* but only mark it once... */
2171 /* and only if it wouldn't leave this RAID set
2172 completely broken */
2173 if (((queue->raidPtr->Disks[queue->col].status ==
2174 rf_ds_optimal) ||
2175 (queue->raidPtr->Disks[queue->col].status ==
2176 rf_ds_used_spare)) &&
2177 (queue->raidPtr->numFailures <
2178 queue->raidPtr->Layout.map->faultsTolerated)) {
2179 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2180 queue->raidPtr->raidid,
2181 bp->b_error,
2182 queue->raidPtr->Disks[queue->col].devname);
2183 queue->raidPtr->Disks[queue->col].status =
2184 rf_ds_failed;
2185 queue->raidPtr->status = rf_rs_degraded;
2186 queue->raidPtr->numFailures++;
2187 queue->raidPtr->numNewFailures++;
2188 } else { /* Disk is already dead... */
2189 /* printf("Disk already marked as dead!\n"); */
2190 }
2191
2192 }
2193
2194 /* Fill in the error value */
2195 req->error = bp->b_error;
2196
2197 /* Drop this one on the "finished" queue... */
2198 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2199
2200 /* Let the raidio thread know there is work to be done. */
2201 rf_signal_cond2(queue->raidPtr->iodone_cv);
2202
2203 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2204 }
2205
2206
2207 /*
2208 * initialize a buf structure for doing an I/O in the kernel.
2209 */
2210 static void
2211 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2212 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2213 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2214 {
2215 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2216 bp->b_oflags = 0;
2217 bp->b_cflags = 0;
2218 bp->b_bcount = numSect << logBytesPerSector;
2219 bp->b_bufsize = bp->b_bcount;
2220 bp->b_error = 0;
2221 bp->b_dev = dev;
2222 bp->b_data = bf;
2223 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2224 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2225 if (bp->b_bcount == 0) {
2226 panic("bp->b_bcount is zero in InitBP!!");
2227 }
2228 bp->b_iodone = cbFunc;
2229 bp->b_private = cbArg;
2230 }
2231
2232 /*
2233 * Wait interruptibly for an exclusive lock.
2234 *
2235 * XXX
2236 * Several drivers do this; it should be abstracted and made MP-safe.
2237 * (Hmm... where have we seen this warning before :-> GO )
2238 */
2239 static int
2240 raidlock(struct raid_softc *rs)
2241 {
2242 int error;
2243
2244 error = 0;
2245 mutex_enter(&rs->sc_mutex);
2246 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2247 rs->sc_flags |= RAIDF_WANTED;
2248 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2249 if (error != 0)
2250 goto done;
2251 }
2252 rs->sc_flags |= RAIDF_LOCKED;
2253 done:
2254 mutex_exit(&rs->sc_mutex);
2255 return error;
2256 }
2257 /*
2258 * Unlock and wake up any waiters.
2259 */
2260 static void
2261 raidunlock(struct raid_softc *rs)
2262 {
2263
2264 mutex_enter(&rs->sc_mutex);
2265 rs->sc_flags &= ~RAIDF_LOCKED;
2266 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2267 rs->sc_flags &= ~RAIDF_WANTED;
2268 cv_broadcast(&rs->sc_cv);
2269 }
2270 mutex_exit(&rs->sc_mutex);
2271 }
2272
2273
2274 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2275 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2276 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2277
2278 static daddr_t
2279 rf_component_info_offset(void)
2280 {
2281
2282 return RF_COMPONENT_INFO_OFFSET;
2283 }
2284
2285 static daddr_t
2286 rf_component_info_size(unsigned secsize)
2287 {
2288 daddr_t info_size;
2289
2290 KASSERT(secsize);
2291 if (secsize > RF_COMPONENT_INFO_SIZE)
2292 info_size = secsize;
2293 else
2294 info_size = RF_COMPONENT_INFO_SIZE;
2295
2296 return info_size;
2297 }
2298
2299 static daddr_t
2300 rf_parity_map_offset(RF_Raid_t *raidPtr)
2301 {
2302 daddr_t map_offset;
2303
2304 KASSERT(raidPtr->bytesPerSector);
2305 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2306 map_offset = raidPtr->bytesPerSector;
2307 else
2308 map_offset = RF_COMPONENT_INFO_SIZE;
2309 map_offset += rf_component_info_offset();
2310
2311 return map_offset;
2312 }
2313
2314 static daddr_t
2315 rf_parity_map_size(RF_Raid_t *raidPtr)
2316 {
2317 daddr_t map_size;
2318
2319 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2320 map_size = raidPtr->bytesPerSector;
2321 else
2322 map_size = RF_PARITY_MAP_SIZE;
2323
2324 return map_size;
2325 }
2326
2327 int
2328 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2329 {
2330 RF_ComponentLabel_t *clabel;
2331
2332 clabel = raidget_component_label(raidPtr, col);
2333 clabel->clean = RF_RAID_CLEAN;
2334 raidflush_component_label(raidPtr, col);
2335 return(0);
2336 }
2337
2338
2339 int
2340 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2341 {
2342 RF_ComponentLabel_t *clabel;
2343
2344 clabel = raidget_component_label(raidPtr, col);
2345 clabel->clean = RF_RAID_DIRTY;
2346 raidflush_component_label(raidPtr, col);
2347 return(0);
2348 }
2349
2350 int
2351 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2352 {
2353 KASSERT(raidPtr->bytesPerSector);
2354
2355 return raidread_component_label(raidPtr->bytesPerSector,
2356 raidPtr->Disks[col].dev,
2357 raidPtr->raid_cinfo[col].ci_vp,
2358 &raidPtr->raid_cinfo[col].ci_label);
2359 }
2360
2361 RF_ComponentLabel_t *
2362 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2363 {
2364 return &raidPtr->raid_cinfo[col].ci_label;
2365 }
2366
2367 int
2368 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2369 {
2370 RF_ComponentLabel_t *label;
2371
2372 label = &raidPtr->raid_cinfo[col].ci_label;
2373 label->mod_counter = raidPtr->mod_counter;
2374 #ifndef RF_NO_PARITY_MAP
2375 label->parity_map_modcount = label->mod_counter;
2376 #endif
2377 return raidwrite_component_label(raidPtr->bytesPerSector,
2378 raidPtr->Disks[col].dev,
2379 raidPtr->raid_cinfo[col].ci_vp, label);
2380 }
2381
2382 /*
2383 * Swap the label endianness.
2384 *
2385 * Everything in the component label is 4-byte-swapped except the version,
2386 * which is kept in the byte-swapped version at all times, and indicates
2387 * for the writer that a swap is necessary.
2388 *
2389 * For reads it is expected that out_label == clabel, but writes expect
2390 * separate labels so only the re-swapped label is written out to disk,
2391 * leaving the swapped-except-version internally.
2392 *
2393 * Only support swapping label version 2.
2394 */
2395 static void
2396 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2397 {
2398 int *in, *out, *in_last;
2399
2400 KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2401
2402 /* Don't swap the label, but do copy it. */
2403 out_label->version = clabel->version;
2404
2405 in = &clabel->serial_number;
2406 in_last = &clabel->future_use2[42];
2407 out = &out_label->serial_number;
2408
2409 for (; in < in_last; in++, out++)
2410 *out = bswap32(*in);
2411 }
2412
2413 static int
2414 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2415 RF_ComponentLabel_t *clabel)
2416 {
2417 int error;
2418
2419 error = raidread_component_area(dev, b_vp, clabel,
2420 sizeof(RF_ComponentLabel_t),
2421 rf_component_info_offset(),
2422 rf_component_info_size(secsize));
2423
2424 if (error == 0 &&
2425 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2426 rf_swap_label(clabel, clabel);
2427 }
2428
2429 return error;
2430 }
2431
2432 /* ARGSUSED */
2433 static int
2434 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2435 size_t msize, daddr_t offset, daddr_t dsize)
2436 {
2437 struct buf *bp;
2438 int error;
2439
2440 /* XXX should probably ensure that we don't try to do this if
2441 someone has changed rf_protected_sectors. */
2442
2443 if (b_vp == NULL) {
2444 /* For whatever reason, this component is not valid.
2445 Don't try to read a component label from it. */
2446 return(EINVAL);
2447 }
2448
2449 /* get a block of the appropriate size... */
2450 bp = geteblk((int)dsize);
2451 bp->b_dev = dev;
2452
2453 /* get our ducks in a row for the read */
2454 bp->b_blkno = offset / DEV_BSIZE;
2455 bp->b_bcount = dsize;
2456 bp->b_flags |= B_READ;
2457 bp->b_resid = dsize;
2458
2459 bdev_strategy(bp);
2460 error = biowait(bp);
2461
2462 if (!error) {
2463 memcpy(data, bp->b_data, msize);
2464 }
2465
2466 brelse(bp, 0);
2467 return(error);
2468 }
2469
2470 static int
2471 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2472 RF_ComponentLabel_t *clabel)
2473 {
2474 RF_ComponentLabel_t *clabel_write = clabel;
2475 RF_ComponentLabel_t lclabel;
2476 int error;
2477
2478 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2479 clabel_write = &lclabel;
2480 rf_swap_label(clabel, clabel_write);
2481 }
2482 error = raidwrite_component_area(dev, b_vp, clabel_write,
2483 sizeof(RF_ComponentLabel_t),
2484 rf_component_info_offset(),
2485 rf_component_info_size(secsize), 0);
2486
2487 return error;
2488 }
2489
2490 /* ARGSUSED */
2491 static int
2492 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2493 size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2494 {
2495 struct buf *bp;
2496 int error;
2497
2498 /* get a block of the appropriate size... */
2499 bp = geteblk((int)dsize);
2500 bp->b_dev = dev;
2501
2502 /* get our ducks in a row for the write */
2503 bp->b_blkno = offset / DEV_BSIZE;
2504 bp->b_bcount = dsize;
2505 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2506 bp->b_resid = dsize;
2507
2508 memset(bp->b_data, 0, dsize);
2509 memcpy(bp->b_data, data, msize);
2510
2511 bdev_strategy(bp);
2512 if (asyncp)
2513 return 0;
2514 error = biowait(bp);
2515 brelse(bp, 0);
2516 if (error) {
2517 #if 1
2518 printf("Failed to write RAID component info!\n");
2519 #endif
2520 }
2521
2522 return(error);
2523 }
2524
2525 void
2526 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2527 {
2528 int c;
2529
2530 for (c = 0; c < raidPtr->numCol; c++) {
2531 /* Skip dead disks. */
2532 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2533 continue;
2534 /* XXXjld: what if an error occurs here? */
2535 raidwrite_component_area(raidPtr->Disks[c].dev,
2536 raidPtr->raid_cinfo[c].ci_vp, map,
2537 RF_PARITYMAP_NBYTE,
2538 rf_parity_map_offset(raidPtr),
2539 rf_parity_map_size(raidPtr), 0);
2540 }
2541 }
2542
2543 void
2544 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2545 {
2546 struct rf_paritymap_ondisk tmp;
2547 int c,first;
2548
2549 first=1;
2550 for (c = 0; c < raidPtr->numCol; c++) {
2551 /* Skip dead disks. */
2552 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2553 continue;
2554 raidread_component_area(raidPtr->Disks[c].dev,
2555 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2556 RF_PARITYMAP_NBYTE,
2557 rf_parity_map_offset(raidPtr),
2558 rf_parity_map_size(raidPtr));
2559 if (first) {
2560 memcpy(map, &tmp, sizeof(*map));
2561 first = 0;
2562 } else {
2563 rf_paritymap_merge(map, &tmp);
2564 }
2565 }
2566 }
2567
2568 void
2569 rf_markalldirty(RF_Raid_t *raidPtr)
2570 {
2571 RF_ComponentLabel_t *clabel;
2572 int sparecol;
2573 int c;
2574 int j;
2575 int scol = -1;
2576
2577 raidPtr->mod_counter++;
2578 for (c = 0; c < raidPtr->numCol; c++) {
2579 /* we don't want to touch (at all) a disk that has
2580 failed */
2581 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2582 clabel = raidget_component_label(raidPtr, c);
2583 if (clabel->status == rf_ds_spared) {
2584 /* XXX do something special...
2585 but whatever you do, don't
2586 try to access it!! */
2587 } else {
2588 raidmarkdirty(raidPtr, c);
2589 }
2590 }
2591 }
2592
2593 for( c = 0; c < raidPtr->numSpare ; c++) {
2594 sparecol = raidPtr->numCol + c;
2595 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2596 /*
2597
2598 we claim this disk is "optimal" if it's
2599 rf_ds_used_spare, as that means it should be
2600 directly substitutable for the disk it replaced.
2601 We note that too...
2602
2603 */
2604
2605 for(j=0;j<raidPtr->numCol;j++) {
2606 if (raidPtr->Disks[j].spareCol == sparecol) {
2607 scol = j;
2608 break;
2609 }
2610 }
2611
2612 clabel = raidget_component_label(raidPtr, sparecol);
2613 /* make sure status is noted */
2614
2615 raid_init_component_label(raidPtr, clabel);
2616
2617 clabel->row = 0;
2618 clabel->column = scol;
2619 /* Note: we *don't* change status from rf_ds_used_spare
2620 to rf_ds_optimal */
2621 /* clabel.status = rf_ds_optimal; */
2622
2623 raidmarkdirty(raidPtr, sparecol);
2624 }
2625 }
2626 }
2627
2628
2629 void
2630 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2631 {
2632 RF_ComponentLabel_t *clabel;
2633 int sparecol;
2634 int c;
2635 int j;
2636 int scol;
2637 struct raid_softc *rs = raidPtr->softc;
2638
2639 scol = -1;
2640
2641 /* XXX should do extra checks to make sure things really are clean,
2642 rather than blindly setting the clean bit... */
2643
2644 raidPtr->mod_counter++;
2645
2646 for (c = 0; c < raidPtr->numCol; c++) {
2647 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2648 clabel = raidget_component_label(raidPtr, c);
2649 /* make sure status is noted */
2650 clabel->status = rf_ds_optimal;
2651
2652 /* note what unit we are configured as */
2653 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2654 clabel->last_unit = raidPtr->raidid;
2655
2656 raidflush_component_label(raidPtr, c);
2657 if (final == RF_FINAL_COMPONENT_UPDATE) {
2658 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2659 raidmarkclean(raidPtr, c);
2660 }
2661 }
2662 }
2663 /* else we don't touch it.. */
2664 }
2665
2666 for( c = 0; c < raidPtr->numSpare ; c++) {
2667 sparecol = raidPtr->numCol + c;
2668 /* Need to ensure that the reconstruct actually completed! */
2669 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2670 /*
2671
2672 we claim this disk is "optimal" if it's
2673 rf_ds_used_spare, as that means it should be
2674 directly substitutable for the disk it replaced.
2675 We note that too...
2676
2677 */
2678
2679 for(j=0;j<raidPtr->numCol;j++) {
2680 if (raidPtr->Disks[j].spareCol == sparecol) {
2681 scol = j;
2682 break;
2683 }
2684 }
2685
2686 /* XXX shouldn't *really* need this... */
2687 clabel = raidget_component_label(raidPtr, sparecol);
2688 /* make sure status is noted */
2689
2690 raid_init_component_label(raidPtr, clabel);
2691
2692 clabel->column = scol;
2693 clabel->status = rf_ds_optimal;
2694 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2695 clabel->last_unit = raidPtr->raidid;
2696
2697 raidflush_component_label(raidPtr, sparecol);
2698 if (final == RF_FINAL_COMPONENT_UPDATE) {
2699 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2700 raidmarkclean(raidPtr, sparecol);
2701 }
2702 }
2703 }
2704 }
2705 }
2706
2707 void
2708 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2709 {
2710
2711 if (vp != NULL) {
2712 if (auto_configured == 1) {
2713 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2714 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2715 vput(vp);
2716
2717 } else {
2718 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2719 }
2720 }
2721 }
2722
2723
2724 void
2725 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2726 {
2727 int r,c;
2728 struct vnode *vp;
2729 int acd;
2730
2731
2732 /* We take this opportunity to close the vnodes like we should.. */
2733
2734 for (c = 0; c < raidPtr->numCol; c++) {
2735 vp = raidPtr->raid_cinfo[c].ci_vp;
2736 acd = raidPtr->Disks[c].auto_configured;
2737 rf_close_component(raidPtr, vp, acd);
2738 raidPtr->raid_cinfo[c].ci_vp = NULL;
2739 raidPtr->Disks[c].auto_configured = 0;
2740 }
2741
2742 for (r = 0; r < raidPtr->numSpare; r++) {
2743 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2744 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2745 rf_close_component(raidPtr, vp, acd);
2746 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2747 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2748 }
2749 }
2750
2751
2752 static void
2753 rf_ReconThread(struct rf_recon_req_internal *req)
2754 {
2755 int s;
2756 RF_Raid_t *raidPtr;
2757
2758 s = splbio();
2759 raidPtr = (RF_Raid_t *) req->raidPtr;
2760 raidPtr->recon_in_progress = 1;
2761
2762 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2763 raidPtr->forceRecon = 1;
2764 }
2765
2766 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2767 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2768
2769 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2770 raidPtr->forceRecon = 0;
2771 }
2772
2773 RF_Free(req, sizeof(*req));
2774
2775 raidPtr->recon_in_progress = 0;
2776 splx(s);
2777
2778 /* That's all... */
2779 kthread_exit(0); /* does not return */
2780 }
2781
2782 static void
2783 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2784 {
2785 int retcode;
2786 int s;
2787
2788 raidPtr->parity_rewrite_stripes_done = 0;
2789 raidPtr->parity_rewrite_in_progress = 1;
2790 s = splbio();
2791 retcode = rf_RewriteParity(raidPtr);
2792 splx(s);
2793 if (retcode) {
2794 printf("raid%d: Error re-writing parity (%d)!\n",
2795 raidPtr->raidid, retcode);
2796 } else {
2797 /* set the clean bit! If we shutdown correctly,
2798 the clean bit on each component label will get
2799 set */
2800 raidPtr->parity_good = RF_RAID_CLEAN;
2801 }
2802 raidPtr->parity_rewrite_in_progress = 0;
2803
2804 /* Anyone waiting for us to stop? If so, inform them... */
2805 if (raidPtr->waitShutdown) {
2806 rf_lock_mutex2(raidPtr->rad_lock);
2807 cv_broadcast(&raidPtr->parity_rewrite_cv);
2808 rf_unlock_mutex2(raidPtr->rad_lock);
2809 }
2810
2811 /* That's all... */
2812 kthread_exit(0); /* does not return */
2813 }
2814
2815
2816 static void
2817 rf_CopybackThread(RF_Raid_t *raidPtr)
2818 {
2819 int s;
2820
2821 raidPtr->copyback_in_progress = 1;
2822 s = splbio();
2823 rf_CopybackReconstructedData(raidPtr);
2824 splx(s);
2825 raidPtr->copyback_in_progress = 0;
2826
2827 /* That's all... */
2828 kthread_exit(0); /* does not return */
2829 }
2830
2831
2832 static void
2833 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2834 {
2835 int s;
2836 RF_Raid_t *raidPtr;
2837
2838 s = splbio();
2839 raidPtr = req->raidPtr;
2840 raidPtr->recon_in_progress = 1;
2841
2842 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2843 raidPtr->forceRecon = 1;
2844 }
2845
2846 rf_ReconstructInPlace(raidPtr, req->col);
2847
2848 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2849 raidPtr->forceRecon = 0;
2850 }
2851
2852 RF_Free(req, sizeof(*req));
2853 raidPtr->recon_in_progress = 0;
2854 splx(s);
2855
2856 /* That's all... */
2857 kthread_exit(0); /* does not return */
2858 }
2859
2860 static RF_AutoConfig_t *
2861 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2862 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2863 unsigned secsize)
2864 {
2865 int good_one = 0;
2866 RF_ComponentLabel_t *clabel;
2867 RF_AutoConfig_t *ac;
2868
2869 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2870
2871 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2872 /* Got the label. Does it look reasonable? */
2873 if (rf_reasonable_label(clabel, numsecs) &&
2874 (rf_component_label_partitionsize(clabel) <= size)) {
2875 #ifdef DEBUG
2876 printf("Component on: %s: %llu\n",
2877 cname, (unsigned long long)size);
2878 rf_print_component_label(clabel);
2879 #endif
2880 /* if it's reasonable, add it, else ignore it. */
2881 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2882 M_WAITOK);
2883 strlcpy(ac->devname, cname, sizeof(ac->devname));
2884 ac->dev = dev;
2885 ac->vp = vp;
2886 ac->clabel = clabel;
2887 ac->next = ac_list;
2888 ac_list = ac;
2889 good_one = 1;
2890 }
2891 }
2892 if (!good_one) {
2893 /* cleanup */
2894 free(clabel, M_RAIDFRAME);
2895 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2896 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2897 vput(vp);
2898 }
2899 return ac_list;
2900 }
2901
2902 static RF_AutoConfig_t *
2903 rf_find_raid_components(void)
2904 {
2905 struct vnode *vp;
2906 struct disklabel label;
2907 device_t dv;
2908 deviter_t di;
2909 dev_t dev;
2910 int bmajor, bminor, wedge, rf_part_found;
2911 int error;
2912 int i;
2913 RF_AutoConfig_t *ac_list;
2914 uint64_t numsecs;
2915 unsigned secsize;
2916 int dowedges;
2917
2918 /* initialize the AutoConfig list */
2919 ac_list = NULL;
2920
2921 /*
2922 * we begin by trolling through *all* the devices on the system *twice*
2923 * first we scan for wedges, second for other devices. This avoids
2924 * using a raw partition instead of a wedge that covers the whole disk
2925 */
2926
2927 for (dowedges=1; dowedges>=0; --dowedges) {
2928 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2929 dv = deviter_next(&di)) {
2930
2931 /* we are only interested in disks */
2932 if (device_class(dv) != DV_DISK)
2933 continue;
2934
2935 /* we don't care about floppies */
2936 if (device_is_a(dv, "fd")) {
2937 continue;
2938 }
2939
2940 /* we don't care about CDs. */
2941 if (device_is_a(dv, "cd")) {
2942 continue;
2943 }
2944
2945 /* we don't care about md. */
2946 if (device_is_a(dv, "md")) {
2947 continue;
2948 }
2949
2950 /* hdfd is the Atari/Hades floppy driver */
2951 if (device_is_a(dv, "hdfd")) {
2952 continue;
2953 }
2954
2955 /* fdisa is the Atari/Milan floppy driver */
2956 if (device_is_a(dv, "fdisa")) {
2957 continue;
2958 }
2959
2960 /* we don't care about spiflash */
2961 if (device_is_a(dv, "spiflash")) {
2962 continue;
2963 }
2964
2965 /* are we in the wedges pass ? */
2966 wedge = device_is_a(dv, "dk");
2967 if (wedge != dowedges) {
2968 continue;
2969 }
2970
2971 /* need to find the device_name_to_block_device_major stuff */
2972 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2973
2974 rf_part_found = 0; /*No raid partition as yet*/
2975
2976 /* get a vnode for the raw partition of this disk */
2977 bminor = minor(device_unit(dv));
2978 dev = wedge ? makedev(bmajor, bminor) :
2979 MAKEDISKDEV(bmajor, bminor, RAW_PART);
2980 if (bdevvp(dev, &vp))
2981 panic("RAID can't alloc vnode");
2982
2983 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2984 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
2985
2986 if (error) {
2987 /* "Who cares." Continue looking
2988 for something that exists*/
2989 vput(vp);
2990 continue;
2991 }
2992
2993 VOP_UNLOCK(vp);
2994 error = getdisksize(vp, &numsecs, &secsize);
2995 if (error) {
2996 /*
2997 * Pseudo devices like vnd and cgd can be
2998 * opened but may still need some configuration.
2999 * Ignore these quietly.
3000 */
3001 if (error != ENXIO)
3002 printf("RAIDframe: can't get disk size"
3003 " for dev %s (%d)\n",
3004 device_xname(dv), error);
3005 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3006 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3007 vput(vp);
3008 continue;
3009 }
3010 if (wedge) {
3011 struct dkwedge_info dkw;
3012 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3013 NOCRED);
3014 if (error) {
3015 printf("RAIDframe: can't get wedge info for "
3016 "dev %s (%d)\n", device_xname(dv), error);
3017 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3018 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3019 vput(vp);
3020 continue;
3021 }
3022
3023 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3024 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3025 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3026 vput(vp);
3027 continue;
3028 }
3029
3030 ac_list = rf_get_component(ac_list, dev, vp,
3031 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3032 rf_part_found = 1; /*There is a raid component on this disk*/
3033 continue;
3034 }
3035
3036 /* Ok, the disk exists. Go get the disklabel. */
3037 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3038 if (error) {
3039 /*
3040 * XXX can't happen - open() would
3041 * have errored out (or faked up one)
3042 */
3043 if (error != ENOTTY)
3044 printf("RAIDframe: can't get label for dev "
3045 "%s (%d)\n", device_xname(dv), error);
3046 }
3047
3048 /* don't need this any more. We'll allocate it again
3049 a little later if we really do... */
3050 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3051 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3052 vput(vp);
3053
3054 if (error)
3055 continue;
3056
3057 rf_part_found = 0; /*No raid partitions yet*/
3058 for (i = 0; i < label.d_npartitions; i++) {
3059 char cname[sizeof(ac_list->devname)];
3060
3061 /* We only support partitions marked as RAID */
3062 if (label.d_partitions[i].p_fstype != FS_RAID)
3063 continue;
3064
3065 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3066 if (bdevvp(dev, &vp))
3067 panic("RAID can't alloc vnode");
3068
3069 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3070 error = VOP_OPEN(vp, FREAD, NOCRED);
3071 if (error) {
3072 /* Not quite a 'whatever'. In
3073 * this situation we know
3074 * there is a FS_RAID
3075 * partition, but we can't
3076 * open it. The most likely
3077 * reason is that the
3078 * partition is already in
3079 * use by another RAID set.
3080 * So note that we've already
3081 * found a partition on this
3082 * disk so we don't attempt
3083 * to use the raw disk later. */
3084 rf_part_found = 1;
3085 vput(vp);
3086 continue;
3087 }
3088 VOP_UNLOCK(vp);
3089 snprintf(cname, sizeof(cname), "%s%c",
3090 device_xname(dv), 'a' + i);
3091 ac_list = rf_get_component(ac_list, dev, vp, cname,
3092 label.d_partitions[i].p_size, numsecs, secsize);
3093 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3094 }
3095
3096 /*
3097 *If there is no raid component on this disk, either in a
3098 *disklabel or inside a wedge, check the raw partition as well,
3099 *as it is possible to configure raid components on raw disk
3100 *devices.
3101 */
3102
3103 if (!rf_part_found) {
3104 char cname[sizeof(ac_list->devname)];
3105
3106 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3107 if (bdevvp(dev, &vp))
3108 panic("RAID can't alloc vnode");
3109
3110 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3111
3112 error = VOP_OPEN(vp, FREAD, NOCRED);
3113 if (error) {
3114 /* Whatever... */
3115 vput(vp);
3116 continue;
3117 }
3118 VOP_UNLOCK(vp);
3119 snprintf(cname, sizeof(cname), "%s%c",
3120 device_xname(dv), 'a' + RAW_PART);
3121 ac_list = rf_get_component(ac_list, dev, vp, cname,
3122 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3123 }
3124 }
3125 deviter_release(&di);
3126 }
3127 return ac_list;
3128 }
3129
3130 int
3131 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3132 {
3133
3134 if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3135 clabel->version==RF_COMPONENT_LABEL_VERSION ||
3136 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3137 (clabel->clean == RF_RAID_CLEAN ||
3138 clabel->clean == RF_RAID_DIRTY) &&
3139 clabel->row >=0 &&
3140 clabel->column >= 0 &&
3141 clabel->num_rows > 0 &&
3142 clabel->num_columns > 0 &&
3143 clabel->row < clabel->num_rows &&
3144 clabel->column < clabel->num_columns &&
3145 clabel->blockSize > 0 &&
3146 /*
3147 * numBlocksHi may contain garbage, but it is ok since
3148 * the type is unsigned. If it is really garbage,
3149 * rf_fix_old_label_size() will fix it.
3150 */
3151 rf_component_label_numblocks(clabel) > 0) {
3152 /*
3153 * label looks reasonable enough...
3154 * let's make sure it has no old garbage.
3155 */
3156 if (numsecs)
3157 rf_fix_old_label_size(clabel, numsecs);
3158 return(1);
3159 }
3160 return(0);
3161 }
3162
3163
3164 /*
3165 * For reasons yet unknown, some old component labels have garbage in
3166 * the newer numBlocksHi region, and this causes lossage. Since those
3167 * disks will also have numsecs set to less than 32 bits of sectors,
3168 * we can determine when this corruption has occurred, and fix it.
3169 *
3170 * The exact same problem, with the same unknown reason, happens to
3171 * the partitionSizeHi member as well.
3172 */
3173 static void
3174 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3175 {
3176
3177 if (numsecs < ((uint64_t)1 << 32)) {
3178 if (clabel->numBlocksHi) {
3179 printf("WARNING: total sectors < 32 bits, yet "
3180 "numBlocksHi set\n"
3181 "WARNING: resetting numBlocksHi to zero.\n");
3182 clabel->numBlocksHi = 0;
3183 }
3184
3185 if (clabel->partitionSizeHi) {
3186 printf("WARNING: total sectors < 32 bits, yet "
3187 "partitionSizeHi set\n"
3188 "WARNING: resetting partitionSizeHi to zero.\n");
3189 clabel->partitionSizeHi = 0;
3190 }
3191 }
3192 }
3193
3194
3195 #ifdef DEBUG
3196 void
3197 rf_print_component_label(RF_ComponentLabel_t *clabel)
3198 {
3199 uint64_t numBlocks;
3200 static const char *rp[] = {
3201 "No", "Force", "Soft", "*invalid*"
3202 };
3203
3204
3205 numBlocks = rf_component_label_numblocks(clabel);
3206
3207 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3208 clabel->row, clabel->column,
3209 clabel->num_rows, clabel->num_columns);
3210 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3211 clabel->version, clabel->serial_number,
3212 clabel->mod_counter);
3213 printf(" Clean: %s Status: %d\n",
3214 clabel->clean ? "Yes" : "No", clabel->status);
3215 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3216 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3217 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3218 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3219 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3220 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3221 printf(" Last configured as: raid%d\n", clabel->last_unit);
3222 #if 0
3223 printf(" Config order: %d\n", clabel->config_order);
3224 #endif
3225
3226 }
3227 #endif
3228
3229 static RF_ConfigSet_t *
3230 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3231 {
3232 RF_AutoConfig_t *ac;
3233 RF_ConfigSet_t *config_sets;
3234 RF_ConfigSet_t *cset;
3235 RF_AutoConfig_t *ac_next;
3236
3237
3238 config_sets = NULL;
3239
3240 /* Go through the AutoConfig list, and figure out which components
3241 belong to what sets. */
3242 ac = ac_list;
3243 while(ac!=NULL) {
3244 /* we're going to putz with ac->next, so save it here
3245 for use at the end of the loop */
3246 ac_next = ac->next;
3247
3248 if (config_sets == NULL) {
3249 /* will need at least this one... */
3250 config_sets = malloc(sizeof(RF_ConfigSet_t),
3251 M_RAIDFRAME, M_WAITOK);
3252 /* this one is easy :) */
3253 config_sets->ac = ac;
3254 config_sets->next = NULL;
3255 config_sets->rootable = 0;
3256 ac->next = NULL;
3257 } else {
3258 /* which set does this component fit into? */
3259 cset = config_sets;
3260 while(cset!=NULL) {
3261 if (rf_does_it_fit(cset, ac)) {
3262 /* looks like it matches... */
3263 ac->next = cset->ac;
3264 cset->ac = ac;
3265 break;
3266 }
3267 cset = cset->next;
3268 }
3269 if (cset==NULL) {
3270 /* didn't find a match above... new set..*/
3271 cset = malloc(sizeof(RF_ConfigSet_t),
3272 M_RAIDFRAME, M_WAITOK);
3273 cset->ac = ac;
3274 ac->next = NULL;
3275 cset->next = config_sets;
3276 cset->rootable = 0;
3277 config_sets = cset;
3278 }
3279 }
3280 ac = ac_next;
3281 }
3282
3283
3284 return(config_sets);
3285 }
3286
3287 static int
3288 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3289 {
3290 RF_ComponentLabel_t *clabel1, *clabel2;
3291
3292 /* If this one matches the *first* one in the set, that's good
3293 enough, since the other members of the set would have been
3294 through here too... */
3295 /* note that we are not checking partitionSize here..
3296
3297 Note that we are also not checking the mod_counters here.
3298 If everything else matches except the mod_counter, that's
3299 good enough for this test. We will deal with the mod_counters
3300 a little later in the autoconfiguration process.
3301
3302 (clabel1->mod_counter == clabel2->mod_counter) &&
3303
3304 The reason we don't check for this is that failed disks
3305 will have lower modification counts. If those disks are
3306 not added to the set they used to belong to, then they will
3307 form their own set, which may result in 2 different sets,
3308 for example, competing to be configured at raid0, and
3309 perhaps competing to be the root filesystem set. If the
3310 wrong ones get configured, or both attempt to become /,
3311 weird behaviour and or serious lossage will occur. Thus we
3312 need to bring them into the fold here, and kick them out at
3313 a later point.
3314
3315 */
3316
3317 clabel1 = cset->ac->clabel;
3318 clabel2 = ac->clabel;
3319 if ((clabel1->version == clabel2->version) &&
3320 (clabel1->serial_number == clabel2->serial_number) &&
3321 (clabel1->num_rows == clabel2->num_rows) &&
3322 (clabel1->num_columns == clabel2->num_columns) &&
3323 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3324 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3325 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3326 (clabel1->parityConfig == clabel2->parityConfig) &&
3327 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3328 (clabel1->blockSize == clabel2->blockSize) &&
3329 rf_component_label_numblocks(clabel1) ==
3330 rf_component_label_numblocks(clabel2) &&
3331 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3332 (clabel1->root_partition == clabel2->root_partition) &&
3333 (clabel1->last_unit == clabel2->last_unit) &&
3334 (clabel1->config_order == clabel2->config_order)) {
3335 /* if it get's here, it almost *has* to be a match */
3336 } else {
3337 /* it's not consistent with somebody in the set..
3338 punt */
3339 return(0);
3340 }
3341 /* all was fine.. it must fit... */
3342 return(1);
3343 }
3344
3345 static int
3346 rf_have_enough_components(RF_ConfigSet_t *cset)
3347 {
3348 RF_AutoConfig_t *ac;
3349 RF_AutoConfig_t *auto_config;
3350 RF_ComponentLabel_t *clabel;
3351 int c;
3352 int num_cols;
3353 int num_missing;
3354 int mod_counter;
3355 int mod_counter_found;
3356 int even_pair_failed;
3357 char parity_type;
3358
3359
3360 /* check to see that we have enough 'live' components
3361 of this set. If so, we can configure it if necessary */
3362
3363 num_cols = cset->ac->clabel->num_columns;
3364 parity_type = cset->ac->clabel->parityConfig;
3365
3366 /* XXX Check for duplicate components!?!?!? */
3367
3368 /* Determine what the mod_counter is supposed to be for this set. */
3369
3370 mod_counter_found = 0;
3371 mod_counter = 0;
3372 ac = cset->ac;
3373 while(ac!=NULL) {
3374 if (mod_counter_found==0) {
3375 mod_counter = ac->clabel->mod_counter;
3376 mod_counter_found = 1;
3377 } else {
3378 if (ac->clabel->mod_counter > mod_counter) {
3379 mod_counter = ac->clabel->mod_counter;
3380 }
3381 }
3382 ac = ac->next;
3383 }
3384
3385 num_missing = 0;
3386 auto_config = cset->ac;
3387
3388 even_pair_failed = 0;
3389 for(c=0; c<num_cols; c++) {
3390 ac = auto_config;
3391 while(ac!=NULL) {
3392 if ((ac->clabel->column == c) &&
3393 (ac->clabel->mod_counter == mod_counter)) {
3394 /* it's this one... */
3395 #ifdef DEBUG
3396 printf("Found: %s at %d\n",
3397 ac->devname,c);
3398 #endif
3399 break;
3400 }
3401 ac=ac->next;
3402 }
3403 if (ac==NULL) {
3404 /* Didn't find one here! */
3405 /* special case for RAID 1, especially
3406 where there are more than 2
3407 components (where RAIDframe treats
3408 things a little differently :( ) */
3409 if (parity_type == '1') {
3410 if (c%2 == 0) { /* even component */
3411 even_pair_failed = 1;
3412 } else { /* odd component. If
3413 we're failed, and
3414 so is the even
3415 component, it's
3416 "Good Night, Charlie" */
3417 if (even_pair_failed == 1) {
3418 return(0);
3419 }
3420 }
3421 } else {
3422 /* normal accounting */
3423 num_missing++;
3424 }
3425 }
3426 if ((parity_type == '1') && (c%2 == 1)) {
3427 /* Just did an even component, and we didn't
3428 bail.. reset the even_pair_failed flag,
3429 and go on to the next component.... */
3430 even_pair_failed = 0;
3431 }
3432 }
3433
3434 clabel = cset->ac->clabel;
3435
3436 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3437 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3438 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3439 /* XXX this needs to be made *much* more general */
3440 /* Too many failures */
3441 return(0);
3442 }
3443 /* otherwise, all is well, and we've got enough to take a kick
3444 at autoconfiguring this set */
3445 return(1);
3446 }
3447
3448 static void
3449 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3450 RF_Raid_t *raidPtr)
3451 {
3452 RF_ComponentLabel_t *clabel;
3453 int i;
3454
3455 clabel = ac->clabel;
3456
3457 /* 1. Fill in the common stuff */
3458 config->numCol = clabel->num_columns;
3459 config->numSpare = 0; /* XXX should this be set here? */
3460 config->sectPerSU = clabel->sectPerSU;
3461 config->SUsPerPU = clabel->SUsPerPU;
3462 config->SUsPerRU = clabel->SUsPerRU;
3463 config->parityConfig = clabel->parityConfig;
3464 /* XXX... */
3465 strcpy(config->diskQueueType,"fifo");
3466 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3467 config->layoutSpecificSize = 0; /* XXX ?? */
3468
3469 while(ac!=NULL) {
3470 /* row/col values will be in range due to the checks
3471 in reasonable_label() */
3472 strcpy(config->devnames[0][ac->clabel->column],
3473 ac->devname);
3474 ac = ac->next;
3475 }
3476
3477 for(i=0;i<RF_MAXDBGV;i++) {
3478 config->debugVars[i][0] = 0;
3479 }
3480 }
3481
3482 static int
3483 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3484 {
3485 RF_ComponentLabel_t *clabel;
3486 int column;
3487 int sparecol;
3488
3489 raidPtr->autoconfigure = new_value;
3490
3491 for(column=0; column<raidPtr->numCol; column++) {
3492 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3493 clabel = raidget_component_label(raidPtr, column);
3494 clabel->autoconfigure = new_value;
3495 raidflush_component_label(raidPtr, column);
3496 }
3497 }
3498 for(column = 0; column < raidPtr->numSpare ; column++) {
3499 sparecol = raidPtr->numCol + column;
3500 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3501 clabel = raidget_component_label(raidPtr, sparecol);
3502 clabel->autoconfigure = new_value;
3503 raidflush_component_label(raidPtr, sparecol);
3504 }
3505 }
3506 return(new_value);
3507 }
3508
3509 static int
3510 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3511 {
3512 RF_ComponentLabel_t *clabel;
3513 int column;
3514 int sparecol;
3515
3516 raidPtr->root_partition = new_value;
3517 for(column=0; column<raidPtr->numCol; column++) {
3518 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3519 clabel = raidget_component_label(raidPtr, column);
3520 clabel->root_partition = new_value;
3521 raidflush_component_label(raidPtr, column);
3522 }
3523 }
3524 for(column = 0; column < raidPtr->numSpare ; column++) {
3525 sparecol = raidPtr->numCol + column;
3526 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3527 clabel = raidget_component_label(raidPtr, sparecol);
3528 clabel->root_partition = new_value;
3529 raidflush_component_label(raidPtr, sparecol);
3530 }
3531 }
3532 return(new_value);
3533 }
3534
3535 static void
3536 rf_release_all_vps(RF_ConfigSet_t *cset)
3537 {
3538 RF_AutoConfig_t *ac;
3539
3540 ac = cset->ac;
3541 while(ac!=NULL) {
3542 /* Close the vp, and give it back */
3543 if (ac->vp) {
3544 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3545 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3546 vput(ac->vp);
3547 ac->vp = NULL;
3548 }
3549 ac = ac->next;
3550 }
3551 }
3552
3553
3554 static void
3555 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3556 {
3557 RF_AutoConfig_t *ac;
3558 RF_AutoConfig_t *next_ac;
3559
3560 ac = cset->ac;
3561 while(ac!=NULL) {
3562 next_ac = ac->next;
3563 /* nuke the label */
3564 free(ac->clabel, M_RAIDFRAME);
3565 /* cleanup the config structure */
3566 free(ac, M_RAIDFRAME);
3567 /* "next.." */
3568 ac = next_ac;
3569 }
3570 /* and, finally, nuke the config set */
3571 free(cset, M_RAIDFRAME);
3572 }
3573
3574
3575 void
3576 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3577 {
3578 /* avoid over-writing byteswapped version. */
3579 if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3580 clabel->version = RF_COMPONENT_LABEL_VERSION;
3581 clabel->serial_number = raidPtr->serial_number;
3582 clabel->mod_counter = raidPtr->mod_counter;
3583
3584 clabel->num_rows = 1;
3585 clabel->num_columns = raidPtr->numCol;
3586 clabel->clean = RF_RAID_DIRTY; /* not clean */
3587 clabel->status = rf_ds_optimal; /* "It's good!" */
3588
3589 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3590 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3591 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3592
3593 clabel->blockSize = raidPtr->bytesPerSector;
3594 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3595
3596 /* XXX not portable */
3597 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3598 clabel->maxOutstanding = raidPtr->maxOutstanding;
3599 clabel->autoconfigure = raidPtr->autoconfigure;
3600 clabel->root_partition = raidPtr->root_partition;
3601 clabel->last_unit = raidPtr->raidid;
3602 clabel->config_order = raidPtr->config_order;
3603
3604 #ifndef RF_NO_PARITY_MAP
3605 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3606 #endif
3607 }
3608
3609 static struct raid_softc *
3610 rf_auto_config_set(RF_ConfigSet_t *cset)
3611 {
3612 RF_Raid_t *raidPtr;
3613 RF_Config_t *config;
3614 int raidID;
3615 struct raid_softc *sc;
3616
3617 #ifdef DEBUG
3618 printf("RAID autoconfigure\n");
3619 #endif
3620
3621 /* 1. Create a config structure */
3622 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3623
3624 /*
3625 2. Figure out what RAID ID this one is supposed to live at
3626 See if we can get the same RAID dev that it was configured
3627 on last time..
3628 */
3629
3630 raidID = cset->ac->clabel->last_unit;
3631 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3632 sc = raidget(++raidID, false))
3633 continue;
3634 #ifdef DEBUG
3635 printf("Configuring raid%d:\n",raidID);
3636 #endif
3637
3638 if (sc == NULL)
3639 sc = raidget(raidID, true);
3640 raidPtr = &sc->sc_r;
3641
3642 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3643 raidPtr->softc = sc;
3644 raidPtr->raidid = raidID;
3645 raidPtr->openings = RAIDOUTSTANDING;
3646
3647 /* 3. Build the configuration structure */
3648 rf_create_configuration(cset->ac, config, raidPtr);
3649
3650 /* 4. Do the configuration */
3651 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3652 raidinit(sc);
3653
3654 rf_markalldirty(raidPtr);
3655 raidPtr->autoconfigure = 1; /* XXX do this here? */
3656 switch (cset->ac->clabel->root_partition) {
3657 case 1: /* Force Root */
3658 case 2: /* Soft Root: root when boot partition part of raid */
3659 /*
3660 * everything configured just fine. Make a note
3661 * that this set is eligible to be root,
3662 * or forced to be root
3663 */
3664 cset->rootable = cset->ac->clabel->root_partition;
3665 /* XXX do this here? */
3666 raidPtr->root_partition = cset->rootable;
3667 break;
3668 default:
3669 break;
3670 }
3671 } else {
3672 raidput(sc);
3673 sc = NULL;
3674 }
3675
3676 /* 5. Cleanup */
3677 free(config, M_RAIDFRAME);
3678 return sc;
3679 }
3680
3681 void
3682 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3683 size_t xmin, size_t xmax)
3684 {
3685
3686 /* Format: raid%d_foo */
3687 snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3688
3689 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3690 pool_sethiwat(p, xmax);
3691 pool_prime(p, xmin);
3692 }
3693
3694
3695 /*
3696 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3697 * to see if there is IO pending and if that IO could possibly be done
3698 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3699 * otherwise.
3700 *
3701 */
3702 int
3703 rf_buf_queue_check(RF_Raid_t *raidPtr)
3704 {
3705 struct raid_softc *rs;
3706 struct dk_softc *dksc;
3707
3708 rs = raidPtr->softc;
3709 dksc = &rs->sc_dksc;
3710
3711 if ((rs->sc_flags & RAIDF_INITED) == 0)
3712 return 1;
3713
3714 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3715 /* there is work to do */
3716 return 0;
3717 }
3718 /* default is nothing to do */
3719 return 1;
3720 }
3721
3722 int
3723 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3724 {
3725 uint64_t numsecs;
3726 unsigned secsize;
3727 int error;
3728
3729 error = getdisksize(vp, &numsecs, &secsize);
3730 if (error == 0) {
3731 diskPtr->blockSize = secsize;
3732 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3733 diskPtr->partitionSize = numsecs;
3734 return 0;
3735 }
3736 return error;
3737 }
3738
3739 static int
3740 raid_match(device_t self, cfdata_t cfdata, void *aux)
3741 {
3742 return 1;
3743 }
3744
3745 static void
3746 raid_attach(device_t parent, device_t self, void *aux)
3747 {
3748 }
3749
3750
3751 static int
3752 raid_detach(device_t self, int flags)
3753 {
3754 int error;
3755 struct raid_softc *rs = raidsoftc(self);
3756
3757 if (rs == NULL)
3758 return ENXIO;
3759
3760 if ((error = raidlock(rs)) != 0)
3761 return error;
3762
3763 error = raid_detach_unlocked(rs);
3764
3765 raidunlock(rs);
3766
3767 /* XXX raid can be referenced here */
3768
3769 if (error)
3770 return error;
3771
3772 /* Free the softc */
3773 raidput(rs);
3774
3775 return 0;
3776 }
3777
3778 static void
3779 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3780 {
3781 struct dk_softc *dksc = &rs->sc_dksc;
3782 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3783
3784 memset(dg, 0, sizeof(*dg));
3785
3786 dg->dg_secperunit = raidPtr->totalSectors;
3787 dg->dg_secsize = raidPtr->bytesPerSector;
3788 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3789 dg->dg_ntracks = 4 * raidPtr->numCol;
3790
3791 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3792 }
3793
3794 /*
3795 * Get cache info for all the components (including spares).
3796 * Returns intersection of all the cache flags of all disks, or first
3797 * error if any encountered.
3798 * XXXfua feature flags can change as spares are added - lock down somehow
3799 */
3800 static int
3801 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3802 {
3803 int c;
3804 int error;
3805 int dkwhole = 0, dkpart;
3806
3807 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3808 /*
3809 * Check any non-dead disk, even when currently being
3810 * reconstructed.
3811 */
3812 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)
3813 || raidPtr->Disks[c].status == rf_ds_reconstructing) {
3814 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3815 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3816 if (error) {
3817 if (error != ENODEV) {
3818 printf("raid%d: get cache for component %s failed\n",
3819 raidPtr->raidid,
3820 raidPtr->Disks[c].devname);
3821 }
3822
3823 return error;
3824 }
3825
3826 if (c == 0)
3827 dkwhole = dkpart;
3828 else
3829 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3830 }
3831 }
3832
3833 *data = dkwhole;
3834
3835 return 0;
3836 }
3837
3838 /*
3839 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3840 * We end up returning whatever error was returned by the first cache flush
3841 * that fails.
3842 */
3843
3844 static int
3845 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3846 {
3847 int e = 0;
3848 for (int i = 0; i < 5; i++) {
3849 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3850 &force, FWRITE, NOCRED);
3851 if (!e || e == ENODEV)
3852 return e;
3853 printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3854 raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3855 }
3856 return e;
3857 }
3858
3859 int
3860 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3861 {
3862 int c, error;
3863
3864 error = 0;
3865 for (c = 0; c < raidPtr->numCol; c++) {
3866 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3867 int e = rf_sync_component_cache(raidPtr, c, force);
3868 if (e && !error)
3869 error = e;
3870 }
3871 }
3872
3873 for (c = 0; c < raidPtr->numSpare ; c++) {
3874 int sparecol = raidPtr->numCol + c;
3875 /* Need to ensure that the reconstruct actually completed! */
3876 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3877 int e = rf_sync_component_cache(raidPtr, sparecol,
3878 force);
3879 if (e && !error)
3880 error = e;
3881 }
3882 }
3883 return error;
3884 }
3885
3886 /* Fill in info with the current status */
3887 void
3888 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3889 {
3890
3891 memset(info, 0, sizeof(*info));
3892
3893 if (raidPtr->status != rf_rs_reconstructing) {
3894 info->total = 100;
3895 info->completed = 100;
3896 } else {
3897 info->total = raidPtr->reconControl->numRUsTotal;
3898 info->completed = raidPtr->reconControl->numRUsComplete;
3899 }
3900 info->remaining = info->total - info->completed;
3901 }
3902
3903 /* Fill in info with the current status */
3904 void
3905 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3906 {
3907
3908 memset(info, 0, sizeof(*info));
3909
3910 if (raidPtr->parity_rewrite_in_progress == 1) {
3911 info->total = raidPtr->Layout.numStripe;
3912 info->completed = raidPtr->parity_rewrite_stripes_done;
3913 } else {
3914 info->completed = 100;
3915 info->total = 100;
3916 }
3917 info->remaining = info->total - info->completed;
3918 }
3919
3920 /* Fill in info with the current status */
3921 void
3922 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3923 {
3924
3925 memset(info, 0, sizeof(*info));
3926
3927 if (raidPtr->copyback_in_progress == 1) {
3928 info->total = raidPtr->Layout.numStripe;
3929 info->completed = raidPtr->copyback_stripes_done;
3930 info->remaining = info->total - info->completed;
3931 } else {
3932 info->remaining = 0;
3933 info->completed = 100;
3934 info->total = 100;
3935 }
3936 }
3937
3938 /* Fill in config with the current info */
3939 int
3940 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3941 {
3942 int d, i, j;
3943
3944 if (!raidPtr->valid)
3945 return ENODEV;
3946 config->cols = raidPtr->numCol;
3947 config->ndevs = raidPtr->numCol;
3948 if (config->ndevs >= RF_MAX_DISKS)
3949 return ENOMEM;
3950 config->nspares = raidPtr->numSpare;
3951 if (config->nspares >= RF_MAX_DISKS)
3952 return ENOMEM;
3953 config->maxqdepth = raidPtr->maxQueueDepth;
3954 d = 0;
3955 for (j = 0; j < config->cols; j++) {
3956 config->devs[d] = raidPtr->Disks[j];
3957 d++;
3958 }
3959 for (j = config->cols, i = 0; i < config->nspares; i++, j++) {
3960 config->spares[i] = raidPtr->Disks[j];
3961 if (config->spares[i].status == rf_ds_rebuilding_spare) {
3962 /* XXX: raidctl(8) expects to see this as a used spare */
3963 config->spares[i].status = rf_ds_used_spare;
3964 }
3965 }
3966 return 0;
3967 }
3968
3969 int
3970 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
3971 {
3972 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
3973 RF_ComponentLabel_t *raid_clabel;
3974 int column = clabel->column;
3975
3976 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
3977 return EINVAL;
3978 raid_clabel = raidget_component_label(raidPtr, column);
3979 memcpy(clabel, raid_clabel, sizeof *clabel);
3980 /* Fix-up for userland. */
3981 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
3982 clabel->version = RF_COMPONENT_LABEL_VERSION;
3983
3984 return 0;
3985 }
3986
3987 /*
3988 * Module interface
3989 */
3990
3991 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
3992
3993 #ifdef _MODULE
3994 CFDRIVER_DECL(raid, DV_DISK, NULL);
3995 #endif
3996
3997 static int raid_modcmd(modcmd_t, void *);
3998 static int raid_modcmd_init(void);
3999 static int raid_modcmd_fini(void);
4000
4001 static int
4002 raid_modcmd(modcmd_t cmd, void *data)
4003 {
4004 int error;
4005
4006 error = 0;
4007 switch (cmd) {
4008 case MODULE_CMD_INIT:
4009 error = raid_modcmd_init();
4010 break;
4011 case MODULE_CMD_FINI:
4012 error = raid_modcmd_fini();
4013 break;
4014 default:
4015 error = ENOTTY;
4016 break;
4017 }
4018 return error;
4019 }
4020
4021 static int
4022 raid_modcmd_init(void)
4023 {
4024 int error;
4025 int bmajor, cmajor;
4026
4027 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
4028 mutex_enter(&raid_lock);
4029 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4030 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4031 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4032 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4033
4034 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4035 #endif
4036
4037 bmajor = cmajor = -1;
4038 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4039 &raid_cdevsw, &cmajor);
4040 if (error != 0 && error != EEXIST) {
4041 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4042 mutex_exit(&raid_lock);
4043 return error;
4044 }
4045 #ifdef _MODULE
4046 error = config_cfdriver_attach(&raid_cd);
4047 if (error != 0) {
4048 aprint_error("%s: config_cfdriver_attach failed %d\n",
4049 __func__, error);
4050 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4051 mutex_exit(&raid_lock);
4052 return error;
4053 }
4054 #endif
4055 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4056 if (error != 0) {
4057 aprint_error("%s: config_cfattach_attach failed %d\n",
4058 __func__, error);
4059 #ifdef _MODULE
4060 config_cfdriver_detach(&raid_cd);
4061 #endif
4062 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4063 mutex_exit(&raid_lock);
4064 return error;
4065 }
4066
4067 raidautoconfigdone = false;
4068
4069 mutex_exit(&raid_lock);
4070
4071 if (error == 0) {
4072 if (rf_BootRaidframe(true) == 0)
4073 aprint_verbose("Kernelized RAIDframe activated\n");
4074 else
4075 panic("Serious error activating RAID!!");
4076 }
4077
4078 /*
4079 * Register a finalizer which will be used to auto-config RAID
4080 * sets once all real hardware devices have been found.
4081 */
4082 error = config_finalize_register(NULL, rf_autoconfig);
4083 if (error != 0) {
4084 aprint_error("WARNING: unable to register RAIDframe "
4085 "finalizer\n");
4086 error = 0;
4087 }
4088
4089 return error;
4090 }
4091
4092 static int
4093 raid_modcmd_fini(void)
4094 {
4095 int error;
4096
4097 mutex_enter(&raid_lock);
4098
4099 /* Don't allow unload if raid device(s) exist. */
4100 if (!LIST_EMPTY(&raids)) {
4101 mutex_exit(&raid_lock);
4102 return EBUSY;
4103 }
4104
4105 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4106 if (error != 0) {
4107 aprint_error("%s: cannot detach cfattach\n",__func__);
4108 mutex_exit(&raid_lock);
4109 return error;
4110 }
4111 #ifdef _MODULE
4112 error = config_cfdriver_detach(&raid_cd);
4113 if (error != 0) {
4114 aprint_error("%s: cannot detach cfdriver\n",__func__);
4115 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4116 mutex_exit(&raid_lock);
4117 return error;
4118 }
4119 #endif
4120 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4121 rf_BootRaidframe(false);
4122 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4123 rf_destroy_mutex2(rf_sparet_wait_mutex);
4124 rf_destroy_cond2(rf_sparet_wait_cv);
4125 rf_destroy_cond2(rf_sparet_resp_cv);
4126 #endif
4127 mutex_exit(&raid_lock);
4128 mutex_destroy(&raid_lock);
4129
4130 return error;
4131 }
4132