rf_netbsdkintf.c revision 1.417 1 /* $NetBSD: rf_netbsdkintf.c,v 1.417 2023/10/09 21:55:48 oster Exp $ */
2
3 /*-
4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Greg Oster; Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: cd.c 1.6 90/11/28$
66 *
67 * @(#)cd.c 8.2 (Berkeley) 11/16/93
68 */
69
70 /*
71 * Copyright (c) 1995 Carnegie-Mellon University.
72 * All rights reserved.
73 *
74 * Authors: Mark Holland, Jim Zelenka
75 *
76 * Permission to use, copy, modify and distribute this software and
77 * its documentation is hereby granted, provided that both the copyright
78 * notice and this permission notice appear in all copies of the
79 * software, derivative works or modified versions, and any portions
80 * thereof, and that both notices appear in supporting documentation.
81 *
82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85 *
86 * Carnegie Mellon requests users of this software to return to
87 *
88 * Software Distribution Coordinator or Software.Distribution (at) CS.CMU.EDU
89 * School of Computer Science
90 * Carnegie Mellon University
91 * Pittsburgh PA 15213-3890
92 *
93 * any improvements or extensions that they make and grant Carnegie the
94 * rights to redistribute these changes.
95 */
96
97 /***********************************************************
98 *
99 * rf_kintf.c -- the kernel interface routines for RAIDframe
100 *
101 ***********************************************************/
102
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.417 2023/10/09 21:55:48 oster Exp $");
105
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131
132 #include <prop/proplib.h>
133
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137
138 #include "rf_raid.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150
151 #include "ioconf.h"
152
153 #ifdef DEBUG
154 int rf_kdebug_level = 0;
155 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
156 #else /* DEBUG */
157 #define db1_printf(a) { }
158 #endif /* DEBUG */
159
160 #define DEVICE_XNAME(dev) dev ? device_xname(dev) : "null"
161
162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
163 static rf_declare_mutex2(rf_sparet_wait_mutex);
164 static rf_declare_cond2(rf_sparet_wait_cv);
165 static rf_declare_cond2(rf_sparet_resp_cv);
166
167 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a
168 * spare table */
169 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from
170 * installation process */
171 #endif
172
173 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
174
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181 void *, int);
182 static void raidinit(struct raid_softc *);
183 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
184 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
185
186 static int raid_match(device_t, cfdata_t, void *);
187 static void raid_attach(device_t, device_t, void *);
188 static int raid_detach(device_t, int);
189
190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
191 daddr_t, daddr_t);
192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
193 daddr_t, daddr_t);
194
195 static int raidwrite_component_label(unsigned,
196 dev_t, struct vnode *, RF_ComponentLabel_t *);
197 static int raidread_component_label(unsigned,
198 dev_t, struct vnode *, RF_ComponentLabel_t *);
199
200 static int raid_diskstart(device_t, struct buf *bp);
201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
202 static int raid_lastclose(device_t);
203
204 static dev_type_open(raidopen);
205 static dev_type_close(raidclose);
206 static dev_type_read(raidread);
207 static dev_type_write(raidwrite);
208 static dev_type_ioctl(raidioctl);
209 static dev_type_strategy(raidstrategy);
210 static dev_type_dump(raiddump);
211 static dev_type_size(raidsize);
212
213 const struct bdevsw raid_bdevsw = {
214 .d_open = raidopen,
215 .d_close = raidclose,
216 .d_strategy = raidstrategy,
217 .d_ioctl = raidioctl,
218 .d_dump = raiddump,
219 .d_psize = raidsize,
220 .d_discard = nodiscard,
221 .d_flag = D_DISK
222 };
223
224 const struct cdevsw raid_cdevsw = {
225 .d_open = raidopen,
226 .d_close = raidclose,
227 .d_read = raidread,
228 .d_write = raidwrite,
229 .d_ioctl = raidioctl,
230 .d_stop = nostop,
231 .d_tty = notty,
232 .d_poll = nopoll,
233 .d_mmap = nommap,
234 .d_kqfilter = nokqfilter,
235 .d_discard = nodiscard,
236 .d_flag = D_DISK
237 };
238
239 static struct dkdriver rf_dkdriver = {
240 .d_open = raidopen,
241 .d_close = raidclose,
242 .d_strategy = raidstrategy,
243 .d_diskstart = raid_diskstart,
244 .d_dumpblocks = raid_dumpblocks,
245 .d_lastclose = raid_lastclose,
246 .d_minphys = minphys
247 };
248
249 #define raidunit(x) DISKUNIT(x)
250 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc)
251
252 extern struct cfdriver raid_cd;
253 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
254 raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
255 DVF_DETACH_SHUTDOWN);
256
257 /* Internal representation of a rf_recon_req */
258 struct rf_recon_req_internal {
259 RF_RowCol_t col;
260 RF_ReconReqFlags_t flags;
261 void *raidPtr;
262 };
263
264 /*
265 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
266 * Be aware that large numbers can allow the driver to consume a lot of
267 * kernel memory, especially on writes, and in degraded mode reads.
268 *
269 * For example: with a stripe width of 64 blocks (32k) and 5 disks,
270 * a single 64K write will typically require 64K for the old data,
271 * 64K for the old parity, and 64K for the new parity, for a total
272 * of 192K (if the parity buffer is not re-used immediately).
273 * Even it if is used immediately, that's still 128K, which when multiplied
274 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
275 *
276 * Now in degraded mode, for example, a 64K read on the above setup may
277 * require data reconstruction, which will require *all* of the 4 remaining
278 * disks to participate -- 4 * 32K/disk == 128K again.
279 */
280
281 #ifndef RAIDOUTSTANDING
282 #define RAIDOUTSTANDING 6
283 #endif
284
285 #define RAIDLABELDEV(dev) \
286 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
287
288 /* declared here, and made public, for the benefit of KVM stuff.. */
289
290 static int raidlock(struct raid_softc *);
291 static void raidunlock(struct raid_softc *);
292
293 static int raid_detach_unlocked(struct raid_softc *);
294
295 static void rf_markalldirty(RF_Raid_t *);
296 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
297
298 static void rf_ReconThread(struct rf_recon_req_internal *);
299 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
301 static int rf_autoconfig(device_t);
302 static int rf_rescan(void);
303 static void rf_buildroothack(RF_ConfigSet_t *);
304
305 static RF_AutoConfig_t *rf_find_raid_components(void);
306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
309 static int rf_set_autoconfig(RF_Raid_t *, int);
310 static int rf_set_rootpartition(RF_Raid_t *, int);
311 static void rf_release_all_vps(RF_ConfigSet_t *);
312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
313 static int rf_have_enough_components(RF_ConfigSet_t *);
314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
316
317 /*
318 * Debugging, mostly. Set to 0 to not allow autoconfig to take place.
319 * Note that this is overridden by having RAID_AUTOCONFIG as an option
320 * in the kernel config file.
321 */
322 #ifdef RAID_AUTOCONFIG
323 int raidautoconfig = 1;
324 #else
325 int raidautoconfig = 0;
326 #endif
327 static bool raidautoconfigdone = false;
328
329 struct pool rf_alloclist_pool; /* AllocList */
330
331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
332 static kmutex_t raid_lock;
333
334 static struct raid_softc *
335 raidcreate(int unit) {
336 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
337 sc->sc_unit = unit;
338 cv_init(&sc->sc_cv, "raidunit");
339 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
340 return sc;
341 }
342
343 static void
344 raiddestroy(struct raid_softc *sc) {
345 cv_destroy(&sc->sc_cv);
346 mutex_destroy(&sc->sc_mutex);
347 kmem_free(sc, sizeof(*sc));
348 }
349
350 static struct raid_softc *
351 raidget(int unit, bool create) {
352 struct raid_softc *sc;
353 if (unit < 0) {
354 #ifdef DIAGNOSTIC
355 panic("%s: unit %d!", __func__, unit);
356 #endif
357 return NULL;
358 }
359 mutex_enter(&raid_lock);
360 LIST_FOREACH(sc, &raids, sc_link) {
361 if (sc->sc_unit == unit) {
362 mutex_exit(&raid_lock);
363 return sc;
364 }
365 }
366 mutex_exit(&raid_lock);
367 if (!create)
368 return NULL;
369 sc = raidcreate(unit);
370 mutex_enter(&raid_lock);
371 LIST_INSERT_HEAD(&raids, sc, sc_link);
372 mutex_exit(&raid_lock);
373 return sc;
374 }
375
376 static void
377 raidput(struct raid_softc *sc) {
378 mutex_enter(&raid_lock);
379 LIST_REMOVE(sc, sc_link);
380 mutex_exit(&raid_lock);
381 raiddestroy(sc);
382 }
383
384 void
385 raidattach(int num)
386 {
387
388 /*
389 * Device attachment and associated initialization now occurs
390 * as part of the module initialization.
391 */
392 }
393
394 static int
395 rf_autoconfig(device_t self)
396 {
397 RF_AutoConfig_t *ac_list;
398 RF_ConfigSet_t *config_sets;
399
400 if (!raidautoconfig || raidautoconfigdone == true)
401 return 0;
402
403 /* XXX This code can only be run once. */
404 raidautoconfigdone = true;
405
406 #ifdef __HAVE_CPU_BOOTCONF
407 /*
408 * 0. find the boot device if needed first so we can use it later
409 * this needs to be done before we autoconfigure any raid sets,
410 * because if we use wedges we are not going to be able to open
411 * the boot device later
412 */
413 if (booted_device == NULL)
414 cpu_bootconf();
415 #endif
416 /* 1. locate all RAID components on the system */
417 aprint_debug("Searching for RAID components...\n");
418 ac_list = rf_find_raid_components();
419
420 /* 2. Sort them into their respective sets. */
421 config_sets = rf_create_auto_sets(ac_list);
422
423 /*
424 * 3. Evaluate each set and configure the valid ones.
425 * This gets done in rf_buildroothack().
426 */
427 rf_buildroothack(config_sets);
428
429 return 1;
430 }
431
432 int
433 rf_inited(const struct raid_softc *rs) {
434 return (rs->sc_flags & RAIDF_INITED) != 0;
435 }
436
437 RF_Raid_t *
438 rf_get_raid(struct raid_softc *rs) {
439 return &rs->sc_r;
440 }
441
442 int
443 rf_get_unit(const struct raid_softc *rs) {
444 return rs->sc_unit;
445 }
446
447 static int
448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
449 const char *bootname;
450 size_t len;
451
452 /* if bdv is NULL, the set can't contain it. exit early. */
453 if (bdv == NULL)
454 return 0;
455
456 bootname = device_xname(bdv);
457 len = strlen(bootname);
458
459 for (int col = 0; col < r->numCol; col++) {
460 const char *devname = r->Disks[col].devname;
461 devname += sizeof("/dev/") - 1;
462 if (strncmp(devname, "dk", 2) == 0) {
463 const char *parent =
464 dkwedge_get_parent_name(r->Disks[col].dev);
465 if (parent != NULL)
466 devname = parent;
467 }
468 if (strncmp(devname, bootname, len) == 0) {
469 struct raid_softc *sc = r->softc;
470 aprint_debug("raid%d includes boot device %s\n",
471 sc->sc_unit, devname);
472 return 1;
473 }
474 }
475 return 0;
476 }
477
478 static int
479 rf_rescan(void)
480 {
481 RF_AutoConfig_t *ac_list;
482 RF_ConfigSet_t *config_sets, *cset, *next_cset;
483 struct raid_softc *sc;
484 int raid_added;
485
486 ac_list = rf_find_raid_components();
487 config_sets = rf_create_auto_sets(ac_list);
488
489 raid_added = 1;
490 while (raid_added > 0) {
491 raid_added = 0;
492 cset = config_sets;
493 while (cset != NULL) {
494 next_cset = cset->next;
495 if (rf_have_enough_components(cset) &&
496 cset->ac->clabel->autoconfigure == 1) {
497 sc = rf_auto_config_set(cset);
498 if (sc != NULL) {
499 aprint_debug("raid%d: configured ok, rootable %d\n",
500 sc->sc_unit, cset->rootable);
501 /* We added one RAID set */
502 raid_added++;
503 } else {
504 /* The autoconfig didn't work :( */
505 aprint_debug("Autoconfig failed\n");
506 rf_release_all_vps(cset);
507 }
508 } else {
509 /* we're not autoconfiguring this set...
510 release the associated resources */
511 rf_release_all_vps(cset);
512 }
513 /* cleanup */
514 rf_cleanup_config_set(cset);
515 cset = next_cset;
516 }
517 if (raid_added > 0) {
518 /* We added at least one RAID set, so re-scan for recursive RAID */
519 ac_list = rf_find_raid_components();
520 config_sets = rf_create_auto_sets(ac_list);
521 }
522 }
523
524 return 0;
525 }
526
527 /*
528 * Example setup:
529 * dk1 at wd0: "raid@wd0", 171965 blocks at 32802, type: raidframe
530 * dk3 at wd1: "raid@wd1", 171965 blocks at 32802, type: raidframz
531 * raid1: Components: /dev/dk1 /dev/dk3
532 * dk4 at raid1: "empty@raid1", 8192 blocks at 34, type: msdos
533 * dk5 at raid1: "root@raid1", 163517 blocks at 8226, type: ffs
534 *
535 * If booted from wd0, booted_device will be
536 * disk wd0, startblk = 41092, nblks = 163517
537 *
538 * That is, dk5 with startblk computed from the beginning of wd0
539 * instead of beginning of raid1:
540 * 32802 + 64 (RF_PROTECTED_SECTORS) + 8226 = 41092
541 *
542 * In order to find the boot wedge, we must iterate on each component,
543 * find its offset from disk beginning, abd look for the boot wedge with
544 * startblck adjusted.
545 */
546 static device_t
547 rf_find_bootwedge(struct raid_softc *rsc)
548 {
549 RF_Raid_t *r = &rsc->sc_r;
550 const char *bootname;
551 size_t len;
552 device_t rdev = NULL;
553
554 if (booted_device == NULL)
555 goto out;
556
557 bootname = device_xname(booted_device);
558 len = strlen(bootname);
559
560 aprint_debug("%s: booted_device %s, startblk = %"PRId64", "
561 "nblks = %"PRId64"\n", __func__,
562 bootname, booted_startblk, booted_nblks);
563
564 for (int col = 0; col < r->numCol; col++) {
565 const char *devname = r->Disks[col].devname;
566 const char *parent;
567 struct disk *dk;
568 u_int nwedges;
569 struct dkwedge_info *dkwi;
570 struct dkwedge_list dkwl;
571 size_t dkwi_len;
572 int i;
573
574 devname += sizeof("/dev/") - 1;
575 if (strncmp(devname, "dk", 2) != 0)
576 continue;
577
578 parent = dkwedge_get_parent_name(r->Disks[col].dev);
579 if (parent == NULL) {
580 aprint_debug("%s: cannot find parent for "
581 "component /dev/%s", __func__, devname);
582 continue;
583 }
584
585 if (strncmp(parent, bootname, len) != 0)
586 continue;
587
588 aprint_debug("%s: looking up wedge %s in device %s\n",
589 __func__, devname, parent);
590
591 dk = disk_find(parent);
592 nwedges = dk->dk_nwedges;
593 dkwi_len = sizeof(*dkwi) * nwedges;
594 dkwi = RF_Malloc(dkwi_len);
595
596 dkwl.dkwl_buf = dkwi;
597 dkwl.dkwl_bufsize = dkwi_len;
598 dkwl.dkwl_nwedges = 0;
599 dkwl.dkwl_ncopied = 0;
600
601 if (dkwedge_list(dk, &dkwl, curlwp) == 0) {
602 daddr_t startblk;
603
604 for (i = 0; i < dkwl.dkwl_ncopied; i++) {
605 if (strcmp(dkwi[i].dkw_devname, devname) == 0)
606 break;
607 }
608
609 KASSERT(i < dkwl.dkwl_ncopied);
610
611 aprint_debug("%s: wedge %s, "
612 "startblk = %"PRId64", "
613 "nblks = %"PRId64"\n",
614 __func__,
615 dkwi[i].dkw_devname,
616 dkwi[i].dkw_offset,
617 dkwi[i].dkw_size);
618
619 startblk = booted_startblk
620 - dkwi[i].dkw_offset
621 - RF_PROTECTED_SECTORS;
622
623 aprint_debug("%s: looking for wedge in %s, "
624 "startblk = %"PRId64", "
625 "nblks = %"PRId64"\n",
626 __func__,
627 DEVICE_XNAME(rsc->sc_dksc.sc_dev),
628 startblk, booted_nblks);
629
630 rdev = dkwedge_find_partition(rsc->sc_dksc.sc_dev,
631 startblk,
632 booted_nblks);
633 if (rdev) {
634 aprint_debug("%s: root candidate wedge %s "
635 "shifted from %s\n", __func__,
636 device_xname(rdev),
637 dkwi[i].dkw_devname);
638 goto done;
639 } else {
640 aprint_debug("%s: not found\n", __func__);
641 }
642 }
643
644 aprint_debug("%s: nothing found for col %d\n", __func__, col);
645 done:
646 RF_Free(dkwi, dkwi_len);
647 }
648
649 out:
650 if (!rdev)
651 aprint_debug("%s: nothing found\n", __func__);
652
653 return rdev;
654 }
655
656 static void
657 rf_buildroothack(RF_ConfigSet_t *config_sets)
658 {
659 RF_AutoConfig_t *ac_list;
660 RF_ConfigSet_t *cset;
661 RF_ConfigSet_t *next_cset;
662 int num_root;
663 int raid_added;
664 struct raid_softc *sc, *rsc;
665 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */
666
667 sc = rsc = NULL;
668 num_root = 0;
669
670 raid_added = 1;
671 while (raid_added > 0) {
672 raid_added = 0;
673 cset = config_sets;
674 while (cset != NULL) {
675 next_cset = cset->next;
676 if (rf_have_enough_components(cset) &&
677 cset->ac->clabel->autoconfigure == 1) {
678 sc = rf_auto_config_set(cset);
679 if (sc != NULL) {
680 aprint_debug("raid%d: configured ok, rootable %d\n",
681 sc->sc_unit, cset->rootable);
682 /* We added one RAID set */
683 raid_added++;
684 if (cset->rootable) {
685 rsc = sc;
686 num_root++;
687 }
688 } else {
689 /* The autoconfig didn't work :( */
690 aprint_debug("Autoconfig failed\n");
691 rf_release_all_vps(cset);
692 }
693 } else {
694 /* we're not autoconfiguring this set...
695 release the associated resources */
696 rf_release_all_vps(cset);
697 }
698 /* cleanup */
699 rf_cleanup_config_set(cset);
700 cset = next_cset;
701 }
702 if (raid_added > 0) {
703 /* We added at least one RAID set, so re-scan for recursive RAID */
704 ac_list = rf_find_raid_components();
705 config_sets = rf_create_auto_sets(ac_list);
706 }
707 }
708
709 /* if the user has specified what the root device should be
710 then we don't touch booted_device or boothowto... */
711
712 if (rootspec != NULL) {
713 aprint_debug("%s: rootspec %s\n", __func__, rootspec);
714 return;
715 }
716
717 /* we found something bootable... */
718 if (num_root == 1) {
719 device_t candidate_root = NULL;
720 dksc = &rsc->sc_dksc;
721
722 if (dksc->sc_dkdev.dk_nwedges != 0) {
723
724 /* Find the wedge we booted from */
725 candidate_root = rf_find_bootwedge(rsc);
726
727 /* Try first partition */
728 if (candidate_root == NULL) {
729 size_t i = 0;
730 candidate_root = dkwedge_find_by_parent(
731 device_xname(dksc->sc_dev), &i);
732 }
733 aprint_debug("%s: candidate wedge root %s\n",
734 __func__, DEVICE_XNAME(candidate_root));
735 } else {
736 candidate_root = dksc->sc_dev;
737 }
738
739 aprint_debug("%s: candidate root = %s, booted_device = %s, "
740 "root_partition = %d, contains_boot=%d\n",
741 __func__, DEVICE_XNAME(candidate_root),
742 DEVICE_XNAME(booted_device), rsc->sc_r.root_partition,
743 rf_containsboot(&rsc->sc_r, booted_device));
744
745 /* XXX the check for booted_device == NULL can probably be
746 * dropped, now that rf_containsboot handles that case.
747 */
748 if (booted_device == NULL ||
749 rsc->sc_r.root_partition == 1 ||
750 rf_containsboot(&rsc->sc_r, booted_device)) {
751 booted_device = candidate_root;
752 booted_method = "raidframe/single";
753 booted_partition = 0; /* XXX assume 'a' */
754 aprint_debug("%s: set booted_device = %s\n", __func__,
755 DEVICE_XNAME(booted_device));
756 }
757 } else if (num_root > 1) {
758 aprint_debug("%s: many roots=%d, %s\n", __func__, num_root,
759 DEVICE_XNAME(booted_device));
760
761 /*
762 * Maybe the MD code can help. If it cannot, then
763 * setroot() will discover that we have no
764 * booted_device and will ask the user if nothing was
765 * hardwired in the kernel config file
766 */
767 if (booted_device == NULL)
768 return;
769
770 num_root = 0;
771 mutex_enter(&raid_lock);
772 LIST_FOREACH(sc, &raids, sc_link) {
773 RF_Raid_t *r = &sc->sc_r;
774 if (r->valid == 0)
775 continue;
776
777 if (r->root_partition == 0)
778 continue;
779
780 if (rf_containsboot(r, booted_device)) {
781 num_root++;
782 rsc = sc;
783 dksc = &rsc->sc_dksc;
784 }
785 }
786 mutex_exit(&raid_lock);
787
788 if (num_root == 1) {
789 booted_device = dksc->sc_dev;
790 booted_method = "raidframe/multi";
791 booted_partition = 0; /* XXX assume 'a' */
792 } else {
793 /* we can't guess.. require the user to answer... */
794 boothowto |= RB_ASKNAME;
795 }
796 }
797 }
798
799 static int
800 raidsize(dev_t dev)
801 {
802 struct raid_softc *rs;
803 struct dk_softc *dksc;
804 unsigned int unit;
805
806 unit = raidunit(dev);
807 if ((rs = raidget(unit, false)) == NULL)
808 return -1;
809 dksc = &rs->sc_dksc;
810
811 if ((rs->sc_flags & RAIDF_INITED) == 0)
812 return -1;
813
814 return dk_size(dksc, dev);
815 }
816
817 static int
818 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
819 {
820 unsigned int unit;
821 struct raid_softc *rs;
822 struct dk_softc *dksc;
823
824 unit = raidunit(dev);
825 if ((rs = raidget(unit, false)) == NULL)
826 return ENXIO;
827 dksc = &rs->sc_dksc;
828
829 if ((rs->sc_flags & RAIDF_INITED) == 0)
830 return ENODEV;
831
832 /*
833 Note that blkno is relative to this particular partition.
834 By adding adding RF_PROTECTED_SECTORS, we get a value that
835 is relative to the partition used for the underlying component.
836 */
837 blkno += RF_PROTECTED_SECTORS;
838
839 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
840 }
841
842 static int
843 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
844 {
845 struct raid_softc *rs = raidsoftc(dev);
846 const struct bdevsw *bdev;
847 RF_Raid_t *raidPtr;
848 int c, sparecol, j, scol, dumpto;
849 int error = 0;
850
851 raidPtr = &rs->sc_r;
852
853 /* we only support dumping to RAID 1 sets */
854 if (raidPtr->Layout.numDataCol != 1 ||
855 raidPtr->Layout.numParityCol != 1)
856 return EINVAL;
857
858 if ((error = raidlock(rs)) != 0)
859 return error;
860
861 /* figure out what device is alive.. */
862
863 /*
864 Look for a component to dump to. The preference for the
865 component to dump to is as follows:
866 1) the first component
867 2) a used_spare of the first component
868 3) the second component
869 4) a used_spare of the second component
870 */
871
872 dumpto = -1;
873 for (c = 0; c < raidPtr->numCol; c++) {
874 if (raidPtr->Disks[c].status == rf_ds_optimal) {
875 /* this might be the one */
876 dumpto = c;
877 break;
878 }
879 }
880
881 /*
882 At this point we have possibly selected a live component.
883 If we didn't find a live ocmponent, we now check to see
884 if there is a relevant spared component.
885 */
886
887 for (c = 0; c < raidPtr->numSpare; c++) {
888 sparecol = raidPtr->numCol + c;
889
890 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
891 /* How about this one? */
892 scol = -1;
893 for(j=0;j<raidPtr->numCol;j++) {
894 if (raidPtr->Disks[j].spareCol == sparecol) {
895 scol = j;
896 break;
897 }
898 }
899 if (scol == 0) {
900 /*
901 We must have found a spared first
902 component! We'll take that over
903 anything else found so far. (We
904 couldn't have found a real first
905 component before, since this is a
906 used spare, and it's saying that
907 it's replacing the first
908 component.) On reboot (with
909 autoconfiguration turned on)
910 sparecol will become the first
911 component (component0) of this set.
912 */
913 dumpto = sparecol;
914 break;
915 } else if (scol != -1) {
916 /*
917 Must be a spared second component.
918 We'll dump to that if we havn't found
919 anything else so far.
920 */
921 if (dumpto == -1)
922 dumpto = sparecol;
923 }
924 }
925 }
926
927 if (dumpto == -1) {
928 /* we couldn't find any live components to dump to!?!?
929 */
930 error = EINVAL;
931 goto out;
932 }
933
934 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
935 if (bdev == NULL) {
936 error = ENXIO;
937 goto out;
938 }
939
940 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
941 blkno, va, nblk * raidPtr->bytesPerSector);
942
943 out:
944 raidunlock(rs);
945
946 return error;
947 }
948
949 /* ARGSUSED */
950 static int
951 raidopen(dev_t dev, int flags, int fmt,
952 struct lwp *l)
953 {
954 int unit = raidunit(dev);
955 struct raid_softc *rs;
956 struct dk_softc *dksc;
957 int error = 0;
958 int part, pmask;
959
960 if ((rs = raidget(unit, true)) == NULL)
961 return ENXIO;
962 if ((error = raidlock(rs)) != 0)
963 return error;
964
965 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
966 error = EBUSY;
967 goto bad;
968 }
969
970 dksc = &rs->sc_dksc;
971
972 part = DISKPART(dev);
973 pmask = (1 << part);
974
975 if (!DK_BUSY(dksc, pmask) &&
976 ((rs->sc_flags & RAIDF_INITED) != 0)) {
977 /* First one... mark things as dirty... Note that we *MUST*
978 have done a configure before this. I DO NOT WANT TO BE
979 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
980 THAT THEY BELONG TOGETHER!!!!! */
981 /* XXX should check to see if we're only open for reading
982 here... If so, we needn't do this, but then need some
983 other way of keeping track of what's happened.. */
984
985 rf_markalldirty(&rs->sc_r);
986 }
987
988 if ((rs->sc_flags & RAIDF_INITED) != 0)
989 error = dk_open(dksc, dev, flags, fmt, l);
990
991 bad:
992 raidunlock(rs);
993
994 return error;
995
996
997 }
998
999 static int
1000 raid_lastclose(device_t self)
1001 {
1002 struct raid_softc *rs = raidsoftc(self);
1003
1004 /* Last one... device is not unconfigured yet.
1005 Device shutdown has taken care of setting the
1006 clean bits if RAIDF_INITED is not set
1007 mark things as clean... */
1008
1009 rf_update_component_labels(&rs->sc_r,
1010 RF_FINAL_COMPONENT_UPDATE);
1011
1012 /* pass to unlocked code */
1013 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1014 rs->sc_flags |= RAIDF_DETACH;
1015
1016 return 0;
1017 }
1018
1019 /* ARGSUSED */
1020 static int
1021 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
1022 {
1023 int unit = raidunit(dev);
1024 struct raid_softc *rs;
1025 struct dk_softc *dksc;
1026 cfdata_t cf;
1027 int error = 0, do_detach = 0, do_put = 0;
1028
1029 if ((rs = raidget(unit, false)) == NULL)
1030 return ENXIO;
1031 dksc = &rs->sc_dksc;
1032
1033 if ((error = raidlock(rs)) != 0)
1034 return error;
1035
1036 if ((rs->sc_flags & RAIDF_INITED) != 0) {
1037 error = dk_close(dksc, dev, flags, fmt, l);
1038 if ((rs->sc_flags & RAIDF_DETACH) != 0)
1039 do_detach = 1;
1040 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1041 do_put = 1;
1042
1043 raidunlock(rs);
1044
1045 if (do_detach) {
1046 /* free the pseudo device attach bits */
1047 cf = device_cfdata(dksc->sc_dev);
1048 error = config_detach(dksc->sc_dev, 0);
1049 if (error == 0)
1050 free(cf, M_RAIDFRAME);
1051 } else if (do_put) {
1052 raidput(rs);
1053 }
1054
1055 return error;
1056
1057 }
1058
1059 static void
1060 raid_wakeup(RF_Raid_t *raidPtr)
1061 {
1062 rf_lock_mutex2(raidPtr->iodone_lock);
1063 rf_signal_cond2(raidPtr->iodone_cv);
1064 rf_unlock_mutex2(raidPtr->iodone_lock);
1065 }
1066
1067 static void
1068 raidstrategy(struct buf *bp)
1069 {
1070 unsigned int unit;
1071 struct raid_softc *rs;
1072 struct dk_softc *dksc;
1073 RF_Raid_t *raidPtr;
1074
1075 unit = raidunit(bp->b_dev);
1076 if ((rs = raidget(unit, false)) == NULL) {
1077 bp->b_error = ENXIO;
1078 goto fail;
1079 }
1080 if ((rs->sc_flags & RAIDF_INITED) == 0) {
1081 bp->b_error = ENXIO;
1082 goto fail;
1083 }
1084 dksc = &rs->sc_dksc;
1085 raidPtr = &rs->sc_r;
1086
1087 /* Queue IO only */
1088 if (dk_strategy_defer(dksc, bp))
1089 goto done;
1090
1091 /* schedule the IO to happen at the next convenient time */
1092 raid_wakeup(raidPtr);
1093
1094 done:
1095 return;
1096
1097 fail:
1098 bp->b_resid = bp->b_bcount;
1099 biodone(bp);
1100 }
1101
1102 static int
1103 raid_diskstart(device_t dev, struct buf *bp)
1104 {
1105 struct raid_softc *rs = raidsoftc(dev);
1106 RF_Raid_t *raidPtr;
1107
1108 raidPtr = &rs->sc_r;
1109 if (!raidPtr->valid) {
1110 db1_printf(("raid is not valid..\n"));
1111 return ENODEV;
1112 }
1113
1114 /* XXX */
1115 bp->b_resid = 0;
1116
1117 return raiddoaccess(raidPtr, bp);
1118 }
1119
1120 void
1121 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1122 {
1123 struct raid_softc *rs;
1124 struct dk_softc *dksc;
1125
1126 rs = raidPtr->softc;
1127 dksc = &rs->sc_dksc;
1128
1129 dk_done(dksc, bp);
1130
1131 rf_lock_mutex2(raidPtr->mutex);
1132 raidPtr->openings++;
1133 rf_unlock_mutex2(raidPtr->mutex);
1134
1135 /* schedule more IO */
1136 raid_wakeup(raidPtr);
1137 }
1138
1139 /* ARGSUSED */
1140 static int
1141 raidread(dev_t dev, struct uio *uio, int flags)
1142 {
1143 int unit = raidunit(dev);
1144 struct raid_softc *rs;
1145
1146 if ((rs = raidget(unit, false)) == NULL)
1147 return ENXIO;
1148
1149 if ((rs->sc_flags & RAIDF_INITED) == 0)
1150 return ENXIO;
1151
1152 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1153
1154 }
1155
1156 /* ARGSUSED */
1157 static int
1158 raidwrite(dev_t dev, struct uio *uio, int flags)
1159 {
1160 int unit = raidunit(dev);
1161 struct raid_softc *rs;
1162
1163 if ((rs = raidget(unit, false)) == NULL)
1164 return ENXIO;
1165
1166 if ((rs->sc_flags & RAIDF_INITED) == 0)
1167 return ENXIO;
1168
1169 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1170
1171 }
1172
1173 static int
1174 raid_detach_unlocked(struct raid_softc *rs)
1175 {
1176 struct dk_softc *dksc = &rs->sc_dksc;
1177 RF_Raid_t *raidPtr;
1178 int error;
1179
1180 raidPtr = &rs->sc_r;
1181
1182 if (DK_BUSY(dksc, 0) ||
1183 raidPtr->recon_in_progress != 0 ||
1184 raidPtr->parity_rewrite_in_progress != 0)
1185 return EBUSY;
1186
1187 if ((rs->sc_flags & RAIDF_INITED) == 0)
1188 return 0;
1189
1190 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1191
1192 if ((error = rf_Shutdown(raidPtr)) != 0)
1193 return error;
1194
1195 rs->sc_flags &= ~RAIDF_INITED;
1196
1197 /* Kill off any queued buffers */
1198 dk_drain(dksc);
1199 bufq_free(dksc->sc_bufq);
1200
1201 /* Detach the disk. */
1202 dkwedge_delall(&dksc->sc_dkdev);
1203 disk_detach(&dksc->sc_dkdev);
1204 disk_destroy(&dksc->sc_dkdev);
1205 dk_detach(dksc);
1206
1207 return 0;
1208 }
1209
1210 int
1211 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1212 {
1213 struct rf_recon_req_internal *rrint;
1214
1215 if (raidPtr->Layout.map->faultsTolerated == 0) {
1216 /* Can't do this on a RAID 0!! */
1217 return EINVAL;
1218 }
1219
1220 if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1221 /* bad column */
1222 return EINVAL;
1223 }
1224
1225 rf_lock_mutex2(raidPtr->mutex);
1226 if (raidPtr->status == rf_rs_reconstructing) {
1227 raidPtr->abortRecon[rr->col] = 1;
1228 }
1229 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1230 (raidPtr->numFailures > 0)) {
1231 /* some other component has failed. Let's not make
1232 things worse. XXX wrong for RAID6 */
1233 goto out;
1234 }
1235 if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1236 int spareCol = raidPtr->Disks[rr->col].spareCol;
1237
1238 if (spareCol < raidPtr->numCol ||
1239 spareCol >= raidPtr->numCol + raidPtr->numSpare)
1240 goto out;
1241
1242 /*
1243 * Fail the spare disk so that we can
1244 * reconstruct on another one.
1245 */
1246 raidPtr->Disks[spareCol].status = rf_ds_failed;
1247
1248 }
1249 rf_unlock_mutex2(raidPtr->mutex);
1250
1251 /* make a copy of the recon request so that we don't rely on
1252 * the user's buffer */
1253 rrint = RF_Malloc(sizeof(*rrint));
1254 if (rrint == NULL)
1255 return(ENOMEM);
1256 rrint->col = rr->col;
1257 rrint->flags = rr->flags;
1258 rrint->raidPtr = raidPtr;
1259
1260 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1261 rrint, "raid_recon");
1262 out:
1263 rf_unlock_mutex2(raidPtr->mutex);
1264 return EINVAL;
1265 }
1266
1267 static int
1268 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1269 {
1270 /* allocate a buffer for the layout-specific data, and copy it in */
1271 if (k_cfg->layoutSpecificSize == 0)
1272 return 0;
1273
1274 if (k_cfg->layoutSpecificSize > 10000) {
1275 /* sanity check */
1276 return EINVAL;
1277 }
1278
1279 u_char *specific_buf;
1280 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize);
1281 if (specific_buf == NULL)
1282 return ENOMEM;
1283
1284 int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1285 k_cfg->layoutSpecificSize);
1286 if (retcode) {
1287 RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1288 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1289 return retcode;
1290 }
1291
1292 k_cfg->layoutSpecific = specific_buf;
1293 return 0;
1294 }
1295
1296 static int
1297 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1298 {
1299 RF_Config_t *u_cfg = *((RF_Config_t **) data);
1300
1301 if (rs->sc_r.valid) {
1302 /* There is a valid RAID set running on this unit! */
1303 printf("raid%d: Device already configured!\n", rs->sc_unit);
1304 return EINVAL;
1305 }
1306
1307 /* copy-in the configuration information */
1308 /* data points to a pointer to the configuration structure */
1309 *k_cfg = RF_Malloc(sizeof(**k_cfg));
1310 if (*k_cfg == NULL) {
1311 return ENOMEM;
1312 }
1313 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1314 if (retcode == 0)
1315 return 0;
1316 RF_Free(*k_cfg, sizeof(RF_Config_t));
1317 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1318 rs->sc_flags |= RAIDF_SHUTDOWN;
1319 return retcode;
1320 }
1321
1322 int
1323 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1324 {
1325 int retcode, i;
1326 RF_Raid_t *raidPtr = &rs->sc_r;
1327
1328 rs->sc_flags &= ~RAIDF_SHUTDOWN;
1329
1330 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1331 goto out;
1332
1333 /* should do some kind of sanity check on the configuration.
1334 * Store the sum of all the bytes in the last byte? */
1335
1336 /* Force nul-termination on all strings. */
1337 #define ZERO_FINAL(s) do { s[sizeof(s) - 1] = '\0'; } while (0)
1338 for (i = 0; i < RF_MAXCOL; i++) {
1339 ZERO_FINAL(k_cfg->devnames[0][i]);
1340 }
1341 for (i = 0; i < RF_MAXSPARE; i++) {
1342 ZERO_FINAL(k_cfg->spare_names[i]);
1343 }
1344 for (i = 0; i < RF_MAXDBGV; i++) {
1345 ZERO_FINAL(k_cfg->debugVars[i]);
1346 }
1347 #undef ZERO_FINAL
1348
1349 /* Check some basic limits. */
1350 if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1351 retcode = EINVAL;
1352 goto out;
1353 }
1354 if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1355 retcode = EINVAL;
1356 goto out;
1357 }
1358
1359 /* configure the system */
1360
1361 /*
1362 * Clear the entire RAID descriptor, just to make sure
1363 * there is no stale data left in the case of a
1364 * reconfiguration
1365 */
1366 memset(raidPtr, 0, sizeof(*raidPtr));
1367 raidPtr->softc = rs;
1368 raidPtr->raidid = rs->sc_unit;
1369
1370 retcode = rf_Configure(raidPtr, k_cfg, NULL);
1371
1372 if (retcode == 0) {
1373 /* allow this many simultaneous IO's to
1374 this RAID device */
1375 raidPtr->openings = RAIDOUTSTANDING;
1376
1377 raidinit(rs);
1378 raid_wakeup(raidPtr);
1379 rf_markalldirty(raidPtr);
1380 }
1381
1382 /* free the buffers. No return code here. */
1383 if (k_cfg->layoutSpecificSize) {
1384 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1385 }
1386 out:
1387 RF_Free(k_cfg, sizeof(RF_Config_t));
1388 if (retcode) {
1389 /*
1390 * If configuration failed, set sc_flags so that we
1391 * will detach the device when we close it.
1392 */
1393 rs->sc_flags |= RAIDF_SHUTDOWN;
1394 }
1395 return retcode;
1396 }
1397
1398 #if RF_DISABLED
1399 static int
1400 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1401 {
1402
1403 /* XXX check the label for valid stuff... */
1404 /* Note that some things *should not* get modified --
1405 the user should be re-initing the labels instead of
1406 trying to patch things.
1407 */
1408 #ifdef DEBUG
1409 int raidid = raidPtr->raidid;
1410 printf("raid%d: Got component label:\n", raidid);
1411 printf("raid%d: Version: %d\n", raidid, clabel->version);
1412 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1413 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1414 printf("raid%d: Column: %d\n", raidid, clabel->column);
1415 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1416 printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1417 printf("raid%d: Status: %d\n", raidid, clabel->status);
1418 #endif /* DEBUG */
1419 clabel->row = 0;
1420 int column = clabel->column;
1421
1422 if ((column < 0) || (column >= raidPtr->numCol)) {
1423 return(EINVAL);
1424 }
1425
1426 /* XXX this isn't allowed to do anything for now :-) */
1427
1428 /* XXX and before it is, we need to fill in the rest
1429 of the fields!?!?!?! */
1430 memcpy(raidget_component_label(raidPtr, column),
1431 clabel, sizeof(*clabel));
1432 raidflush_component_label(raidPtr, column);
1433 return 0;
1434 }
1435 #endif
1436
1437 static int
1438 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1439 {
1440 /*
1441 we only want the serial number from
1442 the above. We get all the rest of the information
1443 from the config that was used to create this RAID
1444 set.
1445 */
1446
1447 raidPtr->serial_number = clabel->serial_number;
1448
1449 for (int column = 0; column < raidPtr->numCol; column++) {
1450 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1451 if (RF_DEAD_DISK(diskPtr->status))
1452 continue;
1453 RF_ComponentLabel_t *ci_label = raidget_component_label(
1454 raidPtr, column);
1455 /* Zeroing this is important. */
1456 memset(ci_label, 0, sizeof(*ci_label));
1457 raid_init_component_label(raidPtr, ci_label);
1458 ci_label->serial_number = raidPtr->serial_number;
1459 ci_label->row = 0; /* we dont' pretend to support more */
1460 rf_component_label_set_partitionsize(ci_label,
1461 diskPtr->partitionSize);
1462 ci_label->column = column;
1463 raidflush_component_label(raidPtr, column);
1464 /* XXXjld what about the spares? */
1465 }
1466
1467 return 0;
1468 }
1469
1470 static int
1471 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1472 {
1473
1474 if (raidPtr->Layout.map->faultsTolerated == 0) {
1475 /* Can't do this on a RAID 0!! */
1476 return EINVAL;
1477 }
1478
1479 if (raidPtr->recon_in_progress == 1) {
1480 /* a reconstruct is already in progress! */
1481 return EINVAL;
1482 }
1483
1484 RF_SingleComponent_t component;
1485 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1486 component.row = 0; /* we don't support any more */
1487 int column = component.column;
1488
1489 if ((column < 0) || (column >= raidPtr->numCol)) {
1490 return EINVAL;
1491 }
1492
1493 rf_lock_mutex2(raidPtr->mutex);
1494 if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1495 (raidPtr->numFailures > 0)) {
1496 /* XXX 0 above shouldn't be constant!!! */
1497 /* some component other than this has failed.
1498 Let's not make things worse than they already
1499 are... */
1500 printf("raid%d: Unable to reconstruct to disk at:\n",
1501 raidPtr->raidid);
1502 printf("raid%d: Col: %d Too many failures.\n",
1503 raidPtr->raidid, column);
1504 rf_unlock_mutex2(raidPtr->mutex);
1505 return EINVAL;
1506 }
1507
1508 if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1509 printf("raid%d: Unable to reconstruct to disk at:\n",
1510 raidPtr->raidid);
1511 printf("raid%d: Col: %d "
1512 "Reconstruction already occurring!\n",
1513 raidPtr->raidid, column);
1514
1515 rf_unlock_mutex2(raidPtr->mutex);
1516 return EINVAL;
1517 }
1518
1519 if (raidPtr->Disks[column].status == rf_ds_spared) {
1520 rf_unlock_mutex2(raidPtr->mutex);
1521 return EINVAL;
1522 }
1523
1524 rf_unlock_mutex2(raidPtr->mutex);
1525
1526 struct rf_recon_req_internal *rrint;
1527 rrint = RF_Malloc(sizeof(*rrint));
1528 if (rrint == NULL)
1529 return ENOMEM;
1530
1531 rrint->col = column;
1532 rrint->raidPtr = raidPtr;
1533
1534 return RF_CREATE_THREAD(raidPtr->recon_thread,
1535 rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1536 }
1537
1538 static int
1539 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1540 {
1541 /*
1542 * This makes no sense on a RAID 0, or if we are not reconstructing
1543 * so tell the user it's done.
1544 */
1545 if (raidPtr->Layout.map->faultsTolerated == 0 ||
1546 raidPtr->status != rf_rs_reconstructing) {
1547 *data = 100;
1548 return 0;
1549 }
1550 if (raidPtr->reconControl->numRUsTotal == 0) {
1551 *data = 0;
1552 return 0;
1553 }
1554 *data = (raidPtr->reconControl->numRUsComplete * 100
1555 / raidPtr->reconControl->numRUsTotal);
1556 return 0;
1557 }
1558
1559 /*
1560 * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1561 * on the component_name[] array.
1562 */
1563 static void
1564 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1565 {
1566
1567 memcpy(component, data, sizeof *component);
1568 component->component_name[sizeof(component->component_name) - 1] = '\0';
1569 }
1570
1571 static int
1572 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1573 {
1574 int unit = raidunit(dev);
1575 int part, pmask;
1576 struct raid_softc *rs;
1577 struct dk_softc *dksc;
1578 RF_Config_t *k_cfg;
1579 RF_Raid_t *raidPtr;
1580 RF_AccTotals_t *totals;
1581 RF_SingleComponent_t component;
1582 RF_DeviceConfig_t *d_cfg, *ucfgp;
1583 int retcode = 0;
1584 int column;
1585 RF_ComponentLabel_t *clabel;
1586 int d;
1587
1588 if ((rs = raidget(unit, false)) == NULL)
1589 return ENXIO;
1590
1591 dksc = &rs->sc_dksc;
1592 raidPtr = &rs->sc_r;
1593
1594 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1595 (int) DISKPART(dev), (int) unit, cmd));
1596
1597 /* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
1598 switch (cmd) {
1599 case RAIDFRAME_CONFIGURE:
1600 case RAIDFRAME_RESCAN:
1601 break;
1602 default:
1603 if (!rf_inited(rs))
1604 return ENXIO;
1605 }
1606
1607 switch (cmd) {
1608 /* configure the system */
1609 case RAIDFRAME_CONFIGURE:
1610 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1611 return retcode;
1612 return rf_construct(rs, k_cfg);
1613
1614 /* shutdown the system */
1615 case RAIDFRAME_SHUTDOWN:
1616
1617 part = DISKPART(dev);
1618 pmask = (1 << part);
1619
1620 if ((retcode = raidlock(rs)) != 0)
1621 return retcode;
1622
1623 if (DK_BUSY(dksc, pmask) ||
1624 raidPtr->recon_in_progress != 0 ||
1625 raidPtr->parity_rewrite_in_progress != 0)
1626 retcode = EBUSY;
1627 else {
1628 /* detach and free on close */
1629 rs->sc_flags |= RAIDF_SHUTDOWN;
1630 retcode = 0;
1631 }
1632
1633 raidunlock(rs);
1634
1635 return retcode;
1636 case RAIDFRAME_GET_COMPONENT_LABEL:
1637 return rf_get_component_label(raidPtr, data);
1638
1639 #if RF_DISABLED
1640 case RAIDFRAME_SET_COMPONENT_LABEL:
1641 return rf_set_component_label(raidPtr, data);
1642 #endif
1643
1644 case RAIDFRAME_INIT_LABELS:
1645 return rf_init_component_label(raidPtr, data);
1646
1647 case RAIDFRAME_SET_AUTOCONFIG:
1648 d = rf_set_autoconfig(raidPtr, *(int *) data);
1649 printf("raid%d: New autoconfig value is: %d\n",
1650 raidPtr->raidid, d);
1651 *(int *) data = d;
1652 return retcode;
1653
1654 case RAIDFRAME_SET_ROOT:
1655 d = rf_set_rootpartition(raidPtr, *(int *) data);
1656 printf("raid%d: New rootpartition value is: %d\n",
1657 raidPtr->raidid, d);
1658 *(int *) data = d;
1659 return retcode;
1660
1661 /* initialize all parity */
1662 case RAIDFRAME_REWRITEPARITY:
1663
1664 if (raidPtr->Layout.map->faultsTolerated == 0) {
1665 /* Parity for RAID 0 is trivially correct */
1666 raidPtr->parity_good = RF_RAID_CLEAN;
1667 return 0;
1668 }
1669
1670 if (raidPtr->parity_rewrite_in_progress == 1) {
1671 /* Re-write is already in progress! */
1672 return EINVAL;
1673 }
1674
1675 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1676 rf_RewriteParityThread, raidPtr,"raid_parity");
1677
1678 case RAIDFRAME_ADD_HOT_SPARE:
1679 rf_copy_single_component(&component, data);
1680 return rf_add_hot_spare(raidPtr, &component);
1681
1682 /* Remove a non hot-spare component, never implemented in userland */
1683 case RAIDFRAME_DELETE_COMPONENT:
1684 rf_copy_single_component(&component, data);
1685 return rf_delete_component(raidPtr, &component);
1686
1687 case RAIDFRAME_REMOVE_COMPONENT:
1688 rf_copy_single_component(&component, data);
1689 return rf_remove_component(raidPtr, &component);
1690
1691 case RAIDFRAME_INCORPORATE_HOT_SPARE:
1692 rf_copy_single_component(&component, data);
1693 return rf_incorporate_hot_spare(raidPtr, &component);
1694
1695 case RAIDFRAME_REBUILD_IN_PLACE:
1696 return rf_rebuild_in_place(raidPtr, data);
1697
1698 case RAIDFRAME_GET_INFO:
1699 ucfgp = *(RF_DeviceConfig_t **)data;
1700 d_cfg = RF_Malloc(sizeof(*d_cfg));
1701 if (d_cfg == NULL)
1702 return ENOMEM;
1703 retcode = rf_get_info(raidPtr, d_cfg);
1704 if (retcode == 0) {
1705 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1706 }
1707 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1708 return retcode;
1709
1710 case RAIDFRAME_CHECK_PARITY:
1711 *(int *) data = raidPtr->parity_good;
1712 return 0;
1713
1714 case RAIDFRAME_PARITYMAP_STATUS:
1715 if (rf_paritymap_ineligible(raidPtr))
1716 return EINVAL;
1717 rf_paritymap_status(raidPtr->parity_map, data);
1718 return 0;
1719
1720 case RAIDFRAME_PARITYMAP_SET_PARAMS:
1721 if (rf_paritymap_ineligible(raidPtr))
1722 return EINVAL;
1723 if (raidPtr->parity_map == NULL)
1724 return ENOENT; /* ??? */
1725 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1726 return EINVAL;
1727 return 0;
1728
1729 case RAIDFRAME_PARITYMAP_GET_DISABLE:
1730 if (rf_paritymap_ineligible(raidPtr))
1731 return EINVAL;
1732 *(int *) data = rf_paritymap_get_disable(raidPtr);
1733 return 0;
1734
1735 case RAIDFRAME_PARITYMAP_SET_DISABLE:
1736 if (rf_paritymap_ineligible(raidPtr))
1737 return EINVAL;
1738 rf_paritymap_set_disable(raidPtr, *(int *)data);
1739 /* XXX should errors be passed up? */
1740 return 0;
1741
1742 case RAIDFRAME_RESCAN:
1743 return rf_rescan();
1744
1745 case RAIDFRAME_RESET_ACCTOTALS:
1746 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1747 return 0;
1748
1749 case RAIDFRAME_GET_ACCTOTALS:
1750 totals = (RF_AccTotals_t *) data;
1751 *totals = raidPtr->acc_totals;
1752 return 0;
1753
1754 case RAIDFRAME_KEEP_ACCTOTALS:
1755 raidPtr->keep_acc_totals = *(int *)data;
1756 return 0;
1757
1758 case RAIDFRAME_GET_SIZE:
1759 *(int *) data = raidPtr->totalSectors;
1760 return 0;
1761
1762 case RAIDFRAME_FAIL_DISK:
1763 return rf_fail_disk(raidPtr, data);
1764
1765 /* copyback is no longer supported */
1766 case RAIDFRAME_COPYBACK:
1767 return EINVAL;
1768
1769 /* return the percentage completion of reconstruction */
1770 case RAIDFRAME_CHECK_RECON_STATUS:
1771 return rf_check_recon_status(raidPtr, data);
1772
1773 case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1774 rf_check_recon_status_ext(raidPtr, data);
1775 return 0;
1776
1777 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1778 if (raidPtr->Layout.map->faultsTolerated == 0) {
1779 /* This makes no sense on a RAID 0, so tell the
1780 user it's done. */
1781 *(int *) data = 100;
1782 return 0;
1783 }
1784 if (raidPtr->parity_rewrite_in_progress == 1) {
1785 *(int *) data = 100 *
1786 raidPtr->parity_rewrite_stripes_done /
1787 raidPtr->Layout.numStripe;
1788 } else {
1789 *(int *) data = 100;
1790 }
1791 return 0;
1792
1793 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1794 rf_check_parityrewrite_status_ext(raidPtr, data);
1795 return 0;
1796
1797 case RAIDFRAME_CHECK_COPYBACK_STATUS:
1798 *(int *) data = 100;
1799 return 0;
1800
1801 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1802 rf_check_copyback_status_ext(raidPtr, data);
1803 return 0;
1804
1805 case RAIDFRAME_SET_LAST_UNIT:
1806 for (column = 0; column < raidPtr->numCol; column++)
1807 if (raidPtr->Disks[column].status != rf_ds_optimal)
1808 return EBUSY;
1809
1810 for (column = 0; column < raidPtr->numCol; column++) {
1811 clabel = raidget_component_label(raidPtr, column);
1812 clabel->last_unit = *(int *)data;
1813 raidflush_component_label(raidPtr, column);
1814 }
1815 rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1816 return 0;
1817
1818 /* the sparetable daemon calls this to wait for the kernel to
1819 * need a spare table. this ioctl does not return until a
1820 * spare table is needed. XXX -- calling mpsleep here in the
1821 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1822 * -- I should either compute the spare table in the kernel,
1823 * or have a different -- XXX XXX -- interface (a different
1824 * character device) for delivering the table -- XXX */
1825 #if RF_DISABLED
1826 case RAIDFRAME_SPARET_WAIT:
1827 rf_lock_mutex2(rf_sparet_wait_mutex);
1828 while (!rf_sparet_wait_queue)
1829 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1830 RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1831 rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1832 rf_unlock_mutex2(rf_sparet_wait_mutex);
1833
1834 /* structure assignment */
1835 *((RF_SparetWait_t *) data) = *waitreq;
1836
1837 RF_Free(waitreq, sizeof(*waitreq));
1838 return 0;
1839
1840 /* wakes up a process waiting on SPARET_WAIT and puts an error
1841 * code in it that will cause the dameon to exit */
1842 case RAIDFRAME_ABORT_SPARET_WAIT:
1843 waitreq = RF_Malloc(sizeof(*waitreq));
1844 waitreq->fcol = -1;
1845 rf_lock_mutex2(rf_sparet_wait_mutex);
1846 waitreq->next = rf_sparet_wait_queue;
1847 rf_sparet_wait_queue = waitreq;
1848 rf_broadcast_cond2(rf_sparet_wait_cv);
1849 rf_unlock_mutex2(rf_sparet_wait_mutex);
1850 return 0;
1851
1852 /* used by the spare table daemon to deliver a spare table
1853 * into the kernel */
1854 case RAIDFRAME_SEND_SPARET:
1855
1856 /* install the spare table */
1857 retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1858
1859 /* respond to the requestor. the return status of the spare
1860 * table installation is passed in the "fcol" field */
1861 waitred = RF_Malloc(sizeof(*waitreq));
1862 waitreq->fcol = retcode;
1863 rf_lock_mutex2(rf_sparet_wait_mutex);
1864 waitreq->next = rf_sparet_resp_queue;
1865 rf_sparet_resp_queue = waitreq;
1866 rf_broadcast_cond2(rf_sparet_resp_cv);
1867 rf_unlock_mutex2(rf_sparet_wait_mutex);
1868
1869 return retcode;
1870 #endif
1871 default:
1872 /*
1873 * Don't bother trying to load compat modules
1874 * if it is not our ioctl. This is more efficient
1875 * and makes rump tests not depend on compat code
1876 */
1877 if (IOCGROUP(cmd) != 'r')
1878 break;
1879 #ifdef _LP64
1880 if ((l->l_proc->p_flag & PK_32) != 0) {
1881 module_autoload("compat_netbsd32_raid",
1882 MODULE_CLASS_EXEC);
1883 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1884 (rs, cmd, data), enosys(), retcode);
1885 if (retcode != EPASSTHROUGH)
1886 return retcode;
1887 }
1888 #endif
1889 module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1890 MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1891 (rs, cmd, data), enosys(), retcode);
1892 if (retcode != EPASSTHROUGH)
1893 return retcode;
1894
1895 module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1896 MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1897 (rs, cmd, data), enosys(), retcode);
1898 if (retcode != EPASSTHROUGH)
1899 return retcode;
1900 break; /* fall through to the os-specific code below */
1901
1902 }
1903
1904 if (!raidPtr->valid)
1905 return EINVAL;
1906
1907 /*
1908 * Add support for "regular" device ioctls here.
1909 */
1910
1911 switch (cmd) {
1912 case DIOCGCACHE:
1913 retcode = rf_get_component_caches(raidPtr, (int *)data);
1914 break;
1915
1916 case DIOCCACHESYNC:
1917 retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1918 break;
1919
1920 default:
1921 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1922 break;
1923 }
1924
1925 return retcode;
1926
1927 }
1928
1929
1930 /* raidinit -- complete the rest of the initialization for the
1931 RAIDframe device. */
1932
1933
1934 static void
1935 raidinit(struct raid_softc *rs)
1936 {
1937 cfdata_t cf;
1938 unsigned int unit;
1939 struct dk_softc *dksc = &rs->sc_dksc;
1940 RF_Raid_t *raidPtr = &rs->sc_r;
1941 device_t dev;
1942
1943 unit = raidPtr->raidid;
1944
1945 /* XXX doesn't check bounds. */
1946 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1947
1948 /* attach the pseudo device */
1949 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1950 cf->cf_name = raid_cd.cd_name;
1951 cf->cf_atname = raid_cd.cd_name;
1952 cf->cf_unit = unit;
1953 cf->cf_fstate = FSTATE_STAR;
1954
1955 dev = config_attach_pseudo(cf);
1956 if (dev == NULL) {
1957 printf("raid%d: config_attach_pseudo failed\n",
1958 raidPtr->raidid);
1959 free(cf, M_RAIDFRAME);
1960 return;
1961 }
1962
1963 /* provide a backpointer to the real softc */
1964 raidsoftc(dev) = rs;
1965
1966 /* disk_attach actually creates space for the CPU disklabel, among
1967 * other things, so it's critical to call this *BEFORE* we try putzing
1968 * with disklabels. */
1969 dk_init(dksc, dev, DKTYPE_RAID);
1970 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1971
1972 /* XXX There may be a weird interaction here between this, and
1973 * protectedSectors, as used in RAIDframe. */
1974
1975 rs->sc_size = raidPtr->totalSectors;
1976
1977 /* Attach dk and disk subsystems */
1978 dk_attach(dksc);
1979 disk_attach(&dksc->sc_dkdev);
1980 rf_set_geometry(rs, raidPtr);
1981
1982 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1983
1984 /* mark unit as usuable */
1985 rs->sc_flags |= RAIDF_INITED;
1986
1987 dkwedge_discover(&dksc->sc_dkdev);
1988 }
1989
1990 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1991 /* wake up the daemon & tell it to get us a spare table
1992 * XXX
1993 * the entries in the queues should be tagged with the raidPtr
1994 * so that in the extremely rare case that two recons happen at once,
1995 * we know for which device were requesting a spare table
1996 * XXX
1997 *
1998 * XXX This code is not currently used. GO
1999 */
2000 int
2001 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2002 {
2003 int retcode;
2004
2005 rf_lock_mutex2(rf_sparet_wait_mutex);
2006 req->next = rf_sparet_wait_queue;
2007 rf_sparet_wait_queue = req;
2008 rf_broadcast_cond2(rf_sparet_wait_cv);
2009
2010 /* mpsleep unlocks the mutex */
2011 while (!rf_sparet_resp_queue) {
2012 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2013 }
2014 req = rf_sparet_resp_queue;
2015 rf_sparet_resp_queue = req->next;
2016 rf_unlock_mutex2(rf_sparet_wait_mutex);
2017
2018 retcode = req->fcol;
2019 RF_Free(req, sizeof(*req)); /* this is not the same req as we
2020 * alloc'd */
2021 return retcode;
2022 }
2023 #endif
2024
2025 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2026 * bp & passes it down.
2027 * any calls originating in the kernel must use non-blocking I/O
2028 * do some extra sanity checking to return "appropriate" error values for
2029 * certain conditions (to make some standard utilities work)
2030 *
2031 * Formerly known as: rf_DoAccessKernel
2032 */
2033 void
2034 raidstart(RF_Raid_t *raidPtr)
2035 {
2036 struct raid_softc *rs;
2037 struct dk_softc *dksc;
2038
2039 rs = raidPtr->softc;
2040 dksc = &rs->sc_dksc;
2041 /* quick check to see if anything has died recently */
2042 rf_lock_mutex2(raidPtr->mutex);
2043 if (raidPtr->numNewFailures > 0) {
2044 rf_unlock_mutex2(raidPtr->mutex);
2045 rf_update_component_labels(raidPtr,
2046 RF_NORMAL_COMPONENT_UPDATE);
2047 rf_lock_mutex2(raidPtr->mutex);
2048 raidPtr->numNewFailures--;
2049 }
2050 rf_unlock_mutex2(raidPtr->mutex);
2051
2052 if ((rs->sc_flags & RAIDF_INITED) == 0) {
2053 printf("raid%d: raidstart not ready\n", raidPtr->raidid);
2054 return;
2055 }
2056
2057 dk_start(dksc, NULL);
2058 }
2059
2060 static int
2061 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
2062 {
2063 RF_SectorCount_t num_blocks, pb, sum;
2064 RF_RaidAddr_t raid_addr;
2065 daddr_t blocknum;
2066 int rc;
2067
2068 rf_lock_mutex2(raidPtr->mutex);
2069 if (raidPtr->openings == 0) {
2070 rf_unlock_mutex2(raidPtr->mutex);
2071 return EAGAIN;
2072 }
2073 rf_unlock_mutex2(raidPtr->mutex);
2074
2075 blocknum = bp->b_rawblkno;
2076
2077 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2078 (int) blocknum));
2079
2080 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2081 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2082
2083 /* *THIS* is where we adjust what block we're going to...
2084 * but DO NOT TOUCH bp->b_blkno!!! */
2085 raid_addr = blocknum;
2086
2087 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2088 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2089 sum = raid_addr + num_blocks + pb;
2090 if (1 || rf_debugKernelAccess) {
2091 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2092 (int) raid_addr, (int) sum, (int) num_blocks,
2093 (int) pb, (int) bp->b_resid));
2094 }
2095 if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2096 || (sum < num_blocks) || (sum < pb)) {
2097 rc = ENOSPC;
2098 goto done;
2099 }
2100 /*
2101 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2102 */
2103
2104 if (bp->b_bcount & raidPtr->sectorMask) {
2105 rc = ENOSPC;
2106 goto done;
2107 }
2108 db1_printf(("Calling DoAccess..\n"));
2109
2110
2111 rf_lock_mutex2(raidPtr->mutex);
2112 raidPtr->openings--;
2113 rf_unlock_mutex2(raidPtr->mutex);
2114
2115 /* don't ever condition on bp->b_flags & B_WRITE.
2116 * always condition on B_READ instead */
2117
2118 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2119 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2120 raid_addr, num_blocks,
2121 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2122
2123 done:
2124 return rc;
2125 }
2126
2127 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
2128
2129 int
2130 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2131 {
2132 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2133 struct buf *bp;
2134
2135 req->queue = queue;
2136 bp = req->bp;
2137
2138 switch (req->type) {
2139 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
2140 /* XXX need to do something extra here.. */
2141 /* I'm leaving this in, as I've never actually seen it used,
2142 * and I'd like folks to report it... GO */
2143 printf("%s: WAKEUP CALLED\n", __func__);
2144 queue->numOutstanding++;
2145
2146 bp->b_flags = 0;
2147 bp->b_private = req;
2148
2149 KernelWakeupFunc(bp);
2150 break;
2151
2152 case RF_IO_TYPE_READ:
2153 case RF_IO_TYPE_WRITE:
2154 #if RF_ACC_TRACE > 0
2155 if (req->tracerec) {
2156 RF_ETIMER_START(req->tracerec->timer);
2157 }
2158 #endif
2159 InitBP(bp, queue->rf_cinfo->ci_vp,
2160 op, queue->rf_cinfo->ci_dev,
2161 req->sectorOffset, req->numSector,
2162 req->buf, KernelWakeupFunc, (void *) req,
2163 queue->raidPtr->logBytesPerSector);
2164
2165 if (rf_debugKernelAccess) {
2166 db1_printf(("dispatch: bp->b_blkno = %ld\n",
2167 (long) bp->b_blkno));
2168 }
2169 queue->numOutstanding++;
2170 queue->last_deq_sector = req->sectorOffset;
2171 /* acc wouldn't have been let in if there were any pending
2172 * reqs at any other priority */
2173 queue->curPriority = req->priority;
2174
2175 db1_printf(("Going for %c to unit %d col %d\n",
2176 req->type, queue->raidPtr->raidid,
2177 queue->col));
2178 db1_printf(("sector %d count %d (%d bytes) %d\n",
2179 (int) req->sectorOffset, (int) req->numSector,
2180 (int) (req->numSector <<
2181 queue->raidPtr->logBytesPerSector),
2182 (int) queue->raidPtr->logBytesPerSector));
2183
2184 /*
2185 * XXX: drop lock here since this can block at
2186 * least with backing SCSI devices. Retake it
2187 * to minimize fuss with calling interfaces.
2188 */
2189
2190 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2191 bdev_strategy(bp);
2192 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2193 break;
2194
2195 default:
2196 panic("bad req->type in rf_DispatchKernelIO");
2197 }
2198 db1_printf(("Exiting from DispatchKernelIO\n"));
2199
2200 return 0;
2201 }
2202 /* this is the callback function associated with a I/O invoked from
2203 kernel code.
2204 */
2205 static void
2206 KernelWakeupFunc(struct buf *bp)
2207 {
2208 RF_DiskQueueData_t *req = NULL;
2209 RF_DiskQueue_t *queue;
2210
2211 db1_printf(("recovering the request queue:\n"));
2212
2213 req = bp->b_private;
2214
2215 queue = (RF_DiskQueue_t *) req->queue;
2216
2217 rf_lock_mutex2(queue->raidPtr->iodone_lock);
2218
2219 #if RF_ACC_TRACE > 0
2220 if (req->tracerec) {
2221 RF_ETIMER_STOP(req->tracerec->timer);
2222 RF_ETIMER_EVAL(req->tracerec->timer);
2223 rf_lock_mutex2(rf_tracing_mutex);
2224 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2225 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2226 req->tracerec->num_phys_ios++;
2227 rf_unlock_mutex2(rf_tracing_mutex);
2228 }
2229 #endif
2230
2231 /* XXX Ok, let's get aggressive... If b_error is set, let's go
2232 * ballistic, and mark the component as hosed... */
2233
2234 if (bp->b_error != 0) {
2235 /* Mark the disk as dead */
2236 /* but only mark it once... */
2237 /* and only if it wouldn't leave this RAID set
2238 completely broken */
2239 if (((queue->raidPtr->Disks[queue->col].status ==
2240 rf_ds_optimal) ||
2241 (queue->raidPtr->Disks[queue->col].status ==
2242 rf_ds_used_spare)) &&
2243 (queue->raidPtr->numFailures <
2244 queue->raidPtr->Layout.map->faultsTolerated)) {
2245 printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2246 queue->raidPtr->raidid,
2247 bp->b_error,
2248 queue->raidPtr->Disks[queue->col].devname);
2249 queue->raidPtr->Disks[queue->col].status =
2250 rf_ds_failed;
2251 queue->raidPtr->status = rf_rs_degraded;
2252 queue->raidPtr->numFailures++;
2253 queue->raidPtr->numNewFailures++;
2254 } else { /* Disk is already dead... */
2255 /* printf("Disk already marked as dead!\n"); */
2256 }
2257
2258 }
2259
2260 /* Fill in the error value */
2261 req->error = bp->b_error;
2262
2263 /* Drop this one on the "finished" queue... */
2264 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2265
2266 /* Let the raidio thread know there is work to be done. */
2267 rf_signal_cond2(queue->raidPtr->iodone_cv);
2268
2269 rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2270 }
2271
2272
2273 /*
2274 * initialize a buf structure for doing an I/O in the kernel.
2275 */
2276 static void
2277 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2278 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2279 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2280 {
2281 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2282 bp->b_oflags = 0;
2283 bp->b_cflags = 0;
2284 bp->b_bcount = numSect << logBytesPerSector;
2285 bp->b_bufsize = bp->b_bcount;
2286 bp->b_error = 0;
2287 bp->b_dev = dev;
2288 bp->b_data = bf;
2289 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2290 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
2291 if (bp->b_bcount == 0) {
2292 panic("bp->b_bcount is zero in InitBP!!");
2293 }
2294 bp->b_iodone = cbFunc;
2295 bp->b_private = cbArg;
2296 }
2297
2298 /*
2299 * Wait interruptibly for an exclusive lock.
2300 *
2301 * XXX
2302 * Several drivers do this; it should be abstracted and made MP-safe.
2303 * (Hmm... where have we seen this warning before :-> GO )
2304 */
2305 static int
2306 raidlock(struct raid_softc *rs)
2307 {
2308 int error;
2309
2310 error = 0;
2311 mutex_enter(&rs->sc_mutex);
2312 while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2313 rs->sc_flags |= RAIDF_WANTED;
2314 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2315 if (error != 0)
2316 goto done;
2317 }
2318 rs->sc_flags |= RAIDF_LOCKED;
2319 done:
2320 mutex_exit(&rs->sc_mutex);
2321 return error;
2322 }
2323 /*
2324 * Unlock and wake up any waiters.
2325 */
2326 static void
2327 raidunlock(struct raid_softc *rs)
2328 {
2329
2330 mutex_enter(&rs->sc_mutex);
2331 rs->sc_flags &= ~RAIDF_LOCKED;
2332 if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2333 rs->sc_flags &= ~RAIDF_WANTED;
2334 cv_broadcast(&rs->sc_cv);
2335 }
2336 mutex_exit(&rs->sc_mutex);
2337 }
2338
2339
2340 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */
2341 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */
2342 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE
2343
2344 static daddr_t
2345 rf_component_info_offset(void)
2346 {
2347
2348 return RF_COMPONENT_INFO_OFFSET;
2349 }
2350
2351 static daddr_t
2352 rf_component_info_size(unsigned secsize)
2353 {
2354 daddr_t info_size;
2355
2356 KASSERT(secsize);
2357 if (secsize > RF_COMPONENT_INFO_SIZE)
2358 info_size = secsize;
2359 else
2360 info_size = RF_COMPONENT_INFO_SIZE;
2361
2362 return info_size;
2363 }
2364
2365 static daddr_t
2366 rf_parity_map_offset(RF_Raid_t *raidPtr)
2367 {
2368 daddr_t map_offset;
2369
2370 KASSERT(raidPtr->bytesPerSector);
2371 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2372 map_offset = raidPtr->bytesPerSector;
2373 else
2374 map_offset = RF_COMPONENT_INFO_SIZE;
2375 map_offset += rf_component_info_offset();
2376
2377 return map_offset;
2378 }
2379
2380 static daddr_t
2381 rf_parity_map_size(RF_Raid_t *raidPtr)
2382 {
2383 daddr_t map_size;
2384
2385 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2386 map_size = raidPtr->bytesPerSector;
2387 else
2388 map_size = RF_PARITY_MAP_SIZE;
2389
2390 return map_size;
2391 }
2392
2393 int
2394 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2395 {
2396 RF_ComponentLabel_t *clabel;
2397
2398 clabel = raidget_component_label(raidPtr, col);
2399 clabel->clean = RF_RAID_CLEAN;
2400 raidflush_component_label(raidPtr, col);
2401 return(0);
2402 }
2403
2404
2405 int
2406 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2407 {
2408 RF_ComponentLabel_t *clabel;
2409
2410 clabel = raidget_component_label(raidPtr, col);
2411 clabel->clean = RF_RAID_DIRTY;
2412 raidflush_component_label(raidPtr, col);
2413 return(0);
2414 }
2415
2416 int
2417 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2418 {
2419 KASSERT(raidPtr->bytesPerSector);
2420
2421 return raidread_component_label(raidPtr->bytesPerSector,
2422 raidPtr->Disks[col].dev,
2423 raidPtr->raid_cinfo[col].ci_vp,
2424 &raidPtr->raid_cinfo[col].ci_label);
2425 }
2426
2427 RF_ComponentLabel_t *
2428 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2429 {
2430 return &raidPtr->raid_cinfo[col].ci_label;
2431 }
2432
2433 int
2434 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2435 {
2436 RF_ComponentLabel_t *label;
2437
2438 label = &raidPtr->raid_cinfo[col].ci_label;
2439 label->mod_counter = raidPtr->mod_counter;
2440 #ifndef RF_NO_PARITY_MAP
2441 label->parity_map_modcount = label->mod_counter;
2442 #endif
2443 return raidwrite_component_label(raidPtr->bytesPerSector,
2444 raidPtr->Disks[col].dev,
2445 raidPtr->raid_cinfo[col].ci_vp, label);
2446 }
2447
2448 /*
2449 * Swap the label endianness.
2450 *
2451 * Everything in the component label is 4-byte-swapped except the version,
2452 * which is kept in the byte-swapped version at all times, and indicates
2453 * for the writer that a swap is necessary.
2454 *
2455 * For reads it is expected that out_label == clabel, but writes expect
2456 * separate labels so only the re-swapped label is written out to disk,
2457 * leaving the swapped-except-version internally.
2458 *
2459 * Only support swapping label version 2.
2460 */
2461 static void
2462 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2463 {
2464 int *in, *out, *in_last;
2465
2466 KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2467
2468 /* Don't swap the label, but do copy it. */
2469 out_label->version = clabel->version;
2470
2471 in = &clabel->serial_number;
2472 in_last = &clabel->future_use2[42];
2473 out = &out_label->serial_number;
2474
2475 for (; in < in_last; in++, out++)
2476 *out = bswap32(*in);
2477 }
2478
2479 static int
2480 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2481 RF_ComponentLabel_t *clabel)
2482 {
2483 int error;
2484
2485 error = raidread_component_area(dev, b_vp, clabel,
2486 sizeof(RF_ComponentLabel_t),
2487 rf_component_info_offset(),
2488 rf_component_info_size(secsize));
2489
2490 if (error == 0 &&
2491 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2492 rf_swap_label(clabel, clabel);
2493 }
2494
2495 return error;
2496 }
2497
2498 /* ARGSUSED */
2499 static int
2500 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2501 size_t msize, daddr_t offset, daddr_t dsize)
2502 {
2503 struct buf *bp;
2504 int error;
2505
2506 /* XXX should probably ensure that we don't try to do this if
2507 someone has changed rf_protected_sectors. */
2508
2509 if (b_vp == NULL) {
2510 /* For whatever reason, this component is not valid.
2511 Don't try to read a component label from it. */
2512 return(EINVAL);
2513 }
2514
2515 /* get a block of the appropriate size... */
2516 bp = geteblk((int)dsize);
2517 bp->b_dev = dev;
2518
2519 /* get our ducks in a row for the read */
2520 bp->b_blkno = offset / DEV_BSIZE;
2521 bp->b_bcount = dsize;
2522 bp->b_flags |= B_READ;
2523 bp->b_resid = dsize;
2524
2525 bdev_strategy(bp);
2526 error = biowait(bp);
2527
2528 if (!error) {
2529 memcpy(data, bp->b_data, msize);
2530 }
2531
2532 brelse(bp, 0);
2533 return(error);
2534 }
2535
2536 static int
2537 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2538 RF_ComponentLabel_t *clabel)
2539 {
2540 RF_ComponentLabel_t *clabel_write = clabel;
2541 RF_ComponentLabel_t lclabel;
2542 int error;
2543
2544 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2545 clabel_write = &lclabel;
2546 rf_swap_label(clabel, clabel_write);
2547 }
2548 error = raidwrite_component_area(dev, b_vp, clabel_write,
2549 sizeof(RF_ComponentLabel_t),
2550 rf_component_info_offset(),
2551 rf_component_info_size(secsize));
2552
2553 return error;
2554 }
2555
2556 /* ARGSUSED */
2557 static int
2558 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2559 size_t msize, daddr_t offset, daddr_t dsize)
2560 {
2561 struct buf *bp;
2562 int error;
2563
2564 /* get a block of the appropriate size... */
2565 bp = geteblk((int)dsize);
2566 bp->b_dev = dev;
2567
2568 /* get our ducks in a row for the write */
2569 bp->b_blkno = offset / DEV_BSIZE;
2570 bp->b_bcount = dsize;
2571 bp->b_flags |= B_WRITE;
2572 bp->b_resid = dsize;
2573
2574 memset(bp->b_data, 0, dsize);
2575 memcpy(bp->b_data, data, msize);
2576
2577 bdev_strategy(bp);
2578 error = biowait(bp);
2579 brelse(bp, 0);
2580 if (error) {
2581 #if 1
2582 printf("Failed to write RAID component info!\n");
2583 #endif
2584 }
2585
2586 return(error);
2587 }
2588
2589 void
2590 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2591 {
2592 int c;
2593
2594 for (c = 0; c < raidPtr->numCol; c++) {
2595 /* Skip dead disks. */
2596 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2597 continue;
2598 /* XXXjld: what if an error occurs here? */
2599 raidwrite_component_area(raidPtr->Disks[c].dev,
2600 raidPtr->raid_cinfo[c].ci_vp, map,
2601 RF_PARITYMAP_NBYTE,
2602 rf_parity_map_offset(raidPtr),
2603 rf_parity_map_size(raidPtr));
2604 }
2605 }
2606
2607 void
2608 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2609 {
2610 struct rf_paritymap_ondisk tmp;
2611 int c,first;
2612
2613 first=1;
2614 for (c = 0; c < raidPtr->numCol; c++) {
2615 /* Skip dead disks. */
2616 if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2617 continue;
2618 raidread_component_area(raidPtr->Disks[c].dev,
2619 raidPtr->raid_cinfo[c].ci_vp, &tmp,
2620 RF_PARITYMAP_NBYTE,
2621 rf_parity_map_offset(raidPtr),
2622 rf_parity_map_size(raidPtr));
2623 if (first) {
2624 memcpy(map, &tmp, sizeof(*map));
2625 first = 0;
2626 } else {
2627 rf_paritymap_merge(map, &tmp);
2628 }
2629 }
2630 }
2631
2632 void
2633 rf_markalldirty(RF_Raid_t *raidPtr)
2634 {
2635 RF_ComponentLabel_t *clabel;
2636 int sparecol;
2637 int c;
2638 int j;
2639 int scol = -1;
2640
2641 raidPtr->mod_counter++;
2642 for (c = 0; c < raidPtr->numCol; c++) {
2643 /* we don't want to touch (at all) a disk that has
2644 failed */
2645 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2646 clabel = raidget_component_label(raidPtr, c);
2647 if (clabel->status == rf_ds_spared) {
2648 /* XXX do something special...
2649 but whatever you do, don't
2650 try to access it!! */
2651 } else {
2652 raidmarkdirty(raidPtr, c);
2653 }
2654 }
2655 }
2656
2657 for (c = 0; c < raidPtr->numSpare ; c++) {
2658 sparecol = raidPtr->numCol + c;
2659
2660 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2661 /*
2662
2663 we claim this disk is "optimal" if it's
2664 rf_ds_used_spare, as that means it should be
2665 directly substitutable for the disk it replaced.
2666 We note that too...
2667
2668 */
2669
2670 for(j=0;j<raidPtr->numCol;j++) {
2671 if (raidPtr->Disks[j].spareCol == sparecol) {
2672 scol = j;
2673 break;
2674 }
2675 }
2676
2677 clabel = raidget_component_label(raidPtr, sparecol);
2678 /* make sure status is noted */
2679
2680 raid_init_component_label(raidPtr, clabel);
2681
2682 clabel->row = 0;
2683 clabel->column = scol;
2684 /* Note: we *don't* change status from rf_ds_used_spare
2685 to rf_ds_optimal */
2686 /* clabel.status = rf_ds_optimal; */
2687
2688 raidmarkdirty(raidPtr, sparecol);
2689 }
2690 }
2691 }
2692
2693
2694 void
2695 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2696 {
2697 RF_ComponentLabel_t *clabel;
2698 int sparecol;
2699 int c;
2700 int j;
2701 int scol;
2702 struct raid_softc *rs = raidPtr->softc;
2703
2704 scol = -1;
2705
2706 /* XXX should do extra checks to make sure things really are clean,
2707 rather than blindly setting the clean bit... */
2708
2709 raidPtr->mod_counter++;
2710
2711 for (c = 0; c < raidPtr->numCol; c++) {
2712 if (raidPtr->Disks[c].status == rf_ds_optimal) {
2713 clabel = raidget_component_label(raidPtr, c);
2714 /* make sure status is noted */
2715 clabel->status = rf_ds_optimal;
2716
2717 /* note what unit we are configured as */
2718 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2719 clabel->last_unit = raidPtr->raidid;
2720
2721 raidflush_component_label(raidPtr, c);
2722 if (final == RF_FINAL_COMPONENT_UPDATE) {
2723 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2724 raidmarkclean(raidPtr, c);
2725 }
2726 }
2727 }
2728 /* else we don't touch it.. */
2729 }
2730
2731 for (c = 0; c < raidPtr->numSpare ; c++) {
2732 sparecol = raidPtr->numCol + c;
2733
2734 /* Need to ensure that the reconstruct actually completed! */
2735 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2736 /*
2737
2738 we claim this disk is "optimal" if it's
2739 rf_ds_used_spare, as that means it should be
2740 directly substitutable for the disk it replaced.
2741 We note that too...
2742
2743 */
2744
2745 for(j=0;j<raidPtr->numCol;j++) {
2746 if (raidPtr->Disks[j].spareCol == sparecol) {
2747 scol = j;
2748 break;
2749 }
2750 }
2751
2752 /* XXX shouldn't *really* need this... */
2753 clabel = raidget_component_label(raidPtr, sparecol);
2754 /* make sure status is noted */
2755
2756 raid_init_component_label(raidPtr, clabel);
2757
2758 clabel->column = scol;
2759 clabel->status = rf_ds_optimal;
2760 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2761 clabel->last_unit = raidPtr->raidid;
2762
2763 raidflush_component_label(raidPtr, sparecol);
2764 if (final == RF_FINAL_COMPONENT_UPDATE) {
2765 if (raidPtr->parity_good == RF_RAID_CLEAN) {
2766 raidmarkclean(raidPtr, sparecol);
2767 }
2768 }
2769 }
2770 }
2771 }
2772
2773 void
2774 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2775 {
2776
2777 if (vp != NULL) {
2778 if (auto_configured == 1) {
2779 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2780 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2781 vput(vp);
2782
2783 } else {
2784 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2785 }
2786 }
2787 }
2788
2789
2790 void
2791 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2792 {
2793 int r,c;
2794 struct vnode *vp;
2795 int acd;
2796
2797
2798 /* We take this opportunity to close the vnodes like we should.. */
2799
2800 for (c = 0; c < raidPtr->numCol; c++) {
2801 vp = raidPtr->raid_cinfo[c].ci_vp;
2802 acd = raidPtr->Disks[c].auto_configured;
2803 rf_close_component(raidPtr, vp, acd);
2804 raidPtr->raid_cinfo[c].ci_vp = NULL;
2805 raidPtr->Disks[c].auto_configured = 0;
2806 }
2807
2808 for (r = 0; r < raidPtr->numSpare; r++) {
2809 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2810 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2811 rf_close_component(raidPtr, vp, acd);
2812 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2813 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2814 }
2815 }
2816
2817
2818 static void
2819 rf_ReconThread(struct rf_recon_req_internal *req)
2820 {
2821 int s;
2822 RF_Raid_t *raidPtr;
2823
2824 s = splbio();
2825 raidPtr = (RF_Raid_t *) req->raidPtr;
2826 raidPtr->recon_in_progress = 1;
2827
2828 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2829 raidPtr->forceRecon = 1;
2830 }
2831
2832 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2833 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2834
2835 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2836 raidPtr->forceRecon = 0;
2837 }
2838
2839 RF_Free(req, sizeof(*req));
2840
2841 raidPtr->recon_in_progress = 0;
2842 splx(s);
2843
2844 /* That's all... */
2845 kthread_exit(0); /* does not return */
2846 }
2847
2848 static void
2849 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2850 {
2851 int retcode;
2852 int s;
2853
2854 raidPtr->parity_rewrite_stripes_done = 0;
2855 raidPtr->parity_rewrite_in_progress = 1;
2856 s = splbio();
2857 retcode = rf_RewriteParity(raidPtr);
2858 splx(s);
2859 if (retcode) {
2860 printf("raid%d: Error re-writing parity (%d)!\n",
2861 raidPtr->raidid, retcode);
2862 } else {
2863 /* set the clean bit! If we shutdown correctly,
2864 the clean bit on each component label will get
2865 set */
2866 raidPtr->parity_good = RF_RAID_CLEAN;
2867 }
2868 raidPtr->parity_rewrite_in_progress = 0;
2869
2870 /* Anyone waiting for us to stop? If so, inform them... */
2871 if (raidPtr->waitShutdown) {
2872 rf_lock_mutex2(raidPtr->rad_lock);
2873 cv_broadcast(&raidPtr->parity_rewrite_cv);
2874 rf_unlock_mutex2(raidPtr->rad_lock);
2875 }
2876
2877 /* That's all... */
2878 kthread_exit(0); /* does not return */
2879 }
2880
2881 static void
2882 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2883 {
2884 int s;
2885 RF_Raid_t *raidPtr;
2886
2887 s = splbio();
2888 raidPtr = req->raidPtr;
2889 raidPtr->recon_in_progress = 1;
2890
2891 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2892 raidPtr->forceRecon = 1;
2893 }
2894
2895 rf_ReconstructInPlace(raidPtr, req->col);
2896
2897 if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2898 raidPtr->forceRecon = 0;
2899 }
2900
2901 RF_Free(req, sizeof(*req));
2902 raidPtr->recon_in_progress = 0;
2903 splx(s);
2904
2905 /* That's all... */
2906 kthread_exit(0); /* does not return */
2907 }
2908
2909 static RF_AutoConfig_t *
2910 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2911 const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2912 unsigned secsize)
2913 {
2914 int good_one = 0;
2915 RF_ComponentLabel_t *clabel;
2916 RF_AutoConfig_t *ac;
2917
2918 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2919
2920 if (!raidread_component_label(secsize, dev, vp, clabel)) {
2921 /* Got the label. Does it look reasonable? */
2922 if (rf_reasonable_label(clabel, numsecs) &&
2923 (rf_component_label_partitionsize(clabel) <= size)) {
2924 #ifdef DEBUG
2925 printf("Component on: %s: %llu\n",
2926 cname, (unsigned long long)size);
2927 rf_print_component_label(clabel);
2928 #endif
2929 /* if it's reasonable, add it, else ignore it. */
2930 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2931 M_WAITOK);
2932 strlcpy(ac->devname, cname, sizeof(ac->devname));
2933 ac->dev = dev;
2934 ac->vp = vp;
2935 ac->clabel = clabel;
2936 ac->next = ac_list;
2937 ac_list = ac;
2938 good_one = 1;
2939 }
2940 }
2941 if (!good_one) {
2942 /* cleanup */
2943 free(clabel, M_RAIDFRAME);
2944 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2945 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2946 vput(vp);
2947 }
2948 return ac_list;
2949 }
2950
2951 static RF_AutoConfig_t *
2952 rf_find_raid_components(void)
2953 {
2954 struct vnode *vp;
2955 struct disklabel label;
2956 device_t dv;
2957 deviter_t di;
2958 dev_t dev;
2959 int bmajor, bminor, wedge, rf_part_found;
2960 int error;
2961 int i;
2962 RF_AutoConfig_t *ac_list;
2963 uint64_t numsecs;
2964 unsigned secsize;
2965 int dowedges;
2966
2967 /* initialize the AutoConfig list */
2968 ac_list = NULL;
2969
2970 /*
2971 * we begin by trolling through *all* the devices on the system *twice*
2972 * first we scan for wedges, second for other devices. This avoids
2973 * using a raw partition instead of a wedge that covers the whole disk
2974 */
2975
2976 for (dowedges=1; dowedges>=0; --dowedges) {
2977 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2978 dv = deviter_next(&di)) {
2979
2980 /* we are only interested in disks */
2981 if (device_class(dv) != DV_DISK)
2982 continue;
2983
2984 /* we don't care about floppies */
2985 if (device_is_a(dv, "fd")) {
2986 continue;
2987 }
2988
2989 /* we don't care about CDs. */
2990 if (device_is_a(dv, "cd")) {
2991 continue;
2992 }
2993
2994 /* we don't care about md. */
2995 if (device_is_a(dv, "md")) {
2996 continue;
2997 }
2998
2999 /* hdfd is the Atari/Hades floppy driver */
3000 if (device_is_a(dv, "hdfd")) {
3001 continue;
3002 }
3003
3004 /* fdisa is the Atari/Milan floppy driver */
3005 if (device_is_a(dv, "fdisa")) {
3006 continue;
3007 }
3008
3009 /* we don't care about spiflash */
3010 if (device_is_a(dv, "spiflash")) {
3011 continue;
3012 }
3013
3014 /* are we in the wedges pass ? */
3015 wedge = device_is_a(dv, "dk");
3016 if (wedge != dowedges) {
3017 continue;
3018 }
3019
3020 /* need to find the device_name_to_block_device_major stuff */
3021 bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3022
3023 rf_part_found = 0; /*No raid partition as yet*/
3024
3025 /* get a vnode for the raw partition of this disk */
3026 bminor = minor(device_unit(dv));
3027 dev = wedge ? makedev(bmajor, bminor) :
3028 MAKEDISKDEV(bmajor, bminor, RAW_PART);
3029 if (bdevvp(dev, &vp))
3030 panic("RAID can't alloc vnode");
3031
3032 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3033 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3034
3035 if (error) {
3036 /* "Who cares." Continue looking
3037 for something that exists*/
3038 vput(vp);
3039 continue;
3040 }
3041
3042 error = getdisksize(vp, &numsecs, &secsize);
3043 if (error) {
3044 /*
3045 * Pseudo devices like vnd and cgd can be
3046 * opened but may still need some configuration.
3047 * Ignore these quietly.
3048 */
3049 if (error != ENXIO)
3050 printf("RAIDframe: can't get disk size"
3051 " for dev %s (%d)\n",
3052 device_xname(dv), error);
3053 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3054 vput(vp);
3055 continue;
3056 }
3057 if (wedge) {
3058 struct dkwedge_info dkw;
3059 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3060 NOCRED);
3061 if (error) {
3062 printf("RAIDframe: can't get wedge info for "
3063 "dev %s (%d)\n", device_xname(dv), error);
3064 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3065 vput(vp);
3066 continue;
3067 }
3068
3069 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3070 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3071 vput(vp);
3072 continue;
3073 }
3074
3075 VOP_UNLOCK(vp);
3076 ac_list = rf_get_component(ac_list, dev, vp,
3077 device_xname(dv), dkw.dkw_size, numsecs, secsize);
3078 rf_part_found = 1; /*There is a raid component on this disk*/
3079 continue;
3080 }
3081
3082 /* Ok, the disk exists. Go get the disklabel. */
3083 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3084 if (error) {
3085 /*
3086 * XXX can't happen - open() would
3087 * have errored out (or faked up one)
3088 */
3089 if (error != ENOTTY)
3090 printf("RAIDframe: can't get label for dev "
3091 "%s (%d)\n", device_xname(dv), error);
3092 }
3093
3094 /* don't need this any more. We'll allocate it again
3095 a little later if we really do... */
3096 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3097 vput(vp);
3098
3099 if (error)
3100 continue;
3101
3102 rf_part_found = 0; /*No raid partitions yet*/
3103 for (i = 0; i < label.d_npartitions; i++) {
3104 char cname[sizeof(ac_list->devname)];
3105
3106 /* We only support partitions marked as RAID */
3107 if (label.d_partitions[i].p_fstype != FS_RAID)
3108 continue;
3109
3110 dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3111 if (bdevvp(dev, &vp))
3112 panic("RAID can't alloc vnode");
3113
3114 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3115 error = VOP_OPEN(vp, FREAD, NOCRED);
3116 if (error) {
3117 /* Not quite a 'whatever'. In
3118 * this situation we know
3119 * there is a FS_RAID
3120 * partition, but we can't
3121 * open it. The most likely
3122 * reason is that the
3123 * partition is already in
3124 * use by another RAID set.
3125 * So note that we've already
3126 * found a partition on this
3127 * disk so we don't attempt
3128 * to use the raw disk later. */
3129 rf_part_found = 1;
3130 vput(vp);
3131 continue;
3132 }
3133 VOP_UNLOCK(vp);
3134 snprintf(cname, sizeof(cname), "%s%c",
3135 device_xname(dv), 'a' + i);
3136 ac_list = rf_get_component(ac_list, dev, vp, cname,
3137 label.d_partitions[i].p_size, numsecs, secsize);
3138 rf_part_found = 1; /*There is at least one raid partition on this disk*/
3139 }
3140
3141 /*
3142 *If there is no raid component on this disk, either in a
3143 *disklabel or inside a wedge, check the raw partition as well,
3144 *as it is possible to configure raid components on raw disk
3145 *devices.
3146 */
3147
3148 if (!rf_part_found) {
3149 char cname[sizeof(ac_list->devname)];
3150
3151 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3152 if (bdevvp(dev, &vp))
3153 panic("RAID can't alloc vnode");
3154
3155 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3156
3157 error = VOP_OPEN(vp, FREAD, NOCRED);
3158 if (error) {
3159 /* Whatever... */
3160 vput(vp);
3161 continue;
3162 }
3163 VOP_UNLOCK(vp);
3164 snprintf(cname, sizeof(cname), "%s%c",
3165 device_xname(dv), 'a' + RAW_PART);
3166 ac_list = rf_get_component(ac_list, dev, vp, cname,
3167 label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3168 }
3169 }
3170 deviter_release(&di);
3171 }
3172 return ac_list;
3173 }
3174
3175 int
3176 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3177 {
3178
3179 if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3180 clabel->version==RF_COMPONENT_LABEL_VERSION ||
3181 clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3182 (clabel->clean == RF_RAID_CLEAN ||
3183 clabel->clean == RF_RAID_DIRTY) &&
3184 clabel->row >=0 &&
3185 clabel->column >= 0 &&
3186 clabel->num_rows > 0 &&
3187 clabel->num_columns > 0 &&
3188 clabel->row < clabel->num_rows &&
3189 clabel->column < clabel->num_columns &&
3190 clabel->blockSize > 0 &&
3191 /*
3192 * numBlocksHi may contain garbage, but it is ok since
3193 * the type is unsigned. If it is really garbage,
3194 * rf_fix_old_label_size() will fix it.
3195 */
3196 rf_component_label_numblocks(clabel) > 0) {
3197 /*
3198 * label looks reasonable enough...
3199 * let's make sure it has no old garbage.
3200 */
3201 if (numsecs)
3202 rf_fix_old_label_size(clabel, numsecs);
3203 return(1);
3204 }
3205 return(0);
3206 }
3207
3208
3209 /*
3210 * For reasons yet unknown, some old component labels have garbage in
3211 * the newer numBlocksHi region, and this causes lossage. Since those
3212 * disks will also have numsecs set to less than 32 bits of sectors,
3213 * we can determine when this corruption has occurred, and fix it.
3214 *
3215 * The exact same problem, with the same unknown reason, happens to
3216 * the partitionSizeHi member as well.
3217 */
3218 static void
3219 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3220 {
3221
3222 if (numsecs < ((uint64_t)1 << 32)) {
3223 if (clabel->numBlocksHi) {
3224 printf("WARNING: total sectors < 32 bits, yet "
3225 "numBlocksHi set\n"
3226 "WARNING: resetting numBlocksHi to zero.\n");
3227 clabel->numBlocksHi = 0;
3228 }
3229
3230 if (clabel->partitionSizeHi) {
3231 printf("WARNING: total sectors < 32 bits, yet "
3232 "partitionSizeHi set\n"
3233 "WARNING: resetting partitionSizeHi to zero.\n");
3234 clabel->partitionSizeHi = 0;
3235 }
3236 }
3237 }
3238
3239
3240 #ifdef DEBUG
3241 void
3242 rf_print_component_label(RF_ComponentLabel_t *clabel)
3243 {
3244 uint64_t numBlocks;
3245 static const char *rp[] = {
3246 "No", "Force", "Soft", "*invalid*"
3247 };
3248
3249
3250 numBlocks = rf_component_label_numblocks(clabel);
3251
3252 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3253 clabel->row, clabel->column,
3254 clabel->num_rows, clabel->num_columns);
3255 printf(" Version: %d Serial Number: %d Mod Counter: %d\n",
3256 clabel->version, clabel->serial_number,
3257 clabel->mod_counter);
3258 printf(" Clean: %s Status: %d\n",
3259 clabel->clean ? "Yes" : "No", clabel->status);
3260 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3261 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3262 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n",
3263 (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3264 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3265 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]);
3266 printf(" Last configured as: raid%d\n", clabel->last_unit);
3267 #if 0
3268 printf(" Config order: %d\n", clabel->config_order);
3269 #endif
3270
3271 }
3272 #endif
3273
3274 static RF_ConfigSet_t *
3275 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3276 {
3277 RF_AutoConfig_t *ac;
3278 RF_ConfigSet_t *config_sets;
3279 RF_ConfigSet_t *cset;
3280 RF_AutoConfig_t *ac_next;
3281
3282
3283 config_sets = NULL;
3284
3285 /* Go through the AutoConfig list, and figure out which components
3286 belong to what sets. */
3287 ac = ac_list;
3288 while(ac!=NULL) {
3289 /* we're going to putz with ac->next, so save it here
3290 for use at the end of the loop */
3291 ac_next = ac->next;
3292
3293 if (config_sets == NULL) {
3294 /* will need at least this one... */
3295 config_sets = malloc(sizeof(RF_ConfigSet_t),
3296 M_RAIDFRAME, M_WAITOK);
3297 /* this one is easy :) */
3298 config_sets->ac = ac;
3299 config_sets->next = NULL;
3300 config_sets->rootable = 0;
3301 ac->next = NULL;
3302 } else {
3303 /* which set does this component fit into? */
3304 cset = config_sets;
3305 while(cset!=NULL) {
3306 if (rf_does_it_fit(cset, ac)) {
3307 /* looks like it matches... */
3308 ac->next = cset->ac;
3309 cset->ac = ac;
3310 break;
3311 }
3312 cset = cset->next;
3313 }
3314 if (cset==NULL) {
3315 /* didn't find a match above... new set..*/
3316 cset = malloc(sizeof(RF_ConfigSet_t),
3317 M_RAIDFRAME, M_WAITOK);
3318 cset->ac = ac;
3319 ac->next = NULL;
3320 cset->next = config_sets;
3321 cset->rootable = 0;
3322 config_sets = cset;
3323 }
3324 }
3325 ac = ac_next;
3326 }
3327
3328
3329 return(config_sets);
3330 }
3331
3332 static int
3333 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3334 {
3335 RF_ComponentLabel_t *clabel1, *clabel2;
3336
3337 /* If this one matches the *first* one in the set, that's good
3338 enough, since the other members of the set would have been
3339 through here too... */
3340 /* note that we are not checking partitionSize here..
3341
3342 Note that we are also not checking the mod_counters here.
3343 If everything else matches except the mod_counter, that's
3344 good enough for this test. We will deal with the mod_counters
3345 a little later in the autoconfiguration process.
3346
3347 (clabel1->mod_counter == clabel2->mod_counter) &&
3348
3349 The reason we don't check for this is that failed disks
3350 will have lower modification counts. If those disks are
3351 not added to the set they used to belong to, then they will
3352 form their own set, which may result in 2 different sets,
3353 for example, competing to be configured at raid0, and
3354 perhaps competing to be the root filesystem set. If the
3355 wrong ones get configured, or both attempt to become /,
3356 weird behaviour and or serious lossage will occur. Thus we
3357 need to bring them into the fold here, and kick them out at
3358 a later point.
3359
3360 */
3361
3362 clabel1 = cset->ac->clabel;
3363 clabel2 = ac->clabel;
3364 if ((clabel1->version == clabel2->version) &&
3365 (clabel1->serial_number == clabel2->serial_number) &&
3366 (clabel1->num_rows == clabel2->num_rows) &&
3367 (clabel1->num_columns == clabel2->num_columns) &&
3368 (clabel1->sectPerSU == clabel2->sectPerSU) &&
3369 (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3370 (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3371 (clabel1->parityConfig == clabel2->parityConfig) &&
3372 (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3373 (clabel1->blockSize == clabel2->blockSize) &&
3374 rf_component_label_numblocks(clabel1) ==
3375 rf_component_label_numblocks(clabel2) &&
3376 (clabel1->autoconfigure == clabel2->autoconfigure) &&
3377 (clabel1->root_partition == clabel2->root_partition) &&
3378 (clabel1->last_unit == clabel2->last_unit) &&
3379 (clabel1->config_order == clabel2->config_order)) {
3380 /* if it get's here, it almost *has* to be a match */
3381 } else {
3382 /* it's not consistent with somebody in the set..
3383 punt */
3384 return(0);
3385 }
3386 /* all was fine.. it must fit... */
3387 return(1);
3388 }
3389
3390 static int
3391 rf_have_enough_components(RF_ConfigSet_t *cset)
3392 {
3393 RF_AutoConfig_t *ac;
3394 RF_AutoConfig_t *auto_config;
3395 RF_ComponentLabel_t *clabel;
3396 int c;
3397 int num_cols;
3398 int num_missing;
3399 int mod_counter;
3400 int mod_counter_found;
3401 int even_pair_failed;
3402 char parity_type;
3403
3404
3405 /* check to see that we have enough 'live' components
3406 of this set. If so, we can configure it if necessary */
3407
3408 num_cols = cset->ac->clabel->num_columns;
3409 parity_type = cset->ac->clabel->parityConfig;
3410
3411 /* XXX Check for duplicate components!?!?!? */
3412
3413 /* Determine what the mod_counter is supposed to be for this set. */
3414
3415 mod_counter_found = 0;
3416 mod_counter = 0;
3417 ac = cset->ac;
3418 while(ac!=NULL) {
3419 if (mod_counter_found==0) {
3420 mod_counter = ac->clabel->mod_counter;
3421 mod_counter_found = 1;
3422 } else {
3423 if (ac->clabel->mod_counter > mod_counter) {
3424 mod_counter = ac->clabel->mod_counter;
3425 }
3426 }
3427 ac = ac->next;
3428 }
3429
3430 num_missing = 0;
3431 auto_config = cset->ac;
3432
3433 even_pair_failed = 0;
3434 for(c=0; c<num_cols; c++) {
3435 ac = auto_config;
3436 while(ac!=NULL) {
3437 if ((ac->clabel->column == c) &&
3438 (ac->clabel->mod_counter == mod_counter)) {
3439 /* it's this one... */
3440 #ifdef DEBUG
3441 printf("Found: %s at %d\n",
3442 ac->devname,c);
3443 #endif
3444 break;
3445 }
3446 ac=ac->next;
3447 }
3448 if (ac==NULL) {
3449 /* Didn't find one here! */
3450 /* special case for RAID 1, especially
3451 where there are more than 2
3452 components (where RAIDframe treats
3453 things a little differently :( ) */
3454 if (parity_type == '1') {
3455 if (c%2 == 0) { /* even component */
3456 even_pair_failed = 1;
3457 } else { /* odd component. If
3458 we're failed, and
3459 so is the even
3460 component, it's
3461 "Good Night, Charlie" */
3462 if (even_pair_failed == 1) {
3463 return(0);
3464 }
3465 }
3466 } else {
3467 /* normal accounting */
3468 num_missing++;
3469 }
3470 }
3471 if ((parity_type == '1') && (c%2 == 1)) {
3472 /* Just did an even component, and we didn't
3473 bail.. reset the even_pair_failed flag,
3474 and go on to the next component.... */
3475 even_pair_failed = 0;
3476 }
3477 }
3478
3479 clabel = cset->ac->clabel;
3480
3481 if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3482 ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3483 ((clabel->parityConfig == '5') && (num_missing > 1))) {
3484 /* XXX this needs to be made *much* more general */
3485 /* Too many failures */
3486 return(0);
3487 }
3488 /* otherwise, all is well, and we've got enough to take a kick
3489 at autoconfiguring this set */
3490 return(1);
3491 }
3492
3493 static void
3494 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3495 RF_Raid_t *raidPtr)
3496 {
3497 RF_ComponentLabel_t *clabel;
3498 int i;
3499
3500 clabel = ac->clabel;
3501
3502 /* 1. Fill in the common stuff */
3503 config->numCol = clabel->num_columns;
3504 config->numSpare = 0; /* XXX should this be set here? */
3505 config->sectPerSU = clabel->sectPerSU;
3506 config->SUsPerPU = clabel->SUsPerPU;
3507 config->SUsPerRU = clabel->SUsPerRU;
3508 config->parityConfig = clabel->parityConfig;
3509 /* XXX... */
3510 strcpy(config->diskQueueType,"fifo");
3511 config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3512 config->layoutSpecificSize = 0; /* XXX ?? */
3513
3514 while(ac!=NULL) {
3515 /* row/col values will be in range due to the checks
3516 in reasonable_label() */
3517 strcpy(config->devnames[0][ac->clabel->column],
3518 ac->devname);
3519 ac = ac->next;
3520 }
3521
3522 for(i=0;i<RF_MAXDBGV;i++) {
3523 config->debugVars[i][0] = 0;
3524 }
3525 }
3526
3527 static int
3528 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3529 {
3530 RF_ComponentLabel_t *clabel;
3531 int column;
3532 int sparecol;
3533
3534 raidPtr->autoconfigure = new_value;
3535
3536 for(column=0; column<raidPtr->numCol; column++) {
3537 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3538 clabel = raidget_component_label(raidPtr, column);
3539 clabel->autoconfigure = new_value;
3540 raidflush_component_label(raidPtr, column);
3541 }
3542 }
3543 for(column = 0; column < raidPtr->numSpare ; column++) {
3544 sparecol = raidPtr->numCol + column;
3545
3546 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3547 clabel = raidget_component_label(raidPtr, sparecol);
3548 clabel->autoconfigure = new_value;
3549 raidflush_component_label(raidPtr, sparecol);
3550 }
3551 }
3552 return(new_value);
3553 }
3554
3555 static int
3556 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3557 {
3558 RF_ComponentLabel_t *clabel;
3559 int column;
3560 int sparecol;
3561
3562 raidPtr->root_partition = new_value;
3563 for(column=0; column<raidPtr->numCol; column++) {
3564 if (raidPtr->Disks[column].status == rf_ds_optimal) {
3565 clabel = raidget_component_label(raidPtr, column);
3566 clabel->root_partition = new_value;
3567 raidflush_component_label(raidPtr, column);
3568 }
3569 }
3570 for (column = 0; column < raidPtr->numSpare ; column++) {
3571 sparecol = raidPtr->numCol + column;
3572
3573 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3574 clabel = raidget_component_label(raidPtr, sparecol);
3575 clabel->root_partition = new_value;
3576 raidflush_component_label(raidPtr, sparecol);
3577 }
3578 }
3579 return(new_value);
3580 }
3581
3582 static void
3583 rf_release_all_vps(RF_ConfigSet_t *cset)
3584 {
3585 RF_AutoConfig_t *ac;
3586
3587 ac = cset->ac;
3588 while(ac!=NULL) {
3589 /* Close the vp, and give it back */
3590 if (ac->vp) {
3591 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3592 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3593 vput(ac->vp);
3594 ac->vp = NULL;
3595 }
3596 ac = ac->next;
3597 }
3598 }
3599
3600
3601 static void
3602 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3603 {
3604 RF_AutoConfig_t *ac;
3605 RF_AutoConfig_t *next_ac;
3606
3607 ac = cset->ac;
3608 while(ac!=NULL) {
3609 next_ac = ac->next;
3610 /* nuke the label */
3611 free(ac->clabel, M_RAIDFRAME);
3612 /* cleanup the config structure */
3613 free(ac, M_RAIDFRAME);
3614 /* "next.." */
3615 ac = next_ac;
3616 }
3617 /* and, finally, nuke the config set */
3618 free(cset, M_RAIDFRAME);
3619 }
3620
3621
3622 void
3623 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3624 {
3625 /* avoid over-writing byteswapped version. */
3626 if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3627 clabel->version = RF_COMPONENT_LABEL_VERSION;
3628 clabel->serial_number = raidPtr->serial_number;
3629 clabel->mod_counter = raidPtr->mod_counter;
3630
3631 clabel->num_rows = 1;
3632 clabel->num_columns = raidPtr->numCol;
3633 clabel->clean = RF_RAID_DIRTY; /* not clean */
3634 clabel->status = rf_ds_optimal; /* "It's good!" */
3635
3636 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3637 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3638 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3639
3640 clabel->blockSize = raidPtr->bytesPerSector;
3641 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3642
3643 /* XXX not portable */
3644 clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3645 clabel->maxOutstanding = raidPtr->maxOutstanding;
3646 clabel->autoconfigure = raidPtr->autoconfigure;
3647 clabel->root_partition = raidPtr->root_partition;
3648 clabel->last_unit = raidPtr->raidid;
3649 clabel->config_order = raidPtr->config_order;
3650
3651 #ifndef RF_NO_PARITY_MAP
3652 rf_paritymap_init_label(raidPtr->parity_map, clabel);
3653 #endif
3654 }
3655
3656 static struct raid_softc *
3657 rf_auto_config_set(RF_ConfigSet_t *cset)
3658 {
3659 RF_Raid_t *raidPtr;
3660 RF_Config_t *config;
3661 int raidID;
3662 struct raid_softc *sc;
3663
3664 #ifdef DEBUG
3665 printf("RAID autoconfigure\n");
3666 #endif
3667
3668 /* 1. Create a config structure */
3669 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3670
3671 /*
3672 2. Figure out what RAID ID this one is supposed to live at
3673 See if we can get the same RAID dev that it was configured
3674 on last time..
3675 */
3676
3677 raidID = cset->ac->clabel->last_unit;
3678 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3679 sc = raidget(++raidID, false))
3680 continue;
3681 #ifdef DEBUG
3682 printf("Configuring raid%d:\n",raidID);
3683 #endif
3684
3685 if (sc == NULL)
3686 sc = raidget(raidID, true);
3687 raidPtr = &sc->sc_r;
3688
3689 /* XXX all this stuff should be done SOMEWHERE ELSE! */
3690 raidPtr->softc = sc;
3691 raidPtr->raidid = raidID;
3692 raidPtr->openings = RAIDOUTSTANDING;
3693
3694 /* 3. Build the configuration structure */
3695 rf_create_configuration(cset->ac, config, raidPtr);
3696
3697 /* 4. Do the configuration */
3698 if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3699 raidinit(sc);
3700
3701 rf_markalldirty(raidPtr);
3702 raidPtr->autoconfigure = 1; /* XXX do this here? */
3703 switch (cset->ac->clabel->root_partition) {
3704 case 1: /* Force Root */
3705 case 2: /* Soft Root: root when boot partition part of raid */
3706 /*
3707 * everything configured just fine. Make a note
3708 * that this set is eligible to be root,
3709 * or forced to be root
3710 */
3711 cset->rootable = cset->ac->clabel->root_partition;
3712 /* XXX do this here? */
3713 raidPtr->root_partition = cset->rootable;
3714 break;
3715 default:
3716 break;
3717 }
3718 } else {
3719 raidput(sc);
3720 sc = NULL;
3721 }
3722
3723 /* 5. Cleanup */
3724 free(config, M_RAIDFRAME);
3725 return sc;
3726 }
3727
3728 void
3729 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3730 size_t xmin, size_t xmax)
3731 {
3732
3733 /* Format: raid%d_foo */
3734 snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3735
3736 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3737 pool_sethiwat(p, xmax);
3738 pool_prime(p, xmin);
3739 }
3740
3741
3742 /*
3743 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3744 * to see if there is IO pending and if that IO could possibly be done
3745 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1
3746 * otherwise.
3747 *
3748 */
3749 int
3750 rf_buf_queue_check(RF_Raid_t *raidPtr)
3751 {
3752 struct raid_softc *rs;
3753 struct dk_softc *dksc;
3754
3755 rs = raidPtr->softc;
3756 dksc = &rs->sc_dksc;
3757
3758 if ((rs->sc_flags & RAIDF_INITED) == 0)
3759 return 1;
3760
3761 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3762 /* there is work to do */
3763 return 0;
3764 }
3765 /* default is nothing to do */
3766 return 1;
3767 }
3768
3769 int
3770 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3771 {
3772 uint64_t numsecs;
3773 unsigned secsize;
3774 int error;
3775
3776 error = getdisksize(vp, &numsecs, &secsize);
3777 if (error == 0) {
3778 diskPtr->blockSize = secsize;
3779 diskPtr->numBlocks = numsecs - rf_protectedSectors;
3780 diskPtr->partitionSize = numsecs;
3781 return 0;
3782 }
3783 return error;
3784 }
3785
3786 static int
3787 raid_match(device_t self, cfdata_t cfdata, void *aux)
3788 {
3789 return 1;
3790 }
3791
3792 static void
3793 raid_attach(device_t parent, device_t self, void *aux)
3794 {
3795 }
3796
3797
3798 static int
3799 raid_detach(device_t self, int flags)
3800 {
3801 int error;
3802 struct raid_softc *rs = raidsoftc(self);
3803
3804 if (rs == NULL)
3805 return ENXIO;
3806
3807 if ((error = raidlock(rs)) != 0)
3808 return error;
3809
3810 error = raid_detach_unlocked(rs);
3811
3812 raidunlock(rs);
3813
3814 /* XXX raid can be referenced here */
3815
3816 if (error)
3817 return error;
3818
3819 /* Free the softc */
3820 raidput(rs);
3821
3822 return 0;
3823 }
3824
3825 static void
3826 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3827 {
3828 struct dk_softc *dksc = &rs->sc_dksc;
3829 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3830
3831 memset(dg, 0, sizeof(*dg));
3832
3833 dg->dg_secperunit = raidPtr->totalSectors;
3834 dg->dg_secsize = raidPtr->bytesPerSector;
3835 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3836 dg->dg_ntracks = 4 * raidPtr->numCol;
3837
3838 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3839 }
3840
3841 /*
3842 * Get cache info for all the components (including spares).
3843 * Returns intersection of all the cache flags of all disks, or first
3844 * error if any encountered.
3845 * XXXfua feature flags can change as spares are added - lock down somehow
3846 */
3847 static int
3848 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3849 {
3850 int c;
3851 int error;
3852 int dkwhole = 0, dkpart;
3853
3854 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3855 /*
3856 * Check any non-dead disk, even when currently being
3857 * reconstructed.
3858 */
3859 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
3860 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3861 DIOCGCACHE, &dkpart, FREAD, NOCRED);
3862 if (error) {
3863 if (error != ENODEV) {
3864 printf("raid%d: get cache for component %s failed\n",
3865 raidPtr->raidid,
3866 raidPtr->Disks[c].devname);
3867 }
3868
3869 return error;
3870 }
3871
3872 if (c == 0)
3873 dkwhole = dkpart;
3874 else
3875 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3876 }
3877 }
3878
3879 *data = dkwhole;
3880
3881 return 0;
3882 }
3883
3884 /*
3885 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3886 * We end up returning whatever error was returned by the first cache flush
3887 * that fails.
3888 */
3889
3890 static int
3891 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3892 {
3893 int e = 0;
3894 for (int i = 0; i < 5; i++) {
3895 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3896 &force, FWRITE, NOCRED);
3897 if (!e || e == ENODEV)
3898 return e;
3899 printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3900 raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3901 }
3902 return e;
3903 }
3904
3905 int
3906 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3907 {
3908 int c, error;
3909
3910 error = 0;
3911 for (c = 0; c < raidPtr->numCol; c++) {
3912 if (raidPtr->Disks[c].status == rf_ds_optimal) {
3913 int e = rf_sync_component_cache(raidPtr, c, force);
3914 if (e && !error)
3915 error = e;
3916 }
3917 }
3918
3919 for (c = 0; c < raidPtr->numSpare ; c++) {
3920 int sparecol = raidPtr->numCol + c;
3921
3922 /* Need to ensure that the reconstruct actually completed! */
3923 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3924 int e = rf_sync_component_cache(raidPtr, sparecol,
3925 force);
3926 if (e && !error)
3927 error = e;
3928 }
3929 }
3930 return error;
3931 }
3932
3933 /* Fill in info with the current status */
3934 void
3935 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3936 {
3937
3938 memset(info, 0, sizeof(*info));
3939
3940 if (raidPtr->status != rf_rs_reconstructing) {
3941 info->total = 100;
3942 info->completed = 100;
3943 } else {
3944 info->total = raidPtr->reconControl->numRUsTotal;
3945 info->completed = raidPtr->reconControl->numRUsComplete;
3946 }
3947 info->remaining = info->total - info->completed;
3948 }
3949
3950 /* Fill in info with the current status */
3951 void
3952 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3953 {
3954
3955 memset(info, 0, sizeof(*info));
3956
3957 if (raidPtr->parity_rewrite_in_progress == 1) {
3958 info->total = raidPtr->Layout.numStripe;
3959 info->completed = raidPtr->parity_rewrite_stripes_done;
3960 } else {
3961 info->completed = 100;
3962 info->total = 100;
3963 }
3964 info->remaining = info->total - info->completed;
3965 }
3966
3967 /* Fill in info with the current status */
3968 void
3969 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3970 {
3971
3972 memset(info, 0, sizeof(*info));
3973 info->remaining = 0;
3974 info->completed = 100;
3975 info->total = 100;
3976 }
3977
3978 /* Fill in config with the current info */
3979 int
3980 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3981 {
3982 int d, i, j;
3983
3984 if (!raidPtr->valid)
3985 return ENODEV;
3986 config->cols = raidPtr->numCol;
3987 config->ndevs = raidPtr->numCol;
3988 if (config->ndevs >= RF_MAX_DISKS)
3989 return ENOMEM;
3990 config->nspares = raidPtr->numSpare;
3991 if (config->nspares >= RF_MAX_DISKS)
3992 return ENOMEM;
3993 config->maxqdepth = raidPtr->maxQueueDepth;
3994 d = 0;
3995 for (j = 0; j < config->cols; j++) {
3996 config->devs[d] = raidPtr->Disks[j];
3997 d++;
3998 }
3999 for (i = 0; i < config->nspares; i++) {
4000 config->spares[i] = raidPtr->Disks[raidPtr->numCol + i];
4001 if (config->spares[i].status == rf_ds_rebuilding_spare) {
4002 /* raidctl(8) expects to see this as a used spare */
4003 config->spares[i].status = rf_ds_used_spare;
4004 }
4005 }
4006 return 0;
4007 }
4008
4009 int
4010 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
4011 {
4012 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
4013 RF_ComponentLabel_t *raid_clabel;
4014 int column = clabel->column;
4015
4016 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
4017 return EINVAL;
4018 raid_clabel = raidget_component_label(raidPtr, column);
4019 memcpy(clabel, raid_clabel, sizeof *clabel);
4020 /* Fix-up for userland. */
4021 if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
4022 clabel->version = RF_COMPONENT_LABEL_VERSION;
4023
4024 return 0;
4025 }
4026
4027 /*
4028 * Module interface
4029 */
4030
4031 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
4032
4033 #ifdef _MODULE
4034 CFDRIVER_DECL(raid, DV_DISK, NULL);
4035 #endif
4036
4037 static int raid_modcmd(modcmd_t, void *);
4038 static int raid_modcmd_init(void);
4039 static int raid_modcmd_fini(void);
4040
4041 static int
4042 raid_modcmd(modcmd_t cmd, void *data)
4043 {
4044 int error;
4045
4046 error = 0;
4047 switch (cmd) {
4048 case MODULE_CMD_INIT:
4049 error = raid_modcmd_init();
4050 break;
4051 case MODULE_CMD_FINI:
4052 error = raid_modcmd_fini();
4053 break;
4054 default:
4055 error = ENOTTY;
4056 break;
4057 }
4058 return error;
4059 }
4060
4061 static int
4062 raid_modcmd_init(void)
4063 {
4064 int error;
4065 int bmajor, cmajor;
4066
4067 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
4068 mutex_enter(&raid_lock);
4069 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4070 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4071 rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4072 rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4073
4074 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4075 #endif
4076
4077 bmajor = cmajor = -1;
4078 error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4079 &raid_cdevsw, &cmajor);
4080 if (error != 0 && error != EEXIST) {
4081 aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4082 mutex_exit(&raid_lock);
4083 return error;
4084 }
4085 #ifdef _MODULE
4086 error = config_cfdriver_attach(&raid_cd);
4087 if (error != 0) {
4088 aprint_error("%s: config_cfdriver_attach failed %d\n",
4089 __func__, error);
4090 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4091 mutex_exit(&raid_lock);
4092 return error;
4093 }
4094 #endif
4095 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4096 if (error != 0) {
4097 aprint_error("%s: config_cfattach_attach failed %d\n",
4098 __func__, error);
4099 #ifdef _MODULE
4100 config_cfdriver_detach(&raid_cd);
4101 #endif
4102 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4103 mutex_exit(&raid_lock);
4104 return error;
4105 }
4106
4107 raidautoconfigdone = false;
4108
4109 mutex_exit(&raid_lock);
4110
4111 if (error == 0) {
4112 if (rf_BootRaidframe(true) == 0)
4113 aprint_verbose("Kernelized RAIDframe activated\n");
4114 else
4115 panic("Serious error activating RAID!!");
4116 }
4117
4118 /*
4119 * Register a finalizer which will be used to auto-config RAID
4120 * sets once all real hardware devices have been found.
4121 */
4122 error = config_finalize_register(NULL, rf_autoconfig);
4123 if (error != 0) {
4124 aprint_error("WARNING: unable to register RAIDframe "
4125 "finalizer\n");
4126 error = 0;
4127 }
4128
4129 return error;
4130 }
4131
4132 static int
4133 raid_modcmd_fini(void)
4134 {
4135 int error;
4136
4137 mutex_enter(&raid_lock);
4138
4139 /* Don't allow unload if raid device(s) exist. */
4140 if (!LIST_EMPTY(&raids)) {
4141 mutex_exit(&raid_lock);
4142 return EBUSY;
4143 }
4144
4145 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4146 if (error != 0) {
4147 aprint_error("%s: cannot detach cfattach\n",__func__);
4148 mutex_exit(&raid_lock);
4149 return error;
4150 }
4151 #ifdef _MODULE
4152 error = config_cfdriver_detach(&raid_cd);
4153 if (error != 0) {
4154 aprint_error("%s: cannot detach cfdriver\n",__func__);
4155 config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4156 mutex_exit(&raid_lock);
4157 return error;
4158 }
4159 #endif
4160 devsw_detach(&raid_bdevsw, &raid_cdevsw);
4161 rf_BootRaidframe(false);
4162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4163 rf_destroy_mutex2(rf_sparet_wait_mutex);
4164 rf_destroy_cond2(rf_sparet_wait_cv);
4165 rf_destroy_cond2(rf_sparet_resp_cv);
4166 #endif
4167 mutex_exit(&raid_lock);
4168 mutex_destroy(&raid_lock);
4169
4170 return error;
4171 }
4172